From a6018e3ca0cd96c004cb17e3659cc39b664ff0a1 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 24 Oct 2022 10:29:23 +0200
Subject: [PATCH 001/117] alpine files moved to sub-directory

---
 alpine/{ => ElectrostaticPIC}/BumponTailInstability.cpp | 0
 alpine/{ => ElectrostaticPIC}/CMakeLists.txt            | 0
 alpine/{ => ElectrostaticPIC}/ChargedParticles.hpp      | 0
 alpine/{ => ElectrostaticPIC}/LandauDamping.cpp         | 0
 alpine/{ => ElectrostaticPIC}/PenningTrap.cpp           | 0
 alpine/{ => ElectrostaticPIC}/UniformPlasmaTest.cpp     | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename alpine/{ => ElectrostaticPIC}/BumponTailInstability.cpp (100%)
 rename alpine/{ => ElectrostaticPIC}/CMakeLists.txt (100%)
 rename alpine/{ => ElectrostaticPIC}/ChargedParticles.hpp (100%)
 rename alpine/{ => ElectrostaticPIC}/LandauDamping.cpp (100%)
 rename alpine/{ => ElectrostaticPIC}/PenningTrap.cpp (100%)
 rename alpine/{ => ElectrostaticPIC}/UniformPlasmaTest.cpp (100%)

diff --git a/alpine/BumponTailInstability.cpp b/alpine/ElectrostaticPIC/BumponTailInstability.cpp
similarity index 100%
rename from alpine/BumponTailInstability.cpp
rename to alpine/ElectrostaticPIC/BumponTailInstability.cpp
diff --git a/alpine/CMakeLists.txt b/alpine/ElectrostaticPIC/CMakeLists.txt
similarity index 100%
rename from alpine/CMakeLists.txt
rename to alpine/ElectrostaticPIC/CMakeLists.txt
diff --git a/alpine/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
similarity index 100%
rename from alpine/ChargedParticles.hpp
rename to alpine/ElectrostaticPIC/ChargedParticles.hpp
diff --git a/alpine/LandauDamping.cpp b/alpine/ElectrostaticPIC/LandauDamping.cpp
similarity index 100%
rename from alpine/LandauDamping.cpp
rename to alpine/ElectrostaticPIC/LandauDamping.cpp
diff --git a/alpine/PenningTrap.cpp b/alpine/ElectrostaticPIC/PenningTrap.cpp
similarity index 100%
rename from alpine/PenningTrap.cpp
rename to alpine/ElectrostaticPIC/PenningTrap.cpp
diff --git a/alpine/UniformPlasmaTest.cpp b/alpine/ElectrostaticPIC/UniformPlasmaTest.cpp
similarity index 100%
rename from alpine/UniformPlasmaTest.cpp
rename to alpine/ElectrostaticPIC/UniformPlasmaTest.cpp

From 6e2186b75ca1a49e9a7648a6e0ea860a11e8e44a Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 24 Oct 2022 10:35:34 +0200
Subject: [PATCH 002/117] Files copied from PIC directory to PIF for modifying

---
 alpine/ElectrostaticPIF/CMakeLists.txt        |  26 +
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 624 ++++++++++++++++++
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  | 410 ++++++++++++
 3 files changed, 1060 insertions(+)
 create mode 100644 alpine/ElectrostaticPIF/CMakeLists.txt
 create mode 100644 alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
 create mode 100644 alpine/ElectrostaticPIF/LandauDampingPIF.cpp

diff --git a/alpine/ElectrostaticPIF/CMakeLists.txt b/alpine/ElectrostaticPIF/CMakeLists.txt
new file mode 100644
index 000000000..60fa9678b
--- /dev/null
+++ b/alpine/ElectrostaticPIF/CMakeLists.txt
@@ -0,0 +1,26 @@
+file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
+message (STATUS "Adding index test found in ${_relPath}")
+
+include_directories (
+    ${CMAKE_SOURCE_DIR}/src
+)
+
+link_directories (
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${Kokkos_DIR}/..
+)
+
+set (IPPL_LIBS ippl ${MPI_CXX_LIBRARIES})
+set (COMPILE_FLAGS ${OPAL_CXX_FLAGS})
+
+add_executable (LandauDampingPIF LandauDampingPIF.cpp)
+target_link_libraries (LandauDampingPIF ${IPPL_LIBS})
+
+# vi: set et ts=4 sw=4 sts=4:
+
+# Local Variables:
+# mode: cmake
+# cmake-tab-width: 4
+# indent-tabs-mode: nil
+# require-final-newline: nil
+# End:
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
new file mode 100644
index 000000000..e64417e19
--- /dev/null
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -0,0 +1,624 @@
+// ChargedParticles header file
+//   Defines a particle attribute for charged particles to be used in
+//   test programs
+//
+// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "Ippl.h"
+#include "Solver/FFTPeriodicPoissonSolver.h"
+
+// dimension of our positions
+constexpr unsigned Dim = 3;
+
+// some typedefs
+typedef ippl::ParticleSpatialLayout<double,Dim>   PLayout_t;
+typedef ippl::UniformCartesian<double, Dim>        Mesh_t;
+typedef ippl::FieldLayout<Dim> FieldLayout_t;
+typedef ippl::OrthogonalRecursiveBisection<double, Dim, Mesh_t> ORB;
+
+using size_type = ippl::detail::size_type;
+
+template<typename T, unsigned Dim>
+using Vector = ippl::Vector<T, Dim>;
+
+template<typename T, unsigned Dim>
+using Field = ippl::Field<T, Dim>;
+
+template<typename T>
+using ParticleAttrib = ippl::ParticleAttrib<T>;
+
+typedef Vector<double, Dim>  Vector_t;
+typedef Field<double, Dim>   Field_t;
+typedef Field<Vector_t, Dim> VField_t;
+typedef ippl::FFTPeriodicPoissonSolver<Vector_t, double, Dim> Solver_t;
+
+const double pi = std::acos(-1.0);
+
+// Test programs have to define this variable for VTK dump purposes
+extern const char* TestName;
+
+void dumpVTK(VField_t& E, int nx, int ny, int nz, int iteration,
+             double dx, double dy, double dz) {
+
+
+    typename VField_t::view_type::host_mirror_type host_view = E.getHostMirror();
+
+    std::stringstream fname;
+    fname << "data/ef_";
+    fname << std::setw(4) << std::setfill('0') << iteration;
+    fname << ".vtk";
+
+    Kokkos::deep_copy(host_view, E.getView());
+
+    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
+    vtkout.precision(10);
+    vtkout.setf(std::ios::scientific, std::ios::floatfield);
+
+    // start with header
+    vtkout << "# vtk DataFile Version 2.0" << endl;
+    vtkout << TestName << endl;
+    vtkout << "ASCII" << endl;
+    vtkout << "DATASET STRUCTURED_POINTS" << endl;
+    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
+    vtkout << "ORIGIN "     << -dx  << " " << -dy  << " "  << -dz << endl;
+    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
+    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
+
+    vtkout << "VECTORS E-Field float" << endl;
+    for (int z=0; z<nz+2; z++) {
+        for (int y=0; y<ny+2; y++) {
+            for (int x=0; x<nx+2; x++) {
+
+                vtkout << host_view(x,y,z)[0] << "\t"
+                       << host_view(x,y,z)[1] << "\t"
+                       << host_view(x,y,z)[2] << endl;
+            }
+        }
+    }
+}
+
+void dumpVTK(Field_t& rho, int nx, int ny, int nz, int iteration,
+             double dx, double dy, double dz) {
+
+    typename Field_t::view_type::host_mirror_type host_view = rho.getHostMirror();
+
+    std::stringstream fname;
+    fname << "data/scalar_";
+    fname << std::setw(4) << std::setfill('0') << iteration;
+    fname << ".vtk";
+
+    Kokkos::deep_copy(host_view, rho.getView());
+
+    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
+    vtkout.precision(10);
+    vtkout.setf(std::ios::scientific, std::ios::floatfield);
+
+    // start with header
+    vtkout << "# vtk DataFile Version 2.0" << endl;
+    vtkout << TestName << endl;
+    vtkout << "ASCII" << endl;
+    vtkout << "DATASET STRUCTURED_POINTS" << endl;
+    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
+    vtkout << "ORIGIN " << -dx << " " << -dy << " " << -dz << endl;
+    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
+    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
+
+    vtkout << "SCALARS Rho float" << endl;
+    vtkout << "LOOKUP_TABLE default" << endl;
+    for (int z=0; z<nz+2; z++) {
+        for (int y=0; y<ny+2; y++) {
+            for (int x=0; x<nx+2; x++) {
+
+                vtkout << host_view(x,y,z) << endl;
+            }
+        }
+    }
+}
+
+template<class PLayout>
+class ChargedParticles : public ippl::ParticleBase<PLayout> {
+public:
+    VField_t E_m;
+    Field_t rho_m;
+
+    // ORB
+    ORB orb;
+
+    Vector<int, Dim> nr_m;
+
+    ippl::e_dim_tag decomp_m[Dim];
+
+    Vector_t hr_m;
+    Vector_t rmin_m;
+    Vector_t rmax_m;
+
+    double Q_m;
+
+    std::string stype_m;
+
+    std::shared_ptr<Solver_t> solver_mp;
+
+    double time_m;
+
+    double rhoNorm_m;
+
+    unsigned int loadbalancefreq_m;
+    
+    double loadbalancethreshold_m;
+
+
+public:
+    ParticleAttrib<double>     q; // charge
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;  // particle velocity
+    typename ippl::ParticleBase<PLayout>::particle_position_type E;  // electric field at particle position
+
+
+    /*
+      This constructor is mandatory for all derived classes from
+      ParticleBase as the bunch buffer uses this
+    */
+    ChargedParticles(PLayout& pl)
+    : ippl::ParticleBase<PLayout>(pl)
+    {
+        // register the particle attributes
+        this->addAttribute(q);
+        this->addAttribute(P);
+        this->addAttribute(E);
+    }
+
+    ChargedParticles(PLayout& pl,
+                     Vector_t hr,
+                     Vector_t rmin,
+                     Vector_t rmax,
+                     ippl::e_dim_tag decomp[Dim],
+                     double Q)
+    : ippl::ParticleBase<PLayout>(pl)
+    , hr_m(hr)
+    , rmin_m(rmin)
+    , rmax_m(rmax)
+    , Q_m(Q)
+    {
+        // register the particle attributes
+        this->addAttribute(q);
+        this->addAttribute(P);
+        this->addAttribute(E);
+        setupBCs();
+        for (unsigned int i = 0; i < Dim; i++)
+            decomp_m[i]=decomp[i];
+    }
+
+    ~ChargedParticles(){ }
+
+    void setupBCs() {
+        setBCAllPeriodic();
+    }
+
+    void updateLayout(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticles<PLayout>& buffer,
+                      bool& isFirstRepartition) {
+        // Update local fields
+        static IpplTimings::TimerRef tupdateLayout = IpplTimings::getTimer("updateLayout");
+        IpplTimings::startTimer(tupdateLayout);
+        this->E_m.updateLayout(fl);
+        this->rho_m.updateLayout(fl);
+
+        // Update layout with new FieldLayout
+        PLayout& layout = this->getLayout();
+        layout.updateLayout(fl, mesh);
+        IpplTimings::stopTimer(tupdateLayout);
+        static IpplTimings::TimerRef tupdatePLayout = IpplTimings::getTimer("updatePB");
+        IpplTimings::startTimer(tupdatePLayout);
+        if(!isFirstRepartition) {
+            layout.update(*this, buffer);
+        }
+        IpplTimings::stopTimer(tupdatePLayout);
+    }
+
+    void initializeORB(FieldLayout_t& fl, Mesh_t& mesh) {
+        orb.initialize(fl, mesh, rho_m);
+    }
+
+    void repartition(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticles<PLayout>& buffer, 
+                     bool& isFirstRepartition) {
+        // Repartition the domains
+        bool res = orb.binaryRepartition(this->R, fl, isFirstRepartition);
+
+        if (res != true) {
+           std::cout << "Could not repartition!" << std::endl;
+           return;
+        }
+        // Update
+        this->updateLayout(fl, mesh, buffer, isFirstRepartition);
+        this->solver_mp->setRhs(rho_m);
+    }
+
+    bool balance(size_type totalP, const unsigned int nstep){
+        if(std::strcmp(TestName,"UniformPlasmaTest") == 0) {
+            return (nstep % loadbalancefreq_m == 0);
+        }
+        else {
+            int local = 0;
+            std::vector<int> res(Ippl::Comm->size());
+            double equalPart = (double) totalP / Ippl::Comm->size();
+            double dev = std::abs((double)this->getLocalNum() - equalPart) / totalP;
+            if (dev > loadbalancethreshold_m)
+                local = 1;
+            MPI_Allgather(&local, 1, MPI_INT, res.data(), 1, MPI_INT, Ippl::getComm());
+
+            for (unsigned int i = 0; i < res.size(); i++) {
+                if (res[i] == 1)
+                    return true;
+            }
+            return false;
+        }
+    }
+
+    void gatherStatistics(size_type totalP) {
+        std::vector<double> imb(Ippl::Comm->size());
+        double equalPart = (double) totalP / Ippl::Comm->size();
+        double dev = (std::abs((double)this->getLocalNum() - equalPart) 
+                     / totalP) * 100.0;
+        MPI_Gather(&dev, 1, MPI_DOUBLE, imb.data(), 1, MPI_DOUBLE, 0, 
+                   Ippl::getComm());
+    
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/LoadBalance_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(5);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, rank, imbalance percentage" << endl;
+            }
+
+            for(int r=0; r < Ippl::Comm->size(); ++r) { 
+                csvout << time_m << " "
+                       << r << " "
+                       << imb[r] << endl;
+            }
+        }
+
+        Ippl::Comm->barrier();
+    
+    }
+
+    void gatherCIC() {
+
+        gather(this->E, E_m, this->R);
+
+    }
+
+    void scatterCIC(size_type totalP, unsigned int iteration, Vector_t& hrField) {
+
+
+         Inform m("scatter ");
+
+         rho_m = 0.0;
+         scatter(q, rho_m, this->R);
+
+         static IpplTimings::TimerRef sumTimer = IpplTimings::getTimer("Check");
+         IpplTimings::startTimer(sumTimer);
+         double Q_grid = rho_m.sum();
+
+         size_type Total_particles = 0;
+         size_type local_particles = this->getLocalNum();
+
+         MPI_Reduce(&local_particles, &Total_particles, 1,
+                       MPI_UNSIGNED_LONG, MPI_SUM, 0, Ippl::getComm());
+
+         double rel_error = std::fabs((Q_m-Q_grid)/Q_m);
+         m << "Rel. error in charge conservation = " << rel_error << endl;
+
+         if(Ippl::Comm->rank() == 0) {
+             if(Total_particles != totalP || rel_error > 1e-10) {
+                 m << "Time step: " << iteration << endl;
+                 m << "Total particles in the sim. " << totalP
+                   << " " << "after update: "
+                   << Total_particles << endl;
+                 m << "Rel. error in charge conservation: "
+                   << rel_error << endl;
+                 std::abort();
+             }
+         }
+
+         rho_m = rho_m / (hrField[0] * hrField[1] * hrField[2]);
+
+         rhoNorm_m = norm(rho_m);
+         IpplTimings::stopTimer(sumTimer);
+
+         //dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
+
+         //rho = rho_e - rho_i
+         rho_m = rho_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    }
+
+    void initSolver() {
+
+        Inform m("solver ");
+        if(stype_m == "FFT")
+            initFFTSolver();
+        else
+            m << "No solver matches the argument" << endl;
+
+    }
+
+    void initFFTSolver() {
+        ippl::ParameterList sp;
+        sp.add("output_type", Solver_t::GRAD);
+        sp.add("use_heffte_defaults", false);  
+        sp.add("use_pencils", true);  
+        sp.add("use_reorder", false);  
+        sp.add("use_gpu_aware", true);  
+        sp.add("comm", ippl::p2p_pl);  
+        sp.add("r2c_direction", 0);  
+
+        solver_mp = std::make_shared<Solver_t>();
+
+        solver_mp->mergeParameters(sp);
+
+        solver_mp->setRhs(rho_m);
+
+        solver_mp->setLhs(E_m);
+    }
+
+
+
+     void dumpData() {
+
+        auto Pview = P.getView();
+
+        double Energy = 0.0;
+
+        Kokkos::parallel_reduce("Particle Energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = dot(Pview(i), Pview(i)).apply();
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(Energy));
+
+        Energy *= 0.5;
+        double gEnergy = 0.0;
+
+        MPI_Reduce(&Energy, &gEnergy, 1,
+                    MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+
+        const int nghostE = E_m.getNghost();
+        auto Eview = E_m.getView();
+        Vector_t normE;
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        for (unsigned d=0; d<Dim; ++d) {
+
+        double temp = 0.0;
+        Kokkos::parallel_reduce("Vector E reduce",
+                                mdrange_type({nghostE, nghostE, nghostE},
+                                             {Eview.extent(0) - nghostE,
+                                              Eview.extent(1) - nghostE,
+                                              Eview.extent(2) - nghostE}),
+                                KOKKOS_LAMBDA(const size_t i, const size_t j,
+                                              const size_t k, double& valL)
+                                {
+                                    double myVal = std::pow(Eview(i, j, k)[d], 2);
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+            double globaltemp = 0.0;
+            MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+            normE[d] = std::sqrt(globaltemp);
+        }
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/ParticleField_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Kinetic energy, Rho_norm2, Ex_norm2, Ey_norm2, Ez_norm2" << endl;
+            }
+
+            csvout << time_m << " "
+                   << gEnergy << " "
+                   << rhoNorm_m << " "
+                   << normE[0] << " "
+                   << normE[1] << " "
+                   << normE[2] << endl;
+        }
+
+        Ippl::Comm->barrier();
+     }
+
+     void dumpLandau() {
+
+        const int nghostE = E_m.getNghost();
+        auto Eview = E_m.getView();
+        double fieldEnergy, ExAmp;
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        double temp = 0.0;
+        Kokkos::parallel_reduce("Ex inner product",
+                                mdrange_type({nghostE, nghostE, nghostE},
+                                             {Eview.extent(0) - nghostE,
+                                              Eview.extent(1) - nghostE,
+                                              Eview.extent(2) - nghostE}),
+                                KOKKOS_LAMBDA(const size_t i, const size_t j,
+                                              const size_t k, double& valL)
+                                {
+                                    double myVal = std::pow(Eview(i, j, k)[0], 2);
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+        double globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
+
+        double tempMax = 0.0;
+        Kokkos::parallel_reduce("Ex max norm",
+                                mdrange_type({nghostE, nghostE, nghostE},
+                                             {Eview.extent(0) - nghostE,
+                                              Eview.extent(1) - nghostE,
+                                              Eview.extent(2) - nghostE}),
+                                KOKKOS_LAMBDA(const size_t i, const size_t j,
+                                              const size_t k, double& valL)
+                                {
+                                    double myVal = std::fabs(Eview(i, j, k)[0]);
+                                    if(myVal > valL) valL = myVal;
+                                }, Kokkos::Max<double>(tempMax));
+        ExAmp = 0.0;
+        MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/FieldLandau_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+            }
+
+            csvout << time_m << " "
+                   << fieldEnergy << " "
+                   << ExAmp << endl;
+
+        }
+        
+        Ippl::Comm->barrier();
+     }
+     
+     void dumpBumponTail() {
+
+        const int nghostE = E_m.getNghost();
+        auto Eview = E_m.getView();
+        double fieldEnergy, EzAmp;
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        double temp = 0.0;
+        Kokkos::parallel_reduce("Ex inner product",
+                                mdrange_type({nghostE, nghostE, nghostE},
+                                             {Eview.extent(0) - nghostE,
+                                              Eview.extent(1) - nghostE,
+                                              Eview.extent(2) - nghostE}),
+                                KOKKOS_LAMBDA(const size_t i, const size_t j,
+                                              const size_t k, double& valL)
+                                {
+                                    double myVal = std::pow(Eview(i, j, k)[2], 2);
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+        double globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
+
+        double tempMax = 0.0;
+        Kokkos::parallel_reduce("Ex max norm",
+                                mdrange_type({nghostE, nghostE, nghostE},
+                                             {Eview.extent(0) - nghostE,
+                                              Eview.extent(1) - nghostE,
+                                              Eview.extent(2) - nghostE}),
+                                KOKKOS_LAMBDA(const size_t i, const size_t j,
+                                              const size_t k, double& valL)
+                                {
+                                    double myVal = std::fabs(Eview(i, j, k)[2]);
+                                    if(myVal > valL) valL = myVal;
+                                }, Kokkos::Max<double>(tempMax));
+        EzAmp = 0.0;
+        MPI_Reduce(&tempMax, &EzAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/FieldBumponTail_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Ez_field_energy, Ez_max_norm" << endl;
+            }
+
+            csvout << time_m << " "
+                   << fieldEnergy << " "
+                   << EzAmp << endl;
+
+        }
+        
+        Ippl::Comm->barrier();
+     }
+
+     void dumpParticleData() {
+
+        typename ParticleAttrib<Vector_t>::HostMirror R_host = this->R.getHostMirror();
+        typename ParticleAttrib<Vector_t>::HostMirror P_host = this->P.getHostMirror();
+        Kokkos::deep_copy(R_host, this->R.getView());
+        Kokkos::deep_copy(P_host, P.getView());
+        std::stringstream pname;
+        pname << "data/ParticleIC_";
+        pname << Ippl::Comm->rank();
+        pname << ".csv";
+        Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+        pcsvout.precision(10);
+        pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+        pcsvout << "R_x, R_y, R_z, V_x, V_y, V_z" << endl;
+        for (size_type i = 0; i< this->getLocalNum(); i++) {
+            pcsvout << R_host(i)[0] << " "
+                    << R_host(i)[1] << " "
+                    << R_host(i)[2] << " "
+                    << P_host(i)[0] << " "
+                    << P_host(i)[1] << " "
+                    << P_host(i)[2] << endl;
+        }
+        Ippl::Comm->barrier();
+     }
+     
+     void dumpLocalDomains(const FieldLayout_t& fl, const unsigned int step) {
+
+        if (Ippl::Comm->rank() == 0) {
+            const typename FieldLayout_t::host_mirror_type domains = fl.getHostLocalDomains();
+            std::ofstream myfile;
+            myfile.open("data/domains" + std::to_string(step) + ".txt");
+            for (unsigned int i = 0; i < domains.size(); ++i) {
+                myfile << domains[i][0].first() << " " << domains[i][1].first() << " " << domains[i][2].first() << " "
+                       << domains[i][0].first() << " " << domains[i][1].last() << " " << domains[i][2].first() << " "
+                       << domains[i][0].last() << " " << domains[i][1].first() << " " << domains[i][2].first() << " "
+                       << domains[i][0].first() << " " << domains[i][1].first() << " " << domains[i][2].last()
+                       << "\n";
+            }
+            myfile.close();
+        }
+        Ippl::Comm->barrier();
+     }
+
+private:
+    void setBCAllPeriodic() {
+
+        this->setParticleBC(ippl::BC::PERIODIC);
+    }
+
+};
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
new file mode 100644
index 000000000..e78dd91bf
--- /dev/null
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -0,0 +1,410 @@
+// Landau Damping Test
+//   Usage:
+//     srun ./LandauDamping <nx> <ny> <nz> <Np> <Nt> <stype> <lbthres> <ovfactor> --info 10
+//     nx       = No. cell-centered points in the x-direction
+//     ny       = No. cell-centered points in the y-direction
+//     nz       = No. cell-centered points in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Nt       = Number of time steps
+//     stype    = Field solver type e.g., FFT
+//     lbthres  = Load balancing threshold i.e., lbthres*100 is the maximum load imbalance
+//                percentage which can be tolerated and beyond which
+//                particle load balancing occurs. A value of 0.01 is good for many typical 
+//                simulations.
+//     ovfactor = Over-allocation factor for the buffers used in the communication. Typical
+//                values are 1.0, 2.0. Value 1.0 means no over-allocation.
+//     Example:
+//     srun ./LandauDamping 128 128 128 10000 10 FFT 0.01 2.0 --info 10
+//
+// Copyright (c) 2021, Sriramkrishnan Muralikrishnan,
+// Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticles.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T k, alpha, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& k_, const T& alpha_, 
+           const T& u_) 
+  : k(k_), alpha(alpha_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = x  + (alpha  * (std::sin(k * x) / k)) - u;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = 1  + (alpha  * std::cos(k * x));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while (iterations < max_iter && std::fabs(f(x)) > tol) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  value_type alpha;
+
+  T k, minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
+                  value_type& alpha_, T& k_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        alpha(alpha_), k(k_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    value_type u;
+    for (unsigned d = 0; d < Dim; ++d) {
+
+        u = rand_gen.drand(minU[d], maxU[d]);
+        x(i)[d] = u / (1 + alpha);
+        Newton1D<value_type> solver(k[d], alpha, u);
+        solver.solve(x(i)[d]);
+        v(i)[d] = rand_gen.normal(0.0, 1.0);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& alpha, const double& k) {
+   double cdf = x + (alpha / k) * std::sin(k * x);
+   return cdf;
+}
+
+KOKKOS_FUNCTION
+double PDF(const Vector_t& xvec, const double& alpha, 
+             const Vector_t& kw, const unsigned Dim) {
+    double pdf = 1.0;
+
+    for (unsigned d = 0; d < Dim; ++d) {
+        pdf *= (1.0 + alpha * std::cos(kw[d] * xvec[d]));
+    }
+    return pdf;
+}
+
+const char* TestName = "LandauDamping";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg("LandauDamping");
+    Inform msg2all("LandauDamping",INFORM_ALL_NODES);
+
+    Ippl::Comm->setDefaultOverallocation(std::atof(argv[8]));
+
+    auto start = std::chrono::high_resolution_clock::now();
+    ippl::Vector<int,Dim> nr = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
+    static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
+    static IpplTimings::TimerRef updateTimer = IpplTimings::getTimer("update");
+    static IpplTimings::TimerRef DummySolveTimer = IpplTimings::getTimer("solveWarmup");
+    static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("solve");
+    static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("domainDecomp");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[4]);
+    const unsigned int nt     = std::atoi(argv[5]);
+
+    msg << "Landau damping"
+        << endl
+        << "nt " << nt << " Np= "
+        << totalP << " grid = " << nr
+        << endl;
+
+    using bunch_type = ChargedParticles<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  P;
+
+    ippl::NDIndex<Dim> domain;
+    for (unsigned i = 0; i< Dim; i++) {
+        domain[i] = ippl::Index(nr[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::PARALLEL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t kw = {0.5, 0.5, 0.5};
+    double alpha = 0.05;
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw ;
+    double dx = rmax[0] / nr[0];
+    double dy = rmax[1] / nr[1];
+    double dz = rmax[2] / nr[2];
+
+    Vector_t hr = {dx, dy, dz};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+    const double dt = 0.5*dx;
+
+    const bool isAllPeriodic=true;
+    Mesh_t mesh(domain, hr, origin);
+    FieldLayout_t FL(domain, decomp, isAllPeriodic);
+    PLayout_t PL(FL, mesh);
+
+    //Q = -\int\int f dx dv
+    double Q = -rmax[0] * rmax[1] * rmax[2];
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+
+    P->nr_m = nr;
+
+    P->E_m.initialize(mesh, FL);
+    P->rho_m.initialize(mesh, FL);
+
+    bunch_type bunchBuffer(PL);
+
+    P->stype_m = argv[6];
+    P->initSolver();
+    P->time_m = 0.0;
+    P->loadbalancethreshold_m = std::atof(argv[7]);
+
+    bool isFirstRepartition;
+
+    if ((P->loadbalancethreshold_m != 1.0) && (Ippl::Comm->size() > 1)) {
+        msg << "Starting first repartition" << endl;
+        IpplTimings::startTimer(domainDecomposition);
+        isFirstRepartition = true;
+        const ippl::NDIndex<Dim>& lDom = FL.getLocalNDIndex();
+        const int nghost = P->rho_m.getNghost();
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        auto rhoview = P->rho_m.getView();
+
+        Kokkos::parallel_for("Assign initial rho based on PDF",
+                              mdrange_type({nghost, nghost, nghost},
+                                           {rhoview.extent(0) - nghost,
+                                            rhoview.extent(1) - nghost,
+                                            rhoview.extent(2) - nghost}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k)
+                              {
+                                //local to global index conversion
+                                const size_t ig = i + lDom[0].first() - nghost;
+                                const size_t jg = j + lDom[1].first() - nghost;
+                                const size_t kg = k + lDom[2].first() - nghost;
+                                double x = (ig + 0.5) * hr[0] + origin[0];
+                                double y = (jg + 0.5) * hr[1] + origin[1];
+                                double z = (kg + 0.5) * hr[2] + origin[2];
+
+                                Vector_t xvec = {x, y, z};
+
+                                rhoview(i, j, k) = PDF(xvec, alpha, kw, Dim);
+                                    
+                              });
+
+        Kokkos::fence();
+       
+        P->initializeORB(FL, mesh);
+        P->repartition(FL, mesh, bunchBuffer, isFirstRepartition);
+        IpplTimings::stopTimer(domainDecomposition);
+    }
+    
+    msg << "First domain decomposition done" << endl;
+    IpplTimings::startTimer(particleCreation);
+
+    typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
+    const RegionLayout_t& RLayout = PL.getRegionLayout();
+    const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
+    Vector_t Nr, Dr, minU, maxU;
+    int myRank = Ippl::Comm->rank();
+    for (unsigned d = 0; d <Dim; ++d) {
+        Nr[d] = CDF(Regions(myRank)[d].max(), alpha, kw[d]) - 
+                CDF(Regions(myRank)[d].min(), alpha, kw[d]);  
+        Dr[d] = CDF(rmax[d], alpha, kw[d]) - CDF(rmin[d], alpha, kw[d]);
+        minU[d] = CDF(Regions(myRank)[d].min(), alpha, kw[d]);
+        maxU[d]   = CDF(Regions(myRank)[d].max(), alpha, kw[d]);
+    }
+
+    double factor = (Nr[0] * Nr[1] * Nr[2]) / (Dr[0] * Dr[1] * Dr[2]);
+    size_type nloc = (size_type)(factor * totalP);
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+
+    int rest = (int) (totalP - Total_particles);
+
+    if ( Ippl::Comm->rank() < rest )
+        ++nloc;
+
+    P->create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         P->R.getView(), P->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    P->q = P->Q_m/totalP;
+    msg << "particles created and initial conditions assigned " << endl;
+    isFirstRepartition = false;
+    //The update after the particle creation is not needed as the 
+    //particles are generated locally
+
+    IpplTimings::startTimer(DummySolveTimer);
+    P->rho_m = 0.0;
+    P->solver_mp->solve();
+    IpplTimings::stopTimer(DummySolveTimer);
+
+    P->scatterCIC(totalP, 0, hr);
+
+    IpplTimings::startTimer(SolveTimer);
+    P->solver_mp->solve();
+    IpplTimings::stopTimer(SolveTimer);
+
+    P->gatherCIC();
+
+    IpplTimings::startTimer(dumpDataTimer);
+    P->dumpLandau();
+    P->gatherStatistics(totalP);
+    //P->dumpLocalDomains(FL, 0);
+    IpplTimings::stopTimer(dumpDataTimer);
+
+    // begin main timestep loop
+    msg << "Starting iterations ..." << endl;
+    for (unsigned int it=0; it<nt; it++) {
+
+        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
+
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
+
+        //drift
+        IpplTimings::startTimer(RTimer);
+        P->R = P->R + dt * P->P;
+        IpplTimings::stopTimer(RTimer);
+
+        //Since the particles have moved spatially update them to correct processors
+	    IpplTimings::startTimer(updateTimer);
+        PL.update(*P, bunchBuffer);
+        IpplTimings::stopTimer(updateTimer);
+
+        // Domain Decomposition
+        if (P->balance(totalP, it+1)) {
+           msg << "Starting repartition" << endl;
+           IpplTimings::startTimer(domainDecomposition);
+           P->repartition(FL, mesh, bunchBuffer, isFirstRepartition);
+           IpplTimings::stopTimer(domainDecomposition);
+           //IpplTimings::startTimer(dumpDataTimer);
+           //P->dumpLocalDomains(FL, it+1);
+           //IpplTimings::stopTimer(dumpDataTimer);
+        }
+
+
+        //scatter the charge onto the underlying grid
+        P->scatterCIC(totalP, it+1, hr);
+
+        //Field solve
+        IpplTimings::startTimer(SolveTimer);
+        P->solver_mp->solve();
+        IpplTimings::stopTimer(SolveTimer);
+
+        // gather E field
+        P->gatherCIC();
+
+        //kick
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
+
+        P->time_m += dt;
+        IpplTimings::startTimer(dumpDataTimer);
+        P->dumpLandau();
+        P->gatherStatistics(totalP);
+        IpplTimings::stopTimer(dumpDataTimer);
+        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    }
+
+    msg << "LandauDamping: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+    auto end = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> time_chrono = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+    std::cout << "Elapsed time: " << time_chrono.count() << std::endl;
+
+
+    return 0;
+}

From 8c282c9470bcbda6f94ed360ea42a9d17dca2818 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 24 Oct 2022 10:37:34 +0200
Subject: [PATCH 003/117] class name changed for ChargedParticles

---
 alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index e64417e19..c2f12fe21 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -1,4 +1,4 @@
-// ChargedParticles header file
+// ChargedParticlesPIF header file
 //   Defines a particle attribute for charged particles to be used in
 //   test programs
 //
@@ -128,7 +128,7 @@ void dumpVTK(Field_t& rho, int nx, int ny, int nz, int iteration,
 }
 
 template<class PLayout>
-class ChargedParticles : public ippl::ParticleBase<PLayout> {
+class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 public:
     VField_t E_m;
     Field_t rho_m;
@@ -169,7 +169,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
       This constructor is mandatory for all derived classes from
       ParticleBase as the bunch buffer uses this
     */
-    ChargedParticles(PLayout& pl)
+    ChargedParticlesPIF(PLayout& pl)
     : ippl::ParticleBase<PLayout>(pl)
     {
         // register the particle attributes
@@ -178,7 +178,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
         this->addAttribute(E);
     }
 
-    ChargedParticles(PLayout& pl,
+    ChargedParticlesPIF(PLayout& pl,
                      Vector_t hr,
                      Vector_t rmin,
                      Vector_t rmax,
@@ -199,13 +199,13 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
             decomp_m[i]=decomp[i];
     }
 
-    ~ChargedParticles(){ }
+    ~ChargedParticlesPIF(){ }
 
     void setupBCs() {
         setBCAllPeriodic();
     }
 
-    void updateLayout(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticles<PLayout>& buffer,
+    void updateLayout(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticlesPIF<PLayout>& buffer,
                       bool& isFirstRepartition) {
         // Update local fields
         static IpplTimings::TimerRef tupdateLayout = IpplTimings::getTimer("updateLayout");
@@ -229,7 +229,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
         orb.initialize(fl, mesh, rho_m);
     }
 
-    void repartition(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticles<PLayout>& buffer, 
+    void repartition(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticlesPIF<PLayout>& buffer, 
                      bool& isFirstRepartition) {
         // Repartition the domains
         bool res = orb.binaryRepartition(this->R, fl, isFirstRepartition);

From 3cb368d0f6a437d611e32c9b4e31e268e1ed944b Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 24 Oct 2022 10:41:44 +0200
Subject: [PATCH 004/117] CMakeLists added for alpine

---
 alpine/CMakeLists.txt | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 alpine/CMakeLists.txt

diff --git a/alpine/CMakeLists.txt b/alpine/CMakeLists.txt
new file mode 100644
index 000000000..ffba1ba3c
--- /dev/null
+++ b/alpine/CMakeLists.txt
@@ -0,0 +1,26 @@
+macro(list_subdirectories retval curdir)
+  file(GLOB sub-dir RELATIVE ${curdir} *)
+  set(list_of_dirs "")
+  foreach(dir ${sub-dir})
+    if(IS_DIRECTORY ${curdir}/${dir})
+      set(list_of_dirs ${list_of_dirs} ${dir})
+    endif()
+  endforeach()
+  set(${retval} ${list_of_dirs})
+endmacro()
+
+#list_subdirectories("TESTS" ${CMAKE_CURRENT_SOURCE_DIR})
+#foreach (test ${TESTS})
+#  add_subdirectory (${test})
+#endforeach()
+
+add_subdirectory (ElectrostaticPIC)
+
+# vi: set et ts=4 sw=4 sts=4:
+
+# Local Variables:
+# mode: cmake
+# cmake-tab-width: 4
+# indent-tabs-mode: nil
+# require-final-newline: nil
+# End:

From 440d593995e44e1db73d90d915a1a4e1aaddd05f Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 24 Oct 2022 16:20:01 +0200
Subject: [PATCH 005/117] Landau damping modified for PIF. Need to do scatter,
 solver and gather

---
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp | 117 +++----------------
 src/FieldLayout/FieldLayout.hpp              |   8 +-
 2 files changed, 23 insertions(+), 102 deletions(-)

diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index e78dd91bf..ec14d1b15 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -1,20 +1,13 @@
-// Landau Damping Test
+// Electrostatic Landau damping test with Particle-in-Fourier schemes
 //   Usage:
-//     srun ./LandauDamping <nx> <ny> <nz> <Np> <Nt> <stype> <lbthres> <ovfactor> --info 10
-//     nx       = No. cell-centered points in the x-direction
-//     ny       = No. cell-centered points in the y-direction
-//     nz       = No. cell-centered points in the z-direction
+//     srun ./LandauDampingPIF <nx> <ny> <nz> <Np> <Nt> --info 10
+//     nx       = No. of Fourier modes in the x-direction
+//     ny       = No. of Fourier modes in the y-direction
+//     nz       = No. of Fourier modes in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
 //     Nt       = Number of time steps
-//     stype    = Field solver type e.g., FFT
-//     lbthres  = Load balancing threshold i.e., lbthres*100 is the maximum load imbalance
-//                percentage which can be tolerated and beyond which
-//                particle load balancing occurs. A value of 0.01 is good for many typical 
-//                simulations.
-//     ovfactor = Over-allocation factor for the buffers used in the communication. Typical
-//                values are 1.0, 2.0. Value 1.0 means no over-allocation.
 //     Example:
-//     srun ./LandauDamping 128 128 128 10000 10 FFT 0.01 2.0 --info 10
+//     srun ./LandauDampingPIF 128 128 128 10000 10 --info 10
 //
 // Copyright (c) 2021, Sriramkrishnan Muralikrishnan,
 // Paul Scherrer Institut, Villigen PSI, Switzerland
@@ -31,7 +24,7 @@
 // along with IPPL. If not, see <https://www.gnu.org/licenses/>.
 //
 
-#include "ChargedParticles.hpp"
+#include "ChargedParticlesPIF.hpp"
 #include <string>
 #include <vector>
 #include <iostream>
@@ -146,17 +139,14 @@ double PDF(const Vector_t& xvec, const double& alpha,
     return pdf;
 }
 
-const char* TestName = "LandauDamping";
+const char* TestName = "LandauDampingPIF";
 
 int main(int argc, char *argv[]){
     Ippl ippl(argc, argv);
     
-    Inform msg("LandauDamping");
-    Inform msg2all("LandauDamping",INFORM_ALL_NODES);
+    Inform msg("LandauDampingPIF");
+    Inform msg2all("LandauDampingPIF",INFORM_ALL_NODES);
 
-    Ippl::Comm->setDefaultOverallocation(std::atof(argv[8]));
-
-    auto start = std::chrono::high_resolution_clock::now();
     ippl::Vector<int,Dim> nr = {
         std::atoi(argv[1]),
         std::atoi(argv[2]),
@@ -169,9 +159,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
     static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
     static IpplTimings::TimerRef updateTimer = IpplTimings::getTimer("update");
-    static IpplTimings::TimerRef DummySolveTimer = IpplTimings::getTimer("solveWarmup");
     static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("solve");
-    static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("domainDecomp");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -181,10 +169,10 @@ int main(int argc, char *argv[]){
     msg << "Landau damping"
         << endl
         << "nt " << nt << " Np= "
-        << totalP << " grid = " << nr
+        << totalP << " Fourier modes = " << nr
         << endl;
 
-    using bunch_type = ChargedParticles<PLayout_t>;
+    using bunch_type = ChargedParticlesPIF<PLayout_t>;
 
     std::unique_ptr<bunch_type>  P;
 
@@ -195,7 +183,7 @@ int main(int argc, char *argv[]){
 
     ippl::e_dim_tag decomp[Dim];
     for (unsigned d = 0; d < Dim; ++d) {
-        decomp[d] = ippl::PARALLEL;
+        decomp[d] = ippl::SERIAL;
     }
 
     // create mesh and layout objects for this problem domain
@@ -227,69 +215,22 @@ int main(int argc, char *argv[]){
 
     bunch_type bunchBuffer(PL);
 
-    P->stype_m = argv[6];
     P->initSolver();
     P->time_m = 0.0;
-    P->loadbalancethreshold_m = std::atof(argv[7]);
-
-    bool isFirstRepartition;
-
-    if ((P->loadbalancethreshold_m != 1.0) && (Ippl::Comm->size() > 1)) {
-        msg << "Starting first repartition" << endl;
-        IpplTimings::startTimer(domainDecomposition);
-        isFirstRepartition = true;
-        const ippl::NDIndex<Dim>& lDom = FL.getLocalNDIndex();
-        const int nghost = P->rho_m.getNghost();
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-        auto rhoview = P->rho_m.getView();
-
-        Kokkos::parallel_for("Assign initial rho based on PDF",
-                              mdrange_type({nghost, nghost, nghost},
-                                           {rhoview.extent(0) - nghost,
-                                            rhoview.extent(1) - nghost,
-                                            rhoview.extent(2) - nghost}),
-                              KOKKOS_LAMBDA(const int i,
-                                            const int j,
-                                            const int k)
-                              {
-                                //local to global index conversion
-                                const size_t ig = i + lDom[0].first() - nghost;
-                                const size_t jg = j + lDom[1].first() - nghost;
-                                const size_t kg = k + lDom[2].first() - nghost;
-                                double x = (ig + 0.5) * hr[0] + origin[0];
-                                double y = (jg + 0.5) * hr[1] + origin[1];
-                                double z = (kg + 0.5) * hr[2] + origin[2];
-
-                                Vector_t xvec = {x, y, z};
-
-                                rhoview(i, j, k) = PDF(xvec, alpha, kw, Dim);
-                                    
-                              });
-
-        Kokkos::fence();
-       
-        P->initializeORB(FL, mesh);
-        P->repartition(FL, mesh, bunchBuffer, isFirstRepartition);
-        IpplTimings::stopTimer(domainDecomposition);
-    }
-    
-    msg << "First domain decomposition done" << endl;
+
     IpplTimings::startTimer(particleCreation);
 
     typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
     const RegionLayout_t& RLayout = PL.getRegionLayout();
     const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
-    Vector_t Nr, Dr, minU, maxU;
+    Vector_t minU, maxU;
     int myRank = Ippl::Comm->rank();
     for (unsigned d = 0; d <Dim; ++d) {
-        Nr[d] = CDF(Regions(myRank)[d].max(), alpha, kw[d]) - 
-                CDF(Regions(myRank)[d].min(), alpha, kw[d]);  
-        Dr[d] = CDF(rmax[d], alpha, kw[d]) - CDF(rmin[d], alpha, kw[d]);
         minU[d] = CDF(Regions(myRank)[d].min(), alpha, kw[d]);
         maxU[d]   = CDF(Regions(myRank)[d].max(), alpha, kw[d]);
     }
 
-    double factor = (Nr[0] * Nr[1] * Nr[2]) / (Dr[0] * Dr[1] * Dr[2]);
+    double factor = 1.0/Ippl::Comm->size();
     size_type nloc = (size_type)(factor * totalP);
     size_type Total_particles = 0;
 
@@ -313,14 +254,6 @@ int main(int argc, char *argv[]){
     
     P->q = P->Q_m/totalP;
     msg << "particles created and initial conditions assigned " << endl;
-    isFirstRepartition = false;
-    //The update after the particle creation is not needed as the 
-    //particles are generated locally
-
-    IpplTimings::startTimer(DummySolveTimer);
-    P->rho_m = 0.0;
-    P->solver_mp->solve();
-    IpplTimings::stopTimer(DummySolveTimer);
 
     P->scatterCIC(totalP, 0, hr);
 
@@ -332,7 +265,6 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(dumpDataTimer);
     P->dumpLandau();
-    P->gatherStatistics(totalP);
     //P->dumpLocalDomains(FL, 0);
     IpplTimings::stopTimer(dumpDataTimer);
 
@@ -360,17 +292,6 @@ int main(int argc, char *argv[]){
         PL.update(*P, bunchBuffer);
         IpplTimings::stopTimer(updateTimer);
 
-        // Domain Decomposition
-        if (P->balance(totalP, it+1)) {
-           msg << "Starting repartition" << endl;
-           IpplTimings::startTimer(domainDecomposition);
-           P->repartition(FL, mesh, bunchBuffer, isFirstRepartition);
-           IpplTimings::stopTimer(domainDecomposition);
-           //IpplTimings::startTimer(dumpDataTimer);
-           //P->dumpLocalDomains(FL, it+1);
-           //IpplTimings::stopTimer(dumpDataTimer);
-        }
-
 
         //scatter the charge onto the underlying grid
         P->scatterCIC(totalP, it+1, hr);
@@ -391,7 +312,6 @@ int main(int argc, char *argv[]){
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
         P->dumpLandau();
-        P->gatherStatistics(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
@@ -400,11 +320,6 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
-    auto end = std::chrono::high_resolution_clock::now();
-
-    std::chrono::duration<double> time_chrono = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
-    std::cout << "Elapsed time: " << time_chrono.count() << std::endl;
-
 
     return 0;
 }
diff --git a/src/FieldLayout/FieldLayout.hpp b/src/FieldLayout/FieldLayout.hpp
index 34cf92e0e..47d12a9e6 100644
--- a/src/FieldLayout/FieldLayout.hpp
+++ b/src/FieldLayout/FieldLayout.hpp
@@ -132,7 +132,13 @@ namespace ippl {
 
         isAllPeriodic_m = isAllPeriodic;
 
-        if (nRanks < 2) {
+        bool isAllSerial = true;
+        
+        for (unsigned d = 0; d < Dim; ++d) {
+            isAllSerial = isAllSerial && (requestedLayout_m[d] == SERIAL);
+        }
+
+        if ((nRanks < 2) || isAllSerial) {
             Kokkos::resize(dLocalDomains_m, nRanks);
             Kokkos::resize(hLocalDomains_m, nRanks);
             hLocalDomains_m(0) = domain;

From 27856b254fcf056c042c00b00a462050b08052cb Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 2 Nov 2022 12:04:32 +0100
Subject: [PATCH 006/117] scatterPIF implemented and seems to be working

---
 alpine/CMakeLists.txt                         |   1 +
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 892 ++++++++----------
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  | 102 +-
 src/FieldLayout/FieldLayout.hpp               |  31 +-
 src/Particle/ParticleAttrib.h                 |  88 +-
 src/Particle/ParticleAttrib.hpp               |  95 ++
 6 files changed, 665 insertions(+), 544 deletions(-)

diff --git a/alpine/CMakeLists.txt b/alpine/CMakeLists.txt
index ffba1ba3c..3a6d622c5 100644
--- a/alpine/CMakeLists.txt
+++ b/alpine/CMakeLists.txt
@@ -15,6 +15,7 @@ endmacro()
 #endforeach()
 
 add_subdirectory (ElectrostaticPIC)
+add_subdirectory (ElectrostaticPIF)
 
 # vi: set et ts=4 sw=4 sts=4:
 
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index c2f12fe21..b13b9f156 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -40,8 +40,11 @@ template<typename T>
 using ParticleAttrib = ippl::ParticleAttrib<T>;
 
 typedef Vector<double, Dim>  Vector_t;
+typedef Vector<Kokkos::complex<double>, Dim>  CxVector_t;
 typedef Field<double, Dim>   Field_t;
+typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
 typedef Field<Vector_t, Dim> VField_t;
+typedef Field<CxVector_t, Dim> CxVField_t;
 typedef ippl::FFTPeriodicPoissonSolver<Vector_t, double, Dim> Solver_t;
 
 const double pi = std::acos(-1.0);
@@ -49,89 +52,89 @@ const double pi = std::acos(-1.0);
 // Test programs have to define this variable for VTK dump purposes
 extern const char* TestName;
 
-void dumpVTK(VField_t& E, int nx, int ny, int nz, int iteration,
-             double dx, double dy, double dz) {
-
-
-    typename VField_t::view_type::host_mirror_type host_view = E.getHostMirror();
-
-    std::stringstream fname;
-    fname << "data/ef_";
-    fname << std::setw(4) << std::setfill('0') << iteration;
-    fname << ".vtk";
-
-    Kokkos::deep_copy(host_view, E.getView());
-
-    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
-    vtkout.precision(10);
-    vtkout.setf(std::ios::scientific, std::ios::floatfield);
-
-    // start with header
-    vtkout << "# vtk DataFile Version 2.0" << endl;
-    vtkout << TestName << endl;
-    vtkout << "ASCII" << endl;
-    vtkout << "DATASET STRUCTURED_POINTS" << endl;
-    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
-    vtkout << "ORIGIN "     << -dx  << " " << -dy  << " "  << -dz << endl;
-    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
-    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
-
-    vtkout << "VECTORS E-Field float" << endl;
-    for (int z=0; z<nz+2; z++) {
-        for (int y=0; y<ny+2; y++) {
-            for (int x=0; x<nx+2; x++) {
-
-                vtkout << host_view(x,y,z)[0] << "\t"
-                       << host_view(x,y,z)[1] << "\t"
-                       << host_view(x,y,z)[2] << endl;
-            }
-        }
-    }
-}
-
-void dumpVTK(Field_t& rho, int nx, int ny, int nz, int iteration,
-             double dx, double dy, double dz) {
-
-    typename Field_t::view_type::host_mirror_type host_view = rho.getHostMirror();
-
-    std::stringstream fname;
-    fname << "data/scalar_";
-    fname << std::setw(4) << std::setfill('0') << iteration;
-    fname << ".vtk";
-
-    Kokkos::deep_copy(host_view, rho.getView());
-
-    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
-    vtkout.precision(10);
-    vtkout.setf(std::ios::scientific, std::ios::floatfield);
-
-    // start with header
-    vtkout << "# vtk DataFile Version 2.0" << endl;
-    vtkout << TestName << endl;
-    vtkout << "ASCII" << endl;
-    vtkout << "DATASET STRUCTURED_POINTS" << endl;
-    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
-    vtkout << "ORIGIN " << -dx << " " << -dy << " " << -dz << endl;
-    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
-    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
-
-    vtkout << "SCALARS Rho float" << endl;
-    vtkout << "LOOKUP_TABLE default" << endl;
-    for (int z=0; z<nz+2; z++) {
-        for (int y=0; y<ny+2; y++) {
-            for (int x=0; x<nx+2; x++) {
-
-                vtkout << host_view(x,y,z) << endl;
-            }
-        }
-    }
-}
+//void dumpVTK(VField_t& E, int nx, int ny, int nz, int iteration,
+//             double dx, double dy, double dz) {
+//
+//
+//    typename VField_t::view_type::host_mirror_type host_view = E.getHostMirror();
+//
+//    std::stringstream fname;
+//    fname << "data/ef_";
+//    fname << std::setw(4) << std::setfill('0') << iteration;
+//    fname << ".vtk";
+//
+//    Kokkos::deep_copy(host_view, E.getView());
+//
+//    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
+//    vtkout.precision(10);
+//    vtkout.setf(std::ios::scientific, std::ios::floatfield);
+//
+//    // start with header
+//    vtkout << "# vtk DataFile Version 2.0" << endl;
+//    vtkout << TestName << endl;
+//    vtkout << "ASCII" << endl;
+//    vtkout << "DATASET STRUCTURED_POINTS" << endl;
+//    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
+//    vtkout << "ORIGIN "     << -dx  << " " << -dy  << " "  << -dz << endl;
+//    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
+//    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
+//
+//    vtkout << "VECTORS E-Field float" << endl;
+//    for (int z=0; z<nz+2; z++) {
+//        for (int y=0; y<ny+2; y++) {
+//            for (int x=0; x<nx+2; x++) {
+//
+//                vtkout << host_view(x,y,z)[0] << "\t"
+//                       << host_view(x,y,z)[1] << "\t"
+//                       << host_view(x,y,z)[2] << endl;
+//            }
+//        }
+//    }
+//}
+//
+//void dumpVTK(Field_t& rho, int nx, int ny, int nz, int iteration,
+//             double dx, double dy, double dz) {
+//
+//    typename Field_t::view_type::host_mirror_type host_view = rho.getHostMirror();
+//
+//    std::stringstream fname;
+//    fname << "data/scalar_";
+//    fname << std::setw(4) << std::setfill('0') << iteration;
+//    fname << ".vtk";
+//
+//    Kokkos::deep_copy(host_view, rho.getView());
+//
+//    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
+//    vtkout.precision(10);
+//    vtkout.setf(std::ios::scientific, std::ios::floatfield);
+//
+//    // start with header
+//    vtkout << "# vtk DataFile Version 2.0" << endl;
+//    vtkout << TestName << endl;
+//    vtkout << "ASCII" << endl;
+//    vtkout << "DATASET STRUCTURED_POINTS" << endl;
+//    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
+//    vtkout << "ORIGIN " << -dx << " " << -dy << " " << -dz << endl;
+//    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
+//    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
+//
+//    vtkout << "SCALARS Rho float" << endl;
+//    vtkout << "LOOKUP_TABLE default" << endl;
+//    for (int z=0; z<nz+2; z++) {
+//        for (int y=0; y<ny+2; y++) {
+//            for (int x=0; x<nx+2; x++) {
+//
+//                vtkout << host_view(x,y,z) << endl;
+//            }
+//        }
+//    }
+//}
 
 template<class PLayout>
 class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 public:
-    VField_t E_m;
-    Field_t rho_m;
+    CxVField_t E_m;
+    CxField_t rho_m;
 
     // ORB
     ORB orb;
@@ -205,415 +208,358 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         setBCAllPeriodic();
     }
 
-    void updateLayout(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticlesPIF<PLayout>& buffer,
-                      bool& isFirstRepartition) {
-        // Update local fields
-        static IpplTimings::TimerRef tupdateLayout = IpplTimings::getTimer("updateLayout");
-        IpplTimings::startTimer(tupdateLayout);
-        this->E_m.updateLayout(fl);
-        this->rho_m.updateLayout(fl);
-
-        // Update layout with new FieldLayout
-        PLayout& layout = this->getLayout();
-        layout.updateLayout(fl, mesh);
-        IpplTimings::stopTimer(tupdateLayout);
-        static IpplTimings::TimerRef tupdatePLayout = IpplTimings::getTimer("updatePB");
-        IpplTimings::startTimer(tupdatePLayout);
-        if(!isFirstRepartition) {
-            layout.update(*this, buffer);
-        }
-        IpplTimings::stopTimer(tupdatePLayout);
-    }
-
-    void initializeORB(FieldLayout_t& fl, Mesh_t& mesh) {
-        orb.initialize(fl, mesh, rho_m);
-    }
-
-    void repartition(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticlesPIF<PLayout>& buffer, 
-                     bool& isFirstRepartition) {
-        // Repartition the domains
-        bool res = orb.binaryRepartition(this->R, fl, isFirstRepartition);
-
-        if (res != true) {
-           std::cout << "Could not repartition!" << std::endl;
-           return;
-        }
-        // Update
-        this->updateLayout(fl, mesh, buffer, isFirstRepartition);
-        this->solver_mp->setRhs(rho_m);
-    }
-
-    bool balance(size_type totalP, const unsigned int nstep){
-        if(std::strcmp(TestName,"UniformPlasmaTest") == 0) {
-            return (nstep % loadbalancefreq_m == 0);
-        }
-        else {
-            int local = 0;
-            std::vector<int> res(Ippl::Comm->size());
-            double equalPart = (double) totalP / Ippl::Comm->size();
-            double dev = std::abs((double)this->getLocalNum() - equalPart) / totalP;
-            if (dev > loadbalancethreshold_m)
-                local = 1;
-            MPI_Allgather(&local, 1, MPI_INT, res.data(), 1, MPI_INT, Ippl::getComm());
-
-            for (unsigned int i = 0; i < res.size(); i++) {
-                if (res[i] == 1)
-                    return true;
-            }
-            return false;
-        }
-    }
+    //void updateLayout(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticlesPIF<PLayout>& buffer,
+    //                  bool& isFirstRepartition) {
+    //    // Update local fields
+    //    static IpplTimings::TimerRef tupdateLayout = IpplTimings::getTimer("updateLayout");
+    //    IpplTimings::startTimer(tupdateLayout);
+    //    this->E_m.updateLayout(fl);
+    //    this->rho_m.updateLayout(fl);
+
+    //    // Update layout with new FieldLayout
+    //    PLayout& layout = this->getLayout();
+    //    layout.updateLayout(fl, mesh);
+    //    IpplTimings::stopTimer(tupdateLayout);
+    //    static IpplTimings::TimerRef tupdatePLayout = IpplTimings::getTimer("updatePB");
+    //    IpplTimings::startTimer(tupdatePLayout);
+    //    if(!isFirstRepartition) {
+    //        layout.update(*this, buffer);
+    //    }
+    //    IpplTimings::stopTimer(tupdatePLayout);
+    //}
+
+    //void initializeORB(FieldLayout_t& fl, Mesh_t& mesh) {
+    //    orb.initialize(fl, mesh, rho_m);
+    //}
+
+    //void repartition(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticlesPIF<PLayout>& buffer, 
+    //                 bool& isFirstRepartition) {
+    //    // Repartition the domains
+    //    bool res = orb.binaryRepartition(this->R, fl, isFirstRepartition);
+
+    //    if (res != true) {
+    //       std::cout << "Could not repartition!" << std::endl;
+    //       return;
+    //    }
+    //    // Update
+    //    this->updateLayout(fl, mesh, buffer, isFirstRepartition);
+    //    this->solver_mp->setRhs(rho_m);
+    //}
+
+    //bool balance(size_type totalP, const unsigned int nstep){
+    //    if(std::strcmp(TestName,"UniformPlasmaTest") == 0) {
+    //        return (nstep % loadbalancefreq_m == 0);
+    //    }
+    //    else {
+    //        int local = 0;
+    //        std::vector<int> res(Ippl::Comm->size());
+    //        double equalPart = (double) totalP / Ippl::Comm->size();
+    //        double dev = std::abs((double)this->getLocalNum() - equalPart) / totalP;
+    //        if (dev > loadbalancethreshold_m)
+    //            local = 1;
+    //        MPI_Allgather(&local, 1, MPI_INT, res.data(), 1, MPI_INT, Ippl::getComm());
+
+    //        for (unsigned int i = 0; i < res.size(); i++) {
+    //            if (res[i] == 1)
+    //                return true;
+    //        }
+    //        return false;
+    //    }
+    //}
+
+    //void gatherStatistics(size_type totalP) {
+    //    std::vector<double> imb(Ippl::Comm->size());
+    //    double equalPart = (double) totalP / Ippl::Comm->size();
+    //    double dev = (std::abs((double)this->getLocalNum() - equalPart) 
+    //                 / totalP) * 100.0;
+    //    MPI_Gather(&dev, 1, MPI_DOUBLE, imb.data(), 1, MPI_DOUBLE, 0, 
+    //               Ippl::getComm());
+    //
+    //    if (Ippl::Comm->rank() == 0) {
+    //        std::stringstream fname;
+    //        fname << "data/LoadBalance_";
+    //        fname << Ippl::Comm->size();
+    //        fname << ".csv";
+
+    //        Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+    //        csvout.precision(5);
+    //        csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+    //        if(time_m == 0.0) {
+    //            csvout << "time, rank, imbalance percentage" << endl;
+    //        }
+
+    //        for(int r=0; r < Ippl::Comm->size(); ++r) { 
+    //            csvout << time_m << " "
+    //                   << r << " "
+    //                   << imb[r] << endl;
+    //        }
+    //    }
+
+    //    Ippl::Comm->barrier();
+    //
+    //}
+
+    void gather() {
+
+        gatherPIF(this->E, E_m, this->R);
 
-    void gatherStatistics(size_type totalP) {
-        std::vector<double> imb(Ippl::Comm->size());
-        double equalPart = (double) totalP / Ippl::Comm->size();
-        double dev = (std::abs((double)this->getLocalNum() - equalPart) 
-                     / totalP) * 100.0;
-        MPI_Gather(&dev, 1, MPI_DOUBLE, imb.data(), 1, MPI_DOUBLE, 0, 
-                   Ippl::getComm());
-    
-        if (Ippl::Comm->rank() == 0) {
-            std::stringstream fname;
-            fname << "data/LoadBalance_";
-            fname << Ippl::Comm->size();
-            fname << ".csv";
-
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-            csvout.precision(5);
-            csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-            if(time_m == 0.0) {
-                csvout << "time, rank, imbalance percentage" << endl;
-            }
-
-            for(int r=0; r < Ippl::Comm->size(); ++r) { 
-                csvout << time_m << " "
-                       << r << " "
-                       << imb[r] << endl;
-            }
-        }
-
-        Ippl::Comm->barrier();
-    
     }
 
-    void gatherCIC() {
-
-        gather(this->E, E_m, this->R);
-
-    }
-
-    void scatterCIC(size_type totalP, unsigned int iteration, Vector_t& hrField) {
-
-
-         Inform m("scatter ");
-
-         rho_m = 0.0;
-         scatter(q, rho_m, this->R);
-
-         static IpplTimings::TimerRef sumTimer = IpplTimings::getTimer("Check");
-         IpplTimings::startTimer(sumTimer);
-         double Q_grid = rho_m.sum();
-
-         size_type Total_particles = 0;
-         size_type local_particles = this->getLocalNum();
-
-         MPI_Reduce(&local_particles, &Total_particles, 1,
-                       MPI_UNSIGNED_LONG, MPI_SUM, 0, Ippl::getComm());
-
-         double rel_error = std::fabs((Q_m-Q_grid)/Q_m);
-         m << "Rel. error in charge conservation = " << rel_error << endl;
-
-         if(Ippl::Comm->rank() == 0) {
-             if(Total_particles != totalP || rel_error > 1e-10) {
-                 m << "Time step: " << iteration << endl;
-                 m << "Total particles in the sim. " << totalP
-                   << " " << "after update: "
-                   << Total_particles << endl;
-                 m << "Rel. error in charge conservation: "
-                   << rel_error << endl;
-                 std::abort();
-             }
-         }
-
-         rho_m = rho_m / (hrField[0] * hrField[1] * hrField[2]);
-
-         rhoNorm_m = norm(rho_m);
-         IpplTimings::stopTimer(sumTimer);
+    void scatter() {
+        
+        Inform m("scatter ");
+        rho_m = {0.0, 0.0};
+        scatterPIF(q, rho_m, this->R);
 
-         //dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
+        rho_m = rho_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
 
-         //rho = rho_e - rho_i
-         rho_m = rho_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
     }
 
     void initSolver() {
 
         Inform m("solver ");
-        if(stype_m == "FFT")
-            initFFTSolver();
-        else
-            m << "No solver matches the argument" << endl;
-
-    }
-
-    void initFFTSolver() {
-        ippl::ParameterList sp;
-        sp.add("output_type", Solver_t::GRAD);
-        sp.add("use_heffte_defaults", false);  
-        sp.add("use_pencils", true);  
-        sp.add("use_reorder", false);  
-        sp.add("use_gpu_aware", true);  
-        sp.add("comm", ippl::p2p_pl);  
-        sp.add("r2c_direction", 0);  
-
-        solver_mp = std::make_shared<Solver_t>();
-
-        solver_mp->mergeParameters(sp);
 
-        solver_mp->setRhs(rho_m);
-
-        solver_mp->setLhs(E_m);
     }
 
 
 
-     void dumpData() {
-
-        auto Pview = P.getView();
-
-        double Energy = 0.0;
-
-        Kokkos::parallel_reduce("Particle Energy", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const int i, double& valL){
-                                    double myVal = dot(Pview(i), Pview(i)).apply();
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(Energy));
-
-        Energy *= 0.5;
-        double gEnergy = 0.0;
-
-        MPI_Reduce(&Energy, &gEnergy, 1,
-                    MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-
-
-        const int nghostE = E_m.getNghost();
-        auto Eview = E_m.getView();
-        Vector_t normE;
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-        for (unsigned d=0; d<Dim; ++d) {
-
-        double temp = 0.0;
-        Kokkos::parallel_reduce("Vector E reduce",
-                                mdrange_type({nghostE, nghostE, nghostE},
-                                             {Eview.extent(0) - nghostE,
-                                              Eview.extent(1) - nghostE,
-                                              Eview.extent(2) - nghostE}),
-                                KOKKOS_LAMBDA(const size_t i, const size_t j,
-                                              const size_t k, double& valL)
-                                {
-                                    double myVal = std::pow(Eview(i, j, k)[d], 2);
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
-            double globaltemp = 0.0;
-            MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-            normE[d] = std::sqrt(globaltemp);
-        }
-
-        if (Ippl::Comm->rank() == 0) {
-            std::stringstream fname;
-            fname << "data/ParticleField_";
-            fname << Ippl::Comm->size();
-            fname << ".csv";
-
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-            csvout.precision(10);
-            csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-            if(time_m == 0.0) {
-                csvout << "time, Kinetic energy, Rho_norm2, Ex_norm2, Ey_norm2, Ez_norm2" << endl;
-            }
-
-            csvout << time_m << " "
-                   << gEnergy << " "
-                   << rhoNorm_m << " "
-                   << normE[0] << " "
-                   << normE[1] << " "
-                   << normE[2] << endl;
-        }
-
-        Ippl::Comm->barrier();
-     }
-
-     void dumpLandau() {
-
-        const int nghostE = E_m.getNghost();
-        auto Eview = E_m.getView();
-        double fieldEnergy, ExAmp;
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-        double temp = 0.0;
-        Kokkos::parallel_reduce("Ex inner product",
-                                mdrange_type({nghostE, nghostE, nghostE},
-                                             {Eview.extent(0) - nghostE,
-                                              Eview.extent(1) - nghostE,
-                                              Eview.extent(2) - nghostE}),
-                                KOKKOS_LAMBDA(const size_t i, const size_t j,
-                                              const size_t k, double& valL)
-                                {
-                                    double myVal = std::pow(Eview(i, j, k)[0], 2);
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
-        double globaltemp = 0.0;
-        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-        fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
-
-        double tempMax = 0.0;
-        Kokkos::parallel_reduce("Ex max norm",
-                                mdrange_type({nghostE, nghostE, nghostE},
-                                             {Eview.extent(0) - nghostE,
-                                              Eview.extent(1) - nghostE,
-                                              Eview.extent(2) - nghostE}),
-                                KOKKOS_LAMBDA(const size_t i, const size_t j,
-                                              const size_t k, double& valL)
-                                {
-                                    double myVal = std::fabs(Eview(i, j, k)[0]);
-                                    if(myVal > valL) valL = myVal;
-                                }, Kokkos::Max<double>(tempMax));
-        ExAmp = 0.0;
-        MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
-
-
-        if (Ippl::Comm->rank() == 0) {
-            std::stringstream fname;
-            fname << "data/FieldLandau_";
-            fname << Ippl::Comm->size();
-            fname << ".csv";
-
-
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-            csvout.precision(10);
-            csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-            if(time_m == 0.0) {
-                csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
-            }
-
-            csvout << time_m << " "
-                   << fieldEnergy << " "
-                   << ExAmp << endl;
-
-        }
-        
-        Ippl::Comm->barrier();
-     }
-     
-     void dumpBumponTail() {
-
-        const int nghostE = E_m.getNghost();
-        auto Eview = E_m.getView();
-        double fieldEnergy, EzAmp;
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-        double temp = 0.0;
-        Kokkos::parallel_reduce("Ex inner product",
-                                mdrange_type({nghostE, nghostE, nghostE},
-                                             {Eview.extent(0) - nghostE,
-                                              Eview.extent(1) - nghostE,
-                                              Eview.extent(2) - nghostE}),
-                                KOKKOS_LAMBDA(const size_t i, const size_t j,
-                                              const size_t k, double& valL)
-                                {
-                                    double myVal = std::pow(Eview(i, j, k)[2], 2);
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
-        double globaltemp = 0.0;
-        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-        fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
-
-        double tempMax = 0.0;
-        Kokkos::parallel_reduce("Ex max norm",
-                                mdrange_type({nghostE, nghostE, nghostE},
-                                             {Eview.extent(0) - nghostE,
-                                              Eview.extent(1) - nghostE,
-                                              Eview.extent(2) - nghostE}),
-                                KOKKOS_LAMBDA(const size_t i, const size_t j,
-                                              const size_t k, double& valL)
-                                {
-                                    double myVal = std::fabs(Eview(i, j, k)[2]);
-                                    if(myVal > valL) valL = myVal;
-                                }, Kokkos::Max<double>(tempMax));
-        EzAmp = 0.0;
-        MPI_Reduce(&tempMax, &EzAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
-
-
-        if (Ippl::Comm->rank() == 0) {
-            std::stringstream fname;
-            fname << "data/FieldBumponTail_";
-            fname << Ippl::Comm->size();
-            fname << ".csv";
-
-
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-            csvout.precision(10);
-            csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-            if(time_m == 0.0) {
-                csvout << "time, Ez_field_energy, Ez_max_norm" << endl;
-            }
-
-            csvout << time_m << " "
-                   << fieldEnergy << " "
-                   << EzAmp << endl;
-
-        }
-        
-        Ippl::Comm->barrier();
-     }
-
-     void dumpParticleData() {
-
-        typename ParticleAttrib<Vector_t>::HostMirror R_host = this->R.getHostMirror();
-        typename ParticleAttrib<Vector_t>::HostMirror P_host = this->P.getHostMirror();
-        Kokkos::deep_copy(R_host, this->R.getView());
-        Kokkos::deep_copy(P_host, P.getView());
-        std::stringstream pname;
-        pname << "data/ParticleIC_";
-        pname << Ippl::Comm->rank();
-        pname << ".csv";
-        Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
-        pcsvout.precision(10);
-        pcsvout.setf(std::ios::scientific, std::ios::floatfield);
-        pcsvout << "R_x, R_y, R_z, V_x, V_y, V_z" << endl;
-        for (size_type i = 0; i< this->getLocalNum(); i++) {
-            pcsvout << R_host(i)[0] << " "
-                    << R_host(i)[1] << " "
-                    << R_host(i)[2] << " "
-                    << P_host(i)[0] << " "
-                    << P_host(i)[1] << " "
-                    << P_host(i)[2] << endl;
-        }
-        Ippl::Comm->barrier();
-     }
-     
-     void dumpLocalDomains(const FieldLayout_t& fl, const unsigned int step) {
-
-        if (Ippl::Comm->rank() == 0) {
-            const typename FieldLayout_t::host_mirror_type domains = fl.getHostLocalDomains();
-            std::ofstream myfile;
-            myfile.open("data/domains" + std::to_string(step) + ".txt");
-            for (unsigned int i = 0; i < domains.size(); ++i) {
-                myfile << domains[i][0].first() << " " << domains[i][1].first() << " " << domains[i][2].first() << " "
-                       << domains[i][0].first() << " " << domains[i][1].last() << " " << domains[i][2].first() << " "
-                       << domains[i][0].last() << " " << domains[i][1].first() << " " << domains[i][2].first() << " "
-                       << domains[i][0].first() << " " << domains[i][1].first() << " " << domains[i][2].last()
-                       << "\n";
-            }
-            myfile.close();
-        }
-        Ippl::Comm->barrier();
-     }
+     //void dumpData() {
+
+     //   auto Pview = P.getView();
+
+     //   double Energy = 0.0;
+
+     //   Kokkos::parallel_reduce("Particle Energy", this->getLocalNum(),
+     //                           KOKKOS_LAMBDA(const int i, double& valL){
+     //                               double myVal = dot(Pview(i), Pview(i)).apply();
+     //                               valL += myVal;
+     //                           }, Kokkos::Sum<double>(Energy));
+
+     //   Energy *= 0.5;
+     //   double gEnergy = 0.0;
+
+     //   MPI_Reduce(&Energy, &gEnergy, 1,
+     //               MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+
+     //   const int nghostE = E_m.getNghost();
+     //   auto Eview = E_m.getView();
+     //   Vector_t normE;
+     //   using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+     //   for (unsigned d=0; d<Dim; ++d) {
+
+     //   double temp = 0.0;
+     //   Kokkos::parallel_reduce("Vector E reduce",
+     //                           mdrange_type({nghostE, nghostE, nghostE},
+     //                                        {Eview.extent(0) - nghostE,
+     //                                         Eview.extent(1) - nghostE,
+     //                                         Eview.extent(2) - nghostE}),
+     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
+     //                                         const size_t k, double& valL)
+     //                           {
+     //                               double myVal = std::pow(Eview(i, j, k)[d], 2);
+     //                               valL += myVal;
+     //                           }, Kokkos::Sum<double>(temp));
+     //       double globaltemp = 0.0;
+     //       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+     //       normE[d] = std::sqrt(globaltemp);
+     //   }
+
+     //   if (Ippl::Comm->rank() == 0) {
+     //       std::stringstream fname;
+     //       fname << "data/ParticleField_";
+     //       fname << Ippl::Comm->size();
+     //       fname << ".csv";
+
+     //       Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+     //       csvout.precision(10);
+     //       csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+     //       if(time_m == 0.0) {
+     //           csvout << "time, Kinetic energy, Rho_norm2, Ex_norm2, Ey_norm2, Ez_norm2" << endl;
+     //       }
+
+     //       csvout << time_m << " "
+     //              << gEnergy << " "
+     //              << rhoNorm_m << " "
+     //              << normE[0] << " "
+     //              << normE[1] << " "
+     //              << normE[2] << endl;
+     //   }
+
+     //   Ippl::Comm->barrier();
+     //}
+
+     //void dumpLandau() {
+
+     //   const int nghostE = E_m.getNghost();
+     //   auto Eview = E_m.getView();
+     //   double fieldEnergy, ExAmp;
+     //   using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+     //   double temp = 0.0;
+     //   Kokkos::parallel_reduce("Ex inner product",
+     //                           mdrange_type({nghostE, nghostE, nghostE},
+     //                                        {Eview.extent(0) - nghostE,
+     //                                         Eview.extent(1) - nghostE,
+     //                                         Eview.extent(2) - nghostE}),
+     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
+     //                                         const size_t k, double& valL)
+     //                           {
+     //                               double myVal = std::pow(Eview(i, j, k)[0], 2);
+     //                               valL += myVal;
+     //                           }, Kokkos::Sum<double>(temp));
+     //   double globaltemp = 0.0;
+     //   MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+     //   fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
+
+     //   double tempMax = 0.0;
+     //   Kokkos::parallel_reduce("Ex max norm",
+     //                           mdrange_type({nghostE, nghostE, nghostE},
+     //                                        {Eview.extent(0) - nghostE,
+     //                                         Eview.extent(1) - nghostE,
+     //                                         Eview.extent(2) - nghostE}),
+     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
+     //                                         const size_t k, double& valL)
+     //                           {
+     //                               double myVal = std::fabs(Eview(i, j, k)[0]);
+     //                               if(myVal > valL) valL = myVal;
+     //                           }, Kokkos::Max<double>(tempMax));
+     //   ExAmp = 0.0;
+     //   MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+     //   if (Ippl::Comm->rank() == 0) {
+     //       std::stringstream fname;
+     //       fname << "data/FieldLandau_";
+     //       fname << Ippl::Comm->size();
+     //       fname << ".csv";
+
+
+     //       Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+     //       csvout.precision(10);
+     //       csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+     //       if(time_m == 0.0) {
+     //           csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+     //       }
+
+     //       csvout << time_m << " "
+     //              << fieldEnergy << " "
+     //              << ExAmp << endl;
+
+     //   }
+     //   
+     //   Ippl::Comm->barrier();
+     //}
+     //
+     //void dumpBumponTail() {
+
+     //   const int nghostE = E_m.getNghost();
+     //   auto Eview = E_m.getView();
+     //   double fieldEnergy, EzAmp;
+     //   using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+     //   double temp = 0.0;
+     //   Kokkos::parallel_reduce("Ex inner product",
+     //                           mdrange_type({nghostE, nghostE, nghostE},
+     //                                        {Eview.extent(0) - nghostE,
+     //                                         Eview.extent(1) - nghostE,
+     //                                         Eview.extent(2) - nghostE}),
+     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
+     //                                         const size_t k, double& valL)
+     //                           {
+     //                               double myVal = std::pow(Eview(i, j, k)[2], 2);
+     //                               valL += myVal;
+     //                           }, Kokkos::Sum<double>(temp));
+     //   double globaltemp = 0.0;
+     //   MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+     //   fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
+
+     //   double tempMax = 0.0;
+     //   Kokkos::parallel_reduce("Ex max norm",
+     //                           mdrange_type({nghostE, nghostE, nghostE},
+     //                                        {Eview.extent(0) - nghostE,
+     //                                         Eview.extent(1) - nghostE,
+     //                                         Eview.extent(2) - nghostE}),
+     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
+     //                                         const size_t k, double& valL)
+     //                           {
+     //                               double myVal = std::fabs(Eview(i, j, k)[2]);
+     //                               if(myVal > valL) valL = myVal;
+     //                           }, Kokkos::Max<double>(tempMax));
+     //   EzAmp = 0.0;
+     //   MPI_Reduce(&tempMax, &EzAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+     //   if (Ippl::Comm->rank() == 0) {
+     //       std::stringstream fname;
+     //       fname << "data/FieldBumponTail_";
+     //       fname << Ippl::Comm->size();
+     //       fname << ".csv";
+
+
+     //       Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+     //       csvout.precision(10);
+     //       csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+     //       if(time_m == 0.0) {
+     //           csvout << "time, Ez_field_energy, Ez_max_norm" << endl;
+     //       }
+
+     //       csvout << time_m << " "
+     //              << fieldEnergy << " "
+     //              << EzAmp << endl;
+
+     //   }
+     //   
+     //   Ippl::Comm->barrier();
+     //}
+
+     //void dumpParticleData() {
+
+     //   typename ParticleAttrib<Vector_t>::HostMirror R_host = this->R.getHostMirror();
+     //   typename ParticleAttrib<Vector_t>::HostMirror P_host = this->P.getHostMirror();
+     //   Kokkos::deep_copy(R_host, this->R.getView());
+     //   Kokkos::deep_copy(P_host, P.getView());
+     //   std::stringstream pname;
+     //   pname << "data/ParticleIC_";
+     //   pname << Ippl::Comm->rank();
+     //   pname << ".csv";
+     //   Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+     //   pcsvout.precision(10);
+     //   pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+     //   pcsvout << "R_x, R_y, R_z, V_x, V_y, V_z" << endl;
+     //   for (size_type i = 0; i< this->getLocalNum(); i++) {
+     //       pcsvout << R_host(i)[0] << " "
+     //               << R_host(i)[1] << " "
+     //               << R_host(i)[2] << " "
+     //               << P_host(i)[0] << " "
+     //               << P_host(i)[1] << " "
+     //               << P_host(i)[2] << endl;
+     //   }
+     //   Ippl::Comm->barrier();
+     //}
+     //
+     //void dumpLocalDomains(const FieldLayout_t& fl, const unsigned int step) {
+
+     //   if (Ippl::Comm->rank() == 0) {
+     //       const typename FieldLayout_t::host_mirror_type domains = fl.getHostLocalDomains();
+     //       std::ofstream myfile;
+     //       myfile.open("data/domains" + std::to_string(step) + ".txt");
+     //       for (unsigned int i = 0; i < domains.size(); ++i) {
+     //           myfile << domains[i][0].first() << " " << domains[i][1].first() << " " << domains[i][2].first() << " "
+     //                  << domains[i][0].first() << " " << domains[i][1].last() << " " << domains[i][2].first() << " "
+     //                  << domains[i][0].last() << " " << domains[i][1].first() << " " << domains[i][2].first() << " "
+     //                  << domains[i][0].first() << " " << domains[i][1].first() << " " << domains[i][2].last()
+     //                  << "\n";
+     //       }
+     //       myfile.close();
+     //   }
+     //   Ippl::Comm->barrier();
+     //}
 
 private:
     void setBCAllPeriodic() {
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index ec14d1b15..0102e55dc 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -220,14 +220,14 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(particleCreation);
 
-    typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
-    const RegionLayout_t& RLayout = PL.getRegionLayout();
-    const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
+    //typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
+    //const RegionLayout_t& RLayout = PL.getRegionLayout();
+    //const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
     Vector_t minU, maxU;
     int myRank = Ippl::Comm->rank();
     for (unsigned d = 0; d <Dim; ++d) {
-        minU[d] = CDF(Regions(myRank)[d].min(), alpha, kw[d]);
-        maxU[d]   = CDF(Regions(myRank)[d].max(), alpha, kw[d]);
+        minU[d] = rmin[d];//CDF(Regions(myRank)[d].min(), alpha, kw[d]);
+        maxU[d] = rmax[d];//CDF(Regions(myRank)[d].max(), alpha, kw[d]);
     }
 
     double factor = 1.0/Ippl::Comm->size();
@@ -255,66 +255,66 @@ int main(int argc, char *argv[]){
     P->q = P->Q_m/totalP;
     msg << "particles created and initial conditions assigned " << endl;
 
-    P->scatterCIC(totalP, 0, hr);
+    P->scatter();
 
-    IpplTimings::startTimer(SolveTimer);
-    P->solver_mp->solve();
-    IpplTimings::stopTimer(SolveTimer);
+    //IpplTimings::startTimer(SolveTimer);
+    //P->solver_mp->solve();
+    //IpplTimings::stopTimer(SolveTimer);
 
-    P->gatherCIC();
+    //P->gather();
 
-    IpplTimings::startTimer(dumpDataTimer);
-    P->dumpLandau();
-    //P->dumpLocalDomains(FL, 0);
-    IpplTimings::stopTimer(dumpDataTimer);
+    //IpplTimings::startTimer(dumpDataTimer);
+    //P->dumpLandau();
+    ////P->dumpLocalDomains(FL, 0);
+    //IpplTimings::stopTimer(dumpDataTimer);
 
-    // begin main timestep loop
-    msg << "Starting iterations ..." << endl;
-    for (unsigned int it=0; it<nt; it++) {
+    //// begin main timestep loop
+    //msg << "Starting iterations ..." << endl;
+    //for (unsigned int it=0; it<nt; it++) {
 
-        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
-        // Here, we assume a constant charge-to-mass ratio of -1 for
-        // all the particles hence eliminating the need to store mass as
-        // an attribute
-        // kick
+    //    // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+    //    // Here, we assume a constant charge-to-mass ratio of -1 for
+    //    // all the particles hence eliminating the need to store mass as
+    //    // an attribute
+    //    // kick
 
-        IpplTimings::startTimer(PTimer);
-        P->P = P->P - 0.5 * dt * P->E;
-        IpplTimings::stopTimer(PTimer);
+    //    IpplTimings::startTimer(PTimer);
+    //    P->P = P->P - 0.5 * dt * P->E;
+    //    IpplTimings::stopTimer(PTimer);
 
-        //drift
-        IpplTimings::startTimer(RTimer);
-        P->R = P->R + dt * P->P;
-        IpplTimings::stopTimer(RTimer);
+    //    //drift
+    //    IpplTimings::startTimer(RTimer);
+    //    P->R = P->R + dt * P->P;
+    //    IpplTimings::stopTimer(RTimer);
 
-        //Since the particles have moved spatially update them to correct processors
-	    IpplTimings::startTimer(updateTimer);
-        PL.update(*P, bunchBuffer);
-        IpplTimings::stopTimer(updateTimer);
+    //    //Since the particles have moved spatially update them to correct processors
+	//    IpplTimings::startTimer(updateTimer);
+    //    PL.update(*P, bunchBuffer);
+    //    IpplTimings::stopTimer(updateTimer);
 
 
-        //scatter the charge onto the underlying grid
-        P->scatterCIC(totalP, it+1, hr);
+    //    //scatter the charge onto the underlying grid
+    //    P->scatter(totalP, it+1, hr);
 
-        //Field solve
-        IpplTimings::startTimer(SolveTimer);
-        P->solver_mp->solve();
-        IpplTimings::stopTimer(SolveTimer);
+    //    //Field solve
+    //    IpplTimings::startTimer(SolveTimer);
+    //    P->solver_mp->solve();
+    //    IpplTimings::stopTimer(SolveTimer);
 
-        // gather E field
-        P->gatherCIC();
+    //    // gather E field
+    //    P->gather();
 
-        //kick
-        IpplTimings::startTimer(PTimer);
-        P->P = P->P - 0.5 * dt * P->E;
-        IpplTimings::stopTimer(PTimer);
+    //    //kick
+    //    IpplTimings::startTimer(PTimer);
+    //    P->P = P->P - 0.5 * dt * P->E;
+    //    IpplTimings::stopTimer(PTimer);
 
-        P->time_m += dt;
-        IpplTimings::startTimer(dumpDataTimer);
-        P->dumpLandau();
-        IpplTimings::stopTimer(dumpDataTimer);
-        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
-    }
+    //    P->time_m += dt;
+    //    IpplTimings::startTimer(dumpDataTimer);
+    //    P->dumpLandau();
+    //    IpplTimings::stopTimer(dumpDataTimer);
+    //    msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    //}
 
     msg << "LandauDamping: End." << endl;
     IpplTimings::stopTimer(mainTimer);
diff --git a/src/FieldLayout/FieldLayout.hpp b/src/FieldLayout/FieldLayout.hpp
index 47d12a9e6..9e129e497 100644
--- a/src/FieldLayout/FieldLayout.hpp
+++ b/src/FieldLayout/FieldLayout.hpp
@@ -132,21 +132,6 @@ namespace ippl {
 
         isAllPeriodic_m = isAllPeriodic;
 
-        bool isAllSerial = true;
-        
-        for (unsigned d = 0; d < Dim; ++d) {
-            isAllSerial = isAllSerial && (requestedLayout_m[d] == SERIAL);
-        }
-
-        if ((nRanks < 2) || isAllSerial) {
-            Kokkos::resize(dLocalDomains_m, nRanks);
-            Kokkos::resize(hLocalDomains_m, nRanks);
-            hLocalDomains_m(0) = domain;
-            Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
-            return;
-        }
-
-
         // If the user did not specify parallel/serial flags then make all parallel.
         long totparelems = 1;
         for (unsigned d = 0; d < Dim; ++d) {
@@ -160,6 +145,22 @@ namespace ippl {
             }
         }
 
+        bool isAllSerial = true;
+        
+        for (unsigned d = 0; d < Dim; ++d) {
+            isAllSerial = isAllSerial && (requestedLayout_m[d] == SERIAL);
+        }
+
+        if ((nRanks < 2) || isAllSerial) {
+            Kokkos::resize(dLocalDomains_m,nRanks);
+            Kokkos::resize(hLocalDomains_m,nRanks);
+            for (int r = 0; r < nRanks; ++r) {
+                hLocalDomains_m(r) = domain;
+            }
+            Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
+            return;
+        }
+
         /* Check to see if we have too few elements to partition.  If so, reduce
          * the number of ranks (if necessary) to just the number of elements along
          * parallel dims.
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 99276a82c..5e60fd06a 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -32,6 +32,80 @@
 #include "Expression/IpplExpressions.h"
 #include "Particle/ParticleAttribBase.h"
 
+
+//namespace sample {  // namespace helps with name resolution in reduction identity
+//    template< typename T, int N0, int N1, int N2 >
+//    struct array_type {
+//    
+//        using view_type = typename ippl::detail::ViewType<T, 3>::view_type;
+//        view_type viewTemp{"viewLocal",N0,N1,N2};
+//        using mdrange_type3 = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+//
+//        //KOKKOS_INLINE_FUNCTION   // Default constructor - Initialize to 0's
+//        array_type() {
+//            Kokkos::deep_copy(viewTemp, 0.0);
+//            //Kokkos::parallel_for(
+//            //    "array_type default constructor",
+//            //    mdrange_type3({0, 0, 0},
+//            //                 {viewTemp.extent(0),
+//            //                  viewTemp.extent(1),
+//            //                  viewTemp.extent(2)}),
+//            //    KOKKOS_CLASS_LAMBDA(const size_t i,
+//            //                  const size_t j,
+//            //                  const size_t k)
+//            //    {
+//            //        viewTemp(i,j,k) = 0.0;
+//            //    });
+//        }
+//        //KOKKOS_INLINE_FUNCTION   // Copy Constructor
+//        array_type(const array_type & rhs) {
+//            auto rhsView = rhs.viewTemp;
+//            Kokkos::deep_copy(viewTemp, rhsView);
+//            //Kokkos::parallel_for(
+//            //    "array_type copy constructor",
+//            //    mdrange_type3({0, 0, 0},
+//            //                 {viewTemp.extent(0),
+//            //                  viewTemp.extent(1),
+//            //                  viewTemp.extent(2)}),
+//            //    KOKKOS_CLASS_LAMBDA(const size_t i,
+//            //                  const size_t j,
+//            //                  const size_t k)
+//            //    {
+//            //        viewTemp(i,j,k) = rhsView(i,j,k);
+//            //    });
+//        
+//        }
+//        KOKKOS_FUNCTION   // add operator
+//        array_type& operator+=(const array_type& src) {
+//            auto srcView = src.viewTemp;
+//            Kokkos::parallel_for(
+//                "array_type operator +=",
+//                mdrange_type3({0, 0, 0},
+//                             {viewTemp.extent(0),
+//                              viewTemp.extent(1),
+//                              viewTemp.extent(2)}),
+//                KOKKOS_CLASS_LAMBDA(const size_t i,
+//                              const size_t j,
+//                              const size_t k)
+//                {
+//                    viewTemp(i,j,k) += srcView(i,j,k);
+//                });
+//                
+//            return *this;
+//        }
+//    };
+//    typedef array_type<Kokkos::complex<double>,34,34,34> ValueType;
+//}
+//
+//namespace Kokkos { //reduction identity must be defined in Kokkos namespace
+//   template<>
+//   struct reduction_identity< sample::ValueType > {
+//      KOKKOS_FORCEINLINE_FUNCTION static sample::ValueType sum() {
+//         return sample::ValueType();
+//      }
+//   };
+//}
+
 namespace ippl {
 
     // ParticleAttrib class definition
@@ -127,7 +201,6 @@ namespace ippl {
         /*!
          * Assign the same value to the whole attribute.
          */
-	//KOKKOS_INLINE_FUNCTION
         ParticleAttrib<T, Properties...>& operator=(T x);
 
         /*!
@@ -138,17 +211,22 @@ namespace ippl {
          * @param expr is the expression
          */
         template <typename E, size_t N>
-	//KOKKOS_INLINE_FUNCTION
         ParticleAttrib<T, Properties...>& operator=(detail::Expression<E, N> const& expr);
 
 
-        //     // scatter the data from this attribute onto the given Field, using
-//     // the given Position attribute
+        // scatter the data from this attribute onto the given Field, using
+        // the given Position attribute
         template <unsigned Dim, class M, class C, typename P2>
         void
         scatter(Field<T, Dim, M, C>& f,
                 const ParticleAttrib<Vector<P2, Dim>, Properties... >& pp) const;
-        
+
+        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        void
+        scatterPIF(Field<P2, Dim, M, C>& f,
+                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+
+
         template <unsigned Dim, class M, class C, typename P2>
         void
         gather(Field<T, Dim, M, C>& f,
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index dad8ededf..0b6d8aee8 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -30,6 +30,7 @@
 #include "Communicate/DataTypes.h"
 #include "Utility/IpplTimings.h"
 
+
 namespace ippl {
 
     template<typename T, class... Properties>
@@ -195,6 +196,91 @@ namespace ippl {
     }
 
 
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIF(Field<FT,Dim,M,C>& f,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+    const
+    {
+        static IpplTimings::TimerRef scatterTimer = IpplTimings::getTimer("Scatter");           
+        IpplTimings::startTimer(scatterTimer);
+        
+        using view_type = typename Field<FT, Dim, M, C>::view_type;
+        using vector_type = typename M::vector_type;
+        using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
+        view_type fview = f.getView();
+        const int nghost = f.getNghost();
+        const FieldLayout<Dim>& layout = f.getLayout(); 
+        const M& mesh = f.get_mesh();
+        const vector_type& dx = mesh.getMeshSpacing();
+        const vector_type& origin = mesh.getOrigin();
+        const auto& domain = layout.getDomain();
+        vector_type length;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            length[d] = origin[d] + dx[d] * domain[d].length();
+        }
+
+        typedef Kokkos::TeamPolicy<> team_policy;
+        typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+        using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
+
+        view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
+
+        double pi = std::acos(-1.0);
+        Kokkos::complex<double> imag = {0.0, 1.0};
+
+        size_t Np = *(this->localNum_mp);
+
+        size_t N = domain[0].length()*domain[1].length()*domain[2].length();
+
+        Kokkos::parallel_for("ParticleAttrib::scatterPIF compute",
+                team_policy(N, Kokkos::AUTO),
+                KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
+                const size_t flatIndex = teamMember.league_rank();
+                const int i = flatIndex % domain[0].length();
+                const int j = (int)(flatIndex / domain[0].length());
+                const int k = (int)(flatIndex / (domain[0].length() * domain[1].length()));
+
+                FT reducedValue = 0.0;
+                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
+                [=](const size_t idx, FT& innerReduce)
+                {
+                    //This can be done with Ippl vectors but problem maybe the
+                    //complex numbers
+                    Kokkos::complex<double> fx = Kokkos::Experimental::cos((2*pi*i*pp(idx)[0])/length[0])
+                                                 -imag*Kokkos::Experimental::sin((2*pi*i*pp(idx)[0])/length[0]);
+                    Kokkos::complex<double> fy = Kokkos::Experimental::cos((2*pi*j*pp(idx)[1])/length[1])
+                                                 -imag*Kokkos::Experimental::sin((2*pi*j*pp(idx)[1])/length[1]);
+                    Kokkos::complex<double> fz = Kokkos::Experimental::cos((2*pi*k*pp(idx)[2])/length[2])
+                                                 -imag*Kokkos::Experimental::sin((2*pi*k*pp(idx)[2])/length[2]);
+
+                    const value_type& val = dview_m(idx);
+
+                    innerReduce += fx*fy*fz*val;
+                }, Kokkos::Sum<FT>(reducedValue));
+
+                if(teamMember.team_rank() == 0) {
+                    viewLocal(i+nghost,j+nghost,k+nghost) = reducedValue;
+                }
+
+                }
+        );
+
+        IpplTimings::stopTimer(scatterTimer);
+
+
+        static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
+        IpplTimings::startTimer(scatterAllReduceTimer);                                               
+        int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
+        MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
+                      MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
+        IpplTimings::stopTimer(scatterAllReduceTimer);
+
+    }
+
+
     template<typename T, class... Properties>
     template <unsigned Dim, class M, class C, typename P2>
     void ParticleAttrib<T, Properties...>::gather(Field<T, Dim, M, C>& f,
@@ -269,6 +355,15 @@ namespace ippl {
         attrib.scatter(f, pp);
     }
 
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
+    inline
+    void scatterPIF(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
+    {
+        attrib.scatterPIF(f, pp);
+    }
+
+
 
     template<typename P1, unsigned Dim, class M, class C, typename P2, class... Properties>
     inline

From 0d4f2b23a8264f125261c1b3f1bfe39984bea717 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 4 Nov 2022 16:55:40 +0100
Subject: [PATCH 007/117] PIF implemented but gives nan results. Need to check

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 376 ++----------------
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  |  96 ++---
 src/Particle/ParticleAttrib.h                 |  15 +-
 src/Particle/ParticleAttrib.hpp               | 134 ++++++-
 src/Solver/FFTPeriodicPoissonSolver.hpp       |   2 +
 5 files changed, 214 insertions(+), 409 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index b13b9f156..ecb7ffd0a 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -17,7 +17,6 @@
 //
 
 #include "Ippl.h"
-#include "Solver/FFTPeriodicPoissonSolver.h"
 
 // dimension of our positions
 constexpr unsigned Dim = 3;
@@ -26,7 +25,6 @@ constexpr unsigned Dim = 3;
 typedef ippl::ParticleSpatialLayout<double,Dim>   PLayout_t;
 typedef ippl::UniformCartesian<double, Dim>        Mesh_t;
 typedef ippl::FieldLayout<Dim> FieldLayout_t;
-typedef ippl::OrthogonalRecursiveBisection<double, Dim, Mesh_t> ORB;
 
 using size_type = ippl::detail::size_type;
 
@@ -40,105 +38,20 @@ template<typename T>
 using ParticleAttrib = ippl::ParticleAttrib<T>;
 
 typedef Vector<double, Dim>  Vector_t;
-typedef Vector<Kokkos::complex<double>, Dim>  CxVector_t;
 typedef Field<double, Dim>   Field_t;
 typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
 typedef Field<Vector_t, Dim> VField_t;
-typedef Field<CxVector_t, Dim> CxVField_t;
-typedef ippl::FFTPeriodicPoissonSolver<Vector_t, double, Dim> Solver_t;
 
 const double pi = std::acos(-1.0);
 
 // Test programs have to define this variable for VTK dump purposes
 extern const char* TestName;
 
-//void dumpVTK(VField_t& E, int nx, int ny, int nz, int iteration,
-//             double dx, double dy, double dz) {
-//
-//
-//    typename VField_t::view_type::host_mirror_type host_view = E.getHostMirror();
-//
-//    std::stringstream fname;
-//    fname << "data/ef_";
-//    fname << std::setw(4) << std::setfill('0') << iteration;
-//    fname << ".vtk";
-//
-//    Kokkos::deep_copy(host_view, E.getView());
-//
-//    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
-//    vtkout.precision(10);
-//    vtkout.setf(std::ios::scientific, std::ios::floatfield);
-//
-//    // start with header
-//    vtkout << "# vtk DataFile Version 2.0" << endl;
-//    vtkout << TestName << endl;
-//    vtkout << "ASCII" << endl;
-//    vtkout << "DATASET STRUCTURED_POINTS" << endl;
-//    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
-//    vtkout << "ORIGIN "     << -dx  << " " << -dy  << " "  << -dz << endl;
-//    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
-//    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
-//
-//    vtkout << "VECTORS E-Field float" << endl;
-//    for (int z=0; z<nz+2; z++) {
-//        for (int y=0; y<ny+2; y++) {
-//            for (int x=0; x<nx+2; x++) {
-//
-//                vtkout << host_view(x,y,z)[0] << "\t"
-//                       << host_view(x,y,z)[1] << "\t"
-//                       << host_view(x,y,z)[2] << endl;
-//            }
-//        }
-//    }
-//}
-//
-//void dumpVTK(Field_t& rho, int nx, int ny, int nz, int iteration,
-//             double dx, double dy, double dz) {
-//
-//    typename Field_t::view_type::host_mirror_type host_view = rho.getHostMirror();
-//
-//    std::stringstream fname;
-//    fname << "data/scalar_";
-//    fname << std::setw(4) << std::setfill('0') << iteration;
-//    fname << ".vtk";
-//
-//    Kokkos::deep_copy(host_view, rho.getView());
-//
-//    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
-//    vtkout.precision(10);
-//    vtkout.setf(std::ios::scientific, std::ios::floatfield);
-//
-//    // start with header
-//    vtkout << "# vtk DataFile Version 2.0" << endl;
-//    vtkout << TestName << endl;
-//    vtkout << "ASCII" << endl;
-//    vtkout << "DATASET STRUCTURED_POINTS" << endl;
-//    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
-//    vtkout << "ORIGIN " << -dx << " " << -dy << " " << -dz << endl;
-//    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
-//    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
-//
-//    vtkout << "SCALARS Rho float" << endl;
-//    vtkout << "LOOKUP_TABLE default" << endl;
-//    for (int z=0; z<nz+2; z++) {
-//        for (int y=0; y<ny+2; y++) {
-//            for (int x=0; x<nx+2; x++) {
-//
-//                vtkout << host_view(x,y,z) << endl;
-//            }
-//        }
-//    }
-//}
-
 template<class PLayout>
 class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 public:
-    CxVField_t E_m;
     CxField_t rho_m;
 
-    // ORB
-    ORB orb;
-
     Vector<int, Dim> nr_m;
 
     ippl::e_dim_tag decomp_m[Dim];
@@ -149,18 +62,10 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     double Q_m;
 
-    std::string stype_m;
-
-    std::shared_ptr<Solver_t> solver_mp;
-
     double time_m;
 
     double rhoNorm_m;
 
-    unsigned int loadbalancefreq_m;
-    
-    double loadbalancethreshold_m;
-
 
 public:
     ParticleAttrib<double>     q; // charge
@@ -208,101 +113,9 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         setBCAllPeriodic();
     }
 
-    //void updateLayout(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticlesPIF<PLayout>& buffer,
-    //                  bool& isFirstRepartition) {
-    //    // Update local fields
-    //    static IpplTimings::TimerRef tupdateLayout = IpplTimings::getTimer("updateLayout");
-    //    IpplTimings::startTimer(tupdateLayout);
-    //    this->E_m.updateLayout(fl);
-    //    this->rho_m.updateLayout(fl);
-
-    //    // Update layout with new FieldLayout
-    //    PLayout& layout = this->getLayout();
-    //    layout.updateLayout(fl, mesh);
-    //    IpplTimings::stopTimer(tupdateLayout);
-    //    static IpplTimings::TimerRef tupdatePLayout = IpplTimings::getTimer("updatePB");
-    //    IpplTimings::startTimer(tupdatePLayout);
-    //    if(!isFirstRepartition) {
-    //        layout.update(*this, buffer);
-    //    }
-    //    IpplTimings::stopTimer(tupdatePLayout);
-    //}
-
-    //void initializeORB(FieldLayout_t& fl, Mesh_t& mesh) {
-    //    orb.initialize(fl, mesh, rho_m);
-    //}
-
-    //void repartition(FieldLayout_t& fl, Mesh_t& mesh, ChargedParticlesPIF<PLayout>& buffer, 
-    //                 bool& isFirstRepartition) {
-    //    // Repartition the domains
-    //    bool res = orb.binaryRepartition(this->R, fl, isFirstRepartition);
-
-    //    if (res != true) {
-    //       std::cout << "Could not repartition!" << std::endl;
-    //       return;
-    //    }
-    //    // Update
-    //    this->updateLayout(fl, mesh, buffer, isFirstRepartition);
-    //    this->solver_mp->setRhs(rho_m);
-    //}
-
-    //bool balance(size_type totalP, const unsigned int nstep){
-    //    if(std::strcmp(TestName,"UniformPlasmaTest") == 0) {
-    //        return (nstep % loadbalancefreq_m == 0);
-    //    }
-    //    else {
-    //        int local = 0;
-    //        std::vector<int> res(Ippl::Comm->size());
-    //        double equalPart = (double) totalP / Ippl::Comm->size();
-    //        double dev = std::abs((double)this->getLocalNum() - equalPart) / totalP;
-    //        if (dev > loadbalancethreshold_m)
-    //            local = 1;
-    //        MPI_Allgather(&local, 1, MPI_INT, res.data(), 1, MPI_INT, Ippl::getComm());
-
-    //        for (unsigned int i = 0; i < res.size(); i++) {
-    //            if (res[i] == 1)
-    //                return true;
-    //        }
-    //        return false;
-    //    }
-    //}
-
-    //void gatherStatistics(size_type totalP) {
-    //    std::vector<double> imb(Ippl::Comm->size());
-    //    double equalPart = (double) totalP / Ippl::Comm->size();
-    //    double dev = (std::abs((double)this->getLocalNum() - equalPart) 
-    //                 / totalP) * 100.0;
-    //    MPI_Gather(&dev, 1, MPI_DOUBLE, imb.data(), 1, MPI_DOUBLE, 0, 
-    //               Ippl::getComm());
-    //
-    //    if (Ippl::Comm->rank() == 0) {
-    //        std::stringstream fname;
-    //        fname << "data/LoadBalance_";
-    //        fname << Ippl::Comm->size();
-    //        fname << ".csv";
-
-    //        Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-    //        csvout.precision(5);
-    //        csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-    //        if(time_m == 0.0) {
-    //            csvout << "time, rank, imbalance percentage" << endl;
-    //        }
-
-    //        for(int r=0; r < Ippl::Comm->size(); ++r) { 
-    //            csvout << time_m << " "
-    //                   << r << " "
-    //                   << imb[r] << endl;
-    //        }
-    //    }
-
-    //    Ippl::Comm->barrier();
-    //
-    //}
-
     void gather() {
 
-        gatherPIF(this->E, E_m, this->R);
+        gatherPIF(this->E, rho_m, this->R);
 
     }
 
@@ -316,145 +129,60 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     }
 
-    void initSolver() {
-
-        Inform m("solver ");
-
-    }
-
-
-
-     //void dumpData() {
-
-     //   auto Pview = P.getView();
-
-     //   double Energy = 0.0;
-
-     //   Kokkos::parallel_reduce("Particle Energy", this->getLocalNum(),
-     //                           KOKKOS_LAMBDA(const int i, double& valL){
-     //                               double myVal = dot(Pview(i), Pview(i)).apply();
-     //                               valL += myVal;
-     //                           }, Kokkos::Sum<double>(Energy));
-
-     //   Energy *= 0.5;
-     //   double gEnergy = 0.0;
-
-     //   MPI_Reduce(&Energy, &gEnergy, 1,
-     //               MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-
-
-     //   const int nghostE = E_m.getNghost();
-     //   auto Eview = E_m.getView();
-     //   Vector_t normE;
-     //   using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-     //   for (unsigned d=0; d<Dim; ++d) {
-
-     //   double temp = 0.0;
-     //   Kokkos::parallel_reduce("Vector E reduce",
-     //                           mdrange_type({nghostE, nghostE, nghostE},
-     //                                        {Eview.extent(0) - nghostE,
-     //                                         Eview.extent(1) - nghostE,
-     //                                         Eview.extent(2) - nghostE}),
-     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
-     //                                         const size_t k, double& valL)
-     //                           {
-     //                               double myVal = std::pow(Eview(i, j, k)[d], 2);
-     //                               valL += myVal;
-     //                           }, Kokkos::Sum<double>(temp));
-     //       double globaltemp = 0.0;
-     //       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-     //       normE[d] = std::sqrt(globaltemp);
-     //   }
-
-     //   if (Ippl::Comm->rank() == 0) {
-     //       std::stringstream fname;
-     //       fname << "data/ParticleField_";
-     //       fname << Ippl::Comm->size();
-     //       fname << ".csv";
-
-     //       Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-     //       csvout.precision(10);
-     //       csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-     //       if(time_m == 0.0) {
-     //           csvout << "time, Kinetic energy, Rho_norm2, Ex_norm2, Ey_norm2, Ez_norm2" << endl;
-     //       }
-
-     //       csvout << time_m << " "
-     //              << gEnergy << " "
-     //              << rhoNorm_m << " "
-     //              << normE[0] << " "
-     //              << normE[1] << " "
-     //              << normE[2] << endl;
-     //   }
 
-     //   Ippl::Comm->barrier();
-     //}
+     void dumpLandau(size_type totalP) {
+        
+        auto Eview = E.getView();
 
-     //void dumpLandau() {
+        double fieldEnergy, ExAmp;
+        double temp = 0.0;
 
-     //   const int nghostE = E_m.getNghost();
-     //   auto Eview = E_m.getView();
-     //   double fieldEnergy, ExAmp;
-     //   using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = Eview(i)[0] * Eview(i)[0];
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
 
-     //   double temp = 0.0;
-     //   Kokkos::parallel_reduce("Ex inner product",
-     //                           mdrange_type({nghostE, nghostE, nghostE},
-     //                                        {Eview.extent(0) - nghostE,
-     //                                         Eview.extent(1) - nghostE,
-     //                                         Eview.extent(2) - nghostE}),
-     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
-     //                                         const size_t k, double& valL)
-     //                           {
-     //                               double myVal = std::pow(Eview(i, j, k)[0], 2);
-     //                               valL += myVal;
-     //                           }, Kokkos::Sum<double>(temp));
-     //   double globaltemp = 0.0;
-     //   MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-     //   fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
+        double globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        fieldEnergy = globaltemp * volume / totalP ;
 
-     //   double tempMax = 0.0;
-     //   Kokkos::parallel_reduce("Ex max norm",
-     //                           mdrange_type({nghostE, nghostE, nghostE},
-     //                                        {Eview.extent(0) - nghostE,
-     //                                         Eview.extent(1) - nghostE,
-     //                                         Eview.extent(2) - nghostE}),
-     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
-     //                                         const size_t k, double& valL)
-     //                           {
-     //                               double myVal = std::fabs(Eview(i, j, k)[0]);
-     //                               if(myVal > valL) valL = myVal;
-     //                           }, Kokkos::Max<double>(tempMax));
-     //   ExAmp = 0.0;
-     //   MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+        double tempMax = 0.0;
+        Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const size_t i, double& valL)
+                                {
+                                    double myVal = std::fabs(Eview(i)[0]);
+                                    if(myVal > valL) valL = myVal;
+                                }, Kokkos::Max<double>(tempMax));
+        ExAmp = 0.0;
+        MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
 
 
-     //   if (Ippl::Comm->rank() == 0) {
-     //       std::stringstream fname;
-     //       fname << "data/FieldLandau_";
-     //       fname << Ippl::Comm->size();
-     //       fname << ".csv";
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/FieldLandau_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
 
 
-     //       Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-     //       csvout.precision(10);
-     //       csvout.setf(std::ios::scientific, std::ios::floatfield);
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
 
-     //       if(time_m == 0.0) {
-     //           csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
-     //       }
+            if(time_m == 0.0) {
+                csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+            }
 
-     //       csvout << time_m << " "
-     //              << fieldEnergy << " "
-     //              << ExAmp << endl;
+            csvout << time_m << " "
+                   << fieldEnergy << " "
+                   << ExAmp << endl;
 
-     //   }
-     //   
-     //   Ippl::Comm->barrier();
-     //}
-     //
+        }
+        
+        Ippl::Comm->barrier();
+     }
+     
      //void dumpBumponTail() {
 
      //   const int nghostE = E_m.getNghost();
@@ -542,24 +270,6 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
      //   }
      //   Ippl::Comm->barrier();
      //}
-     //
-     //void dumpLocalDomains(const FieldLayout_t& fl, const unsigned int step) {
-
-     //   if (Ippl::Comm->rank() == 0) {
-     //       const typename FieldLayout_t::host_mirror_type domains = fl.getHostLocalDomains();
-     //       std::ofstream myfile;
-     //       myfile.open("data/domains" + std::to_string(step) + ".txt");
-     //       for (unsigned int i = 0; i < domains.size(); ++i) {
-     //           myfile << domains[i][0].first() << " " << domains[i][1].first() << " " << domains[i][2].first() << " "
-     //                  << domains[i][0].first() << " " << domains[i][1].last() << " " << domains[i][2].first() << " "
-     //                  << domains[i][0].last() << " " << domains[i][1].first() << " " << domains[i][2].first() << " "
-     //                  << domains[i][0].first() << " " << domains[i][1].first() << " " << domains[i][2].last()
-     //                  << "\n";
-     //       }
-     //       myfile.close();
-     //   }
-     //   Ippl::Comm->barrier();
-     //}
 
 private:
     void setBCAllPeriodic() {
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 0102e55dc..054ff5f18 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -9,7 +9,7 @@
 //     Example:
 //     srun ./LandauDampingPIF 128 128 128 10000 10 --info 10
 //
-// Copyright (c) 2021, Sriramkrishnan Muralikrishnan,
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Paul Scherrer Institut, Villigen PSI, Switzerland
 // All rights reserved
 //
@@ -158,8 +158,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
     static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
-    static IpplTimings::TimerRef updateTimer = IpplTimings::getTimer("update");
-    static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("solve");
+    static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -197,7 +196,7 @@ int main(int argc, char *argv[]){
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    const double dt = 0.5*dx;
+    const double dt = 0.05;//0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
@@ -210,12 +209,8 @@ int main(int argc, char *argv[]){
 
     P->nr_m = nr;
 
-    P->E_m.initialize(mesh, FL);
     P->rho_m.initialize(mesh, FL);
 
-    bunch_type bunchBuffer(PL);
-
-    P->initSolver();
     P->time_m = 0.0;
 
     IpplTimings::startTimer(particleCreation);
@@ -257,64 +252,53 @@ int main(int argc, char *argv[]){
 
     P->scatter();
 
-    //IpplTimings::startTimer(SolveTimer);
-    //P->solver_mp->solve();
-    //IpplTimings::stopTimer(SolveTimer);
-
-    //P->gather();
-
-    //IpplTimings::startTimer(dumpDataTimer);
-    //P->dumpLandau();
-    ////P->dumpLocalDomains(FL, 0);
-    //IpplTimings::stopTimer(dumpDataTimer);
+    P->gather();
 
-    //// begin main timestep loop
-    //msg << "Starting iterations ..." << endl;
-    //for (unsigned int it=0; it<nt; it++) {
+    IpplTimings::startTimer(dumpDataTimer);
+    P->dumpLandau(totalP);
+    IpplTimings::stopTimer(dumpDataTimer);
 
-    //    // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
-    //    // Here, we assume a constant charge-to-mass ratio of -1 for
-    //    // all the particles hence eliminating the need to store mass as
-    //    // an attribute
-    //    // kick
+    // begin main timestep loop
+    msg << "Starting iterations ..." << endl;
+    for (unsigned int it=0; it<nt; it++) {
 
-    //    IpplTimings::startTimer(PTimer);
-    //    P->P = P->P - 0.5 * dt * P->E;
-    //    IpplTimings::stopTimer(PTimer);
+        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
 
-    //    //drift
-    //    IpplTimings::startTimer(RTimer);
-    //    P->R = P->R + dt * P->P;
-    //    IpplTimings::stopTimer(RTimer);
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
 
-    //    //Since the particles have moved spatially update them to correct processors
-	//    IpplTimings::startTimer(updateTimer);
-    //    PL.update(*P, bunchBuffer);
-    //    IpplTimings::stopTimer(updateTimer);
+        //drift
+        IpplTimings::startTimer(RTimer);
+        P->R = P->R + dt * P->P;
+        IpplTimings::stopTimer(RTimer);
 
+        //Apply particle BC
+	    IpplTimings::startTimer(BCTimer);
+        PL.applyBC(P->R, PL.getRegionLayout().getDomain());
+        IpplTimings::stopTimer(BCTimer);
 
-    //    //scatter the charge onto the underlying grid
-    //    P->scatter(totalP, it+1, hr);
+        //scatter the charge onto the underlying grid
+        P->scatter();
 
-    //    //Field solve
-    //    IpplTimings::startTimer(SolveTimer);
-    //    P->solver_mp->solve();
-    //    IpplTimings::stopTimer(SolveTimer);
+        // Solve for and gather E field
+        P->gather();
 
-    //    // gather E field
-    //    P->gather();
+        //kick
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
 
-    //    //kick
-    //    IpplTimings::startTimer(PTimer);
-    //    P->P = P->P - 0.5 * dt * P->E;
-    //    IpplTimings::stopTimer(PTimer);
-
-    //    P->time_m += dt;
-    //    IpplTimings::startTimer(dumpDataTimer);
-    //    P->dumpLandau();
-    //    IpplTimings::stopTimer(dumpDataTimer);
-    //    msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
-    //}
+        P->time_m += dt;
+        IpplTimings::startTimer(dumpDataTimer);
+        P->dumpLandau(totalP);
+        IpplTimings::stopTimer(dumpDataTimer);
+        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    }
 
     msg << "LandauDamping: End." << endl;
     IpplTimings::stopTimer(mainTimer);
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 5e60fd06a..480e8e5ed 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -106,6 +106,15 @@
 //   };
 //}
 
+namespace Kokkos { //reduction identity must be defined in Kokkos namespace
+    template<>
+    struct reduction_identity< ippl::Vector<double, 3> > {
+        KOKKOS_FORCEINLINE_FUNCTION static ippl::Vector<double, 3> sum() {
+            return ippl::Vector<double, 3>();
+        }
+    };
+}
+
 namespace ippl {
 
     // ParticleAttrib class definition
@@ -226,12 +235,16 @@ namespace ippl {
         scatterPIF(Field<P2, Dim, M, C>& f,
                 const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
 
-
         template <unsigned Dim, class M, class C, typename P2>
         void
         gather(Field<T, Dim, M, C>& f,
                const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp);
 
+        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        void
+        gatherPIF(Field<P2, Dim, M, C>& f,
+                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+
         T sum();
         T max();
         T min();
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 0b6d8aee8..f0f340ab0 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -213,17 +213,19 @@ namespace ippl {
         const FieldLayout<Dim>& layout = f.getLayout(); 
         const M& mesh = f.get_mesh();
         const vector_type& dx = mesh.getMeshSpacing();
-        const vector_type& origin = mesh.getOrigin();
         const auto& domain = layout.getDomain();
-        vector_type length;
+        vector_type Len;
+        Vector<int, Dim> N;
 
         for (unsigned d=0; d < Dim; ++d) {
-            length[d] = origin[d] + dx[d] * domain[d].length();
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
         }
-
+        
         typedef Kokkos::TeamPolicy<> team_policy;
         typedef Kokkos::TeamPolicy<>::member_type member_type;
 
+
         using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
 
         view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
@@ -233,32 +235,31 @@ namespace ippl {
 
         size_t Np = *(this->localNum_mp);
 
-        size_t N = domain[0].length()*domain[1].length()*domain[2].length();
+        size_t flatN = N[0]*N[1]*N[2];
 
         Kokkos::parallel_for("ParticleAttrib::scatterPIF compute",
-                team_policy(N, Kokkos::AUTO),
+                team_policy(flatN, Kokkos::AUTO),
                 KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
                 const size_t flatIndex = teamMember.league_rank();
-                const int i = flatIndex % domain[0].length();
-                const int j = (int)(flatIndex / domain[0].length());
-                const int k = (int)(flatIndex / (domain[0].length() * domain[1].length()));
+                const int i = flatIndex % N[0];
+                const int j = (int)(flatIndex / N[0]);
+                const int k = (int)(flatIndex / (N[0] * N[1]));
 
                 FT reducedValue = 0.0;
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
                 [=](const size_t idx, FT& innerReduce)
                 {
-                    //This can be done with Ippl vectors but problem maybe the
-                    //complex numbers
-                    Kokkos::complex<double> fx = Kokkos::Experimental::cos((2*pi*i*pp(idx)[0])/length[0])
-                                                 -imag*Kokkos::Experimental::sin((2*pi*i*pp(idx)[0])/length[0]);
-                    Kokkos::complex<double> fy = Kokkos::Experimental::cos((2*pi*j*pp(idx)[1])/length[1])
-                                                 -imag*Kokkos::Experimental::sin((2*pi*j*pp(idx)[1])/length[1]);
-                    Kokkos::complex<double> fz = Kokkos::Experimental::cos((2*pi*k*pp(idx)[2])/length[2])
-                                                 -imag*Kokkos::Experimental::sin((2*pi*k*pp(idx)[2])/length[2]);
-
+                    Vector<int, 3> iVec = {i, j, k};
+                    vector_type kVec;
+                    double arg=0.0;
+                    for(size_t d = 0; d < Dim; ++d) {
+                        bool shift = (iVec[d] > (N[d]/2));
+                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        arg += kVec[d]*pp(idx)[d];
+                    }
                     const value_type& val = dview_m(idx);
 
-                    innerReduce += fx*fy*fz*val;
+                    innerReduce += (Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {
@@ -339,6 +340,92 @@ namespace ippl {
         IpplTimings::stopTimer(gatherTimer);                                               
     }
 
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIF(Field<FT,Dim,M,C>& f,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+    const
+    {
+        static IpplTimings::TimerRef gatherTimer = IpplTimings::getTimer("Gather");           
+        IpplTimings::startTimer(gatherTimer);
+        
+        using view_type = typename Field<FT, Dim, M, C>::view_type;
+        using vector_type = typename M::vector_type;
+        using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
+        view_type fview = f.getView();
+        const int nghost = f.getNghost();
+        const FieldLayout<Dim>& layout = f.getLayout(); 
+        const M& mesh = f.get_mesh();
+        const vector_type& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        vector_type Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+        typedef Kokkos::TeamPolicy<> team_policy;
+        typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+        double pi = std::acos(-1.0);
+        Kokkos::complex<double> imag = {0.0, 1.0};
+
+        size_t Np = *(this->localNum_mp);
+
+        size_t flatN = N[0]*N[1]*N[2];
+
+        Kokkos::parallel_for("ParticleAttrib::gatherPIF",
+                team_policy(Np, Kokkos::AUTO),
+                KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
+                const size_t idx = teamMember.league_rank();
+
+                value_type reducedValue = 0.0;
+                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
+                [=](const size_t flatIndex, value_type& innerReduce)
+                {
+                    const int i = flatIndex % N[0];
+                    const int j = (int)(flatIndex / N[0]);
+                    const int k = (int)(flatIndex / (N[0] * N[1]));
+
+                    Vector<int, 3> iVec = {i, j, k};
+                    vector_type kVec;
+                    double Dr = 0.0, arg=0.0;
+                    for(size_t d = 0; d < Dim; ++d) {
+                        bool shift = (iVec[d] > (N[d]/2));
+                        bool notMid = (iVec[d] != (N[d]/2));
+                        //For the noMid part see 
+                        //https://math.mit.edu/~stevenj/fft-deriv.pdf Algorithm 1
+                        kVec[d] = notMid * 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        Dr += kVec[d] * kVec[d];
+                        arg += kVec[d]*pp(idx)[d];
+                    }
+
+                    FT Ek;
+                    value_type Ex;
+                    for(size_t d = 0; d < Dim; ++d) {
+                        Ek = -(imag * kVec[d] * fview(i+nghost,j+nghost,k+nghost) / Dr);
+                        //Inverse Fourier transform when the lhs is real
+                        Ex[d] = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
+                                - Ek.imag() * Kokkos::Experimental::sin(arg));
+                    }
+                    
+                    innerReduce += Ex;
+                }, Kokkos::Sum<value_type>(reducedValue));
+
+                teamMember.team_barrier();
+
+                if(teamMember.team_rank() == 0) {
+                    dview_m(idx) = reducedValue;
+                }
+
+                }
+        );
+
+        IpplTimings::stopTimer(gatherTimer);
+
+    }
 
 
     /*
@@ -373,6 +460,15 @@ namespace ippl {
         attrib.gather(f, pp);
     }
 
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
+    inline
+    void gatherPIF(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
+    {
+        attrib.gatherPIF(f, pp);
+    }
+
+
     #define DefineParticleReduction(fun, name, op, MPI_Op)                                                   \
     template<typename T, class... Properties>                                                                \
     T ParticleAttrib<T, Properties...>::name() {                                                             \
diff --git a/src/Solver/FFTPeriodicPoissonSolver.hpp b/src/Solver/FFTPeriodicPoissonSolver.hpp
index 015400e9a..e6f690942 100644
--- a/src/Solver/FFTPeriodicPoissonSolver.hpp
+++ b/src/Solver/FFTPeriodicPoissonSolver.hpp
@@ -158,6 +158,8 @@ namespace ippl {
                             const double Len = rmax[d] - origin[d];
                             bool shift = (iVec[d] > (N[d]/2));
                             bool notMid = (iVec[d] != (N[d]/2));
+                            //For the noMid part see 
+                            //https://math.mit.edu/~stevenj/fft-deriv.pdf Algorithm 1
                             kVec[d] = notMid * 2 * pi / Len * (iVec[d] - shift * N[d]);
                         }
 

From 676ba0738fd94c117dbf77b08755f27b74a05d31 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 14 Nov 2022 15:36:34 +0100
Subject: [PATCH 008/117] Bugs corrected in PIF and it seems to be working.
 Need to check more

---
 alpine/ElectrostaticPIC/ChargedParticles.hpp |  56 ++++++++-
 alpine/ElectrostaticPIC/LandauDamping.cpp    |   6 +-
 src/Particle/ParticleAttrib.hpp              | 124 +++++++++++++++++--
 3 files changed, 171 insertions(+), 15 deletions(-)

diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index e64417e19..53653f3dc 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -508,7 +508,61 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
         
         Ippl::Comm->barrier();
      }
-     
+    
+     void dumpLandauParticle(size_type totalP) {
+        
+        auto Eview = E.getView();
+
+        double fieldEnergy, ExAmp;
+        double temp = 0.0;
+
+        Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = Eview(i)[0] * Eview(i)[0];
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        double globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        fieldEnergy = globaltemp * volume / totalP ;
+
+        double tempMax = 0.0;
+        Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const size_t i, double& valL)
+                                {
+                                    double myVal = std::fabs(Eview(i)[0]);
+                                    if(myVal > valL) valL = myVal;
+                                }, Kokkos::Max<double>(tempMax));
+        ExAmp = 0.0;
+        MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/FieldLandau_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+            }
+
+            csvout << time_m << " "
+                   << fieldEnergy << " "
+                   << ExAmp << endl;
+
+        }
+        
+        Ippl::Comm->barrier();
+     }
+
+
      void dumpBumponTail() {
 
         const int nghostE = E_m.getNghost();
diff --git a/alpine/ElectrostaticPIC/LandauDamping.cpp b/alpine/ElectrostaticPIC/LandauDamping.cpp
index e78dd91bf..2cd0acbcd 100644
--- a/alpine/ElectrostaticPIC/LandauDamping.cpp
+++ b/alpine/ElectrostaticPIC/LandauDamping.cpp
@@ -209,7 +209,7 @@ int main(int argc, char *argv[]){
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    const double dt = 0.5*dx;
+    const double dt = 0.05;//0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
@@ -331,7 +331,7 @@ int main(int argc, char *argv[]){
     P->gatherCIC();
 
     IpplTimings::startTimer(dumpDataTimer);
-    P->dumpLandau();
+    P->dumpLandauParticle(totalP);
     P->gatherStatistics(totalP);
     //P->dumpLocalDomains(FL, 0);
     IpplTimings::stopTimer(dumpDataTimer);
@@ -390,7 +390,7 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        P->dumpLandau();
+        P->dumpLandauParticle(totalP);
         P->gatherStatistics(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index f0f340ab0..9aa7f8ca5 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -202,6 +202,8 @@ namespace ippl {
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
+        //Inform msg("scatterPIF");
+        
         static IpplTimings::TimerRef scatterTimer = IpplTimings::getTimer("Scatter");           
         IpplTimings::startTimer(scatterTimer);
         
@@ -241,9 +243,11 @@ namespace ippl {
                 team_policy(flatN, Kokkos::AUTO),
                 KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
                 const size_t flatIndex = teamMember.league_rank();
-                const int i = flatIndex % N[0];
-                const int j = (int)(flatIndex / N[0]);
+                
                 const int k = (int)(flatIndex / (N[0] * N[1]));
+                const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
+                const int i = flatIndex2D % N[0];
+                const int j = (int)(flatIndex2D / N[0]);
 
                 FT reducedValue = 0.0;
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
@@ -253,8 +257,9 @@ namespace ippl {
                     vector_type kVec;
                     double arg=0.0;
                     for(size_t d = 0; d < Dim; ++d) {
-                        bool shift = (iVec[d] > (N[d]/2));
-                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        //bool shift = (iVec[d] > (N[d]/2));
+                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        kVec[d] = 2 * pi / Len[d] * iVec[d];
                         arg += kVec[d]*pp(idx)[d];
                     }
                     const value_type& val = dview_m(idx);
@@ -271,6 +276,17 @@ namespace ippl {
 
         IpplTimings::stopTimer(scatterTimer);
 
+        //double sum = 0.0;
+        //Kokkos::parallel_reduce("inner product complex", f.getRangePolicy(),
+        //    KOKKOS_LAMBDA(const size_t i, const size_t j, const size_t k, double& val) {
+        //        val += std::pow(viewLocal(i, j, k).real(), 2) + std::pow(viewLocal(i, j, k).imag(), 2);
+        //    },
+        //    Kokkos::Sum<double>(sum)
+        //);
+        //double globalSum = 0;
+        //MPI_Allreduce(&sum, &globalSum, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+
+        //msg << "rho inner product before all reduce: " << globalSum << endl;
 
         static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
         IpplTimings::startTimer(scatterAllReduceTimer);                                               
@@ -279,6 +295,17 @@ namespace ippl {
                       MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
         IpplTimings::stopTimer(scatterAllReduceTimer);
 
+        //sum = 0.0;
+        //Kokkos::parallel_reduce("inner product complex2", f.getRangePolicy(),
+        //    KOKKOS_LAMBDA(const size_t i, const size_t j, const size_t k, double& val) {
+        //        val += std::pow(fview(i, j, k).real(), 2) + std::pow(fview(i, j, k).imag(), 2);
+        //    },
+        //    Kokkos::Sum<double>(sum)
+        //);
+        //MPI_Allreduce(&sum, &globalSum, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+        //
+        //msg << "rho inner product after all reduce: " << globalSum << endl;
+
     }
 
 
@@ -346,6 +373,7 @@ namespace ippl {
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
+        //Inform msg("gatherPIF");
         static IpplTimings::TimerRef gatherTimer = IpplTimings::getTimer("Gather");           
         IpplTimings::startTimer(gatherTimer);
         
@@ -385,27 +413,31 @@ namespace ippl {
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
                 [=](const size_t flatIndex, value_type& innerReduce)
                 {
-                    const int i = flatIndex % N[0];
-                    const int j = (int)(flatIndex / N[0]);
                     const int k = (int)(flatIndex / (N[0] * N[1]));
+                    const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
+                    const int i = flatIndex2D % N[0];
+                    const int j = (int)(flatIndex2D / N[0]);
 
                     Vector<int, 3> iVec = {i, j, k};
                     vector_type kVec;
                     double Dr = 0.0, arg=0.0;
                     for(size_t d = 0; d < Dim; ++d) {
-                        bool shift = (iVec[d] > (N[d]/2));
-                        bool notMid = (iVec[d] != (N[d]/2));
+                        //bool shift = (iVec[d] > (N[d]/2));
+                        //bool notMid = (iVec[d] != (N[d]/2));
                         //For the noMid part see 
                         //https://math.mit.edu/~stevenj/fft-deriv.pdf Algorithm 1
-                        kVec[d] = notMid * 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        //kVec[d] = notMid * 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        kVec[d] = 2 * pi / Len[d] * iVec[d];
                         Dr += kVec[d] * kVec[d];
                         arg += kVec[d]*pp(idx)[d];
                     }
 
-                    FT Ek;
+                    FT Ek = 0.0;
                     value_type Ex;
                     for(size_t d = 0; d < Dim; ++d) {
-                        Ek = -(imag * kVec[d] * fview(i+nghost,j+nghost,k+nghost) / Dr);
+                        if(Dr != 0.0)
+                            Ek = -(imag * kVec[d] * fview(i+nghost,j+nghost,k+nghost) / Dr);
+                        
                         //Inverse Fourier transform when the lhs is real
                         Ex[d] = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
                                 - Ek.imag() * Kokkos::Experimental::sin(arg));
@@ -423,8 +455,78 @@ namespace ippl {
                 }
         );
 
+        
+        //Kokkos::parallel_for("ParticleAttrib::gatherPIF",
+        //        team_policy(Np, Kokkos::AUTO),
+        //        KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
+        //        const size_t idx = teamMember.league_rank();
+
+        //        for(size_t gd = 0; gd <  Dim; ++gd) {
+        //            double reducedValue = 0.0;
+        //            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
+        //            [=](const size_t flatIndex, double& innerReduce)
+        //            {
+        //                const int i = flatIndex % N[0];
+        //                const int j = (int)(flatIndex / N[0]);
+        //                const int k = (int)(flatIndex / (N[0] * N[1]));
+
+        //                Vector<int, 3> iVec = {i, j, k};
+        //                vector_type kVec;
+        //                double Dr = 0.0, arg=0.0;
+        //                for(size_t d = 0; d < Dim; ++d) {
+        //                    bool shift = (iVec[d] > (N[d]/2));
+        //                    bool notMid = (iVec[d] != (N[d]/2));
+        //                    //For the noMid part see 
+        //                    //https://math.mit.edu/~stevenj/fft-deriv.pdf Algorithm 1
+        //                    kVec[d] = notMid * 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+        //                    Dr += kVec[d] * kVec[d];
+        //                    arg += kVec[d]*pp(idx)[d];
+        //                }
+
+        //                FT Ek;
+        //                double Ex;
+        //                //for(size_t d = 0; d < Dim; ++d) {
+        //                    if(Dr != 0.0)
+        //                        Ek = -(imag * kVec[gd] * fview(i+nghost,j+nghost,k+nghost) / Dr);
+        //                    else
+        //                        Ek = 0.0;
+        //                    
+        //                    //Inverse Fourier transform when the lhs is real
+        //                    Ex = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
+        //                            - Ek.imag() * Kokkos::Experimental::sin(arg));
+        //                //}
+        //                
+        //                innerReduce += Ex;
+        //            }, reducedValue);
+
+        //            teamMember.team_barrier();
+
+        //            if(teamMember.team_rank() == 0) {
+        //                dview_m(idx)[gd] = reducedValue;
+        //            }
+
+        //        }
+        //        }
+        //);
+
+
         IpplTimings::stopTimer(gatherTimer);
 
+        //double Energy = 0.0;
+
+        //Kokkos::parallel_reduce("E Energy", Np,
+        //                        KOKKOS_CLASS_LAMBDA(const int i, double& valL){
+        //                            double myVal = dot(dview_m(i), dview_m(i)).apply();
+        //                            valL += myVal;
+        //                        }, Kokkos::Sum<double>(Energy));
+
+        //double gEnergy = 0.0;
+
+        //MPI_Reduce(&Energy, &gEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+        //msg << "E energy in gatherPIF: " << gEnergy << endl;
+
+
     }
 
 

From 0a8f787b2991d43740755156fc51922848832c75 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Thu, 17 Nov 2022 05:25:19 +0100
Subject: [PATCH 009/117] [-K K] implementation

---
 alpine/ElectrostaticPIC/ChargedParticles.hpp  |  63 +++++++++
 alpine/ElectrostaticPIC/LandauDamping.cpp     |   4 +-
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  |  63 +++++++++
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  |   4 +-
 src/Particle/ParticleAttrib.hpp               | 122 +++---------------
 5 files changed, 149 insertions(+), 107 deletions(-)

diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index 53653f3dc..8e57432e1 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -563,6 +563,69 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
      }
 
 
+     void dumpEnergy(size_type totalP) {
+        
+        auto Eview = E.getView();
+
+        double potentialEnergy, kineticEnergy;
+        double temp = 0.0;
+
+        Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = dot(Eview(i), Eview(i)).apply();
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        double globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        potentialEnergy = 0.5 * globaltemp * volume / totalP ;
+
+
+        auto Pview = P.getView();
+        auto qView = q.getView();
+
+        temp = 0.0;
+
+        Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = dot(Pview(i), Pview(i)).apply();
+                                    myVal *= -qView(i);
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        temp *= 0.5;
+        globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+        kineticEnergy = globaltemp;
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/Energy_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+            }
+
+            csvout << time_m << " "
+                   << potentialEnergy << " "
+                   << kineticEnergy << " "
+                   << potentialEnergy + kineticEnergy << endl;
+
+        }
+        
+        Ippl::Comm->barrier();
+     }
+
+
      void dumpBumponTail() {
 
         const int nghostE = E_m.getNghost();
diff --git a/alpine/ElectrostaticPIC/LandauDamping.cpp b/alpine/ElectrostaticPIC/LandauDamping.cpp
index 2cd0acbcd..85448c342 100644
--- a/alpine/ElectrostaticPIC/LandauDamping.cpp
+++ b/alpine/ElectrostaticPIC/LandauDamping.cpp
@@ -177,6 +177,7 @@ int main(int argc, char *argv[]){
 
     const size_type totalP = std::atoll(argv[4]);
     const unsigned int nt     = std::atoi(argv[5]);
+    const double dt = std::atof(argv[9]);;//0.5*dx;
 
     msg << "Landau damping"
         << endl
@@ -209,7 +210,6 @@ int main(int argc, char *argv[]){
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    const double dt = 0.05;//0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
@@ -332,6 +332,7 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(dumpDataTimer);
     P->dumpLandauParticle(totalP);
+    P->dumpEnergy(totalP);
     P->gatherStatistics(totalP);
     //P->dumpLocalDomains(FL, 0);
     IpplTimings::stopTimer(dumpDataTimer);
@@ -391,6 +392,7 @@ int main(int argc, char *argv[]){
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
         P->dumpLandauParticle(totalP);
+        P->dumpEnergy(totalP);
         P->gatherStatistics(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index ecb7ffd0a..5e63bba8f 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -182,6 +182,69 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         
         Ippl::Comm->barrier();
      }
+
+
+     void dumpEnergy(size_type totalP) {
+        
+        auto Eview = E.getView();
+
+        double potentialEnergy, kineticEnergy;
+        double temp = 0.0;
+
+        Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = dot(Eview(i), Eview(i)).apply();
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        double globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        potentialEnergy = 0.5 * globaltemp * volume / totalP ;
+
+
+        auto Pview = P.getView();
+        auto qView = q.getView();
+
+        temp = 0.0;
+
+        Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = dot(Pview(i), Pview(i)).apply();
+                                    myVal *= -qView(i);
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        temp *= 0.5;
+        globaltemp = 0.0;
+        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+        kineticEnergy = globaltemp;
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/Energy_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+            }
+
+            csvout << time_m << " "
+                   << potentialEnergy << " "
+                   << kineticEnergy << " "
+                   << potentialEnergy + kineticEnergy << endl;
+
+        }
+        
+        Ippl::Comm->barrier();
+     }
      
      //void dumpBumponTail() {
 
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 054ff5f18..8db6b9dce 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -164,6 +164,7 @@ int main(int argc, char *argv[]){
 
     const size_type totalP = std::atoll(argv[4]);
     const unsigned int nt     = std::atoi(argv[5]);
+    const double dt = std::atof(argv[6]);
 
     msg << "Landau damping"
         << endl
@@ -196,7 +197,6 @@ int main(int argc, char *argv[]){
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    const double dt = 0.05;//0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
@@ -256,6 +256,7 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(dumpDataTimer);
     P->dumpLandau(totalP);
+    P->dumpEnergy(totalP);
     IpplTimings::stopTimer(dumpDataTimer);
 
     // begin main timestep loop
@@ -296,6 +297,7 @@ int main(int argc, char *argv[]){
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
         P->dumpLandau(totalP);
+        P->dumpEnergy(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 9aa7f8ca5..ee0c2084e 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -257,9 +257,10 @@ namespace ippl {
                     vector_type kVec;
                     double arg=0.0;
                     for(size_t d = 0; d < Dim; ++d) {
-                        //bool shift = (iVec[d] > (N[d]/2));
-                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                        kVec[d] = 2 * pi / Len[d] * iVec[d];
+                        bool shift = (iVec[d] > (N[d]/2));
+                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        //kVec[d] = 2 * pi / Len[d] * iVec[d];
+                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
                         arg += kVec[d]*pp(idx)[d];
                     }
                     const value_type& val = dview_m(idx);
@@ -276,18 +277,6 @@ namespace ippl {
 
         IpplTimings::stopTimer(scatterTimer);
 
-        //double sum = 0.0;
-        //Kokkos::parallel_reduce("inner product complex", f.getRangePolicy(),
-        //    KOKKOS_LAMBDA(const size_t i, const size_t j, const size_t k, double& val) {
-        //        val += std::pow(viewLocal(i, j, k).real(), 2) + std::pow(viewLocal(i, j, k).imag(), 2);
-        //    },
-        //    Kokkos::Sum<double>(sum)
-        //);
-        //double globalSum = 0;
-        //MPI_Allreduce(&sum, &globalSum, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-
-        //msg << "rho inner product before all reduce: " << globalSum << endl;
-
         static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
         IpplTimings::startTimer(scatterAllReduceTimer);                                               
         int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
@@ -295,17 +284,6 @@ namespace ippl {
                       MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
         IpplTimings::stopTimer(scatterAllReduceTimer);
 
-        //sum = 0.0;
-        //Kokkos::parallel_reduce("inner product complex2", f.getRangePolicy(),
-        //    KOKKOS_LAMBDA(const size_t i, const size_t j, const size_t k, double& val) {
-        //        val += std::pow(fview(i, j, k).real(), 2) + std::pow(fview(i, j, k).imag(), 2);
-        //    },
-        //    Kokkos::Sum<double>(sum)
-        //);
-        //MPI_Allreduce(&sum, &globalSum, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-        //
-        //msg << "rho inner product after all reduce: " << globalSum << endl;
-
     }
 
 
@@ -422,12 +400,10 @@ namespace ippl {
                     vector_type kVec;
                     double Dr = 0.0, arg=0.0;
                     for(size_t d = 0; d < Dim; ++d) {
-                        //bool shift = (iVec[d] > (N[d]/2));
-                        //bool notMid = (iVec[d] != (N[d]/2));
-                        //For the noMid part see 
-                        //https://math.mit.edu/~stevenj/fft-deriv.pdf Algorithm 1
-                        //kVec[d] = notMid * 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                        kVec[d] = 2 * pi / Len[d] * iVec[d];
+                        bool shift = (iVec[d] > (N[d]/2));
+                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        //kVec[d] = 2 * pi / Len[d] * iVec[d];
+                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
                         Dr += kVec[d] * kVec[d];
                         arg += kVec[d]*pp(idx)[d];
                     }
@@ -435,12 +411,17 @@ namespace ippl {
                     FT Ek = 0.0;
                     value_type Ex;
                     for(size_t d = 0; d < Dim; ++d) {
-                        if(Dr != 0.0)
+                        if(Dr != 0.0) {
                             Ek = -(imag * kVec[d] * fview(i+nghost,j+nghost,k+nghost) / Dr);
+                        }
                         
-                        //Inverse Fourier transform when the lhs is real
-                        Ex[d] = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
-                                - Ek.imag() * Kokkos::Experimental::sin(arg));
+                        //Inverse Fourier transform when the lhs is real. Use when 
+                        //we choose k \in [0 K) instead of from [-K/2+1 K/2] 
+                        //Ex[d] = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
+                        //        - Ek.imag() * Kokkos::Experimental::sin(arg));
+                        Ek *= (Kokkos::Experimental::cos(arg) 
+                                + imag * Kokkos::Experimental::sin(arg));
+                        Ex[d] = Ek.real();
                     }
                     
                     innerReduce += Ex;
@@ -456,77 +437,8 @@ namespace ippl {
         );
 
         
-        //Kokkos::parallel_for("ParticleAttrib::gatherPIF",
-        //        team_policy(Np, Kokkos::AUTO),
-        //        KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
-        //        const size_t idx = teamMember.league_rank();
-
-        //        for(size_t gd = 0; gd <  Dim; ++gd) {
-        //            double reducedValue = 0.0;
-        //            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
-        //            [=](const size_t flatIndex, double& innerReduce)
-        //            {
-        //                const int i = flatIndex % N[0];
-        //                const int j = (int)(flatIndex / N[0]);
-        //                const int k = (int)(flatIndex / (N[0] * N[1]));
-
-        //                Vector<int, 3> iVec = {i, j, k};
-        //                vector_type kVec;
-        //                double Dr = 0.0, arg=0.0;
-        //                for(size_t d = 0; d < Dim; ++d) {
-        //                    bool shift = (iVec[d] > (N[d]/2));
-        //                    bool notMid = (iVec[d] != (N[d]/2));
-        //                    //For the noMid part see 
-        //                    //https://math.mit.edu/~stevenj/fft-deriv.pdf Algorithm 1
-        //                    kVec[d] = notMid * 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-        //                    Dr += kVec[d] * kVec[d];
-        //                    arg += kVec[d]*pp(idx)[d];
-        //                }
-
-        //                FT Ek;
-        //                double Ex;
-        //                //for(size_t d = 0; d < Dim; ++d) {
-        //                    if(Dr != 0.0)
-        //                        Ek = -(imag * kVec[gd] * fview(i+nghost,j+nghost,k+nghost) / Dr);
-        //                    else
-        //                        Ek = 0.0;
-        //                    
-        //                    //Inverse Fourier transform when the lhs is real
-        //                    Ex = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
-        //                            - Ek.imag() * Kokkos::Experimental::sin(arg));
-        //                //}
-        //                
-        //                innerReduce += Ex;
-        //            }, reducedValue);
-
-        //            teamMember.team_barrier();
-
-        //            if(teamMember.team_rank() == 0) {
-        //                dview_m(idx)[gd] = reducedValue;
-        //            }
-
-        //        }
-        //        }
-        //);
-
-
         IpplTimings::stopTimer(gatherTimer);
 
-        //double Energy = 0.0;
-
-        //Kokkos::parallel_reduce("E Energy", Np,
-        //                        KOKKOS_CLASS_LAMBDA(const int i, double& valL){
-        //                            double myVal = dot(dview_m(i), dview_m(i)).apply();
-        //                            valL += myVal;
-        //                        }, Kokkos::Sum<double>(Energy));
-
-        //double gEnergy = 0.0;
-
-        //MPI_Reduce(&Energy, &gEnergy, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-
-        //msg << "E energy in gatherPIF: " << gEnergy << endl;
-
-
     }
 
 

From 2ca3462069c4e87bba52cd27a46c8a308469f450 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 2 Dec 2022 09:56:51 +0100
Subject: [PATCH 010/117] Version which has correct energy error convergence

---
 alpine/ElectrostaticPIC/ChargedParticles.hpp  | 22 ++---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 82 +++++++++++++++++--
 2 files changed, 85 insertions(+), 19 deletions(-)

diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index 8e57432e1..61730648d 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -563,24 +563,26 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
      }
 
 
-     void dumpEnergy(size_type totalP) {
+     void dumpEnergy(size_type /*totalP*/) {
         
-        auto Eview = E.getView();
 
         double potentialEnergy, kineticEnergy;
+        //auto Eview = E.getView();
         double temp = 0.0;
 
-        Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const int i, double& valL){
-                                    double myVal = dot(Eview(i), Eview(i)).apply();
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
+        //Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
+        //                        KOKKOS_LAMBDA(const int i, double& valL){
+        //                            double myVal = dot(Eview(i), Eview(i)).apply();
+        //                            valL += myVal;
+        //                        }, Kokkos::Sum<double>(temp));
 
         double globaltemp = 0.0;
-        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        potentialEnergy = 0.5 * globaltemp * volume / totalP ;
+        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        //potentialEnergy = 0.5 * globaltemp * volume / totalP ;
 
+        rho_m = dot(E_m, E_m);
+        potentialEnergy = 0.5 * hr_m[0] * hr_m[1] * hr_m[2] * rho_m.sum();
 
         auto Pview = P.getView();
         auto qView = q.getView();
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 5e63bba8f..9ff279c18 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -184,24 +184,88 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
      }
 
 
-     void dumpEnergy(size_type totalP) {
+     void dumpEnergy(size_type /*totalP*/) {
         
-        auto Eview = E.getView();
 
         double potentialEnergy, kineticEnergy;
         double temp = 0.0;
 
-        Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const int i, double& valL){
-                                    double myVal = dot(Eview(i), Eview(i)).apply();
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
+        //auto Eview = E.getView();
+        //Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
+        //                        KOKKOS_LAMBDA(const int i, double& valL){
+        //                            double myVal = dot(Eview(i), Eview(i)).apply();
+        //                            valL += myVal;
+        //                        }, Kokkos::Sum<double>(temp));
+
+
+
+        auto rhoview = rho_m.getView();
+        const int nghost = rho_m.getNghost();
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+       
+        const FieldLayout_t& layout = rho_m.getLayout(); 
+        const Mesh_t& mesh = rho_m.get_mesh();
+        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        Vector<double, Dim> Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+
+        Kokkos::complex<double> imag = {0.0, 1.0};
+        double pi = std::acos(-1.0);
+        Kokkos::parallel_reduce("Potential energy",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& valL)
+        {
+        
+            Vector<int, 3> iVec = {i, j, k};
+            Vector<double, 3> kVec;
+            double Dr = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                bool shift = (iVec[d] > (N[d]/2));
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                //kVec[d] = 2 * pi / Len[d] * iVec[d];
+                Dr += kVec[d] * kVec[d];
+            }
+
+            Kokkos::complex<double> Ek = {0.0, 0.0}; 
+            double myVal = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                if(Dr != 0.0) {
+                    Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+                }
+                myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+            }
+
+            //double myVal = rhoview(i,j,k).real() * rhoview(i,j,k).real() + 
+            //               rhoview(i,j,k).imag() * rhoview(i,j,k).imag();
+            //if(Dr != 0.0) {
+            //    myVal /= Dr;
+            //}
+            //else {
+            //    myVal = 0.0;
+            //}
+            valL += myVal;
+
+        }, Kokkos::Sum<double>(temp));
+        
 
         double globaltemp = 0.0;
         MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
         double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        potentialEnergy = 0.5 * globaltemp * volume / totalP ;
-
+        //potentialEnergy = 0.5 * globaltemp * volume / totalP ;
+        potentialEnergy = 0.25 * 0.5 * globaltemp * volume;
 
         auto Pview = P.getView();
         auto qView = q.getView();

From f79439d542a8d7bfb5d04c8b1318cbff143b19f8 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 5 Dec 2022 08:52:01 +0100
Subject: [PATCH 011/117] PinT directory created and necessary files copied and
 renamed

---
 alpine/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/alpine/CMakeLists.txt b/alpine/CMakeLists.txt
index 3a6d622c5..a3884ca14 100644
--- a/alpine/CMakeLists.txt
+++ b/alpine/CMakeLists.txt
@@ -16,6 +16,7 @@ endmacro()
 
 add_subdirectory (ElectrostaticPIC)
 add_subdirectory (ElectrostaticPIF)
+add_subdirectory (PinT)
 
 # vi: set et ts=4 sw=4 sts=4:
 

From 2890a1d3221522da9b1f26272959cf86cff3149e Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 5 Dec 2022 12:26:11 +0100
Subject: [PATCH 012/117] PIF and PIC integrators made. Need to write parareal
 now.

---
 alpine/ElectrostaticPIC/LandauDamping.cpp    | 3 ++-
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/alpine/ElectrostaticPIC/LandauDamping.cpp b/alpine/ElectrostaticPIC/LandauDamping.cpp
index 85448c342..591dff32f 100644
--- a/alpine/ElectrostaticPIC/LandauDamping.cpp
+++ b/alpine/ElectrostaticPIC/LandauDamping.cpp
@@ -1,6 +1,6 @@
 // Landau Damping Test
 //   Usage:
-//     srun ./LandauDamping <nx> <ny> <nz> <Np> <Nt> <stype> <lbthres> <ovfactor> --info 10
+//     srun ./LandauDamping <nx> <ny> <nz> <Np> <Nt> <stype> <lbthres> <ovfactor> <dt> --info 10
 //     nx       = No. cell-centered points in the x-direction
 //     ny       = No. cell-centered points in the y-direction
 //     nz       = No. cell-centered points in the z-direction
@@ -13,6 +13,7 @@
 //                simulations.
 //     ovfactor = Over-allocation factor for the buffers used in the communication. Typical
 //                values are 1.0, 2.0. Value 1.0 means no over-allocation.
+//     dt       = Time stepize
 //     Example:
 //     srun ./LandauDamping 128 128 128 10000 10 FFT 0.01 2.0 --info 10
 //
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 8db6b9dce..93e9e7796 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -1,13 +1,14 @@
 // Electrostatic Landau damping test with Particle-in-Fourier schemes
 //   Usage:
-//     srun ./LandauDampingPIF <nx> <ny> <nz> <Np> <Nt> --info 10
+//     srun ./LandauDampingPIF <nx> <ny> <nz> <Np> <Nt> <dt> --info 5
 //     nx       = No. of Fourier modes in the x-direction
 //     ny       = No. of Fourier modes in the y-direction
 //     nz       = No. of Fourier modes in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
 //     Nt       = Number of time steps
+//     dt       = Time stepsize
 //     Example:
-//     srun ./LandauDampingPIF 128 128 128 10000 10 --info 10
+//     srun ./LandauDampingPIF 128 128 128 10000 10 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Paul Scherrer Institut, Villigen PSI, Switzerland

From 4a9fa74d8309099fb62d4148aff6ce54dcfea56a Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 5 Dec 2022 16:19:03 +0100
Subject: [PATCH 013/117] In the middle of MPI send and receive

---
 src/Particle/ParticleAttrib.h | 74 -----------------------------------
 1 file changed, 74 deletions(-)

diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 480e8e5ed..bcfde8d3c 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -32,80 +32,6 @@
 #include "Expression/IpplExpressions.h"
 #include "Particle/ParticleAttribBase.h"
 
-
-//namespace sample {  // namespace helps with name resolution in reduction identity
-//    template< typename T, int N0, int N1, int N2 >
-//    struct array_type {
-//    
-//        using view_type = typename ippl::detail::ViewType<T, 3>::view_type;
-//        view_type viewTemp{"viewLocal",N0,N1,N2};
-//        using mdrange_type3 = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-//
-//        //KOKKOS_INLINE_FUNCTION   // Default constructor - Initialize to 0's
-//        array_type() {
-//            Kokkos::deep_copy(viewTemp, 0.0);
-//            //Kokkos::parallel_for(
-//            //    "array_type default constructor",
-//            //    mdrange_type3({0, 0, 0},
-//            //                 {viewTemp.extent(0),
-//            //                  viewTemp.extent(1),
-//            //                  viewTemp.extent(2)}),
-//            //    KOKKOS_CLASS_LAMBDA(const size_t i,
-//            //                  const size_t j,
-//            //                  const size_t k)
-//            //    {
-//            //        viewTemp(i,j,k) = 0.0;
-//            //    });
-//        }
-//        //KOKKOS_INLINE_FUNCTION   // Copy Constructor
-//        array_type(const array_type & rhs) {
-//            auto rhsView = rhs.viewTemp;
-//            Kokkos::deep_copy(viewTemp, rhsView);
-//            //Kokkos::parallel_for(
-//            //    "array_type copy constructor",
-//            //    mdrange_type3({0, 0, 0},
-//            //                 {viewTemp.extent(0),
-//            //                  viewTemp.extent(1),
-//            //                  viewTemp.extent(2)}),
-//            //    KOKKOS_CLASS_LAMBDA(const size_t i,
-//            //                  const size_t j,
-//            //                  const size_t k)
-//            //    {
-//            //        viewTemp(i,j,k) = rhsView(i,j,k);
-//            //    });
-//        
-//        }
-//        KOKKOS_FUNCTION   // add operator
-//        array_type& operator+=(const array_type& src) {
-//            auto srcView = src.viewTemp;
-//            Kokkos::parallel_for(
-//                "array_type operator +=",
-//                mdrange_type3({0, 0, 0},
-//                             {viewTemp.extent(0),
-//                              viewTemp.extent(1),
-//                              viewTemp.extent(2)}),
-//                KOKKOS_CLASS_LAMBDA(const size_t i,
-//                              const size_t j,
-//                              const size_t k)
-//                {
-//                    viewTemp(i,j,k) += srcView(i,j,k);
-//                });
-//                
-//            return *this;
-//        }
-//    };
-//    typedef array_type<Kokkos::complex<double>,34,34,34> ValueType;
-//}
-//
-//namespace Kokkos { //reduction identity must be defined in Kokkos namespace
-//   template<>
-//   struct reduction_identity< sample::ValueType > {
-//      KOKKOS_FORCEINLINE_FUNCTION static sample::ValueType sum() {
-//         return sample::ValueType();
-//      }
-//   };
-//}
-
 namespace Kokkos { //reduction identity must be defined in Kokkos namespace
     template<>
     struct reduction_identity< ippl::Vector<double, 3> > {

From 6888fed70bc6cdfdd115cd545b518581a5605ec8 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 6 Dec 2022 14:32:53 +0100
Subject: [PATCH 014/117] PinT directory and files added in alpine

---
 alpine/PinT/CMakeLists.txt           |  26 +++
 alpine/PinT/ChargedParticlesPinT.hpp | 320 +++++++++++++++++++++++++++
 alpine/PinT/LandauDampingPinT.cpp    | 315 ++++++++++++++++++++++++++
 alpine/PinT/LeapFrogPIC.cpp          |  60 +++++
 alpine/PinT/LeapFrogPIF.cpp          |  56 +++++
 5 files changed, 777 insertions(+)
 create mode 100644 alpine/PinT/CMakeLists.txt
 create mode 100644 alpine/PinT/ChargedParticlesPinT.hpp
 create mode 100644 alpine/PinT/LandauDampingPinT.cpp
 create mode 100644 alpine/PinT/LeapFrogPIC.cpp
 create mode 100644 alpine/PinT/LeapFrogPIF.cpp

diff --git a/alpine/PinT/CMakeLists.txt b/alpine/PinT/CMakeLists.txt
new file mode 100644
index 000000000..f73338484
--- /dev/null
+++ b/alpine/PinT/CMakeLists.txt
@@ -0,0 +1,26 @@
+file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
+message (STATUS "Adding index test found in ${_relPath}")
+
+include_directories (
+    ${CMAKE_SOURCE_DIR}/src
+)
+
+link_directories (
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${Kokkos_DIR}/..
+)
+
+set (IPPL_LIBS ippl ${MPI_CXX_LIBRARIES})
+set (COMPILE_FLAGS ${OPAL_CXX_FLAGS})
+
+add_executable (LandauDampingPinT LandauDampingPinT.cpp)
+target_link_libraries (LandauDampingPinT ${IPPL_LIBS})
+
+# vi: set et ts=4 sw=4 sts=4:
+
+# Local Variables:
+# mode: cmake
+# cmake-tab-width: 4
+# indent-tabs-mode: nil
+# require-final-newline: nil
+# End:
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
new file mode 100644
index 000000000..98a80618d
--- /dev/null
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -0,0 +1,320 @@
+// ChargedParticlesPinT header file
+//   Defines a particle attribute for charged particles to be used in
+//   test programs
+//
+// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "Ippl.h"
+
+// dimension of our positions
+constexpr unsigned Dim = 3;
+
+// some typedefs
+typedef ippl::ParticleSpatialLayout<double,Dim>   PLayout_t;
+typedef ippl::UniformCartesian<double, Dim>        Mesh_t;
+typedef ippl::FieldLayout<Dim> FieldLayout_t;
+
+using size_type = ippl::detail::size_type;
+
+template<typename T, unsigned Dim>
+using Vector = ippl::Vector<T, Dim>;
+
+template<typename T, unsigned Dim>
+using Field = ippl::Field<T, Dim>;
+
+template<typename T>
+using ParticleAttrib = ippl::ParticleAttrib<T>;
+
+typedef Vector<double, Dim>  Vector_t;
+typedef Field<double, Dim>   Field_t;
+typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
+typedef Field<Vector_t, Dim> VField_t;
+typedef ippl::FFTPeriodicPoissonSolver<Vector_t, double, Dim> Solver_t;
+
+const double pi = std::acos(-1.0);
+
+// Test programs have to define this variable for VTK dump purposes
+extern const char* TestName;
+
+template<class PLayout>
+class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
+public:
+    CxField_t rhoPIF_m;
+    Field_t rhoPIC_m;
+    VField_t EfieldPIC_m;
+
+    Vector<int, Dim> nr_m;
+
+    ippl::e_dim_tag decomp_m[Dim];
+
+    Vector_t hr_m;
+    Vector_t rmin_m;
+    Vector_t rmax_m;
+
+    double Q_m;
+
+    double time_m;
+
+    double rhoNorm_m;
+
+
+public:
+    ParticleAttrib<double>     q; // charge
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;  // particle velocity
+    typename ippl::ParticleBase<PLayout>::particle_position_type E;  // electric field at particle position
+    
+
+    typename ippl::ParticleBase<PLayout>::particle_position_type R0;  // Initial particle positions at t=0
+    typename ippl::ParticleBase<PLayout>::particle_position_type P0;  // Initial particle velocities at t=0
+
+    typename ippl::ParticleBase<PLayout>::particle_position_type Rend;  // Particle positions at end of each time slice
+    typename ippl::ParticleBase<PLayout>::particle_position_type Pend;  // Particle velocities at end of each time slice
+
+    typename ippl::ParticleBase<PLayout>::particle_position_type GR;  // G(R^(k-1)_n)
+    typename ippl::ParticleBase<PLayout>::particle_position_type GP;  // G(P^(k-1)_n)
+
+    ChargedParticlesPinT(PLayout& pl,
+                     Vector_t hr,
+                     Vector_t rmin,
+                     Vector_t rmax,
+                     ippl::e_dim_tag decomp[Dim],
+                     double Q)
+    : ippl::ParticleBase<PLayout>(pl)
+    , hr_m(hr)
+    , rmin_m(rmin)
+    , rmax_m(rmax)
+    , Q_m(Q)
+    {
+        // register the particle attributes
+        this->addAttribute(q);
+        this->addAttribute(P);
+        this->addAttribute(E);
+        this->addAttribute(R0);
+        this->addAttribute(P0);
+        this->addAttribute(Rend);
+        this->addAttribute(Pend);
+        this->addAttribute(GR);
+        this->addAttribute(GP);
+        setupBCs();
+        for (unsigned int i = 0; i < Dim; i++)
+            decomp_m[i]=decomp[i];
+    }
+
+    ~ChargedParticlesPinT(){ }
+
+    void setupBCs() {
+        setBCAllPeriodic();
+    }
+
+
+    void initFFTSolver() {
+        ippl::ParameterList sp;
+        sp.add("output_type", Solver_t::GRAD);
+        sp.add("use_heffte_defaults", false);  
+        sp.add("use_pencils", true);  
+        sp.add("use_reorder", false);  
+        sp.add("use_gpu_aware", true);  
+        sp.add("comm", ippl::p2p_pl);  
+        sp.add("r2c_direction", 0);  
+
+        solver_mp = std::make_shared<Solver_t>();
+
+        solver_mp->mergeParameters(sp);
+
+        solver_mp->setRhs(rhoPIC_m);
+
+        solver_mp->setLhs(EfieldPIC_m);
+    }
+
+
+    void dumpLandau(size_type totalP) {
+       
+       auto Eview = E.getView();
+
+       double fieldEnergy, ExAmp;
+       double temp = 0.0;
+
+       Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
+                               KOKKOS_LAMBDA(const int i, double& valL){
+                                   double myVal = Eview(i)[0] * Eview(i)[0];
+                                   valL += myVal;
+                               }, Kokkos::Sum<double>(temp));
+
+       double globaltemp = 0.0;
+       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       fieldEnergy = globaltemp * volume / totalP ;
+
+       double tempMax = 0.0;
+       Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
+                               KOKKOS_LAMBDA(const size_t i, double& valL)
+                               {
+                                   double myVal = std::fabs(Eview(i)[0]);
+                                   if(myVal > valL) valL = myVal;
+                               }, Kokkos::Max<double>(tempMax));
+       ExAmp = 0.0;
+       MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+       if (Ippl::Comm->rank() == 0) {
+           std::stringstream fname;
+           fname << "data/FieldLandau_";
+           fname << Ippl::Comm->size();
+           fname << ".csv";
+
+
+           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+           csvout.precision(10);
+           csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+           if(time_m == 0.0) {
+               csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+           }
+
+           csvout << time_m << " "
+                  << fieldEnergy << " "
+                  << ExAmp << endl;
+
+       }
+       
+       Ippl::Comm->barrier();
+    }
+
+
+    void dumpEnergy(size_type /*totalP*/) {
+       
+
+       double potentialEnergy, kineticEnergy;
+       double temp = 0.0;
+
+
+       auto rhoview = rho_m.getView();
+       const int nghost = rho_m.getNghost();
+       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+      
+       const FieldLayout_t& layout = rho_m.getLayout(); 
+       const Mesh_t& mesh = rho_m.get_mesh();
+       const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+       const auto& domain = layout.getDomain();
+       Vector<double, Dim> Len;
+       Vector<int, Dim> N;
+
+       for (unsigned d=0; d < Dim; ++d) {
+           N[d] = domain[d].length();
+           Len[d] = dx[d] * N[d];
+       }
+
+
+       Kokkos::complex<double> imag = {0.0, 1.0};
+       double pi = std::acos(-1.0);
+       Kokkos::parallel_reduce("Potential energy",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& valL)
+       {
+       
+           Vector<int, 3> iVec = {i, j, k};
+           Vector<double, 3> kVec;
+           double Dr = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               bool shift = (iVec[d] > (N[d]/2));
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               //kVec[d] = 2 * pi / Len[d] * iVec[d];
+               Dr += kVec[d] * kVec[d];
+           }
+
+           Kokkos::complex<double> Ek = {0.0, 0.0}; 
+           double myVal = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               if(Dr != 0.0) {
+                   Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+               }
+               myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+           }
+
+           //double myVal = rhoview(i,j,k).real() * rhoview(i,j,k).real() + 
+           //               rhoview(i,j,k).imag() * rhoview(i,j,k).imag();
+           //if(Dr != 0.0) {
+           //    myVal /= Dr;
+           //}
+           //else {
+           //    myVal = 0.0;
+           //}
+           valL += myVal;
+
+       }, Kokkos::Sum<double>(temp));
+       
+
+       double globaltemp = 0.0;
+       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       //potentialEnergy = 0.5 * globaltemp * volume / totalP ;
+       potentialEnergy = 0.25 * 0.5 * globaltemp * volume;
+
+       auto Pview = P.getView();
+       auto qView = q.getView();
+
+       temp = 0.0;
+
+       Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
+                               KOKKOS_LAMBDA(const int i, double& valL){
+                                   double myVal = dot(Pview(i), Pview(i)).apply();
+                                   myVal *= -qView(i);
+                                   valL += myVal;
+                               }, Kokkos::Sum<double>(temp));
+
+       temp *= 0.5;
+       globaltemp = 0.0;
+       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+       kineticEnergy = globaltemp;
+
+       if (Ippl::Comm->rank() == 0) {
+           std::stringstream fname;
+           fname << "data/Energy_";
+           fname << Ippl::Comm->size();
+           fname << ".csv";
+
+
+           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+           csvout.precision(10);
+           csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+           if(time_m == 0.0) {
+               csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+           }
+
+           csvout << time_m << " "
+                  << potentialEnergy << " "
+                  << kineticEnergy << " "
+                  << potentialEnergy + kineticEnergy << endl;
+
+       }
+       
+       Ippl::Comm->barrier();
+    }
+
+private:
+    void setBCAllPeriodic() {
+
+        this->setParticleBC(ippl::BC::PERIODIC);
+    }
+
+};
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
new file mode 100644
index 000000000..5e2694088
--- /dev/null
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -0,0 +1,315 @@
+// Parallel-in-time (PinT) method Parareal combined with Particle-in-cell
+// and Particle-in-Fourier schemes. The example is electrostatic Landau 
+// damping. The implementation of Parareal follows the open source implementation
+// https://github.com/Parallel-in-Time/PararealF90 by Daniel Ruprecht. The corresponding
+// publication is Ruprecht, Daniel. "Shared memory pipelined parareal." 
+// European Conference on Parallel Processing. Springer, Cham, 2017.
+// 
+//  Usage:
+//     srun ./LandauDampingPinT <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> --info 5
+//     nx       = No. of Fourier modes in the x-direction
+//     ny       = No. of Fourier modes in the y-direction
+//     nz       = No. of Fourier modes in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Example:
+//     srun ./LandauDampingPinT 128 128 128 10000 20 0.05 0.05 1e-5 100 --info 5
+//
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPinT.hpp"
+#include "LeapFrogPIC.cpp"
+#include "LeapFrogPIF.cpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T k, alpha, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& k_, const T& alpha_, 
+           const T& u_) 
+  : k(k_), alpha(alpha_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = x  + (alpha  * (std::sin(k * x) / k)) - u;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = 1  + (alpha  * std::cos(k * x));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while (iterations < max_iter && std::fabs(f(x)) > tol) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  value_type alpha;
+
+  T k, minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
+                  value_type& alpha_, T& k_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        alpha(alpha_), k(k_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    value_type u;
+    for (unsigned d = 0; d < Dim; ++d) {
+
+        u = rand_gen.drand(minU[d], maxU[d]);
+        x(i)[d] = u / (1 + alpha);
+        Newton1D<value_type> solver(k[d], alpha, u);
+        solver.solve(x(i)[d]);
+        v(i)[d] = rand_gen.normal(0.0, 1.0);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& alpha, const double& k) {
+   double cdf = x + (alpha / k) * std::sin(k * x);
+   return cdf;
+}
+
+KOKKOS_FUNCTION
+double PDF(const Vector_t& xvec, const double& alpha, 
+             const Vector_t& kw, const unsigned Dim) {
+    double pdf = 1.0;
+
+    for (unsigned d = 0; d < Dim; ++d) {
+        pdf *= (1.0 + alpha * std::cos(kw[d] * xvec[d]));
+    }
+    return pdf;
+}
+
+const char* TestName = "LandauDampingPinT";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg("LandauDampingPinT");
+    Inform msg2all("LandauDampingPinT",INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nr = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[4]);
+    const double tEnd = std::atof(argv[5]);
+    const double dtSlice = tEnd / Ippl::Comm->size();
+    const double dtFine = std::atof(argv[6]);
+    const double dtCoarse = std::atof(argv[7]);
+    const unsigned int ntFine = (unsigned int)(dtSlice / dtFine);
+    const unsigned int ntCoarse = (unsigned int)(dtSlice / dtCoarse);
+    const double tol = std::atof(argv[8]);
+    const unsigned int maxIter = std::atoi(argv[9]);
+
+    const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
+    const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
+
+    msg << "Parareal Landau damping"
+        << endl
+        << "Slice dT: " << dtSlice 
+        << "No. of fine time steps: " << ntFine 
+        << "No. of coarse time steps: " << ntCoarse
+        << endl
+        << "Tolerance: " << tol
+        << "Max. iterations: " << maxIter
+        << endl
+        << " Np= "
+        << totalP << " Fourier modes = " << nr
+        << endl;
+
+    using bunch_type = ChargedParticlesPinT<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  P;
+
+    ippl::NDIndex<Dim> domain;
+    for (unsigned i = 0; i< Dim; i++) {
+        domain[i] = ippl::Index(nr[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t kw = {0.5, 0.5, 0.5};
+    double alpha = 0.05;
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw ;
+    double dx = rmax[0] / nr[0];
+    double dy = rmax[1] / nr[1];
+    double dz = rmax[2] / nr[2];
+
+    Vector_t hr = {dx, dy, dz};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t mesh(domain, hr, origin);
+    FieldLayout_t FL(domain, decomp, isAllPeriodic);
+    PLayout_t PL(FL, mesh);
+
+    //Q = -\int\int f dx dv
+    double Q = -rmax[0] * rmax[1] * rmax[2];
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+
+    P->nr_m = nr;
+
+    P->rhoPIF_m.initialize(mesh, FL);
+    P->rhoPIC_m.initialize(mesh, FL);
+    P->EfieldPIC_m.initialize(mesh, FL);
+
+    P->initFFTSolver();
+    P->time_m = 0.0;
+
+    IpplTimings::startTimer(particleCreation);
+
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = rmin[d];
+        maxU[d] = rmax[d];
+    }
+
+    size_type nloc = totalP;
+
+    P->create(nloc);
+    //Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         P->R.getView(), P->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    P->q = P->Q_m/totalP;
+    msg << "particles created and initial conditions assigned " << endl;
+
+    //Copy initial conditions as they are needed later
+    Kokkos::deep_copy(P->R0.getView(), P->R.getView());
+    Kokkos::deep_copy(P->P0.getView(), P->P.getView());
+
+    P->scatter(P->q, P->rhoPIC_m, P->R);
+    P->rhoPIC_m = P->rhoPIC_m / (hr[0] * hr[1] * hr[2]);
+
+    P->rhoPIC_m = P->rhoPIC_m - (P->Q_m/((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2])));
+
+    P->solver_mp->solve();
+
+    P->gather(P->E, P->EfieldPIC_m, P->R);
+
+    //Get initial guess for ranks other than 0 by propagating the coarse solver
+    if (Ippl::Comm->rank() > 0) {
+        LeapFrogPIC(*P, P->R, P->P, Ippl::Comm->rank()*ntCoarse, dtCoarse); 
+    }
+
+    Ippl::Comm->barrier();
+
+    Kokkos::deep_copy(P->GR.getView(), P->R.getView());
+    Kokkos::deep_copy(P->GP.getView(), P->P.getView());
+
+    //Run the coarse integrator to get the values at the end of the time slice 
+    LeapFrogPIC(*P, P->GR, P->GP, ntCoarse, dtCoarse); 
+
+
+    msg << "Starting parareal iterations ..." << endl;
+    for (unsigned int it=0; it<maxIter; it++) {
+
+        //Run fine integrator in parallel
+        LeapFrogPIF(*P, P->R, P->P, ntFine, dtFine);
+
+        //Difference = Fine - Coarse
+        P->Rend = P->R - P->GR;
+        P->Pend = P->P - P->GP;
+
+        if(Ippl::Comm-> rank() > 0) {
+
+            MPI_Recv(P->R.getView().data(), nloc,
+                MPI_BYTE, src, tag, comm_m, &status);
+
+
+        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    }
+
+    msg << "LandauDamping: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    return 0;
+}
diff --git a/alpine/PinT/LeapFrogPIC.cpp b/alpine/PinT/LeapFrogPIC.cpp
new file mode 100644
index 000000000..0de516a80
--- /dev/null
+++ b/alpine/PinT/LeapFrogPIC.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+//#include "ChargedParticlesPinT.hpp"
+
+void LeapFrogPIC(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp, 
+                 ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
+                 const double dt) {
+
+    const auto& PL = P.getLayout();
+
+    const auto& hr = P.hr_m;
+    const auto& rmax = P.rmax_m;
+    const auto& rmax = P.rmin_m;
+    for (unsigned int it=0; it<nt; it++) {
+        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
+
+        Ptemp = Ptemp - 0.5 * dt * P.E;
+
+        //drift
+        Rtemp = Rtemp + dt * Ptemp;
+
+        //Apply particle BC
+        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+
+        //scatter the charge onto the underlying grid
+        P.rhoPIC_m = 0.0;
+        P.scatter(P.q, P.rhoPIC_m, Rtemp);
+
+
+        P.rhoPIC_m = P.rhoPIC_m / (hr[0] * hr[1] * hr[2]);
+        P.rhoPIC_m = P.rhoPIC_m - (P.Q_m/((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2])));
+
+        //Field solve
+        P.solver_mp->solve();
+
+        // gather E field
+        P.gather(P.E, P.EfieldPIC_m, Rtemp);
+
+        //kick
+        Ptemp = Ptemp - 0.5 * dt * P.E;
+    }
+
+}
diff --git a/alpine/PinT/LeapFrogPIF.cpp b/alpine/PinT/LeapFrogPIF.cpp
new file mode 100644
index 000000000..399ffb1f6
--- /dev/null
+++ b/alpine/PinT/LeapFrogPIF.cpp
@@ -0,0 +1,56 @@
+//
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+//#include "ChargedParticlesPinT.hpp"
+
+void LeapFrogPIF(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
+                 ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
+                 const double dt) {
+
+    const auto& PL = P.getLayout();
+    const auto& rmax = P.rmax_m;
+    const auto& rmax = P.rmin_m;
+
+    for (unsigned int it=0; it<nt; it++) {
+
+        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
+
+        Ptemp = Ptemp - 0.5 * dt * P.E;
+
+        //drift
+        Rtemp = Rtemp + dt * Ptemp;
+
+        //Apply particle BC
+        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+
+        //scatter the charge onto the underlying grid
+        P.rhoPIF_m = {0.0, 0.0};
+        P.scatterPIF(P.q, P.rhoPIF_m, Rtemp);
+
+        P.rhoPIF_m = P.rhoPIF_m / ((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2]));
+
+        // Solve for and gather E field
+        P.gatherPIF(P.E, P.rhoPIF_m, Rtemp);
+
+        //kick
+        Ptemp = Ptemp - 0.5 * dt * P.E;
+
+    }
+}

From d3f5e7466b8625dd3e85a28bd2d928eb7a3117a3 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 6 Dec 2022 15:36:05 +0100
Subject: [PATCH 015/117] additional classes created and code modified. Need to
 do MPI send/recv

---
 alpine/PinT/ChargedParticlesPinT.hpp | 12 +----
 alpine/PinT/LandauDampingPinT.cpp    | 80 ++++++++++++++++++----------
 alpine/PinT/StatesBeginSlice.hpp     | 31 +++++++++++
 alpine/PinT/StatesEndSlice.hpp       | 31 +++++++++++
 4 files changed, 116 insertions(+), 38 deletions(-)
 create mode 100644 alpine/PinT/StatesBeginSlice.hpp
 create mode 100644 alpine/PinT/StatesEndSlice.hpp

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 98a80618d..cfc53f50b 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -72,18 +72,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 public:
     ParticleAttrib<double>     q; // charge
-    typename ippl::ParticleBase<PLayout>::particle_position_type P;  // particle velocity
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;  // G(P^(k-1)_n)
     typename ippl::ParticleBase<PLayout>::particle_position_type E;  // electric field at particle position
-    
 
     typename ippl::ParticleBase<PLayout>::particle_position_type R0;  // Initial particle positions at t=0
     typename ippl::ParticleBase<PLayout>::particle_position_type P0;  // Initial particle velocities at t=0
 
-    typename ippl::ParticleBase<PLayout>::particle_position_type Rend;  // Particle positions at end of each time slice
-    typename ippl::ParticleBase<PLayout>::particle_position_type Pend;  // Particle velocities at end of each time slice
-
-    typename ippl::ParticleBase<PLayout>::particle_position_type GR;  // G(R^(k-1)_n)
-    typename ippl::ParticleBase<PLayout>::particle_position_type GP;  // G(P^(k-1)_n)
 
     ChargedParticlesPinT(PLayout& pl,
                      Vector_t hr,
@@ -103,10 +97,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         this->addAttribute(E);
         this->addAttribute(R0);
         this->addAttribute(P0);
-        this->addAttribute(Rend);
-        this->addAttribute(Pend);
-        this->addAttribute(GR);
-        this->addAttribute(GP);
         setupBCs();
         for (unsigned int i = 0; i < Dim; i++)
             decomp_m[i]=decomp[i];
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 5e2694088..78c3a2c95 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -30,6 +30,8 @@
 //
 
 #include "ChargedParticlesPinT.hpp"
+#include "StatesBeginSlice.hpp"
+#include "StatesEndSlice.hpp"
 #include "LeapFrogPIC.cpp"
 #include "LeapFrogPIF.cpp"
 #include <string>
@@ -193,8 +195,12 @@ int main(int argc, char *argv[]){
         << endl;
 
     using bunch_type = ChargedParticlesPinT<PLayout_t>;
+    using states_begin_type = StatesBeginSlice<PLayout_t>;
+    using states_end_type = StatesEndSlice<PLayout_t>;
 
-    std::unique_ptr<bunch_type>  P;
+    std::unique_ptr<bunch_type>  Pcoarse;
+    std::unique_ptr<states_begin_type>  Pbegin;
+    std::unique_ptr<states_end_type>  Pend;
 
     ippl::NDIndex<Dim> domain;
     for (unsigned i = 0; i< Dim; i++) {
@@ -225,16 +231,18 @@ int main(int argc, char *argv[]){
 
     //Q = -\int\int f dx dv
     double Q = -rmax[0] * rmax[1] * rmax[2];
-    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+    Pcoarse = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+    Pbegin = std::make_unique<bunch_type>(PL);
+    Pend = std::make_unique<bunch_type>(PL);
 
-    P->nr_m = nr;
+    Pcoarse->nr_m = nr;
 
-    P->rhoPIF_m.initialize(mesh, FL);
-    P->rhoPIC_m.initialize(mesh, FL);
-    P->EfieldPIC_m.initialize(mesh, FL);
+    Pcoarse->rhoPIF_m.initialize(mesh, FL);
+    Pcoarse->rhoPIC_m.initialize(mesh, FL);
+    Pcoarse->EfieldPIC_m.initialize(mesh, FL);
 
-    P->initFFTSolver();
-    P->time_m = 0.0;
+    Pcoarse->initFFTSolver();
+    Pcoarse->time_m = 0.0;
 
     IpplTimings::startTimer(particleCreation);
 
@@ -246,64 +254,82 @@ int main(int argc, char *argv[]){
 
     size_type nloc = totalP;
 
-    P->create(nloc);
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
     //Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
-                         P->R.getView(), P->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+                         Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, alpha, kw, minU, maxU));
 
     Kokkos::fence();
     Ippl::Comm->barrier();
     IpplTimings::stopTimer(particleCreation);                                                    
     
-    P->q = P->Q_m/totalP;
+    Pcoarse->q = Pcoarse->Q_m/totalP;
     msg << "particles created and initial conditions assigned " << endl;
 
     //Copy initial conditions as they are needed later
-    Kokkos::deep_copy(P->R0.getView(), P->R.getView());
-    Kokkos::deep_copy(P->P0.getView(), P->P.getView());
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
 
-    P->scatter(P->q, P->rhoPIC_m, P->R);
-    P->rhoPIC_m = P->rhoPIC_m / (hr[0] * hr[1] * hr[2]);
+    Pcoarse->rhoPIC_m = 0.0;
+    Pcoarse->scatter(Pcoarse->q, Pcoarse->rhoPIC_m, Pcoarse->R);
+    Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m / (hr[0] * hr[1] * hr[2]);
 
-    P->rhoPIC_m = P->rhoPIC_m - (P->Q_m/((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2])));
+    Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m - (Pcoarse->Q_m/((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2])));
 
-    P->solver_mp->solve();
+    Pcoarse->solver_mp->solve();
 
-    P->gather(P->E, P->EfieldPIC_m, P->R);
+    Pcoarse->gather(Pcoarse->E, Pcoarse->EfieldPIC_m, Pcoarse->R);
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
     if (Ippl::Comm->rank() > 0) {
-        LeapFrogPIC(*P, P->R, P->P, Ippl::Comm->rank()*ntCoarse, dtCoarse); 
+        LeapFrogPIC(*Pcoarse, Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse); 
     }
 
     Ippl::Comm->barrier();
 
-    Kokkos::deep_copy(P->GR.getView(), P->R.getView());
-    Kokkos::deep_copy(P->GP.getView(), P->P.getView());
+    
+    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+
+    //Compute initial E fields corresponding to fine integrator
+    Pcoarse->rhoPIF_m = {0.0, 0.0};
+    Pcoarse->scatterPIF(Pcoarse->q, Pcoarse->rhoPIF_m, Pcoarse->R);
+
+    Pcoarse->rhoPIF_m = Pcoarse->rhoPIF_m / 
+                        ((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2]));
+
+    Pcoarse->gatherPIF(Pcoarse->E, Pcoarse->rhoPIF_m, Pcoarse->R);
+
 
     //Run the coarse integrator to get the values at the end of the time slice 
-    LeapFrogPIC(*P, P->GR, P->GP, ntCoarse, dtCoarse); 
+    LeapFrogPIC(*Pcoarse, Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
+
+    //The following might not be needed
+    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
 
 
     msg << "Starting parareal iterations ..." << endl;
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
-        LeapFrogPIF(*P, P->R, P->P, ntFine, dtFine);
+        LeapFrogPIF(*Pcoarse, Pbegin->R, Pbegin->P, ntFine, dtFine);
 
         //Difference = Fine - Coarse
-        P->Rend = P->R - P->GR;
-        P->Pend = P->P - P->GP;
+        Pend->R = Pbegin->R - Pcoarse->R;
+        Pend->P = Pbegin->P - Pcoarse->P;
 
         if(Ippl::Comm-> rank() > 0) {
 
-            MPI_Recv(P->R.getView().data(), nloc,
+            MPI_Recv(Pcoarse->R.getView().data(), nloc,
                 MPI_BYTE, src, tag, comm_m, &status);
 
 
-        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+        msg << "Finished iteration: " << it+1 << endl;
     }
 
     msg << "LandauDamping: End." << endl;
diff --git a/alpine/PinT/StatesBeginSlice.hpp b/alpine/PinT/StatesBeginSlice.hpp
new file mode 100644
index 000000000..621e88038
--- /dev/null
+++ b/alpine/PinT/StatesBeginSlice.hpp
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+
+template<class PLayout>
+class StatesBeginSlice : public ippl::ParticleBase<PLayout> {
+
+public:
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;
+
+    StatesBeginSlice(PLayout& pl)
+    : ippl::ParticleBase<PLayout>(pl)
+    {
+        // register the particle attributes
+        this->addAttribute(P);
+    }
+
+    ~StatesBeginSlice(){ }
+
+};
diff --git a/alpine/PinT/StatesEndSlice.hpp b/alpine/PinT/StatesEndSlice.hpp
new file mode 100644
index 000000000..6b69996a1
--- /dev/null
+++ b/alpine/PinT/StatesEndSlice.hpp
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+
+template<class PLayout>
+class StatesEndSlice : public ippl::ParticleBase<PLayout> {
+
+public:
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;
+
+    StatesEndSlice(PLayout& pl)
+    : ippl::ParticleBase<PLayout>(pl)
+    {
+        // register the particle attributes
+        this->addAttribute(P);
+    }
+
+    ~StatesEndSlice(){ }
+
+};

From e4223cadebc8d85e9544dbe1027c1523cf6f187f Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 6 Dec 2022 21:55:13 +0100
Subject: [PATCH 016/117] Parareal almost completed. Need to do convergence
 check and data writing

---
 alpine/PinT/LandauDampingPinT.cpp | 41 ++++++++++++++++++++++++++-----
 src/Communicate/Tags.h            | 34 +++----------------------
 2 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 78c3a2c95..bc959ee0b 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -257,7 +257,6 @@ int main(int argc, char *argv[]){
     Pcoarse->create(nloc);
     Pbegin->create(nloc);
     Pend->create(nloc);
-    //Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
@@ -278,7 +277,8 @@ int main(int argc, char *argv[]){
     Pcoarse->scatter(Pcoarse->q, Pcoarse->rhoPIC_m, Pcoarse->R);
     Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m / (hr[0] * hr[1] * hr[2]);
 
-    Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m - (Pcoarse->Q_m/((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2])));
+    Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m - 
+        (Pcoarse->Q_m/((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2])));
 
     Pcoarse->solver_mp->solve();
 
@@ -313,6 +313,7 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
 
 
+    using buffer_type = ippl::Communicate::buffer_type;
     msg << "Starting parareal iterations ..." << endl;
     for (unsigned int it=0; it<maxIter; it++) {
 
@@ -323,16 +324,44 @@ int main(int argc, char *argv[]){
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
 
-        if(Ippl::Comm-> rank() > 0) {
 
-            MPI_Recv(Pcoarse->R.getView().data(), nloc,
-                MPI_BYTE, src, tag, comm_m, &status);
+        int tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+        
+        if(Ippl::Comm->rank() > 0) {
+            size_type bufSize = Pbegin->packedSize(nloc);
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+            Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+            buf->resetReadPos();
+        }
+        else {
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+        }
 
+        Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+        Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+
+
+        LeapFrogPIC(*Pcoarse, Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
+
+
+        Pend->R = Pend->R + Pcoarse->R;
+        Pend->P = Pend->P + Pcoarse->P;
+
+
+        if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+            size_type bufSize = Pend->packedSize(nloc);
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+            MPI_Request request;
+            Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+            buf->resetWritePos();
+            MPI_Wait(&request, MPI_STATUS_IGNORE);
+        }
 
         msg << "Finished iteration: " << it+1 << endl;
     }
 
-    msg << "LandauDamping: End." << endl;
+    msg << "LandauDamping Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
diff --git a/src/Communicate/Tags.h b/src/Communicate/Tags.h
index 8d6db8bcd..1e07ed717 100644
--- a/src/Communicate/Tags.h
+++ b/src/Communicate/Tags.h
@@ -26,13 +26,6 @@
 #define IPPL_EXIT_TAG             6    // program should exit()
 
 
-// tags for reduction
-#define COMM_REDUCE_SEND_TAG    10000
-#define COMM_REDUCE_RECV_TAG    11000
-#define COMM_REDUCE_SCATTER_TAG 12000
-#define COMM_REDUCE_CYCLE        1000
-
-
 // tag for applying parallel periodic boundary condition.
 
 #define BC_PARALLEL_PERIODIC_TAG 15000
@@ -48,28 +41,6 @@ namespace ippl {
     }
 }
 
-#define F_GUARD_CELLS_TAG       20000 // Field::fillGuardCells()
-#define F_WRITE_TAG             21000 // Field::write()
-#define F_READ_TAG              22000 // Field::read()
-#define F_GEN_ASSIGN_TAG        23000 // assign(BareField,BareField)
-#define F_REPARTITION_BCAST_TAG 24000 // broadcast in FieldLayout::repartion.
-#define F_REDUCE_PERP_TAG       25000 // reduction in binary load balance.
-#define F_GETSINGLE_TAG         26000 // IndexedBareField::getsingle()
-#define F_REDUCE_TAG            27000 // Reduction in minloc/maxloc
-#define F_LAYOUT_IO_TAG         28000 // Reduction in minloc/maxloc
-#define F_TAG_CYCLE              1000
-
-// // Tags for FieldView and FieldBlock
-// #define FV_2D_TAG               30000 // FieldView::update_2D_data()
-// #define FV_3D_TAG               31000 // FieldView::update_2D_data()
-// #define FV_TAG_CYCLE             1000
-//
-// #define FB_WRITE_TAG            32000 // FieldBlock::write()
-// #define FB_READ_TAG             33000 // FieldBlock::read()
-// #define FB_TAG_CYCLE             1000
-//
-// #define FP_GATHER_TAG           34000 // FieldPrint::print()
-// #define FP_TAG_CYCLE             1000
 
 // Special tags used by Particle classes for communication.
 #define P_WEIGHTED_LAYOUT_TAG   50000
@@ -88,7 +59,7 @@ namespace ippl {
 #define IPPL_TAG_CYCLE             1000
 
 // Tags for Ippl application codes
-#define IPPL_APP_TAG0    90000
+#define IPPL_PARAREAL_APP    90000
 #define IPPL_APP_TAG1    91000
 #define IPPL_APP_TAG2    92000
 #define IPPL_APP_TAG3    93000
@@ -128,4 +99,7 @@ namespace ippl {
 #define OPEN_SOLVER_TAG 18000
 #define VICO_SOLVER_TAG 70000
 
+#define IPPL_PARAREAL_SEND 19000
+#define IPPL_PARAREAL_RECV 21000
+
 #endif // TAGS_H

From fa4c8cec2d224e1ef73303ec00b34a29a8ea3b1f Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 7 Dec 2022 11:11:43 +0100
Subject: [PATCH 017/117] Some modifications made and output writing done

---
 alpine/PinT/ChargedParticlesPinT.hpp | 321 ++++++++++++++-------------
 alpine/PinT/LandauDampingPinT.cpp    |   8 +-
 alpine/PinT/LeapFrogPIF.cpp          |   7 +-
 src/FFT/FFT.hpp                      |   2 +-
 src/Field/BareField.hpp              |  18 +-
 src/Particle/ParticleAttrib.hpp      |  12 +-
 6 files changed, 199 insertions(+), 169 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index cfc53f50b..d894587c9 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -131,174 +131,179 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void dumpLandau(size_type totalP) {
        
-       auto Eview = E.getView();
-
-       double fieldEnergy, ExAmp;
-       double temp = 0.0;
-
-       Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
-                               KOKKOS_LAMBDA(const int i, double& valL){
-                                   double myVal = Eview(i)[0] * Eview(i)[0];
-                                   valL += myVal;
-                               }, Kokkos::Sum<double>(temp));
-
-       double globaltemp = 0.0;
-       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-       fieldEnergy = globaltemp * volume / totalP ;
-
-       double tempMax = 0.0;
-       Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
-                               KOKKOS_LAMBDA(const size_t i, double& valL)
-                               {
-                                   double myVal = std::fabs(Eview(i)[0]);
-                                   if(myVal > valL) valL = myVal;
-                               }, Kokkos::Max<double>(tempMax));
-       ExAmp = 0.0;
-       MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
-
-
-       if (Ippl::Comm->rank() == 0) {
-           std::stringstream fname;
-           fname << "data/FieldLandau_";
-           fname << Ippl::Comm->size();
-           fname << ".csv";
-
-
-           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-           csvout.precision(10);
-           csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-           if(time_m == 0.0) {
-               csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
-           }
-
-           csvout << time_m << " "
-                  << fieldEnergy << " "
-                  << ExAmp << endl;
-
-       }
-       
-       Ippl::Comm->barrier();
+        auto Eview = E.getView();
+
+        double fieldEnergy, ExAmp;
+        double temp = 0.0;
+
+        Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = Eview(i)[0] * Eview(i)[0];
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        //double globaltemp = 0.0;
+        double globaltemp = temp;
+        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        fieldEnergy = globaltemp * volume / totalP ;
+
+        double tempMax = 0.0;
+        Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const size_t i, double& valL)
+                                {
+                                    double myVal = std::fabs(Eview(i)[0]);
+                                    if(myVal > valL) valL = myVal;
+                                }, Kokkos::Max<double>(tempMax));
+        //ExAmp = 0.0;
+        ExAmp = tempMax;
+        //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+        for (int rank=0; rank < Ippl::Comm->size(); ++rank) {
+             if(Ippl::Comm->rank() == rank) {
+                 std::stringstream fname;
+                 fname << "data/FieldLandau_";
+                 fname << Ippl::Comm->size();
+                 fname << ".csv";
+
+
+                 Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, rank);
+                 csvout.precision(10);
+                 csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+                 if(time_m == 0.0) {
+                     csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+                 }
+
+                 csvout << time_m << " "
+                        << fieldEnergy << " "
+                        << ExAmp << endl;
+             }
+             Ippl::Comm->barrier();
+        }
     }
 
 
     void dumpEnergy(size_type /*totalP*/) {
        
 
-       double potentialEnergy, kineticEnergy;
-       double temp = 0.0;
+        double potentialEnergy, kineticEnergy;
+        double temp = 0.0;
 
 
-       auto rhoview = rho_m.getView();
-       const int nghost = rho_m.getNghost();
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+        auto rhoview = rhoPIF_m.getView();
+        const int nghost = rhoPIF_m.getNghost();
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
       
-       const FieldLayout_t& layout = rho_m.getLayout(); 
-       const Mesh_t& mesh = rho_m.get_mesh();
-       const Vector<double, Dim>& dx = mesh.getMeshSpacing();
-       const auto& domain = layout.getDomain();
-       Vector<double, Dim> Len;
-       Vector<int, Dim> N;
-
-       for (unsigned d=0; d < Dim; ++d) {
-           N[d] = domain[d].length();
-           Len[d] = dx[d] * N[d];
-       }
-
-
-       Kokkos::complex<double> imag = {0.0, 1.0};
-       double pi = std::acos(-1.0);
-       Kokkos::parallel_reduce("Potential energy",
-                             mdrange_type({0, 0, 0},
-                                          {N[0],
-                                           N[1],
-                                           N[2]}),
-                             KOKKOS_LAMBDA(const int i,
-                                           const int j,
-                                           const int k,
-                                           double& valL)
-       {
-       
-           Vector<int, 3> iVec = {i, j, k};
-           Vector<double, 3> kVec;
-           double Dr = 0.0;
-           for(size_t d = 0; d < Dim; ++d) {
-               bool shift = (iVec[d] > (N[d]/2));
-               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-               //kVec[d] = 2 * pi / Len[d] * iVec[d];
-               Dr += kVec[d] * kVec[d];
-           }
-
-           Kokkos::complex<double> Ek = {0.0, 0.0}; 
-           double myVal = 0.0;
-           for(size_t d = 0; d < Dim; ++d) {
-               if(Dr != 0.0) {
-                   Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-               }
-               myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
-           }
-
-           //double myVal = rhoview(i,j,k).real() * rhoview(i,j,k).real() + 
-           //               rhoview(i,j,k).imag() * rhoview(i,j,k).imag();
-           //if(Dr != 0.0) {
-           //    myVal /= Dr;
-           //}
-           //else {
-           //    myVal = 0.0;
-           //}
-           valL += myVal;
-
-       }, Kokkos::Sum<double>(temp));
-       
-
-       double globaltemp = 0.0;
-       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-       //potentialEnergy = 0.5 * globaltemp * volume / totalP ;
-       potentialEnergy = 0.25 * 0.5 * globaltemp * volume;
-
-       auto Pview = P.getView();
-       auto qView = q.getView();
-
-       temp = 0.0;
-
-       Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
-                               KOKKOS_LAMBDA(const int i, double& valL){
-                                   double myVal = dot(Pview(i), Pview(i)).apply();
-                                   myVal *= -qView(i);
-                                   valL += myVal;
-                               }, Kokkos::Sum<double>(temp));
-
-       temp *= 0.5;
-       globaltemp = 0.0;
-       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-
-       kineticEnergy = globaltemp;
+        const FieldLayout_t& layout = rhoPIF_m.getLayout(); 
+        const Mesh_t& mesh = rhoPIF_m.get_mesh();
+        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        Vector<double, Dim> Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+
+        Kokkos::complex<double> imag = {0.0, 1.0};
+        double pi = std::acos(-1.0);
+        Kokkos::parallel_reduce("Potential energy",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& valL)
+        {
+        
+            Vector<int, 3> iVec = {i, j, k};
+            Vector<double, 3> kVec;
+            double Dr = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                bool shift = (iVec[d] > (N[d]/2));
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                //kVec[d] = 2 * pi / Len[d] * iVec[d];
+                Dr += kVec[d] * kVec[d];
+            }
+
+            Kokkos::complex<double> Ek = {0.0, 0.0}; 
+            double myVal = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                if(Dr != 0.0) {
+                    Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+                }
+                myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+            }
+
+            //double myVal = rhoview(i,j,k).real() * rhoview(i,j,k).real() + 
+            //               rhoview(i,j,k).imag() * rhoview(i,j,k).imag();
+            //if(Dr != 0.0) {
+            //    myVal /= Dr;
+            //}
+            //else {
+            //    myVal = 0.0;
+            //}
+            valL += myVal;
+
+        }, Kokkos::Sum<double>(temp));
+        
+
+        //double globaltemp = 0.0;
+        double globaltemp = temp;
+        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        //potentialEnergy = 0.5 * globaltemp * volume / totalP ;
+        potentialEnergy = 0.25 * 0.5 * globaltemp * volume;
+
+        auto Pview = P.getView();
+        auto qView = q.getView();
+
+        temp = 0.0;
+
+        Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    double myVal = dot(Pview(i), Pview(i)).apply();
+                                    myVal *= -qView(i);
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+
+        temp *= 0.5;
+        //globaltemp = 0.0;
+        globaltemp = temp;
+        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+        kineticEnergy = globaltemp;
+
+        for (int rank=0; rank < Ippl::Comm->size(); ++rank) {
+             if(Ippl::Comm->rank() == rank) {
+                 std::stringstream fname;
+                 fname << "data/Energy_";
+                 fname << Ippl::Comm->size();
+                 fname << ".csv";
+
+
+                 Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, rank);
+                 csvout.precision(10);
+                 csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+                 if(time_m == 0.0) {
+                     csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+                 }
+
+                 csvout << time_m << " "
+                        << potentialEnergy << " "
+                        << kineticEnergy << " "
+                        << potentialEnergy + kineticEnergy << endl;
+             }
+             Ippl::Comm->barrier();
+        }
 
-       if (Ippl::Comm->rank() == 0) {
-           std::stringstream fname;
-           fname << "data/Energy_";
-           fname << Ippl::Comm->size();
-           fname << ".csv";
-
-
-           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-           csvout.precision(10);
-           csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-           if(time_m == 0.0) {
-               csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
-           }
-
-           csvout << time_m << " "
-                  << potentialEnergy << " "
-                  << kineticEnergy << " "
-                  << potentialEnergy + kineticEnergy << endl;
-
-       }
-       
-       Ippl::Comm->barrier();
     }
 
 private:
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index bc959ee0b..0ab42a8ac 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -315,15 +315,21 @@ int main(int argc, char *argv[]){
 
     using buffer_type = ippl::Communicate::buffer_type;
     msg << "Starting parareal iterations ..." << endl;
+    bool isConverged = false;
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
-        LeapFrogPIF(*Pcoarse, Pbegin->R, Pbegin->P, ntFine, dtFine);
+        LeapFrogPIF(*Pcoarse, Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged);
+
+        if(isConverged) {
+            break;
+        }
 
         //Difference = Fine - Coarse
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
 
+        double Rerror = computeL2Error(
 
         int tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
diff --git a/alpine/PinT/LeapFrogPIF.cpp b/alpine/PinT/LeapFrogPIF.cpp
index 399ffb1f6..19f47f4a7 100644
--- a/alpine/PinT/LeapFrogPIF.cpp
+++ b/alpine/PinT/LeapFrogPIF.cpp
@@ -18,7 +18,7 @@
 
 void LeapFrogPIF(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
                  ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
-                 const double dt) {
+                 const double dt, const bool isConverged) {
 
     const auto& PL = P.getLayout();
     const auto& rmax = P.rmax_m;
@@ -52,5 +52,10 @@ void LeapFrogPIF(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
         //kick
         Ptemp = Ptemp - 0.5 * dt * P.E;
 
+        if(isConverged) {
+            P.dumpLandau(P.getLocalNum());         
+            P.dumpEnergy(P.getLocalNum());         
+        }
+
     }
 }
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 853651858..b6353f09a 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -309,7 +309,7 @@ namespace ippl {
          }
 
          heffte_m = std::make_shared<heffte::fft3d_r2c<heffteBackend, long long>>
-                    (inbox, outbox, params.get<int>("r2c_direction"), Ippl::getComm(),
+                    (inbox, outbox, params.get<int>("r2c_direction"), MPI_COMM_SELF,
                      heffteOptions);
         
          //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
diff --git a/src/Field/BareField.hpp b/src/Field/BareField.hpp
index 685e6a751..36886e86d 100644
--- a/src/Field/BareField.hpp
+++ b/src/Field/BareField.hpp
@@ -92,7 +92,14 @@ namespace ippl {
 
     template <typename T, unsigned Dim>
     void BareField<T, Dim>::fillHalo() {
-        if(Ippl::Comm->size() > 1) {
+        
+        bool isAllSerial = true;
+        
+        for (unsigned d = 0; d < Dim; ++d) {
+            isAllSerial = isAllSerial && (layout_m->getRequestedDistribution(d) == SERIAL);
+        }
+        
+        if((Ippl::Comm->size() > 1) && (!isAllSerial)) {
             halo_m.fillHalo(dview_m, layout_m);
         }
         if(layout_m->isAllPeriodic_m) {
@@ -106,7 +113,14 @@ namespace ippl {
 
     template <typename T, unsigned Dim>
     void BareField<T, Dim>::accumulateHalo() {
-        if(Ippl::Comm->size() > 1) {
+
+        bool isAllSerial = true;
+        
+        for (unsigned d = 0; d < Dim; ++d) {
+            isAllSerial = isAllSerial && (layout_m->getRequestedDistribution(d) == SERIAL);
+        }
+        
+        if((Ippl::Comm->size() > 1) && (!isAllSerial)) {
             halo_m.accumulateHalo(dview_m, layout_m);
         }
         if(layout_m->isAllPeriodic_m) {
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index ee0c2084e..8522f9568 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -277,12 +277,12 @@ namespace ippl {
 
         IpplTimings::stopTimer(scatterTimer);
 
-        static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
-        IpplTimings::startTimer(scatterAllReduceTimer);                                               
-        int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
-        MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
-                      MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
-        IpplTimings::stopTimer(scatterAllReduceTimer);
+        //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
+        //IpplTimings::startTimer(scatterAllReduceTimer);                                               
+        //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
+        //MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
+        //              MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
+        //IpplTimings::stopTimer(scatterAllReduceTimer);
 
     }
 

From 47d857550d4d8db945e90489a778892dea7dc089 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 7 Dec 2022 15:42:00 +0100
Subject: [PATCH 018/117] Finished error calculation also. Need to compile and
 test

---
 alpine/PinT/ChargedParticlesPinT.hpp |  6 +++-
 alpine/PinT/LandauDampingPinT.cpp    | 49 ++++++++++++++++++++++++++--
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index d894587c9..b0c3ea8a0 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -72,12 +72,14 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 public:
     ParticleAttrib<double>     q; // charge
-    typename ippl::ParticleBase<PLayout>::particle_position_type P;  // G(P^(k-1)_n)
+    typename ippl::ParticleBase<PLayout>::particle_position_type P;  // G(P^(k)_n)
     typename ippl::ParticleBase<PLayout>::particle_position_type E;  // electric field at particle position
 
     typename ippl::ParticleBase<PLayout>::particle_position_type R0;  // Initial particle positions at t=0
     typename ippl::ParticleBase<PLayout>::particle_position_type P0;  // Initial particle velocities at t=0
 
+    typename ippl::ParticleBase<PLayout>::particle_position_type RprevIter;  // G(R^(k-1)_n)
+    typename ippl::ParticleBase<PLayout>::particle_position_type PprevIter;  // G(P^(k-1)_n)
 
     ChargedParticlesPinT(PLayout& pl,
                      Vector_t hr,
@@ -97,6 +99,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         this->addAttribute(E);
         this->addAttribute(R0);
         this->addAttribute(P0);
+        this->addAttribute(RprevIter);
+        this->addAttribute(PprevIter);
         setupBCs();
         for (unsigned int i = 0; i < Dim; i++)
             decomp_m[i]=decomp[i];
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 0ab42a8ac..afdc7ae64 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -148,6 +148,43 @@ double PDF(const Vector_t& xvec, const double& alpha,
     return pdf;
 }
 
+double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double temp = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valL){
+                                Vector_t diff = Qview(i) - QprevIterView;
+                                double myVal = dot(diff, diff).apply();
+                                valL += myVal;
+                            }, Kokkos::Sum<double>(temp));
+
+
+    double globaltemp = 0.0;
+    MPI_Allreduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+
+    double absError = std::sqrt(globaltemp);
+
+    temp = 0.0;
+    Kokkos::parallel_reduce("Q norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valL){
+                                double myVal = dot(Qview(i), Qview(i)).apply();
+                                valL += myVal;
+                            }, Kokkos::Sum<double>(temp));
+
+
+    globaltemp = 0.0;
+    MPI_Allreduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+
+    double relError = absError / std::sqrt(globaltemp);
+    
+    return relError;
+
+}
+
+
 const char* TestName = "LandauDampingPinT";
 
 int main(int argc, char *argv[]){
@@ -329,7 +366,8 @@ int main(int argc, char *argv[]){
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
 
-        double Rerror = computeL2Error(
+        Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+        Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
 
         int tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
@@ -350,11 +388,9 @@ int main(int argc, char *argv[]){
 
         LeapFrogPIC(*Pcoarse, Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
 
-
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
-
         if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
             size_type bufSize = Pend->packedSize(nloc);
             buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
@@ -365,6 +401,13 @@ int main(int argc, char *argv[]){
         }
 
         msg << "Finished iteration: " << it+1 << endl;
+
+        double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter);
+        double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter);
+
+        if((Rerror <= tol) && (Perror <= tol)) {
+            isConverged = true;
+        }
     }
 
     msg << "LandauDamping Parareal: End." << endl;

From 8ceeedeff3220dfa1478e7fdcac38efa006ce6f9 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 7 Dec 2022 16:15:41 +0100
Subject: [PATCH 019/117] nx,ny,nz changed for PIC and PIF. Need to compile and
 test

---
 alpine/PinT/ChargedParticlesPinT.hpp |  2 -
 alpine/PinT/LandauDampingPinT.cpp    | 75 +++++++++++++++++-----------
 alpine/PinT/LeapFrogPIF.cpp          |  6 ++-
 3 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index b0c3ea8a0..4e75791c0 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -67,8 +67,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     double time_m;
 
-    double rhoNorm_m;
-
 
 public:
     ParticleAttrib<double>     q; // charge
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index afdc7ae64..660afa795 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -6,13 +6,16 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./LandauDampingPinT <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> --info 5
-//     nx       = No. of Fourier modes in the x-direction
-//     ny       = No. of Fourier modes in the y-direction
-//     nz       = No. of Fourier modes in the z-direction
+//     srun ./LandauDampingPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> --info 5
+//     nmx       = No. of Fourier modes in the x-direction
+//     nmy       = No. of Fourier modes in the y-direction
+//     nmz       = No. of Fourier modes in the z-direction
+//     nx       = No. of grid points in the x-direction
+//     ny       = No. of grid points in the y-direction
+//     nz       = No. of grid points in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
 //     Example:
-//     srun ./LandauDampingPinT 128 128 128 10000 20 0.05 0.05 1e-5 100 --info 5
+//     srun ./LandauDampingPinT 16 16 16 32 32 32 655360 20 0.05 0.05 1e-5 100 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -193,27 +196,33 @@ int main(int argc, char *argv[]){
     Inform msg("LandauDampingPinT");
     Inform msg2all("LandauDampingPinT",INFORM_ALL_NODES);
 
-    ippl::Vector<int,Dim> nr = {
+    ippl::Vector<int,Dim> nmPIF = {
         std::atoi(argv[1]),
         std::atoi(argv[2]),
         std::atoi(argv[3])
     };
 
+    ippl::Vector<int,Dim> nrPIC = {
+        std::atoi(argv[4]),
+        std::atoi(argv[5]),
+        std::atoi(argv[6])
+    };
+
     static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
     static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
     static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
 
     IpplTimings::startTimer(mainTimer);
 
-    const size_type totalP = std::atoll(argv[4]);
-    const double tEnd = std::atof(argv[5]);
+    const size_type totalP = std::atoll(argv[7]);
+    const double tEnd = std::atof(argv[8]);
     const double dtSlice = tEnd / Ippl::Comm->size();
-    const double dtFine = std::atof(argv[6]);
-    const double dtCoarse = std::atof(argv[7]);
+    const double dtFine = std::atof(argv[9]);
+    const double dtCoarse = std::atof(argv[10]);
     const unsigned int ntFine = (unsigned int)(dtSlice / dtFine);
     const unsigned int ntCoarse = (unsigned int)(dtSlice / dtCoarse);
-    const double tol = std::atof(argv[8]);
-    const unsigned int maxIter = std::atoi(argv[9]);
+    const double tol = std::atof(argv[11]);
+    const unsigned int maxIter = std::atoi(argv[12]);
 
     const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
     const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
@@ -228,7 +237,8 @@ int main(int argc, char *argv[]){
         << "Max. iterations: " << maxIter
         << endl
         << " Np= "
-        << totalP << " Fourier modes = " << nr
+        << totalP << " Fourier modes = " << nmPIF
+        << "Grid points = " << nrPIC
         << endl;
 
     using bunch_type = ChargedParticlesPinT<PLayout_t>;
@@ -239,9 +249,11 @@ int main(int argc, char *argv[]){
     std::unique_ptr<states_begin_type>  Pbegin;
     std::unique_ptr<states_end_type>  Pend;
 
-    ippl::NDIndex<Dim> domain;
+    ippl::NDIndex<Dim> domainPIC;
+    ippl::NDIndex<Dim> domainPIF;
     for (unsigned i = 0; i< Dim; i++) {
-        domain[i] = ippl::Index(nr[i]);
+        domainPIC[i] = ippl::Index(nrPIC[i]);
+        domainPIF[i] = ippl::Index(nmPIF[i]);
     }
 
     ippl::e_dim_tag decomp[Dim];
@@ -254,29 +266,30 @@ int main(int argc, char *argv[]){
     double alpha = 0.05;
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
-    double dx = rmax[0] / nr[0];
-    double dy = rmax[1] / nr[1];
-    double dz = rmax[2] / nr[2];
+    double dx = rmax[0] / nrPIC[0];
+    double dy = rmax[1] / nrPIC[1];
+    double dz = rmax[2] / nrPIC[2];
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
 
     const bool isAllPeriodic=true;
-    Mesh_t mesh(domain, hr, origin);
-    FieldLayout_t FL(domain, decomp, isAllPeriodic);
-    PLayout_t PL(FL, mesh);
+    Mesh_t meshPIC(domainPIC, hr, origin);
+    FieldLayout_t FLPIC(domainPIC, decomp, isAllPeriodic);
+    FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
+    PLayout_t PL(FLPIC, meshPIC);
 
     //Q = -\int\int f dx dv
     double Q = -rmax[0] * rmax[1] * rmax[2];
     Pcoarse = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
-    Pbegin = std::make_unique<bunch_type>(PL);
-    Pend = std::make_unique<bunch_type>(PL);
+    Pbegin = std::make_unique<states_begin_type>(PL);
+    Pend = std::make_unique<states_end_type>(PL);
 
-    Pcoarse->nr_m = nr;
+    Pcoarse->nr_m = nrPIC;
 
-    Pcoarse->rhoPIF_m.initialize(mesh, FL);
-    Pcoarse->rhoPIC_m.initialize(mesh, FL);
-    Pcoarse->EfieldPIC_m.initialize(mesh, FL);
+    Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
     Pcoarse->time_m = 0.0;
@@ -356,7 +369,7 @@ int main(int argc, char *argv[]){
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
-        LeapFrogPIF(*Pcoarse, Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged);
+        LeapFrogPIF(*Pcoarse, Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice);
 
         if(isConverged) {
             break;
@@ -400,11 +413,15 @@ int main(int argc, char *argv[]){
             MPI_Wait(&request, MPI_STATUS_IGNORE);
         }
 
-        msg << "Finished iteration: " << it+1 << endl;
 
         double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter);
         double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter);
 
+        msg << "Finished iteration: " << it+1 
+            << "Rerror: " << Rerror 
+            << "Perror: " << Perror
+            << endl;
+
         if((Rerror <= tol) && (Perror <= tol)) {
             isConverged = true;
         }
diff --git a/alpine/PinT/LeapFrogPIF.cpp b/alpine/PinT/LeapFrogPIF.cpp
index 19f47f4a7..022b6e00f 100644
--- a/alpine/PinT/LeapFrogPIF.cpp
+++ b/alpine/PinT/LeapFrogPIF.cpp
@@ -17,13 +17,14 @@
 //#include "ChargedParticlesPinT.hpp"
 
 void LeapFrogPIF(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
-                 ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
-                 const double dt, const bool isConverged) {
+                 ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
+                 const double& dt, const bool& isConverged, const double& tStartMySlice) {
 
     const auto& PL = P.getLayout();
     const auto& rmax = P.rmax_m;
     const auto& rmax = P.rmin_m;
 
+    P.time_m = tStartMySlice;
     for (unsigned int it=0; it<nt; it++) {
 
         // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
@@ -52,6 +53,7 @@ void LeapFrogPIF(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
         //kick
         Ptemp = Ptemp - 0.5 * dt * P.E;
 
+        P.time_m += dt;
         if(isConverged) {
             P.dumpLandau(P.getLocalNum());         
             P.dumpEnergy(P.getLocalNum());         

From 0f18f74db4cbe75722bdf40e63abc5b117095e0e Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 7 Dec 2022 16:51:08 +0100
Subject: [PATCH 020/117] some compilation bugs fixed

---
 alpine/PinT/ChargedParticlesPinT.hpp |  3 +++
 alpine/PinT/LandauDampingPinT.cpp    | 11 ++++++-----
 alpine/PinT/LeapFrogPIC.cpp          |  8 ++++----
 alpine/PinT/LeapFrogPIF.cpp          | 11 ++++++-----
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 4e75791c0..f0b72514a 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -17,6 +17,7 @@
 //
 
 #include "Ippl.h"
+#include "Solver/FFTPeriodicPoissonSolver.h"
 
 // dimension of our positions
 constexpr unsigned Dim = 3;
@@ -65,6 +66,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     double Q_m;
 
+    std::shared_ptr<Solver_t> solver_mp;
+    
     double time_m;
 
 
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 660afa795..bf28b9de1 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -159,7 +159,7 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
     Kokkos::parallel_reduce("Abs. error", Q.size(),
                             KOKKOS_LAMBDA(const int i, double& valL){
-                                Vector_t diff = Qview(i) - QprevIterView;
+                                Vector_t diff = Qview(i) - QprevIterView(i);
                                 double myVal = dot(diff, diff).apply();
                                 valL += myVal;
                             }, Kokkos::Sum<double>(temp));
@@ -275,6 +275,7 @@ int main(int argc, char *argv[]){
 
     const bool isAllPeriodic=true;
     Mesh_t meshPIC(domainPIC, hr, origin);
+    Mesh_t meshPIF(domainPIF, hr, origin);
     FieldLayout_t FLPIC(domainPIC, decomp, isAllPeriodic);
     FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
     PLayout_t PL(FLPIC, meshPIC);
@@ -324,7 +325,7 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
 
     Pcoarse->rhoPIC_m = 0.0;
-    Pcoarse->scatter(Pcoarse->q, Pcoarse->rhoPIC_m, Pcoarse->R);
+    scatter(Pcoarse->q, Pcoarse->rhoPIC_m, Pcoarse->R);
     Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m / (hr[0] * hr[1] * hr[2]);
 
     Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m - 
@@ -332,7 +333,7 @@ int main(int argc, char *argv[]){
 
     Pcoarse->solver_mp->solve();
 
-    Pcoarse->gather(Pcoarse->E, Pcoarse->EfieldPIC_m, Pcoarse->R);
+    gather(Pcoarse->E, Pcoarse->EfieldPIC_m, Pcoarse->R);
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
     if (Ippl::Comm->rank() > 0) {
@@ -347,12 +348,12 @@ int main(int argc, char *argv[]){
 
     //Compute initial E fields corresponding to fine integrator
     Pcoarse->rhoPIF_m = {0.0, 0.0};
-    Pcoarse->scatterPIF(Pcoarse->q, Pcoarse->rhoPIF_m, Pcoarse->R);
+    scatterPIF(Pcoarse->q, Pcoarse->rhoPIF_m, Pcoarse->R);
 
     Pcoarse->rhoPIF_m = Pcoarse->rhoPIF_m / 
                         ((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2]));
 
-    Pcoarse->gatherPIF(Pcoarse->E, Pcoarse->rhoPIF_m, Pcoarse->R);
+    gatherPIF(Pcoarse->E, Pcoarse->rhoPIF_m, Pcoarse->R);
 
 
     //Run the coarse integrator to get the values at the end of the time slice 
diff --git a/alpine/PinT/LeapFrogPIC.cpp b/alpine/PinT/LeapFrogPIC.cpp
index 0de516a80..3d769521d 100644
--- a/alpine/PinT/LeapFrogPIC.cpp
+++ b/alpine/PinT/LeapFrogPIC.cpp
@@ -15,7 +15,7 @@
 
 //#include "ChargedParticlesPinT.hpp"
 
-void LeapFrogPIC(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp, 
+void LeapFrogPIC(ChargedParticlesPinT<PLayout_t>& P, ParticleAttrib<Vector_t>& Rtemp, 
                  ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
                  const double dt) {
 
@@ -23,7 +23,7 @@ void LeapFrogPIC(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
 
     const auto& hr = P.hr_m;
     const auto& rmax = P.rmax_m;
-    const auto& rmax = P.rmin_m;
+    const auto& rmin = P.rmin_m;
     for (unsigned int it=0; it<nt; it++) {
         // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
         // Here, we assume a constant charge-to-mass ratio of -1 for
@@ -41,7 +41,7 @@ void LeapFrogPIC(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
 
         //scatter the charge onto the underlying grid
         P.rhoPIC_m = 0.0;
-        P.scatter(P.q, P.rhoPIC_m, Rtemp);
+        scatter(P.q, P.rhoPIC_m, Rtemp);
 
 
         P.rhoPIC_m = P.rhoPIC_m / (hr[0] * hr[1] * hr[2]);
@@ -51,7 +51,7 @@ void LeapFrogPIC(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
         P.solver_mp->solve();
 
         // gather E field
-        P.gather(P.E, P.EfieldPIC_m, Rtemp);
+        gather(P.E, P.EfieldPIC_m, Rtemp);
 
         //kick
         Ptemp = Ptemp - 0.5 * dt * P.E;
diff --git a/alpine/PinT/LeapFrogPIF.cpp b/alpine/PinT/LeapFrogPIF.cpp
index 022b6e00f..2db5251bb 100644
--- a/alpine/PinT/LeapFrogPIF.cpp
+++ b/alpine/PinT/LeapFrogPIF.cpp
@@ -16,13 +16,14 @@
 
 //#include "ChargedParticlesPinT.hpp"
 
-void LeapFrogPIF(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
+void LeapFrogPIF(ChargedParticlesPinT<PLayout_t>& P, ParticleAttrib<Vector_t>& Rtemp,
                  ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                 const double& dt, const bool& isConverged, const double& tStartMySlice) {
+                 const double& dt, const bool& isConverged, 
+                 const double& tStartMySlice) {
 
     const auto& PL = P.getLayout();
     const auto& rmax = P.rmax_m;
-    const auto& rmax = P.rmin_m;
+    const auto& rmin = P.rmin_m;
 
     P.time_m = tStartMySlice;
     for (unsigned int it=0; it<nt; it++) {
@@ -43,12 +44,12 @@ void LeapFrogPIF(ChargedParticlesPinT& P, ParticleAttrib<Vector_t>& Rtemp,
 
         //scatter the charge onto the underlying grid
         P.rhoPIF_m = {0.0, 0.0};
-        P.scatterPIF(P.q, P.rhoPIF_m, Rtemp);
+        scatterPIF(P.q, P.rhoPIF_m, Rtemp);
 
         P.rhoPIF_m = P.rhoPIF_m / ((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2]));
 
         // Solve for and gather E field
-        P.gatherPIF(P.E, P.rhoPIF_m, Rtemp);
+        gatherPIF(P.E, P.rhoPIF_m, Rtemp);
 
         //kick
         Ptemp = Ptemp - 0.5 * dt * P.E;

From bb2ee189f70ab1681c58724ca755e86e50312dd9 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 7 Dec 2022 22:12:40 +0100
Subject: [PATCH 021/117] Code compiles now. Need to run and test

---
 alpine/PinT/ChargedParticlesPinT.hpp | 103 +++++++++++++++++++++++++++
 alpine/PinT/LandauDampingPinT.cpp    |  12 ++--
 alpine/PinT/LeapFrogPIC.cpp          |   2 +-
 alpine/PinT/LeapFrogPIF.cpp          |   2 +-
 4 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index f0b72514a..83e63cc9e 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -82,6 +82,23 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     typename ippl::ParticleBase<PLayout>::particle_position_type RprevIter;  // G(R^(k-1)_n)
     typename ippl::ParticleBase<PLayout>::particle_position_type PprevIter;  // G(P^(k-1)_n)
 
+    /*
+      This constructor is mandatory for all derived classes from
+      ParticleBase as the bunch buffer uses this
+    */
+    ChargedParticlesPinT(PLayout& pl)
+    : ippl::ParticleBase<PLayout>(pl)
+    {
+        // register the particle attributes
+        this->addAttribute(q);
+        this->addAttribute(P);
+        this->addAttribute(E);
+        this->addAttribute(R0);
+        this->addAttribute(P0);
+        this->addAttribute(RprevIter);
+        this->addAttribute(PprevIter);
+    }
+    
     ChargedParticlesPinT(PLayout& pl,
                      Vector_t hr,
                      Vector_t rmin,
@@ -311,6 +328,92 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     }
 
+    void LeapFrogPIC(ParticleAttrib<Vector_t>& Rtemp, 
+                     ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
+                     const double dt) {
+    
+        PLayout& PL = this->getLayout();
+    
+        for (unsigned int it=0; it<nt; it++) {
+            // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+            // Here, we assume a constant charge-to-mass ratio of -1 for
+            // all the particles hence eliminating the need to store mass as
+            // an attribute
+            // kick
+    
+            Ptemp = Ptemp - 0.5 * dt * E;
+    
+            //drift
+            Rtemp = Rtemp + dt * Ptemp;
+    
+            //Apply particle BC
+            PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+    
+            //scatter the charge onto the underlying grid
+            rhoPIC_m = 0.0;
+            scatter(q, rhoPIC_m, Rtemp);
+    
+    
+            rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
+            rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    
+            //Field solve
+            solver_mp->solve();
+    
+            // gather E field
+            gather(E, EfieldPIC_m, Rtemp);
+    
+            //kick
+            Ptemp = Ptemp - 0.5 * dt * E;
+        }
+    
+    }
+
+    void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
+                     ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
+                     const double& dt, const bool& isConverged, 
+                     const double& tStartMySlice) {
+    
+        PLayout& PL = this->getLayout();
+    
+        time_m = tStartMySlice;
+        for (unsigned int it=0; it<nt; it++) {
+    
+            // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+            // Here, we assume a constant charge-to-mass ratio of -1 for
+            // all the particles hence eliminating the need to store mass as
+            // an attribute
+            // kick
+    
+            Ptemp = Ptemp - 0.5 * dt * E;
+    
+            //drift
+            Rtemp = Rtemp + dt * Ptemp;
+    
+            //Apply particle BC
+            PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+    
+            //scatter the charge onto the underlying grid
+            rhoPIF_m = {0.0, 0.0};
+            scatterPIF(q, rhoPIF_m, Rtemp);
+    
+            rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    
+            // Solve for and gather E field
+            gatherPIF(E, rhoPIF_m, Rtemp);
+    
+            //kick
+            Ptemp = Ptemp - 0.5 * dt * E;
+    
+            time_m += dt;
+            if(isConverged) {
+                dumpLandau(this->getLocalNum());         
+                dumpEnergy(this->getLocalNum());         
+            }
+    
+        }
+    }
+
 private:
     void setBCAllPeriodic() {
 
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index bf28b9de1..40201deb9 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -35,8 +35,8 @@
 #include "ChargedParticlesPinT.hpp"
 #include "StatesBeginSlice.hpp"
 #include "StatesEndSlice.hpp"
-#include "LeapFrogPIC.cpp"
-#include "LeapFrogPIF.cpp"
+//#include "LeapFrogPIC.cpp"
+//#include "LeapFrogPIF.cpp"
 #include <string>
 #include <vector>
 #include <iostream>
@@ -337,7 +337,7 @@ int main(int argc, char *argv[]){
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
     if (Ippl::Comm->rank() > 0) {
-        LeapFrogPIC(*Pcoarse, Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse); 
+        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse); 
     }
 
     Ippl::Comm->barrier();
@@ -357,7 +357,7 @@ int main(int argc, char *argv[]){
 
 
     //Run the coarse integrator to get the values at the end of the time slice 
-    LeapFrogPIC(*Pcoarse, Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
+    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
 
     //The following might not be needed
     Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
@@ -370,7 +370,7 @@ int main(int argc, char *argv[]){
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
-        LeapFrogPIF(*Pcoarse, Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice);
+        Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice);
 
         if(isConverged) {
             break;
@@ -400,7 +400,7 @@ int main(int argc, char *argv[]){
         Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
 
 
-        LeapFrogPIC(*Pcoarse, Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
+        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
 
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
diff --git a/alpine/PinT/LeapFrogPIC.cpp b/alpine/PinT/LeapFrogPIC.cpp
index 3d769521d..d719a423e 100644
--- a/alpine/PinT/LeapFrogPIC.cpp
+++ b/alpine/PinT/LeapFrogPIC.cpp
@@ -19,7 +19,7 @@ void LeapFrogPIC(ChargedParticlesPinT<PLayout_t>& P, ParticleAttrib<Vector_t>& R
                  ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
                  const double dt) {
 
-    const auto& PL = P.getLayout();
+    PLayout_t& PL = P.getLayout();
 
     const auto& hr = P.hr_m;
     const auto& rmax = P.rmax_m;
diff --git a/alpine/PinT/LeapFrogPIF.cpp b/alpine/PinT/LeapFrogPIF.cpp
index 2db5251bb..9aa8c0479 100644
--- a/alpine/PinT/LeapFrogPIF.cpp
+++ b/alpine/PinT/LeapFrogPIF.cpp
@@ -21,7 +21,7 @@ void LeapFrogPIF(ChargedParticlesPinT<PLayout_t>& P, ParticleAttrib<Vector_t>& R
                  const double& dt, const bool& isConverged, 
                  const double& tStartMySlice) {
 
-    const auto& PL = P.getLayout();
+    auto& PL = P.getLayout();
     const auto& rmax = P.rmax_m;
     const auto& rmin = P.rmin_m;
 

From 3d542704d3b74b3ef07fcc2bca15e0967f14ec85 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 7 Dec 2022 22:46:10 +0100
Subject: [PATCH 022/117] Code runs but gives all zeros for electric field.
 Need to debug

---
 alpine/PinT/ChargedParticlesPinT.hpp | 28 ++++++++++++++--------------
 alpine/PinT/LandauDampingPinT.cpp    | 18 ++++++++++--------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 83e63cc9e..8b0a7f772 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -182,15 +182,15 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
 
 
-        for (int rank=0; rank < Ippl::Comm->size(); ++rank) {
-             if(Ippl::Comm->rank() == rank) {
+        //for (int rank=0; rank < Ippl::Comm->size(); ++rank) {
+        //     if(Ippl::Comm->rank() == rank) {
                  std::stringstream fname;
                  fname << "data/FieldLandau_";
-                 fname << Ippl::Comm->size();
+                 fname << Ippl::Comm->rank();
                  fname << ".csv";
 
 
-                 Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, rank);
+                 Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
                  csvout.precision(10);
                  csvout.setf(std::ios::scientific, std::ios::floatfield);
 
@@ -201,9 +201,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                  csvout << time_m << " "
                         << fieldEnergy << " "
                         << ExAmp << endl;
-             }
-             Ippl::Comm->barrier();
-        }
+        //     }
+        //     Ippl::Comm->barrier();
+        //}
     }
 
 
@@ -302,15 +302,15 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         kineticEnergy = globaltemp;
 
-        for (int rank=0; rank < Ippl::Comm->size(); ++rank) {
-             if(Ippl::Comm->rank() == rank) {
+        //for (int rank=0; rank < Ippl::Comm->size(); ++rank) {
+        //     if(Ippl::Comm->rank() == rank) {
                  std::stringstream fname;
                  fname << "data/Energy_";
-                 fname << Ippl::Comm->size();
+                 fname << Ippl::Comm->rank();
                  fname << ".csv";
 
 
-                 Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, rank);
+                 Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
                  csvout.precision(10);
                  csvout.setf(std::ios::scientific, std::ios::floatfield);
 
@@ -322,9 +322,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                         << potentialEnergy << " "
                         << kineticEnergy << " "
                         << potentialEnergy + kineticEnergy << endl;
-             }
-             Ippl::Comm->barrier();
-        }
+             //}
+             //Ippl::Comm->barrier();
+        //}
 
     }
 
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 40201deb9..b5f2c6259 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -15,7 +15,7 @@
 //     nz       = No. of grid points in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
 //     Example:
-//     srun ./LandauDampingPinT 16 16 16 32 32 32 655360 20 0.05 0.05 1e-5 100 --info 5
+//     srun ./LandauDampingPinT 16 16 16 32 32 32 655360 20.0 0.05 0.05 1e-5 100 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -229,16 +229,18 @@ int main(int argc, char *argv[]){
 
     msg << "Parareal Landau damping"
         << endl
-        << "Slice dT: " << dtSlice 
+        << "Slice dT: " << dtSlice
+        << endl
         << "No. of fine time steps: " << ntFine 
+        << endl
         << "No. of coarse time steps: " << ntCoarse
         << endl
         << "Tolerance: " << tol
-        << "Max. iterations: " << maxIter
+        << " Max. iterations: " << maxIter
         << endl
-        << " Np= "
-        << totalP << " Fourier modes = " << nmPIF
-        << "Grid points = " << nrPIC
+        << "Np= " << totalP 
+        << " Fourier modes = " << nmPIF
+        << " Grid points = " << nrPIC
         << endl;
 
     using bunch_type = ChargedParticlesPinT<PLayout_t>;
@@ -419,8 +421,8 @@ int main(int argc, char *argv[]){
         double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter);
 
         msg << "Finished iteration: " << it+1 
-            << "Rerror: " << Rerror 
-            << "Perror: " << Perror
+            << " Rerror: " << Rerror 
+            << " Perror: " << Perror
             << endl;
 
         if((Rerror <= tol) && (Perror <= tol)) {

From 6525492c2df6533a4516e93d351a9929f2b83781 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Thu, 8 Dec 2022 17:16:29 +0100
Subject: [PATCH 023/117] Code is running but results are wrong. Need to debg
 further

---
 alpine/PinT/ChargedParticlesPinT.hpp | 18 +++++++
 alpine/PinT/LandauDampingPinT.cpp    | 76 +++++++++++++++++++---------
 src/Particle/ParticleAttrib.hpp      |  1 +
 3 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 8b0a7f772..6dab41251 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -333,6 +333,17 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                      const double dt) {
     
         PLayout& PL = this->getLayout();
+        rhoPIC_m = 0.0;
+        scatter(q, rhoPIC_m, Rtemp);
+    
+        rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
+        rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    
+        //Field solve
+        solver_mp->solve();
+    
+        // gather E field
+        gather(E, EfieldPIC_m, Rtemp);
     
         for (unsigned int it=0; it<nt; it++) {
             // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
@@ -375,6 +386,13 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                      const double& tStartMySlice) {
     
         PLayout& PL = this->getLayout();
+        rhoPIF_m = {0.0, 0.0};
+        scatterPIF(q, rhoPIF_m, Rtemp);
+    
+        rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    
+        // Solve for and gather E field
+        gatherPIF(E, rhoPIF_m, Rtemp);
     
         time_m = tStartMySlice;
         for (unsigned int it=0; it<nt; it++) {
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index b5f2c6259..d49301863 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -151,7 +151,8 @@ double PDF(const Vector_t& xvec, const double& alpha,
     return pdf;
 }
 
-double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter) {
+double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& iter, const int& myrank) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -164,6 +165,7 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
                                 valL += myVal;
                             }, Kokkos::Sum<double>(temp));
 
+    std::cout << "Rank: " << myrank << " Iter: " << iter << " Abs. Error: " << temp << std::endl;
 
     double globaltemp = 0.0;
     MPI_Allreduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
@@ -295,7 +297,7 @@ int main(int argc, char *argv[]){
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
-    Pcoarse->time_m = 0.0;
+    Pcoarse->time_m = tStartMySlice;
 
     IpplTimings::startTimer(particleCreation);
 
@@ -310,12 +312,47 @@ int main(int argc, char *argv[]){
     Pcoarse->create(nloc);
     Pbegin->create(nloc);
     Pend->create(nloc);
+
+    using buffer_type = ippl::Communicate::buffer_type;
+#ifdef KOKKOS_ENABLE_CUDA
+    //If we don't do the following even with the same seed the initial 
+    //condition is not the same on different GPUs
+    int tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    if(Ippl::Comm->rank() == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
+        Kokkos::parallel_for(nloc,
+                             generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                             Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+
+        Kokkos::fence();
+        size_type bufSize = Pcoarse->packedSize(nloc);
+        std::vector<MPI_Request> requests(0);
+        int sends = 0;
+        for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
+            requests.resize(requests.size() + 1);
+            Ippl::Comm->isend(rank, tag, *Pcoarse, *buf, requests.back(), nloc);
+            buf->resetWritePos();
+            ++sends;
+        }
+        MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+    }
+    else {
+        size_type bufSize = Pcoarse->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+        Ippl::Comm->recv(0, tag, *Pcoarse, *buf, bufSize, nloc);
+        buf->resetReadPos();
+    }
+#else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
                          Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, alpha, kw, minU, maxU));
 
     Kokkos::fence();
+#endif
+
+
     Ippl::Comm->barrier();
     IpplTimings::stopTimer(particleCreation);                                                    
     
@@ -326,17 +363,6 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
 
-    Pcoarse->rhoPIC_m = 0.0;
-    scatter(Pcoarse->q, Pcoarse->rhoPIC_m, Pcoarse->R);
-    Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m / (hr[0] * hr[1] * hr[2]);
-
-    Pcoarse->rhoPIC_m = Pcoarse->rhoPIC_m - 
-        (Pcoarse->Q_m/((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2])));
-
-    Pcoarse->solver_mp->solve();
-
-    gather(Pcoarse->E, Pcoarse->EfieldPIC_m, Pcoarse->R);
-
     //Get initial guess for ranks other than 0 by propagating the coarse solver
     if (Ippl::Comm->rank() > 0) {
         Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse); 
@@ -348,15 +374,9 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
 
-    //Compute initial E fields corresponding to fine integrator
-    Pcoarse->rhoPIF_m = {0.0, 0.0};
-    scatterPIF(Pcoarse->q, Pcoarse->rhoPIF_m, Pcoarse->R);
-
-    Pcoarse->rhoPIF_m = Pcoarse->rhoPIF_m / 
-                        ((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2]));
-
-    gatherPIF(Pcoarse->E, Pcoarse->rhoPIF_m, Pcoarse->R);
 
+    //Pcoarse->dumpLandau(nloc);         
+    //Pcoarse->dumpEnergy(nloc);         
 
     //Run the coarse integrator to get the values at the end of the time slice 
     Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
@@ -365,8 +385,9 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
 
+    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pend->R.getView());
+    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pend->P.getView());
 
-    using buffer_type = ippl::Communicate::buffer_type;
     msg << "Starting parareal iterations ..." << endl;
     bool isConverged = false;
     for (unsigned int it=0; it<maxIter; it++) {
@@ -385,7 +406,7 @@ int main(int argc, char *argv[]){
         Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
         Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
 
-        int tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+        tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
         if(Ippl::Comm->rank() > 0) {
             size_type bufSize = Pbegin->packedSize(nloc);
@@ -417,8 +438,13 @@ int main(int argc, char *argv[]){
         }
 
 
-        double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter);
-        double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter);
+        double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank());
+        double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank());
+        //double Rerror = computeL2Error(Pend->R, Pcoarse->RprevIter);
+        //double Perror = computeL2Error(Pend->P, Pcoarse->PprevIter);
+        
+        //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pend->R.getView());
+        //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pend->P.getView());
 
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 8522f9568..800cd9350 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -277,6 +277,7 @@ namespace ippl {
 
         IpplTimings::stopTimer(scatterTimer);
 
+        Kokkos::deep_copy(fview, viewLocal);
         //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
         //IpplTimings::startTimer(scatterAllReduceTimer);                                               
         //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);

From 34b94e6686625794b89f521bdbe3660697701065 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 9 Dec 2022 09:27:19 +0100
Subject: [PATCH 024/117] Some more modifications to reduce initial
 communication

---
 alpine/PinT/ChargedParticlesPinT.hpp | 36 +++++++++++--------
 alpine/PinT/LandauDampingPinT.cpp    | 53 +++++++++++++++++-----------
 alpine/PinT/LeapFrogPIF.cpp          |  1 +
 3 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 6dab41251..522160d0a 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -151,7 +151,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     }
 
 
-    void dumpLandau(size_type totalP) {
+    void dumpLandau(size_type totalP, const unsigned int& iter) {
        
         auto Eview = E.getView();
 
@@ -187,6 +187,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                  std::stringstream fname;
                  fname << "data/FieldLandau_";
                  fname << Ippl::Comm->rank();
+                 fname << "_iter_";
+                 fname << iter;
                  fname << ".csv";
 
 
@@ -194,9 +196,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                  csvout.precision(10);
                  csvout.setf(std::ios::scientific, std::ios::floatfield);
 
-                 if(time_m == 0.0) {
-                     csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
-                 }
+                 //if(time_m == 0.0) {
+                 //    csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+                 //}
 
                  csvout << time_m << " "
                         << fieldEnergy << " "
@@ -207,7 +209,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     }
 
 
-    void dumpEnergy(size_type /*totalP*/) {
+    void dumpEnergy(size_type /*totalP*/, const unsigned int& iter) {
        
 
         double potentialEnergy, kineticEnergy;
@@ -307,6 +309,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                  std::stringstream fname;
                  fname << "data/Energy_";
                  fname << Ippl::Comm->rank();
+                 fname << "_iter_";
+                 fname << iter;
                  fname << ".csv";
 
 
@@ -314,9 +318,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                  csvout.precision(10);
                  csvout.setf(std::ios::scientific, std::ios::floatfield);
 
-                 if(time_m == 0.0) {
-                     csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
-                 }
+                 //csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
 
                  csvout << time_m << " "
                         << potentialEnergy << " "
@@ -382,8 +384,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const bool& isConverged, 
-                     const double& tStartMySlice) {
+                     const double& dt, const bool& /*isConverged*/, 
+                     const double& tStartMySlice, const unsigned int& iter) {
     
         PLayout& PL = this->getLayout();
         rhoPIF_m = {0.0, 0.0};
@@ -395,6 +397,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         gatherPIF(E, rhoPIF_m, Rtemp);
     
         time_m = tStartMySlice;
+
+        //isConverged = false;
+        if((time_m == 0.0)) {
+            dumpLandau(this->getLocalNum(), iter);         
+            dumpEnergy(this->getLocalNum(), iter);
+        }
         for (unsigned int it=0; it<nt; it++) {
     
             // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
@@ -424,10 +432,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Ptemp = Ptemp - 0.5 * dt * E;
     
             time_m += dt;
-            if(isConverged) {
-                dumpLandau(this->getLocalNum());         
-                dumpEnergy(this->getLocalNum());         
-            }
+            //if(isConverged) {
+            dumpLandau(this->getLocalNum(), iter);         
+            dumpEnergy(this->getLocalNum(), iter);         
+            //}
     
         }
     }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index d49301863..85d09a6b2 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -322,32 +322,32 @@ int main(int argc, char *argv[]){
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
         Kokkos::parallel_for(nloc,
                              generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
-                             Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+                             Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
 
         Kokkos::fence();
-        size_type bufSize = Pcoarse->packedSize(nloc);
+        size_type bufSize = Pbegin->packedSize(nloc);
         std::vector<MPI_Request> requests(0);
         int sends = 0;
         for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
             buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
             requests.resize(requests.size() + 1);
-            Ippl::Comm->isend(rank, tag, *Pcoarse, *buf, requests.back(), nloc);
+            Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
             buf->resetWritePos();
             ++sends;
         }
         MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
     }
     else {
-        size_type bufSize = Pcoarse->packedSize(nloc);
+        size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-        Ippl::Comm->recv(0, tag, *Pcoarse, *buf, bufSize, nloc);
+        Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
         buf->resetReadPos();
     }
 #else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
-                         Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
 
     Kokkos::fence();
 #endif
@@ -355,13 +355,17 @@ int main(int argc, char *argv[]){
 
     Ippl::Comm->barrier();
     IpplTimings::stopTimer(particleCreation);                                                    
+    Pcoarse->R = Pbegin->R * 1;
+    Pcoarse->P = Pbegin->P * 1;
     
     Pcoarse->q = Pcoarse->Q_m/totalP;
     msg << "particles created and initial conditions assigned " << endl;
 
     //Copy initial conditions as they are needed later
-    Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    Pcoarse->R0 = Pcoarse->R * 1;
+    Pcoarse->P0 = Pcoarse->P * 1;
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
     if (Ippl::Comm->rank() > 0) {
@@ -371,8 +375,10 @@ int main(int argc, char *argv[]){
     Ippl::Comm->barrier();
 
     
-    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    Pbegin->R = Pcoarse->R * 1;
+    Pbegin->P = Pcoarse->P * 1;
 
 
     //Pcoarse->dumpLandau(nloc);         
@@ -382,8 +388,10 @@ int main(int argc, char *argv[]){
     Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
 
     //The following might not be needed
-    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    Pend->R = Pcoarse->R * 1;
+    Pend->P = Pcoarse->P * 1;
 
     //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pend->R.getView());
     //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pend->P.getView());
@@ -393,7 +401,7 @@ int main(int argc, char *argv[]){
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
-        Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice);
+        Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1);
 
         if(isConverged) {
             break;
@@ -403,9 +411,10 @@ int main(int argc, char *argv[]){
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
 
-        Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
-        Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
-
+        //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+        //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+        Pcoarse->RprevIter = Pcoarse->R * 1;
+        Pcoarse->PprevIter = Pcoarse->P * 1;
         tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
         if(Ippl::Comm->rank() > 0) {
@@ -415,12 +424,16 @@ int main(int argc, char *argv[]){
             buf->resetReadPos();
         }
         else {
-            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
-            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            Pbegin->R = Pcoarse->R0 * 1;
+            Pbegin->P = Pcoarse->P0 * 1;
         }
 
-        Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-        Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+        //Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+        //Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+        Pcoarse->R = Pbegin->R * 1;
+        Pcoarse->P = Pbegin->P * 1;
 
 
         Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
diff --git a/alpine/PinT/LeapFrogPIF.cpp b/alpine/PinT/LeapFrogPIF.cpp
index 9aa8c0479..b7473237f 100644
--- a/alpine/PinT/LeapFrogPIF.cpp
+++ b/alpine/PinT/LeapFrogPIF.cpp
@@ -26,6 +26,7 @@ void LeapFrogPIF(ChargedParticlesPinT<PLayout_t>& P, ParticleAttrib<Vector_t>& R
     const auto& rmin = P.rmin_m;
 
     P.time_m = tStartMySlice;
+
     for (unsigned int it=0; it<nt; it++) {
 
         // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration

From 2c42519e223256ebd5b2b512b283e5928e1b6408 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 9 Dec 2022 23:08:03 +0100
Subject: [PATCH 025/117] Code seems to be working

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  |   7 +-
 alpine/PinT/ChargedParticlesPinT.hpp          | 216 +++++++++++-------
 alpine/PinT/LandauDampingPinT.cpp             | 115 +++++-----
 src/Particle/ParticleAttrib.hpp               |   5 +-
 4 files changed, 201 insertions(+), 142 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 9ff279c18..602964ab6 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -261,11 +261,8 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         }, Kokkos::Sum<double>(temp));
         
 
-        double globaltemp = 0.0;
-        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
         double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        //potentialEnergy = 0.5 * globaltemp * volume / totalP ;
-        potentialEnergy = 0.25 * 0.5 * globaltemp * volume;
+        potentialEnergy = 0.5 * temp * volume;
 
         auto Pview = P.getView();
         auto qView = q.getView();
@@ -280,7 +277,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                                 }, Kokkos::Sum<double>(temp));
 
         temp *= 0.5;
-        globaltemp = 0.0;
+        double globaltemp = 0.0;
         MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
 
         kineticEnergy = globaltemp;
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 522160d0a..9c993c2bd 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -151,71 +151,127 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     }
 
 
-    void dumpLandau(size_type totalP, const unsigned int& iter) {
+    void dumpLandau(size_type /*totalP*/, const unsigned int& iter) {
        
-        auto Eview = E.getView();
 
-        double fieldEnergy, ExAmp;
-        double temp = 0.0;
+        double fieldEnergy = 0.0; 
+        double ExAmp = 0.0;
+        //auto Eview = E.getView();
+        //double temp = 0.0;
 
-        Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const int i, double& valL){
-                                    double myVal = Eview(i)[0] * Eview(i)[0];
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
+        //Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
+        //                        KOKKOS_LAMBDA(const int i, double& valL){
+        //                            double myVal = Eview(i)[0] * Eview(i)[0];
+        //                            valL += myVal;
+        //                        }, Kokkos::Sum<double>(temp));
 
-        //double globaltemp = 0.0;
-        double globaltemp = temp;
-        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        ////double globaltemp = 0.0;
+        //double globaltemp = temp;
+        ////MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        //fieldEnergy = globaltemp * volume / totalP ;
+
+        //double tempMax = 0.0;
+        //Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
+        //                        KOKKOS_LAMBDA(const size_t i, double& valL)
+        //                        {
+        //                            double myVal = std::fabs(Eview(i)[0]);
+        //                            if(myVal > valL) valL = myVal;
+        //                        }, Kokkos::Max<double>(tempMax));
+        ////ExAmp = 0.0;
+        //ExAmp = tempMax;
+        ////MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+
+        auto rhoview = rhoPIF_m.getView();
+        const int nghost = rhoPIF_m.getNghost();
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+      
+        const FieldLayout_t& layout = rhoPIF_m.getLayout(); 
+        const Mesh_t& mesh = rhoPIF_m.get_mesh();
+        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        Vector<double, Dim> Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+
+        Kokkos::complex<double> imag = {0.0, 1.0};
+        double pi = std::acos(-1.0);
+        Kokkos::parallel_reduce("Ex energy and Max",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& tlSum,
+                                            double& tlMax)
+        {
+        
+            Vector<int, 3> iVec = {i, j, k};
+            Vector<double, 3> kVec;
+            double Dr = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                bool shift = (iVec[d] > (N[d]/2));
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                Dr += kVec[d] * kVec[d];
+            }
+
+            Kokkos::complex<double> Ek = {0.0, 0.0}; 
+            if(Dr != 0.0) {
+                Ek = -(imag * kVec[0] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+            }
+            double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+
+            tlSum += myVal;
+
+            double myValMax = std::sqrt(myVal);
+
+            if(myValMax > tlMax) tlMax = myValMax;
+
+        }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(ExAmp));
+        
+
+        Kokkos::fence();
         double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        fieldEnergy = globaltemp * volume / totalP ;
-
-        double tempMax = 0.0;
-        Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const size_t i, double& valL)
-                                {
-                                    double myVal = std::fabs(Eview(i)[0]);
-                                    if(myVal > valL) valL = myVal;
-                                }, Kokkos::Max<double>(tempMax));
-        //ExAmp = 0.0;
-        ExAmp = tempMax;
-        //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
-
-
-        //for (int rank=0; rank < Ippl::Comm->size(); ++rank) {
-        //     if(Ippl::Comm->rank() == rank) {
-                 std::stringstream fname;
-                 fname << "data/FieldLandau_";
-                 fname << Ippl::Comm->rank();
-                 fname << "_iter_";
-                 fname << iter;
-                 fname << ".csv";
-
-
-                 Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
-                 csvout.precision(10);
-                 csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-                 //if(time_m == 0.0) {
-                 //    csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
-                 //}
-
-                 csvout << time_m << " "
-                        << fieldEnergy << " "
-                        << ExAmp << endl;
-        //     }
-        //     Ippl::Comm->barrier();
+        fieldEnergy *= volume;
+
+
+        std::stringstream fname;
+        fname << "data/FieldLandau_";
+        fname << Ippl::Comm->rank();
+        fname << "_iter_";
+        fname << iter;
+        fname << ".csv";
+
+
+        Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+        csvout.precision(10);
+        csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+        //if(time_m == 0.0) {
+        //    csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
         //}
+
+        csvout << time_m << " "
+               << fieldEnergy << " "
+               << ExAmp << endl;
     }
 
 
-    void dumpEnergy(size_type /*totalP*/, const unsigned int& iter) {
+    void dumpEnergy(size_type /*totalP*/, const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp) {
        
 
         double potentialEnergy, kineticEnergy;
         double temp = 0.0;
 
-
         auto rhoview = rhoPIF_m.getView();
         const int nghost = rhoPIF_m.getNghost();
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
@@ -278,14 +334,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }, Kokkos::Sum<double>(temp));
         
 
-        //double globaltemp = 0.0;
-        double globaltemp = temp;
-        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
         double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        //potentialEnergy = 0.5 * globaltemp * volume / totalP ;
-        potentialEnergy = 0.25 * 0.5 * globaltemp * volume;
+        potentialEnergy = 0.5 * temp * volume;
 
-        auto Pview = P.getView();
+        auto Pview = Ptemp.getView();
         auto qView = q.getView();
 
         temp = 0.0;
@@ -299,40 +351,35 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         temp *= 0.5;
         //globaltemp = 0.0;
-        globaltemp = temp;
+        double globaltemp = temp;
         //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
 
         kineticEnergy = globaltemp;
 
-        //for (int rank=0; rank < Ippl::Comm->size(); ++rank) {
-        //     if(Ippl::Comm->rank() == rank) {
-                 std::stringstream fname;
-                 fname << "data/Energy_";
-                 fname << Ippl::Comm->rank();
-                 fname << "_iter_";
-                 fname << iter;
-                 fname << ".csv";
+        std::stringstream fname;
+        fname << "data/Energy_";
+        fname << Ippl::Comm->rank();
+        fname << "_iter_";
+        fname << iter;
+        fname << ".csv";
 
 
-                 Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
-                 csvout.precision(10);
-                 csvout.setf(std::ios::scientific, std::ios::floatfield);
+        Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+        csvout.precision(10);
+        csvout.setf(std::ios::scientific, std::ios::floatfield);
 
-                 //csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+        //csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
 
-                 csvout << time_m << " "
-                        << potentialEnergy << " "
-                        << kineticEnergy << " "
-                        << potentialEnergy + kineticEnergy << endl;
-             //}
-             //Ippl::Comm->barrier();
-        //}
+        csvout << time_m << " "
+               << potentialEnergy << " "
+               << kineticEnergy << " "
+               << potentialEnergy + kineticEnergy << endl;
 
     }
 
     void LeapFrogPIC(ParticleAttrib<Vector_t>& Rtemp, 
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
-                     const double dt) {
+                     const double dt, const double& tStartMySlice) {
     
         PLayout& PL = this->getLayout();
         rhoPIC_m = 0.0;
@@ -347,6 +394,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         // gather E field
         gather(E, EfieldPIC_m, Rtemp);
     
+        time_m = tStartMySlice;
+
+
         for (unsigned int it=0; it<nt; it++) {
             // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
             // Here, we assume a constant charge-to-mass ratio of -1 for
@@ -378,6 +428,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //kick
             Ptemp = Ptemp - 0.5 * dt * E;
+            
+            time_m += dt;
         }
     
     }
@@ -398,10 +450,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         time_m = tStartMySlice;
 
-        //isConverged = false;
         if((time_m == 0.0)) {
             dumpLandau(this->getLocalNum(), iter);         
-            dumpEnergy(this->getLocalNum(), iter);
+            dumpEnergy(this->getLocalNum(), iter, Ptemp);
         }
         for (unsigned int it=0; it<nt; it++) {
     
@@ -432,10 +483,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Ptemp = Ptemp - 0.5 * dt * E;
     
             time_m += dt;
-            //if(isConverged) {
+            
             dumpLandau(this->getLocalNum(), iter);         
-            dumpEnergy(this->getLocalNum(), iter);         
-            //}
+            dumpEnergy(this->getLocalNum(), iter, Ptemp);         
     
         }
     }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 85d09a6b2..c29865203 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -270,23 +270,28 @@ int main(int argc, char *argv[]){
     double alpha = 0.05;
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
-    double dx = rmax[0] / nrPIC[0];
-    double dy = rmax[1] / nrPIC[1];
-    double dz = rmax[2] / nrPIC[2];
+    double dxPIC = rmax[0] / nrPIC[0];
+    double dyPIC = rmax[1] / nrPIC[1];
+    double dzPIC = rmax[2] / nrPIC[2];
 
-    Vector_t hr = {dx, dy, dz};
+
+    double dxPIF = rmax[0] / nmPIF[0];
+    double dyPIF = rmax[1] / nmPIF[1];
+    double dzPIF = rmax[2] / nmPIF[2];
+    Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
+    Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
 
     const bool isAllPeriodic=true;
-    Mesh_t meshPIC(domainPIC, hr, origin);
-    Mesh_t meshPIF(domainPIF, hr, origin);
+    Mesh_t meshPIC(domainPIC, hrPIC, origin);
+    Mesh_t meshPIF(domainPIF, hrPIF, origin);
     FieldLayout_t FLPIC(domainPIC, decomp, isAllPeriodic);
     FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
     PLayout_t PL(FLPIC, meshPIC);
 
     //Q = -\int\int f dx dv
     double Q = -rmax[0] * rmax[1] * rmax[2];
-    Pcoarse = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -319,7 +324,7 @@ int main(int argc, char *argv[]){
     //condition is not the same on different GPUs
     int tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
     if(Ippl::Comm->rank() == 0) {
-        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
         Kokkos::parallel_for(nloc,
                              generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
                              Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
@@ -343,58 +348,58 @@ int main(int argc, char *argv[]){
         Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
         buf->resetReadPos();
     }
+    Ippl::Comm->barrier();
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
 #else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
-                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+                         Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, alpha, kw, minU, maxU));
 
     Kokkos::fence();
+    Ippl::Comm->barrier();
 #endif
 
 
-    Ippl::Comm->barrier();
+    Pcoarse->q = Pcoarse->Q_m/totalP;
     IpplTimings::stopTimer(particleCreation);                                                    
-    Pcoarse->R = Pbegin->R * 1;
-    Pcoarse->P = Pbegin->P * 1;
     
-    Pcoarse->q = Pcoarse->Q_m/totalP;
+    //Pcoarse->R = Pbegin->R * 1;
+    //Pcoarse->P = Pbegin->P * 1;
+    
     msg << "particles created and initial conditions assigned " << endl;
 
     //Copy initial conditions as they are needed later
-    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
-    Pcoarse->R0 = Pcoarse->R * 1;
-    Pcoarse->P0 = Pcoarse->P * 1;
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    //Pcoarse->R0 = Pcoarse->R * 1;
+    //Pcoarse->P0 = Pcoarse->P * 1;
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
     if (Ippl::Comm->rank() > 0) {
-        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse); 
+        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
     }
 
     Ippl::Comm->barrier();
 
     
-    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
-    Pbegin->R = Pcoarse->R * 1;
-    Pbegin->P = Pcoarse->P * 1;
+    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    //Pbegin->R = Pcoarse->R * 1;
+    //Pbegin->P = Pcoarse->P * 1;
 
 
-    //Pcoarse->dumpLandau(nloc);         
-    //Pcoarse->dumpEnergy(nloc);         
 
     //Run the coarse integrator to get the values at the end of the time slice 
-    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
+    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
 
     //The following might not be needed
-    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
-    Pend->R = Pcoarse->R * 1;
-    Pend->P = Pcoarse->P * 1;
+    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    //Pend->R = Pcoarse->R * 1;
+    //Pend->P = Pcoarse->P * 1;
 
-    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pend->R.getView());
-    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pend->P.getView());
 
     msg << "Starting parareal iterations ..." << endl;
     bool isConverged = false;
@@ -403,18 +408,28 @@ int main(int argc, char *argv[]){
         //Run fine integrator in parallel
         Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1);
 
-        if(isConverged) {
-            break;
-        }
+        //if(isConverged) {
+
+            //test with the serial solution
+            //Pcoarse->LeapFrogPIF(Pcoarse->R0, Pcoarse->P0, (Ippl::Comm->rank()+1)*ntFine, dtFine, isConverged, tStartMySlice, it+1);
+            //Ippl::Comm->barrier();
+            //double Rerror = computeL2Error(Pcoarse->R0, Pbegin->R, it+1, Ippl::Comm->rank());
+            //double Perror = computeL2Error(Pcoarse->P0, Pbegin->P, it+1, Ippl::Comm->rank());
+            //msg << "Finished iteration: " << it+1 
+            //<< " Rerror: " << Rerror 
+            //<< " Perror: " << Perror
+            //<< endl;
+        //    break;
+        //}
 
         //Difference = Fine - Coarse
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
 
-        //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
-        //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
-        Pcoarse->RprevIter = Pcoarse->R * 1;
-        Pcoarse->PprevIter = Pcoarse->P * 1;
+        Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+        Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+        //Pcoarse->RprevIter = Pcoarse->R * 1;
+        //Pcoarse->PprevIter = Pcoarse->P * 1;
         tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
         if(Ippl::Comm->rank() > 0) {
@@ -424,19 +439,19 @@ int main(int argc, char *argv[]){
             buf->resetReadPos();
         }
         else {
-            //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
-            //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
-            Pbegin->R = Pcoarse->R0 * 1;
-            Pbegin->P = Pcoarse->P0 * 1;
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            //Pbegin->R = Pcoarse->R0 * 1;
+            //Pbegin->P = Pcoarse->P0 * 1;
         }
 
-        //Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-        //Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-        Pcoarse->R = Pbegin->R * 1;
-        Pcoarse->P = Pbegin->P * 1;
+        Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+        Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+        //Pcoarse->R = Pbegin->R * 1;
+        //Pcoarse->P = Pbegin->P * 1;
 
 
-        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse); 
+        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
 
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
@@ -453,11 +468,7 @@ int main(int argc, char *argv[]){
 
         double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank());
         double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank());
-        //double Rerror = computeL2Error(Pend->R, Pcoarse->RprevIter);
-        //double Perror = computeL2Error(Pend->P, Pcoarse->PprevIter);
         
-        //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pend->R.getView());
-        //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pend->P.getView());
 
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
@@ -465,7 +476,7 @@ int main(int argc, char *argv[]){
             << endl;
 
         if((Rerror <= tol) && (Perror <= tol)) {
-            isConverged = true;
+            break;
         }
     }
 
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 800cd9350..e2c3928e5 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -269,7 +269,8 @@ namespace ippl {
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {
-                    viewLocal(i+nghost,j+nghost,k+nghost) = reducedValue;
+                    //viewLocal(i+nghost,j+nghost,k+nghost) = reducedValue;
+                    fview(i+nghost,j+nghost,k+nghost) = reducedValue;
                 }
 
                 }
@@ -277,7 +278,7 @@ namespace ippl {
 
         IpplTimings::stopTimer(scatterTimer);
 
-        Kokkos::deep_copy(fview, viewLocal);
+        //Kokkos::deep_copy(fview, viewLocal);
         //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
         //IpplTimings::startTimer(scatterAllReduceTimer);                                               
         //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);

From f8afcdaaa464d1ad7b4104e6d354b441ce848d36 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Sat, 10 Dec 2022 07:03:38 +0100
Subject: [PATCH 026/117] Some cleanup done

---
 alpine/PinT/ChargedParticlesPinT.hpp | 12 +--------
 alpine/PinT/LandauDampingPinT.cpp    | 39 ++++------------------------
 2 files changed, 6 insertions(+), 45 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 9c993c2bd..d292e65a9 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -182,8 +182,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         //ExAmp = tempMax;
         ////MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
 
-
-
         auto rhoview = rhoPIF_m.getView();
         const int nghost = rhoPIF_m.getNghost();
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
@@ -398,12 +396,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 
         for (unsigned int it=0; it<nt; it++) {
-            // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
-            // Here, we assume a constant charge-to-mass ratio of -1 for
-            // all the particles hence eliminating the need to store mass as
-            // an attribute
+            
             // kick
-    
             Ptemp = Ptemp - 0.5 * dt * E;
     
             //drift
@@ -456,10 +450,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }
         for (unsigned int it=0; it<nt; it++) {
     
-            // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
-            // Here, we assume a constant charge-to-mass ratio of -1 for
-            // all the particles hence eliminating the need to store mass as
-            // an attribute
             // kick
     
             Ptemp = Ptemp - 0.5 * dt * E;
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index c29865203..a3f59bb8a 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -135,21 +135,6 @@ struct generate_random {
   }
 };
 
-double CDF(const double& x, const double& alpha, const double& k) {
-   double cdf = x + (alpha / k) * std::sin(k * x);
-   return cdf;
-}
-
-KOKKOS_FUNCTION
-double PDF(const Vector_t& xvec, const double& alpha, 
-             const Vector_t& kw, const unsigned Dim) {
-    double pdf = 1.0;
-
-    for (unsigned d = 0; d < Dim; ++d) {
-        pdf *= (1.0 + alpha * std::cos(kw[d] * xvec[d]));
-    }
-    return pdf;
-}
 
 double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
                       const unsigned int& iter, const int& myrank) {
@@ -319,10 +304,11 @@ int main(int argc, char *argv[]){
     Pend->create(nloc);
 
     using buffer_type = ippl::Communicate::buffer_type;
+    int tag;
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
-    int tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
     if(Ippl::Comm->rank() == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
         Kokkos::parallel_for(nloc,
@@ -365,16 +351,11 @@ int main(int argc, char *argv[]){
     Pcoarse->q = Pcoarse->Q_m/totalP;
     IpplTimings::stopTimer(particleCreation);                                                    
     
-    //Pcoarse->R = Pbegin->R * 1;
-    //Pcoarse->P = Pbegin->P * 1;
-    
     msg << "particles created and initial conditions assigned " << endl;
 
     //Copy initial conditions as they are needed later
     Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
-    //Pcoarse->R0 = Pcoarse->R * 1;
-    //Pcoarse->P0 = Pcoarse->P * 1;
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
     if (Ippl::Comm->rank() > 0) {
@@ -386,9 +367,6 @@ int main(int argc, char *argv[]){
     
     Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
-    //Pbegin->R = Pcoarse->R * 1;
-    //Pbegin->P = Pcoarse->P * 1;
-
 
 
     //Run the coarse integrator to get the values at the end of the time slice 
@@ -397,8 +375,6 @@ int main(int argc, char *argv[]){
     //The following might not be needed
     Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
-    //Pend->R = Pcoarse->R * 1;
-    //Pend->P = Pcoarse->P * 1;
 
 
     msg << "Starting parareal iterations ..." << endl;
@@ -411,7 +387,8 @@ int main(int argc, char *argv[]){
         //if(isConverged) {
 
             //test with the serial solution
-            //Pcoarse->LeapFrogPIF(Pcoarse->R0, Pcoarse->P0, (Ippl::Comm->rank()+1)*ntFine, dtFine, isConverged, tStartMySlice, it+1);
+            //Pcoarse->LeapFrogPIF(Pcoarse->R0, Pcoarse->P0, (Ippl::Comm->rank()+1)*ntFine, 
+            //                     dtFine, isConverged, tStartMySlice, it+1);
             //Ippl::Comm->barrier();
             //double Rerror = computeL2Error(Pcoarse->R0, Pbegin->R, it+1, Ippl::Comm->rank());
             //double Perror = computeL2Error(Pcoarse->P0, Pbegin->P, it+1, Ippl::Comm->rank());
@@ -428,8 +405,7 @@ int main(int argc, char *argv[]){
 
         Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
         Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
-        //Pcoarse->RprevIter = Pcoarse->R * 1;
-        //Pcoarse->PprevIter = Pcoarse->P * 1;
+        
         tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
         if(Ippl::Comm->rank() > 0) {
@@ -441,15 +417,10 @@ int main(int argc, char *argv[]){
         else {
             Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
             Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
-            //Pbegin->R = Pcoarse->R0 * 1;
-            //Pbegin->P = Pcoarse->P0 * 1;
         }
 
         Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
         Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-        //Pcoarse->R = Pbegin->R * 1;
-        //Pcoarse->P = Pbegin->P * 1;
-
 
         Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
 

From 5a39d31c87180fa30201b6953637a4dd18d3268b Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Sat, 10 Dec 2022 08:04:06 +0100
Subject: [PATCH 027/117] Error Vs iterations file writing added

---
 alpine/PinT/ChargedParticlesPinT.hpp | 24 ++++++++++++++++++++++++
 alpine/PinT/LandauDampingPinT.cpp    |  2 ++
 2 files changed, 26 insertions(+)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index d292e65a9..a7955ce63 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -375,6 +375,30 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     }
 
+    void writeError(double Rerror, double Perror, unsigned int iter) {
+        
+        if(Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/Error_Vs_Iter.csv";
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(iter == 1) {
+                csvout << "Iter, Rerror, Perror" << endl;
+            }
+
+            csvout << iter << " "
+                   << Rerror << " "
+                   << Perror << endl;
+
+        }
+    
+        Ippl::Comm->barrier();
+
+    }
+
     void LeapFrogPIC(ParticleAttrib<Vector_t>& Rtemp, 
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
                      const double dt, const double& tStartMySlice) {
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index a3f59bb8a..e680ee7e9 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -446,6 +446,8 @@ int main(int argc, char *argv[]){
             << " Perror: " << Perror
             << endl;
 
+        Pcoarse->writeError(Rerror, Perror, it+1);
+
         if((Rerror <= tol) && (Perror <= tol)) {
             break;
         }

From 0c588e25ec195a7a2a0209ae7636db64f1d61541 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 12 Dec 2022 12:27:22 +0100
Subject: [PATCH 028/117] Particle periodic BCs changed as the previous one was
 giving seg faults

---
 alpine/PinT/ChargedParticlesPinT.hpp | 50 ++++++++++++++++++++++++++++
 alpine/PinT/LandauDampingPinT.cpp    |  5 +--
 src/Particle/ParticleAttrib.hpp      | 13 +++++---
 src/Particle/ParticleBC.h            | 18 ++++++++--
 4 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index a7955ce63..d019dd982 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -399,11 +399,57 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     }
 
+    void checkBounds(ParticleAttrib<Vector_t>& R) {
+
+        auto Rview = R.getView();
+        double xMin = 0.0;
+        double yMin = 0.0;
+        double zMin = 0.0;
+        double xMax = 0.0;
+        double yMax = 0.0;
+        double zMax = 0.0;
+        Kokkos::parallel_reduce("Bounds calculation", R.size(),
+                                KOKKOS_LAMBDA(const int i, 
+                                              double& xlMin, 
+                                              double& ylMin, 
+                                              double& zlMin, 
+                                              double& xlMax, 
+                                              double& ylMax, 
+                                              double& zlMax){
+
+                                    if(Rview(i)[0] < xlMin) xlMin = Rview(i)[0];
+                                    if(Rview(i)[1] < ylMin) ylMin = Rview(i)[1];
+                                    if(Rview(i)[2] < zlMin) zlMin = Rview(i)[2];
+
+                                    if(Rview(i)[0] > xlMax) xlMax = Rview(i)[0];
+                                    if(Rview(i)[1] > ylMax) ylMax = Rview(i)[1];
+                                    if(Rview(i)[2] > zlMax) zlMax = Rview(i)[2];
+                                
+                                }, Kokkos::Min<double>(xMin), Kokkos::Min<double>(yMin), Kokkos::Min<double>(zMin),
+                                   Kokkos::Max<double>(xMax), Kokkos::Max<double>(yMax), Kokkos::Max<double>(zMax));
+
+        Kokkos::fence();
+
+        Vector_t Rmin = {xMin, yMin, zMin};
+        Vector_t Rmax = {xMax, yMax, zMax};
+
+        for (unsigned d = 0; d < 3; ++d) {
+            if(Rmin[d] < rmin_m[d]) {
+                std::cout << "Invalid particles with min. in rank: " << Ippl::Comm->rank() << " Rmin: " << Rmin << std::endl;
+            }
+            if(Rmax[d] > rmax_m[d]) {
+                std::cout << "Invalid particles with max. in rank: " << Ippl::Comm->rank() << " Rmax: " << Rmax << std::endl;
+            }
+        }
+    }
+
     void LeapFrogPIC(ParticleAttrib<Vector_t>& Rtemp, 
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
                      const double dt, const double& tStartMySlice) {
     
         PLayout& PL = this->getLayout();
+        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //checkBounds(Rtemp);
         rhoPIC_m = 0.0;
         scatter(q, rhoPIC_m, Rtemp);
     
@@ -429,6 +475,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //Apply particle BC
             PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+            //checkBounds(Rtemp);
     
             //scatter the charge onto the underlying grid
             rhoPIC_m = 0.0;
@@ -458,6 +505,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                      const double& tStartMySlice, const unsigned int& iter) {
     
         PLayout& PL = this->getLayout();
+        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
         scatterPIF(q, rhoPIF_m, Rtemp);
     
@@ -483,6 +532,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //Apply particle BC
             PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+            //checkBounds(Rtemp);
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index e680ee7e9..3db0ea0e3 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -197,7 +197,6 @@ int main(int argc, char *argv[]){
 
     static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
     static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
-    static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -212,7 +211,7 @@ int main(int argc, char *argv[]){
     const unsigned int maxIter = std::atoi(argv[12]);
 
     const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
-    const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
+    //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
 
     msg << "Parareal Landau damping"
         << endl
@@ -363,6 +362,7 @@ int main(int argc, char *argv[]){
     }
 
     Ippl::Comm->barrier();
+    msg << "First Leap frog PIC done " << endl;
 
     
     Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
@@ -371,6 +371,7 @@ int main(int argc, char *argv[]){
 
     //Run the coarse integrator to get the values at the end of the time slice 
     Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+    msg << "Second Leap frog PIC done " << endl;
 
     //The following might not be needed
     Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index e2c3928e5..498d49cd7 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -170,10 +170,15 @@ namespace ippl {
                 Vector<double, Dim> whi = l - index;
                 Vector<double, Dim> wlo = 1.0 - whi;
 
-                const size_t i = index[0] - lDom[0].first() + nghost;
-                const size_t j = index[1] - lDom[1].first() + nghost;
-                const size_t k = index[2] - lDom[2].first() + nghost;
-
+                const int i = index[0] - lDom[0].first() + nghost;
+                const int j = index[1] - lDom[1].first() + nghost;
+                const int k = index[2] - lDom[2].first() + nghost;
+
+                //if((i < 1) || (i > lDom[0].last() + 2) || (j < 1) || (j > lDom[1].last() + 2)
+                //   || (k < 1) || (k > lDom[0].last() + 2)) {
+                //    std::cout << "i: " << i << ", j: " << j << ", k: " << k << std::endl;
+                //    std::cout << "Invalid particle co-ordinates: " << pp(idx) << std::endl;
+                //}
 
                 // scatter
                 const value_type& val = dview_m(idx);
diff --git a/src/Particle/ParticleBC.h b/src/Particle/ParticleBC.h
index 275f04e00..dfd5aa5a0 100644
--- a/src/Particle/ParticleBC.h
+++ b/src/Particle/ParticleBC.h
@@ -77,8 +77,11 @@ namespace ippl {
         struct PeriodicBC : public ParticleBC<T, Dim, ViewType> {
             using value_type = typename ParticleBC<T, Dim, ViewType>::value_type;
 
-            using ParticleBC<T, Dim, ViewType>::extent_m;
-            using ParticleBC<T, Dim, ViewType>::middle_m;
+            //using ParticleBC<T, Dim, ViewType>::extent_m;
+            //using ParticleBC<T, Dim, ViewType>::middle_m;
+            using ParticleBC<T, Dim, ViewType>::maxval_m;
+            using ParticleBC<T, Dim, ViewType>::minval_m;
+            using ParticleBC<T, Dim, ViewType>::isUpper_m;
 
             KOKKOS_DEFAULTED_FUNCTION
             PeriodicBC() = default;
@@ -94,7 +97,16 @@ namespace ippl {
             KOKKOS_INLINE_FUNCTION
             void operator()(const size_t& i) const {
                 value_type& value = this->view_m(i)[this->dim_m];
-                value = value - extent_m * (int)((value - middle_m) * 2 / extent_m);
+                //value = value - this->extent_m * (int)((value - this->middle_m) * 2 / extent_m);
+                //if ((value < this->minval_m) && (!this->isUpper_m))
+                //    value = (this->maxval_m - (this->minval_m - value));
+                //else if ((value >= this->maxval_m) && (this->isUpper_m))
+                //    value = (this->minval_m + (value - this->maxval_m));
+                bool tooHigh = value >= maxval_m;
+                bool tooLow = value < minval_m;
+
+                value += tooHigh * (minval_m - maxval_m) +
+                         tooLow * (maxval_m - minval_m);
             }
 
             KOKKOS_DEFAULTED_FUNCTION

From 0e09cb2a6aaecefbce11a37568f280caa22915db Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 12 Dec 2022 22:12:12 +0100
Subject: [PATCH 029/117] Several tests performed. Need to run with larger no.
 of particles and CPU cores

---
 alpine/PinT/ChargedParticlesPinT.hpp |   2 +
 alpine/PinT/LandauDampingPinT.cpp    | 138 +++++++++++++++++++++++----
 2 files changed, 124 insertions(+), 16 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index d019dd982..fd6735720 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -53,8 +53,10 @@ template<class PLayout>
 class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 public:
     CxField_t rhoPIF_m;
+    CxField_t rhoPIFprevIter_m;
     Field_t rhoPIC_m;
     VField_t EfieldPIC_m;
+    //VField_t EfieldPICprevIter_m;
 
     Vector<int, Dim> nr_m;
 
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 3db0ea0e3..48c05bca5 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -141,32 +141,36 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
-    double temp = 0.0;
+    double localError = 0.0;
+    double localNorm = 0.0;
 
-    Kokkos::parallel_reduce("Abs. error", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valL){
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
-                                double myVal = dot(diff, diff).apply();
-                                valL += myVal;
-                            }, Kokkos::Sum<double>(temp));
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
-    std::cout << "Rank: " << myrank << " Iter: " << iter << " Abs. Error: " << temp << std::endl;
+    Kokkos::fence();
+    std::cout << "Rank: " << myrank << " Iter: " << iter << " Abs. Error: " << localError << std::endl;
 
     double globaltemp = 0.0;
-    MPI_Allreduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
 
     double absError = std::sqrt(globaltemp);
 
-    temp = 0.0;
-    Kokkos::parallel_reduce("Q norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valL){
-                                double myVal = dot(Qview(i), Qview(i)).apply();
-                                valL += myVal;
-                            }, Kokkos::Sum<double>(temp));
+    //temp = 0.0;
+    //Kokkos::parallel_reduce("Q norm", Q.size(),
+    //                        KOKKOS_LAMBDA(const int i, double& valL){
+    //                            double myVal = dot(Qview(i), Qview(i)).apply();
+    //                            valL += myVal;
+    //                        }, Kokkos::Sum<double>(temp));
 
 
     globaltemp = 0.0;
-    MPI_Allreduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
 
     double relError = absError / std::sqrt(globaltemp);
     
@@ -174,6 +178,88 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
 }
 
+double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
+
+    auto rhoview = rhoPIF.getView();
+    auto rhoprevview = rhoPIFprevIter.getView();
+    const int nghost = rhoPIF.getNghost();
+    using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+    
+    const FieldLayout_t& layout = rhoPIF.getLayout(); 
+    const Mesh_t& mesh = rhoPIF.get_mesh();
+    const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+    const auto& domain = layout.getDomain();
+    Vector<double, Dim> Len;
+    Vector<int, Dim> N;
+
+    for (unsigned d=0; d < Dim; ++d) {
+        N[d] = domain[d].length();
+        Len[d] = dx[d] * N[d];
+    }
+
+    double AbsError = 0.0;
+    double Enorm = 0.0;
+    //Kokkos::complex<double> imag = {0.0, 1.0};
+    double pi = std::acos(-1.0);
+    Kokkos::parallel_reduce("Ex field error",
+                          mdrange_type({0, 0, 0},
+                                       {N[0],
+                                        N[1],
+                                        N[2]}),
+                          KOKKOS_LAMBDA(const int i,
+                                        const int j,
+                                        const int k,
+                                        double& errorSum,
+                                        double& fieldSum)
+    {
+    
+        Vector<int, 3> iVec = {i, j, k};
+        Vector<double, 3> kVec;
+        double Dr = 0.0;
+        for(size_t d = 0; d < Dim; ++d) {
+            bool shift = (iVec[d] > (N[d]/2));
+            kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+            Dr += kVec[d] * kVec[d];
+        }
+
+        double myError = 0.0;
+        double myField = 0.0;
+        //Kokkos::complex<double> Ek = {0.0, 0.0};
+        //Kokkos::complex<double> Ekprev = {0.0, 0.0};
+        //for(size_t d = 0; d < Dim; ++d) {
+        //    if(Dr != 0.0) {
+        //        Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+        //        Ekprev = -(imag * kVec[d] * rhoprevview(i+nghost,j+nghost,k+nghost) / Dr);
+        //    }
+        //    Ekprev = Ekprev - Ek;
+        //    myError += Ekprev.real() * Ekprev.real() + Ekprev.imag() * Ekprev.imag();
+        //    myField += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+        //}
+        //errorSum += myError;
+        //fieldSum += myField;
+        Kokkos::complex<double> rhok = rhoview(i+nghost,j+nghost,k+nghost);
+        Kokkos::complex<double> rhokprev = rhoprevview(i+nghost,j+nghost,k+nghost);
+        rhokprev = rhokprev - rhok;
+        myError = rhokprev.real() * rhokprev.real() + rhokprev.imag() * rhokprev.imag();
+        errorSum += myError;
+        myField = rhok.real() * rhok.real() + rhok.imag() * rhok.imag();
+        fieldSum += myField;
+
+    }, Kokkos::Sum<double>(AbsError), Kokkos::Sum<double>(Enorm));
+    
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&AbsError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    double globalNorm = 0.0;
+    MPI_Allreduce(&Enorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+    //fieldEnergy *= volume;
+
+    double relError = std::sqrt(globalError)/std::sqrt(globalNorm);
+
+    return relError;
+}
+
 
 const char* TestName = "LandauDampingPinT";
 
@@ -282,8 +368,11 @@ int main(int argc, char *argv[]){
     Pcoarse->nr_m = nrPIC;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
     Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+    //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
     Pcoarse->time_m = tStartMySlice;
@@ -373,6 +462,8 @@ int main(int argc, char *argv[]){
     Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
     msg << "Second Leap frog PIC done " << endl;
 
+    //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+
     //The following might not be needed
     Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
@@ -384,6 +475,7 @@ int main(int argc, char *argv[]){
 
         //Run fine integrator in parallel
         Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1);
+    
 
         //if(isConverged) {
 
@@ -437,17 +529,31 @@ int main(int argc, char *argv[]){
             MPI_Wait(&request, MPI_STATUS_IGNORE);
         }
 
+        //Pcoarse->EfieldPICprevIter_m = Pcoarse->EfieldPICprevIter_m - Pcoarse->EfieldPIC_m;
+        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPICprevIter_m, Pcoarse->EfieldPICprevIter_m);
+        //double absFieldError = std::sqrt(Pcoarse->rhoPIC_m.sum());
+        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPIC_m, Pcoarse->EfieldPIC_m);
+        //double EfieldNorm = std::sqrt(Pcoarse->rhoPIC_m.sum());
+        //double EfieldError = absFieldError / EfieldNorm;
 
         double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank());
         double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank());
-        
+    
+        double EfieldError = 0;
+        if(it > 0) {
+            EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
+        }
 
+        Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
+            //<< " Efield error: " << EfieldError
+            << " Rhofield error: " << EfieldError
             << endl;
 
         Pcoarse->writeError(Rerror, Perror, it+1);
+        //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
         if((Rerror <= tol) && (Perror <= tol)) {
             break;

From fb65a6676f33daa9f10b05ce9e9dab3d2eedd92a Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 13 Dec 2022 07:20:19 +0100
Subject: [PATCH 030/117] Some checking things for PIC

---
 alpine/PinT/ChargedParticlesPinT.hpp | 66 +++++++++++++++++++++++
 alpine/PinT/LandauDampingPinT.cpp    | 80 ++++++++++++++--------------
 2 files changed, 107 insertions(+), 39 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index fd6735720..93ce2f64f 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -152,7 +152,71 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         solver_mp->setLhs(EfieldPIC_m);
     }
 
+     void dumpLandauPIC() {
 
+        const int nghostE = EfieldPIC_m.getNghost();
+        auto Eview = EfieldPIC_m.getView();
+        double fieldEnergy, ExAmp;
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        double temp = 0.0;
+        Kokkos::parallel_reduce("Ex inner product",
+                                mdrange_type({nghostE, nghostE, nghostE},
+                                             {Eview.extent(0) - nghostE,
+                                              Eview.extent(1) - nghostE,
+                                              Eview.extent(2) - nghostE}),
+                                KOKKOS_LAMBDA(const size_t i, const size_t j,
+                                              const size_t k, double& valL)
+                                {
+                                    double myVal = std::pow(Eview(i, j, k)[0], 2);
+                                    valL += myVal;
+                                }, Kokkos::Sum<double>(temp));
+        double globaltemp = temp;
+        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
+
+        double tempMax = 0.0;
+        Kokkos::parallel_reduce("Ex max norm",
+                                mdrange_type({nghostE, nghostE, nghostE},
+                                             {Eview.extent(0) - nghostE,
+                                              Eview.extent(1) - nghostE,
+                                              Eview.extent(2) - nghostE}),
+                                KOKKOS_LAMBDA(const size_t i, const size_t j,
+                                              const size_t k, double& valL)
+                                {
+                                    double myVal = std::fabs(Eview(i, j, k)[0]);
+                                    if(myVal > valL) valL = myVal;
+                                }, Kokkos::Max<double>(tempMax));
+        ExAmp = tempMax;
+        //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+        if (Ippl::Comm->rank() == 0) {
+            std::stringstream fname;
+            fname << "data/FieldLandau_";
+            fname << Ippl::Comm->size();
+            fname << ".csv";
+
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(time_m == 0.0) {
+                csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+            }
+
+            csvout << time_m << " "
+                   << fieldEnergy << " "
+                   << ExAmp << endl;
+
+        }
+        
+        Ippl::Comm->barrier();
+     }
+
+
+    
     void dumpLandau(size_type /*totalP*/, const unsigned int& iter) {
        
 
@@ -466,6 +530,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         time_m = tStartMySlice;
 
+        dumpLandauPIC();         
 
         for (unsigned int it=0; it<nt; it++) {
             
@@ -497,6 +562,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Ptemp = Ptemp - 0.5 * dt * E;
             
             time_m += dt;
+            dumpLandauPIC();         
         }
     
     }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 48c05bca5..05dd0b4ec 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -199,7 +199,7 @@ double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
 
     double AbsError = 0.0;
     double Enorm = 0.0;
-    //Kokkos::complex<double> imag = {0.0, 1.0};
+    Kokkos::complex<double> imag = {0.0, 1.0};
     double pi = std::acos(-1.0);
     Kokkos::parallel_reduce("Ex field error",
                           mdrange_type({0, 0, 0},
@@ -224,26 +224,26 @@ double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
 
         double myError = 0.0;
         double myField = 0.0;
-        //Kokkos::complex<double> Ek = {0.0, 0.0};
-        //Kokkos::complex<double> Ekprev = {0.0, 0.0};
-        //for(size_t d = 0; d < Dim; ++d) {
-        //    if(Dr != 0.0) {
-        //        Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-        //        Ekprev = -(imag * kVec[d] * rhoprevview(i+nghost,j+nghost,k+nghost) / Dr);
-        //    }
-        //    Ekprev = Ekprev - Ek;
-        //    myError += Ekprev.real() * Ekprev.real() + Ekprev.imag() * Ekprev.imag();
-        //    myField += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
-        //}
-        //errorSum += myError;
-        //fieldSum += myField;
-        Kokkos::complex<double> rhok = rhoview(i+nghost,j+nghost,k+nghost);
-        Kokkos::complex<double> rhokprev = rhoprevview(i+nghost,j+nghost,k+nghost);
-        rhokprev = rhokprev - rhok;
-        myError = rhokprev.real() * rhokprev.real() + rhokprev.imag() * rhokprev.imag();
+        Kokkos::complex<double> Ek = {0.0, 0.0};
+        Kokkos::complex<double> Ekprev = {0.0, 0.0};
+        for(size_t d = 0; d < Dim; ++d) {
+            if(Dr != 0.0) {
+                Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+                Ekprev = -(imag * kVec[d] * rhoprevview(i+nghost,j+nghost,k+nghost) / Dr);
+            }
+            Ekprev = Ekprev - Ek;
+            myError += Ekprev.real() * Ekprev.real() + Ekprev.imag() * Ekprev.imag();
+            myField += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+        }
         errorSum += myError;
-        myField = rhok.real() * rhok.real() + rhok.imag() * rhok.imag();
         fieldSum += myField;
+        //Kokkos::complex<double> rhok = rhoview(i+nghost,j+nghost,k+nghost);
+        //Kokkos::complex<double> rhokprev = rhoprevview(i+nghost,j+nghost,k+nghost);
+        //rhokprev = rhokprev - rhok;
+        //myError = rhokprev.real() * rhokprev.real() + rhokprev.imag() * rhokprev.imag();
+        //errorSum += myError;
+        //myField = rhok.real() * rhok.real() + rhok.imag() * rhok.imag();
+        //fieldSum += myField;
 
     }, Kokkos::Sum<double>(AbsError), Kokkos::Sum<double>(Enorm));
     
@@ -446,30 +446,32 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
-    if (Ippl::Comm->rank() > 0) {
-        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
+    //if (Ippl::Comm->rank() > 0) {
+    if (Ippl::Comm->rank() == 0) {
+        //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
+        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->size()*ntCoarse, dtCoarse, tStartMySlice); 
     }
 
     Ippl::Comm->barrier();
-    msg << "First Leap frog PIC done " << endl;
+    //msg << "First Leap frog PIC done " << endl;
 
-    
-    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    //
+    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
 
 
-    //Run the coarse integrator to get the values at the end of the time slice 
-    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
-    msg << "Second Leap frog PIC done " << endl;
+    ////Run the coarse integrator to get the values at the end of the time slice 
+    //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+    //msg << "Second Leap frog PIC done " << endl;
 
-    //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+    ////Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
-    //The following might not be needed
-    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    ////The following might not be needed
+    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
 
 
-    msg << "Starting parareal iterations ..." << endl;
+    //msg << "Starting parareal iterations ..." << endl;
     bool isConverged = false;
     for (unsigned int it=0; it<maxIter; it++) {
 
@@ -539,17 +541,17 @@ int main(int argc, char *argv[]){
         double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank());
         double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank());
     
-        double EfieldError = 0;
-        if(it > 0) {
-            EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
-        }
+        //double EfieldError = 0;
+        //if(it > 0) {
+        //    EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
+        //}
 
-        Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
+        //Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
             //<< " Efield error: " << EfieldError
-            << " Rhofield error: " << EfieldError
+            //<< " Rhofield error: " << EfieldError
             << endl;
 
         Pcoarse->writeError(Rerror, Perror, it+1);

From 631b008db1b428bfcb74a87eabf755452d48e7a0 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 13 Dec 2022 09:39:55 +0100
Subject: [PATCH 031/117] PIC checked

---
 alpine/PinT/ChargedParticlesPinT.hpp |  8 +++----
 alpine/PinT/LandauDampingPinT.cpp    | 32 +++++++++++++---------------
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 93ce2f64f..90f055175 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -53,7 +53,7 @@ template<class PLayout>
 class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 public:
     CxField_t rhoPIF_m;
-    CxField_t rhoPIFprevIter_m;
+    //CxField_t rhoPIFprevIter_m;
     Field_t rhoPIC_m;
     VField_t EfieldPIC_m;
     //VField_t EfieldPICprevIter_m;
@@ -212,7 +212,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         }
         
-        Ippl::Comm->barrier();
+        //Ippl::Comm->barrier();
      }
 
 
@@ -530,7 +530,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         time_m = tStartMySlice;
 
-        dumpLandauPIC();         
+        //dumpLandauPIC();         
 
         for (unsigned int it=0; it<nt; it++) {
             
@@ -562,7 +562,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Ptemp = Ptemp - 0.5 * dt * E;
             
             time_m += dt;
-            dumpLandauPIC();         
+            //dumpLandauPIC();         
         }
     
     }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 05dd0b4ec..c4b7d3580 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -368,7 +368,7 @@ int main(int argc, char *argv[]){
     Pcoarse->nr_m = nrPIC;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
-    Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
+    //Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
     Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
@@ -446,32 +446,30 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
-    //if (Ippl::Comm->rank() > 0) {
-    if (Ippl::Comm->rank() == 0) {
-        //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
-        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->size()*ntCoarse, dtCoarse, tStartMySlice); 
+    if (Ippl::Comm->rank() > 0) {
+        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
     }
 
     Ippl::Comm->barrier();
-    //msg << "First Leap frog PIC done " << endl;
+    msg << "First Leap frog PIC done " << endl;
 
-    //
-    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    
+    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
 
 
-    ////Run the coarse integrator to get the values at the end of the time slice 
-    //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
-    //msg << "Second Leap frog PIC done " << endl;
+    //Run the coarse integrator to get the values at the end of the time slice 
+    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+    msg << "Second Leap frog PIC done " << endl;
 
-    ////Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+    //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
-    ////The following might not be needed
-    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    //The following might not be needed
+    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
 
 
-    //msg << "Starting parareal iterations ..." << endl;
+    msg << "Starting parareal iterations ..." << endl;
     bool isConverged = false;
     for (unsigned int it=0; it<maxIter; it++) {
 

From 32b36148fb9f26bc5719eb65247de0019f7a3dad Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 16 Dec 2022 16:04:53 +0100
Subject: [PATCH 032/117] Current version corresponding to the slides

---
 alpine/PinT/ChargedParticlesPinT.hpp | 10 ++++-
 alpine/PinT/LandauDampingPinT.cpp    | 56 +++++++++++++++++++++++-----
 src/Particle/ParticleAttrib.hpp      | 36 +++++++++---------
 3 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 90f055175..59f8aeaed 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -53,7 +53,7 @@ template<class PLayout>
 class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 public:
     CxField_t rhoPIF_m;
-    //CxField_t rhoPIFprevIter_m;
+    CxField_t rhoPIFprevIter_m;
     Field_t rhoPIC_m;
     VField_t EfieldPIC_m;
     //VField_t EfieldPICprevIter_m;
@@ -513,6 +513,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
                      const double dt, const double& tStartMySlice) {
     
+        static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
         PLayout& PL = this->getLayout();
         PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
@@ -553,7 +554,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
     
             //Field solve
+            IpplTimings::startTimer(fieldSolvePIC);
             solver_mp->solve();
+            IpplTimings::stopTimer(fieldSolvePIC);
     
             // gather E field
             gather(E, EfieldPIC_m, Rtemp);
@@ -572,6 +575,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                      const double& dt, const bool& /*isConverged*/, 
                      const double& tStartMySlice, const unsigned int& iter) {
     
+        static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
@@ -586,8 +590,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         time_m = tStartMySlice;
 
         if((time_m == 0.0)) {
+            IpplTimings::startTimer(dumpData);
             dumpLandau(this->getLocalNum(), iter);         
             dumpEnergy(this->getLocalNum(), iter, Ptemp);
+            IpplTimings::stopTimer(dumpData);
         }
         for (unsigned int it=0; it<nt; it++) {
     
@@ -616,8 +622,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             time_m += dt;
             
+            IpplTimings::startTimer(dumpData);
             dumpLandau(this->getLocalNum(), iter);         
             dumpEnergy(this->getLocalNum(), iter, Ptemp);         
+            IpplTimings::stopTimer(dumpData);
     
         }
     }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index c4b7d3580..3b3aabee0 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -137,7 +137,7 @@ struct generate_random {
 
 
 double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& iter, const int& myrank) {
+                      const unsigned int& /*iter*/, const int& /*myrank*/) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -154,7 +154,7 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
                             }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
     Kokkos::fence();
-    std::cout << "Rank: " << myrank << " Iter: " << iter << " Abs. Error: " << localError << std::endl;
+    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Abs. Error: " << localError << std::endl;
 
     double globaltemp = 0.0;
     MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
@@ -283,6 +283,12 @@ int main(int argc, char *argv[]){
 
     static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
     static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef timeCommunication = IpplTimings::getTimer("timeCommunication");
+    static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
+    static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
+    static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -368,7 +374,7 @@ int main(int argc, char *argv[]){
     Pcoarse->nr_m = nrPIC;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
-    //Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
     Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
@@ -423,8 +429,10 @@ int main(int argc, char *argv[]){
         buf->resetReadPos();
     }
     Ippl::Comm->barrier();
+    IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
 #else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
@@ -442,31 +450,41 @@ int main(int argc, char *argv[]){
     msg << "particles created and initial conditions assigned " << endl;
 
     //Copy initial conditions as they are needed later
+    IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
 
     //Get initial guess for ranks other than 0 by propagating the coarse solver
+    IpplTimings::startTimer(coarsePropagator);
     if (Ippl::Comm->rank() > 0) {
         Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
     }
+    IpplTimings::stopTimer(coarsePropagator);
 
     Ippl::Comm->barrier();
     msg << "First Leap frog PIC done " << endl;
 
     
+    IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
 
 
     //Run the coarse integrator to get the values at the end of the time slice 
+    IpplTimings::startTimer(coarsePropagator);
     Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+    IpplTimings::stopTimer(coarsePropagator);
     msg << "Second Leap frog PIC done " << endl;
 
     //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
     //The following might not be needed
+    IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
     Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
 
 
     msg << "Starting parareal iterations ..." << endl;
@@ -474,7 +492,9 @@ int main(int argc, char *argv[]){
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
+        IpplTimings::startTimer(finePropagator);
         Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1);
+        IpplTimings::stopTimer(finePropagator);
     
 
         //if(isConverged) {
@@ -496,9 +516,12 @@ int main(int argc, char *argv[]){
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
 
+        IpplTimings::startTimer(deepCopy);
         Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
         Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+        IpplTimings::stopTimer(deepCopy);
         
+        IpplTimings::startTimer(timeCommunication);
         tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
         if(Ippl::Comm->rank() > 0) {
@@ -511,15 +534,21 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
             Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
         }
+        IpplTimings::stopTimer(timeCommunication);
 
+        IpplTimings::startTimer(deepCopy);
         Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
         Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+        IpplTimings::stopTimer(deepCopy);
 
+        IpplTimings::startTimer(coarsePropagator);
         Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+        IpplTimings::stopTimer(coarsePropagator);
 
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
+        IpplTimings::startTimer(timeCommunication);
         if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
             size_type bufSize = Pend->packedSize(nloc);
             buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
@@ -528,6 +557,7 @@ int main(int argc, char *argv[]){
             buf->resetWritePos();
             MPI_Wait(&request, MPI_STATUS_IGNORE);
         }
+        IpplTimings::stopTimer(timeCommunication);
 
         //Pcoarse->EfieldPICprevIter_m = Pcoarse->EfieldPICprevIter_m - Pcoarse->EfieldPIC_m;
         //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPICprevIter_m, Pcoarse->EfieldPICprevIter_m);
@@ -536,26 +566,34 @@ int main(int argc, char *argv[]){
         //double EfieldNorm = std::sqrt(Pcoarse->rhoPIC_m.sum());
         //double EfieldError = absFieldError / EfieldNorm;
 
+        IpplTimings::startTimer(computeErrors);
         double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank());
         double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank());
     
-        //double EfieldError = 0;
-        //if(it > 0) {
-        //    EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
-        //}
+        double EfieldError = 0;
+        if(it > 0) {
+            EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
+        }
+        IpplTimings::stopTimer(computeErrors);
 
-        //Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
+        IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
+        IpplTimings::stopTimer(deepCopy);
+        
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
-            //<< " Efield error: " << EfieldError
+            << " Efield error: " << EfieldError
             //<< " Rhofield error: " << EfieldError
             << endl;
 
+        IpplTimings::startTimer(dumpData);
         Pcoarse->writeError(Rerror, Perror, it+1);
+        IpplTimings::stopTimer(dumpData);
         //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
         if((Rerror <= tol) && (Perror <= tol)) {
+        //if(Perror <= tol) {
             break;
         }
     }
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 498d49cd7..3c0d9e183 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -142,8 +142,8 @@ namespace ippl {
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
-        static IpplTimings::TimerRef scatterTimer = IpplTimings::getTimer("Scatter");           
-        IpplTimings::startTimer(scatterTimer);                                               
+        static IpplTimings::TimerRef scatterPICTimer = IpplTimings::getTimer("ScatterPIC");           
+        IpplTimings::startTimer(scatterPICTimer);                                               
         typename Field<T, Dim, M, C>::view_type view = f.getView();
 
         const M& mesh = f.get_mesh();
@@ -192,12 +192,12 @@ namespace ippl {
                 Kokkos::atomic_add(&view(i,   j,   k  ), whi[0] * whi[1] * whi[2] * val);
             }
         );
-        IpplTimings::stopTimer(scatterTimer);
+        IpplTimings::stopTimer(scatterPICTimer);
             
-        static IpplTimings::TimerRef accumulateHaloTimer = IpplTimings::getTimer("AccumulateHalo");           
-        IpplTimings::startTimer(accumulateHaloTimer);                                               
+        //static IpplTimings::TimerRef accumulateHaloTimer = IpplTimings::getTimer("AccumulateHalo");           
+        //IpplTimings::startTimer(accumulateHaloTimer);                                               
         f.accumulateHalo();
-        IpplTimings::stopTimer(accumulateHaloTimer);                                               
+        //IpplTimings::stopTimer(accumulateHaloTimer);                                               
     }
 
 
@@ -209,8 +209,8 @@ namespace ippl {
     {
         //Inform msg("scatterPIF");
         
-        static IpplTimings::TimerRef scatterTimer = IpplTimings::getTimer("Scatter");           
-        IpplTimings::startTimer(scatterTimer);
+        static IpplTimings::TimerRef scatterPIFTimer = IpplTimings::getTimer("ScatterPIF");           
+        IpplTimings::startTimer(scatterPIFTimer);
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         using vector_type = typename M::vector_type;
@@ -281,7 +281,7 @@ namespace ippl {
                 }
         );
 
-        IpplTimings::stopTimer(scatterTimer);
+        IpplTimings::stopTimer(scatterPIFTimer);
 
         //Kokkos::deep_copy(fview, viewLocal);
         //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
@@ -300,13 +300,13 @@ namespace ippl {
                                                   const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp)
     {
 
-        static IpplTimings::TimerRef fillHaloTimer = IpplTimings::getTimer("FillHalo");           
-        IpplTimings::startTimer(fillHaloTimer);                                               
+        //static IpplTimings::TimerRef fillHaloTimer = IpplTimings::getTimer("FillHalo");           
+        //IpplTimings::startTimer(fillHaloTimer);                                               
         f.fillHalo();
-        IpplTimings::stopTimer(fillHaloTimer);                                               
+        //IpplTimings::stopTimer(fillHaloTimer);                                               
 
-        static IpplTimings::TimerRef gatherTimer = IpplTimings::getTimer("Gather");           
-        IpplTimings::startTimer(gatherTimer);                                               
+        static IpplTimings::TimerRef gatherPICTimer = IpplTimings::getTimer("GatherPIC");           
+        IpplTimings::startTimer(gatherPICTimer);                                               
         const typename Field<T, Dim, M, C>::view_type view = f.getView();
 
         const M& mesh = f.get_mesh();
@@ -349,7 +349,7 @@ namespace ippl {
                     + whi[0] * whi[1] * whi[2] * view(i,   j,   k  );
             }
         );
-        IpplTimings::stopTimer(gatherTimer);                                               
+        IpplTimings::stopTimer(gatherPICTimer);                                               
     }
 
     template<typename T, class... Properties>
@@ -359,8 +359,8 @@ namespace ippl {
     const
     {
         //Inform msg("gatherPIF");
-        static IpplTimings::TimerRef gatherTimer = IpplTimings::getTimer("Gather");           
-        IpplTimings::startTimer(gatherTimer);
+        static IpplTimings::TimerRef gatherPIFTimer = IpplTimings::getTimer("GatherPIF");           
+        IpplTimings::startTimer(gatherPIFTimer);
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         using vector_type = typename M::vector_type;
@@ -444,7 +444,7 @@ namespace ippl {
         );
 
         
-        IpplTimings::stopTimer(gatherTimer);
+        IpplTimings::stopTimer(gatherPIFTimer);
 
     }
 

From 9ffde44be7e19ec0b823c473467ba574ea0a8741 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 19 Dec 2022 15:44:30 +0100
Subject: [PATCH 033/117] Twostream instability and Penning trap PinT files
 added

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 717 ++++++++++++++++++++++
 alpine/PinT/CMakeLists.txt                |   6 +
 alpine/PinT/ChargedParticlesPinT.hpp      | 355 ++++++++++-
 alpine/PinT/LandauDampingPinT.cpp         |  65 +-
 alpine/PinT/PenningTrapPinT.cpp           | 683 +++++++++++++++++++++
 5 files changed, 1790 insertions(+), 36 deletions(-)
 create mode 100644 alpine/PinT/BumponTailInstabilityPinT.cpp
 create mode 100644 alpine/PinT/PenningTrapPinT.cpp

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
new file mode 100644
index 000000000..6bc012cbc
--- /dev/null
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -0,0 +1,717 @@
+// Parallel-in-time (PinT) method Parareal combined with Particle-in-cell
+// and Particle-in-Fourier schemes. The example is electrostatic Landau 
+// damping. The implementation of Parareal follows the open source implementation
+// https://github.com/Parallel-in-Time/PararealF90 by Daniel Ruprecht. The corresponding
+// publication is Ruprecht, Daniel. "Shared memory pipelined parareal." 
+// European Conference on Parallel Processing. Springer, Cham, 2017.
+// 
+//  Usage:
+//     srun ./BumponTailInstability <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> --info 5
+//     nmx       = No. of Fourier modes in the x-direction
+//     nmy       = No. of Fourier modes in the y-direction
+//     nmz       = No. of Fourier modes in the z-direction
+//     nx       = No. of grid points in the x-direction
+//     ny       = No. of grid points in the y-direction
+//     nz       = No. of grid points in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Example:
+//     srun ./BumponTailInstability 16 16 16 32 32 32 655360 20.0 0.05 0.05 1e-5 100 --info 5
+//
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPinT.hpp"
+#include "StatesBeginSlice.hpp"
+#include "StatesEndSlice.hpp"
+//#include "LeapFrogPIC.cpp"
+//#include "LeapFrogPIF.cpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T k, delta, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& k_, const T& delta_, 
+           const T& u_) 
+  : k(k_), delta(delta_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = x  + (delta  * (std::sin(k * x) / k)) - u;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = 1  + (delta  * std::cos(k * x));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while (iterations < max_iter && std::fabs(f(x)) > tol) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  value_type delta, sigma, muBulk, muBeam;
+  size_type nlocBulk; 
+
+  T k, minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
+                  value_type& delta_, T& k_, value_type& sigma_, 
+                  value_type& muBulk_, value_type& muBeam_, 
+                  size_type& nlocBulk_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        delta(delta_), sigma(sigma_), muBulk(muBulk_), muBeam(muBeam_),
+        nlocBulk(nlocBulk_), k(k_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    bool isBeam = (i >= nlocBulk);
+    
+    value_type muZ = (value_type)(((!isBeam) * muBulk) + (isBeam * muBeam));
+    
+    for (unsigned d = 0; d < Dim-1; ++d) {
+        
+        x(i)[d] = rand_gen.drand(minU[d], maxU[d]); 
+        v(i)[d] = rand_gen.normal(0.0, sigma);
+    }
+    v(i)[Dim-1] = rand_gen.normal(muZ, sigma);
+    
+    value_type u = rand_gen.drand(minU[Dim-1], maxU[Dim-1]);
+    x(i)[Dim-1] = u / (1 + delta);
+    Newton1D<value_type> solver(k[Dim-1], delta, u);
+    solver.solve(x(i)[Dim-1]);
+    
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
+
+
+    double globaltemp = 0.0;
+    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+
+    double absError = std::sqrt(globaltemp);
+
+    //temp = 0.0;
+    //Kokkos::parallel_reduce("Q norm", Q.size(),
+    //                        KOKKOS_LAMBDA(const int i, double& valL){
+    //                            double myVal = dot(Qview(i), Qview(i)).apply();
+    //                            valL += myVal;
+    //                        }, Kokkos::Sum<double>(temp));
+
+
+    globaltemp = 0.0;
+    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+
+    double relError = absError / std::sqrt(globaltemp);
+    
+    return relError;
+
+}
+
+double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                myValError = std::sqrt(myValError);
+                                
+                                if(myValError > valLError) valLError = myValError;
+                                
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                myValnorm = std::sqrt(myValnorm);
+                                
+                                if(myValnorm > valLnorm) valLnorm = myValnorm;
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
+
+    Kokkos::fence();
+    lError = localError/localNorm;
+    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
+
+
+    double globaltemp = 0.0;
+    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+
+    double absError = globaltemp;
+
+    globaltemp = 0.0;
+    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+
+    double relError = absError / globaltemp;
+    
+    return relError;
+
+}
+
+
+double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
+
+    auto rhoview = rhoPIF.getView();
+    auto rhoprevview = rhoPIFprevIter.getView();
+    const int nghost = rhoPIF.getNghost();
+    using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+    
+    const FieldLayout_t& layout = rhoPIF.getLayout(); 
+    const Mesh_t& mesh = rhoPIF.get_mesh();
+    const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+    const auto& domain = layout.getDomain();
+    Vector<double, Dim> Len;
+    Vector<int, Dim> N;
+
+    for (unsigned d=0; d < Dim; ++d) {
+        N[d] = domain[d].length();
+        Len[d] = dx[d] * N[d];
+    }
+
+    double AbsError = 0.0;
+    double Enorm = 0.0;
+    Kokkos::complex<double> imag = {0.0, 1.0};
+    double pi = std::acos(-1.0);
+    Kokkos::parallel_reduce("Ex field error",
+                          mdrange_type({0, 0, 0},
+                                       {N[0],
+                                        N[1],
+                                        N[2]}),
+                          KOKKOS_LAMBDA(const int i,
+                                        const int j,
+                                        const int k,
+                                        double& errorSum,
+                                        double& fieldSum)
+    {
+    
+        Vector<int, 3> iVec = {i, j, k};
+        Vector<double, 3> kVec;
+        double Dr = 0.0;
+        for(size_t d = 0; d < Dim; ++d) {
+            bool shift = (iVec[d] > (N[d]/2));
+            kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+            Dr += kVec[d] * kVec[d];
+        }
+
+        double myError = 0.0;
+        double myField = 0.0;
+        Kokkos::complex<double> Ek = {0.0, 0.0};
+        Kokkos::complex<double> Ekprev = {0.0, 0.0};
+        for(size_t d = 0; d < Dim; ++d) {
+            if(Dr != 0.0) {
+                Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+                Ekprev = -(imag * kVec[d] * rhoprevview(i+nghost,j+nghost,k+nghost) / Dr);
+            }
+            Ekprev = Ekprev - Ek;
+            myError += Ekprev.real() * Ekprev.real() + Ekprev.imag() * Ekprev.imag();
+            myField += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+        }
+        errorSum += myError;
+        fieldSum += myField;
+        //Kokkos::complex<double> rhok = rhoview(i+nghost,j+nghost,k+nghost);
+        //Kokkos::complex<double> rhokprev = rhoprevview(i+nghost,j+nghost,k+nghost);
+        //rhokprev = rhokprev - rhok;
+        //myError = rhokprev.real() * rhokprev.real() + rhokprev.imag() * rhokprev.imag();
+        //errorSum += myError;
+        //myField = rhok.real() * rhok.real() + rhok.imag() * rhok.imag();
+        //fieldSum += myField;
+
+    }, Kokkos::Sum<double>(AbsError), Kokkos::Sum<double>(Enorm));
+    
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&AbsError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    double globalNorm = 0.0;
+    MPI_Allreduce(&Enorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+    //fieldEnergy *= volume;
+
+    double relError = std::sqrt(globalError)/std::sqrt(globalNorm);
+
+    return relError;
+}
+
+
+//const char* TestName = "TwoStreamInstability";
+const char* TestName = "BumponTailInstability";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg("TestName");
+    Inform msg2all("TestName",INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nmPIF = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    ippl::Vector<int,Dim> nrPIC = {
+        std::atoi(argv[4]),
+        std::atoi(argv[5]),
+        std::atoi(argv[6])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef timeCommunication = IpplTimings::getTimer("timeCommunication");
+    static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
+    static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
+    static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[7]);
+    const double tEnd = std::atof(argv[8]);
+    const double dtSlice = tEnd / Ippl::Comm->size();
+    const double dtFine = std::atof(argv[9]);
+    const double dtCoarse = std::atof(argv[10]);
+    const unsigned int ntFine = (unsigned int)(dtSlice / dtFine);
+    const unsigned int ntCoarse = (unsigned int)(dtSlice / dtCoarse);
+    const double tol = std::atof(argv[11]);
+    const unsigned int maxIter = std::atoi(argv[12]);
+
+    msg << "dtSlice: " << dtSlice 
+        << "dtSlice/dtFine: " << dtSlice / dtFine
+        << "(int)dtSlice/dtFine: " << (unsigned int)(dtSlice / dtFine)
+        << endl;
+
+    const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
+    //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
+
+
+    using bunch_type = ChargedParticlesPinT<PLayout_t>;
+    using states_begin_type = StatesBeginSlice<PLayout_t>;
+    using states_end_type = StatesEndSlice<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  Pcoarse;
+    std::unique_ptr<states_begin_type>  Pbegin;
+    std::unique_ptr<states_end_type>  Pend;
+
+    ippl::NDIndex<Dim> domainPIC;
+    ippl::NDIndex<Dim> domainPIF;
+    for (unsigned i = 0; i< Dim; i++) {
+        domainPIC[i] = ippl::Index(nrPIC[i]);
+        domainPIF[i] = ippl::Index(nmPIF[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t kw;
+    double sigma, muBulk, muBeam, epsilon, delta;
+    
+   
+    if(std::strcmp(TestName,"TwoStreamInstability") == 0) {
+        // Parameters for two stream instability as in 
+        //  https://www.frontiersin.org/articles/10.3389/fphy.2018.00105/full
+        kw = {0.5, 0.5, 0.5};
+        sigma = 0.1;
+        epsilon = 0.5;
+        muBulk = -pi / 2.0;
+        muBeam = pi / 2.0;
+        delta = 0.01;
+    }
+    else if(std::strcmp(TestName,"BumponTailInstability") == 0) {
+        kw = {0.21, 0.21, 0.21};
+        sigma = 1.0 / std::sqrt(2.0);
+        epsilon = 0.1;
+        muBulk = 0.0;
+        muBeam = 4.0;
+        delta = 0.01;
+    }
+    else {
+        //Default value is two stream instability
+        kw = {0.5, 0.5, 0.5};
+        sigma = 0.1;
+        epsilon = 0.5;
+        muBulk = -pi / 2.0;
+        muBeam = pi / 2.0;
+        delta = 0.01;
+    }
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw ;
+    double dxPIC = rmax[0] / nrPIC[0];
+    double dyPIC = rmax[1] / nrPIC[1];
+    double dzPIC = rmax[2] / nrPIC[2];
+
+
+    double dxPIF = rmax[0] / nmPIF[0];
+    double dyPIF = rmax[1] / nmPIF[1];
+    double dzPIF = rmax[2] / nmPIF[2];
+    Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
+    Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t meshPIC(domainPIC, hrPIC, origin);
+    Mesh_t meshPIF(domainPIF, hrPIF, origin);
+    FieldLayout_t FLPIC(domainPIC, decomp, isAllPeriodic);
+    FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
+    PLayout_t PL(FLPIC, meshPIC);
+
+    //Q = -\int\int f dx dv
+    double Q = -rmax[0] * rmax[1] * rmax[2];
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q);
+    Pbegin = std::make_unique<states_begin_type>(PL);
+    Pend = std::make_unique<states_end_type>(PL);
+
+    Pcoarse->nr_m = nrPIC;
+
+    Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+    //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
+
+    Pcoarse->initFFTSolver();
+    Pcoarse->time_m = tStartMySlice;
+
+    IpplTimings::startTimer(particleCreation);
+
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = rmin[d];
+        maxU[d] = rmax[d];
+    }
+
+    double factorVelBulk = 1.0 - epsilon;
+    double factorVelBeam = 1.0 - factorVelBulk;
+    size_type nlocBulk = (size_type)(factorVelBulk * totalP);
+    size_type nlocBeam = (size_type)(factorVelBeam * totalP);
+    size_type nloc = nlocBulk + nlocBeam;
+
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
+
+    using buffer_type = ippl::Communicate::buffer_type;
+    int tag;
+#ifdef KOKKOS_ENABLE_CUDA
+    //If we don't do the following even with the same seed the initial 
+    //condition is not the same on different GPUs
+    tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    if(Ippl::Comm->rank() == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+        Kokkos::parallel_for(nloc,
+                             generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                             Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, delta, kw, 
+                             sigma, muBulk, muBeam, nlocBulk, minU, maxU));
+
+
+        Kokkos::fence();
+        size_type bufSize = Pbegin->packedSize(nloc);
+        std::vector<MPI_Request> requests(0);
+        int sends = 0;
+        for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
+            requests.resize(requests.size() + 1);
+            Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
+            buf->resetWritePos();
+            ++sends;
+        }
+        MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+    }
+    else {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+        Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+        buf->resetReadPos();
+    }
+    Ippl::Comm->barrier();
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+#else
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, delta, kw, 
+                         sigma, muBulk, muBeam, nlocBulk, minU, maxU));
+
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+#endif
+
+
+    msg << "Parareal Bump on tail instability"
+        << endl
+        << "Slice dT: " << dtSlice
+        << endl
+        << "No. of fine time steps: " << ntFine 
+        << endl
+        << "No. of coarse time steps: " << ntCoarse
+        << endl
+        << "Tolerance: " << tol
+        << " Max. iterations: " << maxIter
+        << endl
+        << "Np= " << nloc 
+        << " Fourier modes = " << nmPIF
+        << " Grid points = " << nrPIC
+        << endl;
+    
+    Pcoarse->q = Pcoarse->Q_m/nloc;
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    msg << "particles created and initial conditions assigned " << endl;
+
+    //Copy initial conditions as they are needed later
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    //Get initial guess for ranks other than 0 by propagating the coarse solver
+    IpplTimings::startTimer(coarsePropagator);
+    if (Ippl::Comm->rank() > 0) {
+        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
+    }
+    
+    Ippl::Comm->barrier();
+    
+    IpplTimings::stopTimer(coarsePropagator);
+
+    msg << "First Leap frog PIC done " << endl;
+
+    
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+
+    //Run the coarse integrator to get the values at the end of the time slice 
+    IpplTimings::startTimer(coarsePropagator);
+    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+    IpplTimings::stopTimer(coarsePropagator);
+    msg << "Second Leap frog PIC done " << endl;
+
+    //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+
+    //The following might not be needed
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+
+    msg << "Starting parareal iterations ..." << endl;
+    bool isConverged = false;
+    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
+    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
+    //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
+    //                     dtFine, isConverged, tStartMySlice, 0);
+    //Ippl::Comm->barrier();
+    for (unsigned int it=0; it<maxIter; it++) {
+
+        //Run fine integrator in parallel
+        IpplTimings::startTimer(finePropagator);
+        Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1);
+        IpplTimings::stopTimer(finePropagator);
+    
+
+        //if(isConverged) {
+
+            //test with the serial solution
+            //Pcoarse->LeapFrogPIF(Pcoarse->R0, Pcoarse->P0, (Ippl::Comm->rank()+1)*ntFine, 
+            //                     dtFine, isConverged, tStartMySlice, it+1);
+            //Ippl::Comm->barrier();
+            //double Rerror = computeL2Error(Pcoarse->R0, Pbegin->R, it+1, Ippl::Comm->rank());
+            //double Perror = computeL2Error(Pcoarse->P0, Pbegin->P, it+1, Ippl::Comm->rank());
+            //msg << "Finished iteration: " << it+1 
+            //<< " Rerror: " << Rerror 
+            //<< " Perror: " << Perror
+            //<< endl;
+        //    break;
+        //}
+
+        //Difference = Fine - Coarse
+        Pend->R = Pbegin->R - Pcoarse->R;
+        Pend->P = Pbegin->P - Pcoarse->P;
+
+        IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+        Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+        IpplTimings::stopTimer(deepCopy);
+        
+        IpplTimings::startTimer(timeCommunication);
+        tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+        
+        if(Ippl::Comm->rank() > 0) {
+            size_type bufSize = Pbegin->packedSize(nloc);
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+            Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+            buf->resetReadPos();
+        }
+        else {
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+        }
+        IpplTimings::stopTimer(timeCommunication);
+
+        IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+        Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+        IpplTimings::stopTimer(deepCopy);
+
+        IpplTimings::startTimer(coarsePropagator);
+        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+        IpplTimings::stopTimer(coarsePropagator);
+
+        Pend->R = Pend->R + Pcoarse->R;
+        Pend->P = Pend->P + Pcoarse->P;
+
+        IpplTimings::startTimer(timeCommunication);
+        if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+            size_type bufSize = Pend->packedSize(nloc);
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+            MPI_Request request;
+            Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+            buf->resetWritePos();
+            MPI_Wait(&request, MPI_STATUS_IGNORE);
+        }
+        IpplTimings::stopTimer(timeCommunication);
+
+        //Pcoarse->EfieldPICprevIter_m = Pcoarse->EfieldPICprevIter_m - Pcoarse->EfieldPIC_m;
+        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPICprevIter_m, Pcoarse->EfieldPICprevIter_m);
+        //double absFieldError = std::sqrt(Pcoarse->rhoPIC_m.sum());
+        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPIC_m, Pcoarse->EfieldPIC_m);
+        //double EfieldNorm = std::sqrt(Pcoarse->rhoPIC_m.sum());
+        //double EfieldError = absFieldError / EfieldNorm;
+
+        IpplTimings::startTimer(computeErrors);
+        double localRerror, localPerror;
+        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        //double Rerror = computeLinfError(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        //double Perror = computeLinfError(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+    
+        double EfieldError = 0;
+        if(it > 0) {
+            EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
+        }
+        IpplTimings::stopTimer(computeErrors);
+
+        IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
+        IpplTimings::stopTimer(deepCopy);
+        
+        msg << "Finished iteration: " << it+1 
+            << " Rerror: " << Rerror 
+            << " Perror: " << Perror
+            << " Efield error: " << EfieldError
+            //<< " Rhofield error: " << EfieldError
+            << endl;
+
+        IpplTimings::startTimer(dumpData);
+        Pcoarse->writeError(Rerror, Perror, it+1);
+        Pcoarse->writelocalError(localRerror, localPerror, it+1);
+        IpplTimings::stopTimer(dumpData);
+        //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+
+        if((Rerror <= tol) && (Perror <= tol)) {
+        //if(Perror <= tol) {
+            break;
+        }
+    }
+
+    msg << "Twostream instability Parareal: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    return 0;
+}
diff --git a/alpine/PinT/CMakeLists.txt b/alpine/PinT/CMakeLists.txt
index f73338484..73976aa27 100644
--- a/alpine/PinT/CMakeLists.txt
+++ b/alpine/PinT/CMakeLists.txt
@@ -16,6 +16,12 @@ set (COMPILE_FLAGS ${OPAL_CXX_FLAGS})
 add_executable (LandauDampingPinT LandauDampingPinT.cpp)
 target_link_libraries (LandauDampingPinT ${IPPL_LIBS})
 
+add_executable (BumponTailInstabilityPinT BumponTailInstabilityPinT.cpp)
+target_link_libraries (BumponTailInstabilityPinT ${IPPL_LIBS})
+
+add_executable (PenningTrapPinT PenningTrapPinT.cpp)
+target_link_libraries (PenningTrapPinT ${IPPL_LIBS})
+
 # vi: set et ts=4 sw=4 sts=4:
 
 # Local Variables:
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 59f8aeaed..166ea1d29 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -217,36 +217,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 
     
-    void dumpLandau(size_type /*totalP*/, const unsigned int& iter) {
+    void dumpLandau(const unsigned int& iter) {
        
 
         double fieldEnergy = 0.0; 
         double ExAmp = 0.0;
-        //auto Eview = E.getView();
-        //double temp = 0.0;
-
-        //Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
-        //                        KOKKOS_LAMBDA(const int i, double& valL){
-        //                            double myVal = Eview(i)[0] * Eview(i)[0];
-        //                            valL += myVal;
-        //                        }, Kokkos::Sum<double>(temp));
-
-        ////double globaltemp = 0.0;
-        //double globaltemp = temp;
-        ////MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-        //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        //fieldEnergy = globaltemp * volume / totalP ;
-
-        //double tempMax = 0.0;
-        //Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
-        //                        KOKKOS_LAMBDA(const size_t i, double& valL)
-        //                        {
-        //                            double myVal = std::fabs(Eview(i)[0]);
-        //                            if(myVal > valL) valL = myVal;
-        //                        }, Kokkos::Max<double>(tempMax));
-        ////ExAmp = 0.0;
-        //ExAmp = tempMax;
-        ////MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
 
         auto rhoview = rhoPIF_m.getView();
         const int nghost = rhoPIF_m.getNghost();
@@ -320,15 +295,98 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         csvout.precision(10);
         csvout.setf(std::ios::scientific, std::ios::floatfield);
 
-        //if(time_m == 0.0) {
-        //    csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
-        //}
 
         csvout << time_m << " "
                << fieldEnergy << " "
                << ExAmp << endl;
     }
 
+    void dumpBumponTail(const unsigned int& iter) {
+       
+
+        double fieldEnergy = 0.0; 
+        double EzAmp = 0.0;
+
+        auto rhoview = rhoPIF_m.getView();
+        const int nghost = rhoPIF_m.getNghost();
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+      
+        const FieldLayout_t& layout = rhoPIF_m.getLayout(); 
+        const Mesh_t& mesh = rhoPIF_m.get_mesh();
+        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        Vector<double, Dim> Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+
+        Kokkos::complex<double> imag = {0.0, 1.0};
+        double pi = std::acos(-1.0);
+        Kokkos::parallel_reduce("Ez energy and Max",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& tlSum,
+                                            double& tlMax)
+        {
+        
+            Vector<int, 3> iVec = {i, j, k};
+            Vector<double, 3> kVec;
+            double Dr = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                bool shift = (iVec[d] > (N[d]/2));
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                Dr += kVec[d] * kVec[d];
+            }
+
+            Kokkos::complex<double> Ek = {0.0, 0.0}; 
+            if(Dr != 0.0) {
+                Ek = -(imag * kVec[2] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+            }
+            double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+
+            tlSum += myVal;
+
+            double myValMax = std::sqrt(myVal);
+
+            if(myValMax > tlMax) tlMax = myValMax;
+
+        }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(EzAmp));
+        
+
+        Kokkos::fence();
+        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        fieldEnergy *= volume;
+
+
+        std::stringstream fname;
+        fname << "data/FieldBumponTail_";
+        fname << Ippl::Comm->rank();
+        fname << "_iter_";
+        fname << iter;
+        fname << ".csv";
+
+
+        Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+        csvout.precision(10);
+        csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+
+        csvout << time_m << " "
+               << fieldEnergy << " "
+               << EzAmp << endl;
+    }
+
+
+
 
     void dumpEnergy(size_type /*totalP*/, const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp) {
        
@@ -441,6 +499,28 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     }
 
+    void writelocalError(double Rerror, double Perror, unsigned int iter) {
+        
+            std::stringstream fname;
+            fname << "data/localError_";
+            fname << Ippl::Comm->rank();
+            fname << ".csv";
+
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+            if(iter == 1) {
+                csvout << "Iter, Rerror, Perror" << endl;
+            }
+
+            csvout << iter << " "
+                   << Rerror << " "
+                   << Perror << endl;
+
+    }
+
+    
     void writeError(double Rerror, double Perror, unsigned int iter) {
         
         if(Ippl::Comm->rank() == 0) {
@@ -570,6 +650,113 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
     }
 
+    void BorisPIC(ParticleAttrib<Vector_t>& Rtemp, 
+                     ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
+                     const double dt, const double& tStartMySlice, const double& Bext) {
+    
+        static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
+        PLayout& PL = this->getLayout();
+        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //checkBounds(Rtemp);
+        rhoPIC_m = 0.0;
+        scatter(q, rhoPIC_m, Rtemp);
+    
+        rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
+        rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    
+        //Field solve
+        solver_mp->solve();
+    
+        // gather E field
+        gather(E, EfieldPIC_m, Rtemp);
+    
+        time_m = tStartMySlice;
+
+        //dumpLandauPIC();         
+        double alpha = -0.5 * dt;
+        double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
+        Vector_t rmax = rmax_m;
+
+        for (unsigned int it=0; it<nt; it++) {
+            
+            // Staggered Leap frog or Boris algorithm as per 
+            // https://www.sciencedirect.com/science/article/pii/S2590055219300526
+            // eqns 4(a)-4(c). Note we don't use the Boris trick here and do
+            // the analytical matrix inversion which is not complex in this case.
+            // Here, we assume a constant charge-to-mass ratio of -1 for
+            // all the particles hence eliminating the need to store mass as
+            // an attribute
+            // kick
+            auto Rview = Rtemp.getView();
+            auto Pview = Ptemp.getView();
+            auto Eview = E.getView();
+            double V0 = 30*rmax[2];
+            Kokkos::parallel_for("Kick1", this->getLocalNum(),
+                                  KOKKOS_LAMBDA(const size_t j){
+                double Eext_x = -(Rview(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+                Eview(j)[0] += Eext_x;
+                Eview(j)[1] += Eext_y;
+                Eview(j)[2] += Eext_z;
+                
+                Pview(j)[0] += alpha * (Eview(j)[0]  + Pview(j)[1] * Bext);
+                Pview(j)[1] += alpha * (Eview(j)[1]  - Pview(j)[0] * Bext);
+                Pview(j)[2] += alpha * Eview(j)[2];
+            });
+    
+            //drift
+            Rtemp = Rtemp + dt * Ptemp;
+    
+            //Apply particle BC
+            PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+            //checkBounds(Rtemp);
+    
+            //scatter the charge onto the underlying grid
+            rhoPIC_m = 0.0;
+            scatter(q, rhoPIC_m, Rtemp);
+    
+    
+            rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
+            rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
+    
+            //Field solve
+            IpplTimings::startTimer(fieldSolvePIC);
+            solver_mp->solve();
+            IpplTimings::stopTimer(fieldSolvePIC);
+    
+            // gather E field
+            gather(E, EfieldPIC_m, Rtemp);
+    
+            //kick
+            auto R2view = Rtemp.getView();
+            auto P2view = Ptemp.getView();
+            auto E2view = E.getView();
+            Kokkos::parallel_for("Kick2", this->getLocalNum(),
+                                  KOKKOS_LAMBDA(const size_t j){
+                double Eext_x = -(R2view(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+                E2view(j)[0] += Eext_x;
+                E2view(j)[1] += Eext_y;
+                E2view(j)[2] += Eext_z;
+                P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (E2view(j)[0] 
+                                + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]) );
+                P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (E2view(j)[1] 
+                                - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
+                P2view(j)[2] += alpha * E2view(j)[2];
+            });
+            
+            time_m += dt;
+            //dumpLandauPIC();         
+        }
+    
+    }
+
+
+
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
                      const double& dt, const bool& /*isConverged*/, 
@@ -591,7 +778,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0)) {
             IpplTimings::startTimer(dumpData);
-            dumpLandau(this->getLocalNum(), iter);         
+            //dumpLandau(iter);         
+            dumpBumponTail(iter);         
             dumpEnergy(this->getLocalNum(), iter, Ptemp);
             IpplTimings::stopTimer(dumpData);
         }
@@ -623,7 +811,112 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             time_m += dt;
             
             IpplTimings::startTimer(dumpData);
-            dumpLandau(this->getLocalNum(), iter);         
+            //dumpLandau(iter);         
+            dumpBumponTail(iter);         
+            dumpEnergy(this->getLocalNum(), iter, Ptemp);         
+            IpplTimings::stopTimer(dumpData);
+    
+        }
+    }
+
+
+    void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
+                     ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
+                     const double& dt, const bool& /*isConverged*/, 
+                     const double& tStartMySlice, const unsigned int& iter, const double& Bext) {
+    
+        static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+        PLayout& PL = this->getLayout();
+        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //checkBounds(Rtemp);
+        rhoPIF_m = {0.0, 0.0};
+        scatterPIF(q, rhoPIF_m, Rtemp);
+    
+        rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    
+        // Solve for and gather E field
+        gatherPIF(E, rhoPIF_m, Rtemp);
+    
+        time_m = tStartMySlice;
+
+        if((time_m == 0.0)) {
+            IpplTimings::startTimer(dumpData);
+            dumpEnergy(this->getLocalNum(), iter, Ptemp);
+            IpplTimings::stopTimer(dumpData);
+        }
+        double alpha = -0.5 * dt;
+        double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
+        Vector_t rmax = rmax_m;
+        for (unsigned int it=0; it<nt; it++) {
+    
+            // kick
+    
+            // Staggered Leap frog or Boris algorithm as per 
+            // https://www.sciencedirect.com/science/article/pii/S2590055219300526
+            // eqns 4(a)-4(c). Note we don't use the Boris trick here and do
+            // the analytical matrix inversion which is not complex in this case.
+            // Here, we assume a constant charge-to-mass ratio of -1 for
+            // all the particles hence eliminating the need to store mass as
+            // an attribute
+            // kick
+            auto Rview = Rtemp.getView();
+            auto Pview = Ptemp.getView();
+            auto Eview = E.getView();
+            double V0 = 30*rmax[2];
+            Kokkos::parallel_for("Kick1", this->getLocalNum(),
+                                  KOKKOS_LAMBDA(const size_t j){
+                double Eext_x = -(Rview(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+                Eview(j)[0] += Eext_x;
+                Eview(j)[1] += Eext_y;
+                Eview(j)[2] += Eext_z;
+                
+                Pview(j)[0] += alpha * (Eview(j)[0]  + Pview(j)[1] * Bext);
+                Pview(j)[1] += alpha * (Eview(j)[1]  - Pview(j)[0] * Bext);
+                Pview(j)[2] += alpha * Eview(j)[2];
+            });
+    
+            //drift
+            Rtemp = Rtemp + dt * Ptemp;
+    
+            //Apply particle BC
+            PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+            //checkBounds(Rtemp);
+    
+            //scatter the charge onto the underlying grid
+            rhoPIF_m = {0.0, 0.0};
+            scatterPIF(q, rhoPIF_m, Rtemp);
+    
+            rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+    
+            // Solve for and gather E field
+            gatherPIF(E, rhoPIF_m, Rtemp);
+    
+            //kick
+            auto R2view = Rtemp.getView();
+            auto P2view = Ptemp.getView();
+            auto E2view = E.getView();
+            Kokkos::parallel_for("Kick2", this->getLocalNum(),
+                                  KOKKOS_LAMBDA(const size_t j){
+                double Eext_x = -(R2view(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+                double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+                E2view(j)[0] += Eext_x;
+                E2view(j)[1] += Eext_y;
+                E2view(j)[2] += Eext_z;
+                P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (E2view(j)[0] 
+                                + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]) );
+                P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (E2view(j)[1] 
+                                - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
+                P2view(j)[2] += alpha * E2view(j)[2];
+            });
+    
+            time_m += dt;
+            
+            IpplTimings::startTimer(dumpData);
             dumpEnergy(this->getLocalNum(), iter, Ptemp);         
             IpplTimings::stopTimer(dumpData);
     
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 3b3aabee0..01b10d334 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -137,7 +137,7 @@ struct generate_random {
 
 
 double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/) {
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -154,7 +154,9 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
                             }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
     Kokkos::fence();
-    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Abs. Error: " << localError << std::endl;
+    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
+
 
     double globaltemp = 0.0;
     MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
@@ -178,6 +180,48 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
 }
 
+double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                myValError = std::sqrt(myValError);
+                                
+                                if(myValError > valLError) valLError = myValError;
+                                
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                myValnorm = std::sqrt(myValnorm);
+                                
+                                if(myValnorm > valLnorm) valLnorm = myValnorm;
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
+
+    Kokkos::fence();
+    lError = localError/localNorm;
+    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
+
+
+    double globaltemp = 0.0;
+    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+
+    double absError = globaltemp;
+
+    globaltemp = 0.0;
+    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+
+    double relError = absError / globaltemp;
+    
+    return relError;
+
+}
+
+
 double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
 
     auto rhoview = rhoPIF.getView();
@@ -460,9 +504,11 @@ int main(int argc, char *argv[]){
     if (Ippl::Comm->rank() > 0) {
         Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
     }
+    
+    Ippl::Comm->barrier();
+    
     IpplTimings::stopTimer(coarsePropagator);
 
-    Ippl::Comm->barrier();
     msg << "First Leap frog PIC done " << endl;
 
     
@@ -489,6 +535,11 @@ int main(int argc, char *argv[]){
 
     msg << "Starting parareal iterations ..." << endl;
     bool isConverged = false;
+    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
+    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
+    //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
+    //                     dtFine, isConverged, tStartMySlice, 0);
+    //Ippl::Comm->barrier();
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
@@ -567,8 +618,11 @@ int main(int argc, char *argv[]){
         //double EfieldError = absFieldError / EfieldNorm;
 
         IpplTimings::startTimer(computeErrors);
-        double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank());
-        double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank());
+        double localRerror, localPerror;
+        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        //double Rerror = computeLinfError(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        //double Perror = computeLinfError(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
         double EfieldError = 0;
         if(it > 0) {
@@ -589,6 +643,7 @@ int main(int argc, char *argv[]){
 
         IpplTimings::startTimer(dumpData);
         Pcoarse->writeError(Rerror, Perror, it+1);
+        Pcoarse->writelocalError(localRerror, localPerror, it+1);
         IpplTimings::stopTimer(dumpData);
         //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
new file mode 100644
index 000000000..962698396
--- /dev/null
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -0,0 +1,683 @@
+// Parallel-in-time (PinT) method Parareal combined with Particle-in-cell
+// and Particle-in-Fourier schemes. The example is electrostatic Landau 
+// damping. The implementation of Parareal follows the open source implementation
+// https://github.com/Parallel-in-Time/PararealF90 by Daniel Ruprecht. The corresponding
+// publication is Ruprecht, Daniel. "Shared memory pipelined parareal." 
+// European Conference on Parallel Processing. Springer, Cham, 2017.
+// 
+//  Usage:
+//     srun ./PenningTrap <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> --info 5
+//     nmx       = No. of Fourier modes in the x-direction
+//     nmy       = No. of Fourier modes in the y-direction
+//     nmz       = No. of Fourier modes in the z-direction
+//     nx       = No. of grid points in the x-direction
+//     ny       = No. of grid points in the y-direction
+//     nz       = No. of grid points in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Example:
+//     srun ./PenningTrap 16 16 16 32 32 32 655360 20.0 0.05 0.05 1e-5 100 --info 5
+//
+// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPinT.hpp"
+#include "StatesBeginSlice.hpp"
+#include "StatesEndSlice.hpp"
+//#include "LeapFrogPIC.cpp"
+//#include "LeapFrogPIF.cpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T mu, sigma, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& mu_, const T& sigma_, 
+           const T& u_) 
+  : mu(mu_), sigma(sigma_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = std::erf((x - mu)/(sigma * std::sqrt(2.0))) 
+          - 2 * u + 1;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = (1 / sigma) * std::sqrt(2 / pi) * 
+               std::exp(-0.5 * (std::pow(((x - mu) / sigma),2)));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while ((iterations < max_iter) && (std::fabs(f(x)) > tol)) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  T mu, sigma, minU, maxU;
+
+  double pi = std::acos(-1.0);
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_,
+                  T& mu_, T& sigma_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        mu(mu_), sigma(sigma_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    value_type u;
+    for (unsigned d = 0; d < Dim; ++d) {
+        u = rand_gen.drand(minU[d], maxU[d]);
+        x(i)[d] = (std::sqrt(pi / 2) * (2 * u - 1)) * 
+                  sigma[d] + mu[d];
+        Newton1D<value_type> solver(mu[d], sigma[d], u);
+        solver.solve(x(i)[d]);
+        v(i)[d] = rand_gen.normal(0.0, 1.0);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+
+double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
+
+
+    double globaltemp = 0.0;
+    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+
+    double absError = std::sqrt(globaltemp);
+
+    //temp = 0.0;
+    //Kokkos::parallel_reduce("Q norm", Q.size(),
+    //                        KOKKOS_LAMBDA(const int i, double& valL){
+    //                            double myVal = dot(Qview(i), Qview(i)).apply();
+    //                            valL += myVal;
+    //                        }, Kokkos::Sum<double>(temp));
+
+
+    globaltemp = 0.0;
+    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+
+    double relError = absError / std::sqrt(globaltemp);
+    
+    return relError;
+
+}
+
+double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                myValError = std::sqrt(myValError);
+                                
+                                if(myValError > valLError) valLError = myValError;
+                                
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                myValnorm = std::sqrt(myValnorm);
+                                
+                                if(myValnorm > valLnorm) valLnorm = myValnorm;
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
+
+    Kokkos::fence();
+    lError = localError/localNorm;
+    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
+
+
+    double globaltemp = 0.0;
+    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+
+    double absError = globaltemp;
+
+    globaltemp = 0.0;
+    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+
+    double relError = absError / globaltemp;
+    
+    return relError;
+
+}
+
+
+double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
+
+    auto rhoview = rhoPIF.getView();
+    auto rhoprevview = rhoPIFprevIter.getView();
+    const int nghost = rhoPIF.getNghost();
+    using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+    
+    const FieldLayout_t& layout = rhoPIF.getLayout(); 
+    const Mesh_t& mesh = rhoPIF.get_mesh();
+    const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+    const auto& domain = layout.getDomain();
+    Vector<double, Dim> Len;
+    Vector<int, Dim> N;
+
+    for (unsigned d=0; d < Dim; ++d) {
+        N[d] = domain[d].length();
+        Len[d] = dx[d] * N[d];
+    }
+
+    double AbsError = 0.0;
+    double Enorm = 0.0;
+    Kokkos::complex<double> imag = {0.0, 1.0};
+    double pi = std::acos(-1.0);
+    Kokkos::parallel_reduce("Ex field error",
+                          mdrange_type({0, 0, 0},
+                                       {N[0],
+                                        N[1],
+                                        N[2]}),
+                          KOKKOS_LAMBDA(const int i,
+                                        const int j,
+                                        const int k,
+                                        double& errorSum,
+                                        double& fieldSum)
+    {
+    
+        Vector<int, 3> iVec = {i, j, k};
+        Vector<double, 3> kVec;
+        double Dr = 0.0;
+        for(size_t d = 0; d < Dim; ++d) {
+            bool shift = (iVec[d] > (N[d]/2));
+            kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+            Dr += kVec[d] * kVec[d];
+        }
+
+        double myError = 0.0;
+        double myField = 0.0;
+        Kokkos::complex<double> Ek = {0.0, 0.0};
+        Kokkos::complex<double> Ekprev = {0.0, 0.0};
+        for(size_t d = 0; d < Dim; ++d) {
+            if(Dr != 0.0) {
+                Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+                Ekprev = -(imag * kVec[d] * rhoprevview(i+nghost,j+nghost,k+nghost) / Dr);
+            }
+            Ekprev = Ekprev - Ek;
+            myError += Ekprev.real() * Ekprev.real() + Ekprev.imag() * Ekprev.imag();
+            myField += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+        }
+        errorSum += myError;
+        fieldSum += myField;
+        //Kokkos::complex<double> rhok = rhoview(i+nghost,j+nghost,k+nghost);
+        //Kokkos::complex<double> rhokprev = rhoprevview(i+nghost,j+nghost,k+nghost);
+        //rhokprev = rhokprev - rhok;
+        //myError = rhokprev.real() * rhokprev.real() + rhokprev.imag() * rhokprev.imag();
+        //errorSum += myError;
+        //myField = rhok.real() * rhok.real() + rhok.imag() * rhok.imag();
+        //fieldSum += myField;
+
+    }, Kokkos::Sum<double>(AbsError), Kokkos::Sum<double>(Enorm));
+    
+    Kokkos::fence();
+    double globalError = 0.0;
+    MPI_Allreduce(&AbsError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    double globalNorm = 0.0;
+    MPI_Allreduce(&Enorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+    //fieldEnergy *= volume;
+
+    double relError = std::sqrt(globalError)/std::sqrt(globalNorm);
+
+    return relError;
+}
+
+
+const char* TestName = "PenningTrap";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg(TestName);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nmPIF = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    ippl::Vector<int,Dim> nrPIC = {
+        std::atoi(argv[4]),
+        std::atoi(argv[5]),
+        std::atoi(argv[6])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef timeCommunication = IpplTimings::getTimer("timeCommunication");
+    static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
+    static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
+    static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[7]);
+    const double tEnd = std::atof(argv[8]);
+    const double dtSlice = tEnd / Ippl::Comm->size();
+    const double dtFine = std::atof(argv[9]);
+    const double dtCoarse = std::atof(argv[10]);
+    const unsigned int ntFine = (unsigned int)(dtSlice / dtFine);
+    const unsigned int ntCoarse = (unsigned int)(dtSlice / dtCoarse);
+    const double tol = std::atof(argv[11]);
+    const unsigned int maxIter = std::atoi(argv[12]);
+
+    msg << "dtSlice: " << dtSlice 
+        << "dtSlice/dtFine: " << dtSlice / dtFine
+        << "(int)dtSlice/dtFine: " << (unsigned int)(dtSlice / dtFine)
+        << endl;
+
+    const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
+    //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
+
+
+    using bunch_type = ChargedParticlesPinT<PLayout_t>;
+    using states_begin_type = StatesBeginSlice<PLayout_t>;
+    using states_end_type = StatesEndSlice<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  Pcoarse;
+    std::unique_ptr<states_begin_type>  Pbegin;
+    std::unique_ptr<states_end_type>  Pend;
+
+    ippl::NDIndex<Dim> domainPIC;
+    ippl::NDIndex<Dim> domainPIF;
+    for (unsigned i = 0; i< Dim; i++) {
+        domainPIC[i] = ippl::Index(nrPIC[i]);
+        domainPIF[i] = ippl::Index(nmPIF[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t rmin(0.0);
+    Vector_t rmax(20.0);
+    double dxPIC = rmax[0] / nrPIC[0];
+    double dyPIC = rmax[1] / nrPIC[1];
+    double dzPIC = rmax[2] / nrPIC[2];
+
+    Vector_t length = rmax - rmin;
+
+    Vector_t mu, sd;
+
+    for (unsigned d = 0; d<Dim; d++) {
+        mu[d] = 0.5 * length[d];
+    }
+    sd[0] = 0.15*length[0];
+    sd[1] = 0.05*length[1];
+    sd[2] = 0.20*length[2];
+
+
+
+    double dxPIF = rmax[0] / nmPIF[0];
+    double dyPIF = rmax[1] / nmPIF[1];
+    double dzPIF = rmax[2] / nmPIF[2];
+    Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
+    Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t meshPIC(domainPIC, hrPIC, origin);
+    Mesh_t meshPIF(domainPIF, hrPIF, origin);
+    FieldLayout_t FLPIC(domainPIC, decomp, isAllPeriodic);
+    FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
+    PLayout_t PL(FLPIC, meshPIC);
+
+    double Q = -1562.5;
+    double Bext = 5.0;
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q);
+    Pbegin = std::make_unique<states_begin_type>(PL);
+    Pend = std::make_unique<states_end_type>(PL);
+
+    Pcoarse->nr_m = nrPIC;
+
+    Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+    //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
+
+    Pcoarse->initFFTSolver();
+    Pcoarse->time_m = tStartMySlice;
+
+    IpplTimings::startTimer(particleCreation);
+
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = rmin[d];
+        maxU[d] = rmax[d];
+    }
+
+    size_type nloc = totalP;
+
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
+
+    using buffer_type = ippl::Communicate::buffer_type;
+    int tag;
+#ifdef KOKKOS_ENABLE_CUDA
+    //If we don't do the following even with the same seed the initial 
+    //condition is not the same on different GPUs
+    tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    if(Ippl::Comm->rank() == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+        Kokkos::parallel_for(nloc,
+                             generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                             Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, mu, sd, 
+                             minU, maxU));
+
+
+        Kokkos::fence();
+        size_type bufSize = Pbegin->packedSize(nloc);
+        std::vector<MPI_Request> requests(0);
+        int sends = 0;
+        for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
+            requests.resize(requests.size() + 1);
+            Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
+            buf->resetWritePos();
+            ++sends;
+        }
+        MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+    }
+    else {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+        Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+        buf->resetReadPos();
+    }
+    Ippl::Comm->barrier();
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+#else
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         Pcoarse->R.getView(), Pcoarse->P.getView(), rand_pool64, mu, sd, 
+                         minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+#endif
+
+
+    msg << "Parareal Penning trap"
+        << endl
+        << "Slice dT: " << dtSlice
+        << endl
+        << "No. of fine time steps: " << ntFine 
+        << endl
+        << "No. of coarse time steps: " << ntCoarse
+        << endl
+        << "Tolerance: " << tol
+        << " Max. iterations: " << maxIter
+        << endl
+        << "Np= " << nloc 
+        << " Fourier modes = " << nmPIF
+        << " Grid points = " << nrPIC
+        << endl;
+    
+    Pcoarse->q = Pcoarse->Q_m/nloc;
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    msg << "particles created and initial conditions assigned " << endl;
+
+    //Copy initial conditions as they are needed later
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    //Get initial guess for ranks other than 0 by propagating the coarse solver
+    IpplTimings::startTimer(coarsePropagator);
+    if (Ippl::Comm->rank() > 0) {
+        Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice, Bext); 
+    }
+    
+    Ippl::Comm->barrier();
+    
+    IpplTimings::stopTimer(coarsePropagator);
+
+    msg << "First Boris PIC done " << endl;
+
+    
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+
+    //Run the coarse integrator to get the values at the end of the time slice 
+    IpplTimings::startTimer(coarsePropagator);
+    Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
+    IpplTimings::stopTimer(coarsePropagator);
+    msg << "Second Boris PIC done " << endl;
+
+    //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+
+    //The following might not be needed
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+
+    msg << "Starting parareal iterations ..." << endl;
+    bool isConverged = false;
+    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
+    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
+    //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
+    //                     dtFine, isConverged, tStartMySlice, 0);
+    //Ippl::Comm->barrier();
+    for (unsigned int it=0; it<maxIter; it++) {
+
+        //Run fine integrator in parallel
+        IpplTimings::startTimer(finePropagator);
+        Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1, Bext);
+        IpplTimings::stopTimer(finePropagator);
+    
+
+        //if(isConverged) {
+
+            //test with the serial solution
+            //Pcoarse->LeapFrogPIF(Pcoarse->R0, Pcoarse->P0, (Ippl::Comm->rank()+1)*ntFine, 
+            //                     dtFine, isConverged, tStartMySlice, it+1);
+            //Ippl::Comm->barrier();
+            //double Rerror = computeL2Error(Pcoarse->R0, Pbegin->R, it+1, Ippl::Comm->rank());
+            //double Perror = computeL2Error(Pcoarse->P0, Pbegin->P, it+1, Ippl::Comm->rank());
+            //msg << "Finished iteration: " << it+1 
+            //<< " Rerror: " << Rerror 
+            //<< " Perror: " << Perror
+            //<< endl;
+        //    break;
+        //}
+
+        //Difference = Fine - Coarse
+        Pend->R = Pbegin->R - Pcoarse->R;
+        Pend->P = Pbegin->P - Pcoarse->P;
+
+        IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+        Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+        IpplTimings::stopTimer(deepCopy);
+        
+        IpplTimings::startTimer(timeCommunication);
+        tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+        
+        if(Ippl::Comm->rank() > 0) {
+            size_type bufSize = Pbegin->packedSize(nloc);
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+            Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+            buf->resetReadPos();
+        }
+        else {
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+        }
+        IpplTimings::stopTimer(timeCommunication);
+
+        IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+        Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+        IpplTimings::stopTimer(deepCopy);
+
+        IpplTimings::startTimer(coarsePropagator);
+        Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
+        IpplTimings::stopTimer(coarsePropagator);
+
+        Pend->R = Pend->R + Pcoarse->R;
+        Pend->P = Pend->P + Pcoarse->P;
+
+        IpplTimings::startTimer(timeCommunication);
+        if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+            size_type bufSize = Pend->packedSize(nloc);
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+            MPI_Request request;
+            Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+            buf->resetWritePos();
+            MPI_Wait(&request, MPI_STATUS_IGNORE);
+        }
+        IpplTimings::stopTimer(timeCommunication);
+
+        //Pcoarse->EfieldPICprevIter_m = Pcoarse->EfieldPICprevIter_m - Pcoarse->EfieldPIC_m;
+        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPICprevIter_m, Pcoarse->EfieldPICprevIter_m);
+        //double absFieldError = std::sqrt(Pcoarse->rhoPIC_m.sum());
+        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPIC_m, Pcoarse->EfieldPIC_m);
+        //double EfieldNorm = std::sqrt(Pcoarse->rhoPIC_m.sum());
+        //double EfieldError = absFieldError / EfieldNorm;
+
+        IpplTimings::startTimer(computeErrors);
+        double localRerror, localPerror;
+        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        //double Rerror = computeLinfError(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        //double Perror = computeLinfError(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+    
+        double EfieldError = 0;
+        if(it > 0) {
+            EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
+        }
+        IpplTimings::stopTimer(computeErrors);
+
+        IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
+        IpplTimings::stopTimer(deepCopy);
+        
+        msg << "Finished iteration: " << it+1 
+            << " Rerror: " << Rerror 
+            << " Perror: " << Perror
+            << " Efield error: " << EfieldError
+            //<< " Rhofield error: " << EfieldError
+            << endl;
+
+        IpplTimings::startTimer(dumpData);
+        Pcoarse->writeError(Rerror, Perror, it+1);
+        Pcoarse->writelocalError(localRerror, localPerror, it+1);
+        IpplTimings::stopTimer(dumpData);
+        //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+
+        if((Rerror <= tol) && (Perror <= tol)) {
+        //if(Perror <= tol) {
+            break;
+        }
+    }
+
+    msg << "Penning trap Parareal: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    return 0;
+}

From cdb062dea481ab0734b189f92a26cb5c760da819 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 20 Dec 2022 12:35:10 +0100
Subject: [PATCH 034/117] Penningtrap IC generation fixed with CDF

---
 alpine/ElectrostaticPIC/PenningTrap.cpp   |  6 +++---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 14 ++++++++++++--
 alpine/PinT/ChargedParticlesPinT.hpp      |  1 +
 alpine/PinT/LandauDampingPinT.cpp         | 10 ++++++++--
 alpine/PinT/PenningTrapPinT.cpp           | 10 +++++++---
 5 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/alpine/ElectrostaticPIC/PenningTrap.cpp b/alpine/ElectrostaticPIC/PenningTrap.cpp
index fbbdfd3d9..9ea440176 100644
--- a/alpine/ElectrostaticPIC/PenningTrap.cpp
+++ b/alpine/ElectrostaticPIC/PenningTrap.cpp
@@ -213,9 +213,9 @@ int main(int argc, char *argv[]){
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    unsigned int nrMax = 2048;// Max grid size in our studies
-    double dxFinest = rmax[0] / nrMax;  
-    const double dt = 0.5 * dxFinest;//size of timestep
+    //unsigned int nrMax = 2048;// Max grid size in our studies
+    //double dxFinest = rmax[0] / nrMax;  
+    const double dt = 0.05;//0.5 * dxFinest;//size of timestep
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 6bc012cbc..4df725214 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -147,6 +147,14 @@ struct generate_random {
   }
 };
 
+double CDF(const double& x, const double& delta, const double& k,
+           const unsigned& dim) {
+
+   bool isDimZ = (dim == (Dim-1)); 
+   double cdf = x + (double)(isDimZ * ((delta / k) * std::sin(k * x)));
+   return cdf;
+}
+
 double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
                       const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
     
@@ -462,8 +470,10 @@ int main(int argc, char *argv[]){
 
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
-        minU[d] = rmin[d];
-        maxU[d] = rmax[d];
+        minU[d] = CDF(rmin[d], delta, kw[d], d);
+        maxU[d]   = CDF(rmax[d], delta, kw[d], d);
+        //minU[d] = rmin[d];
+        //maxU[d] = rmax[d];
     }
 
     double factorVelBulk = 1.0 - epsilon;
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 166ea1d29..7cad84807 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -677,6 +677,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
         Vector_t rmax = rmax_m;
 
+
         for (unsigned int it=0; it<nt; it++) {
             
             // Staggered Leap frog or Boris algorithm as per 
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 01b10d334..ea6a987b0 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -135,6 +135,10 @@ struct generate_random {
   }
 };
 
+double CDF(const double& x, const double& alpha, const double& k) {
+   double cdf = x + (alpha / k) * std::sin(k * x);
+   return cdf;
+}
 
 double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
                       const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
@@ -431,8 +435,10 @@ int main(int argc, char *argv[]){
 
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
-        minU[d] = rmin[d];
-        maxU[d] = rmax[d];
+        minU[d] = CDF(rmin[d], alpha, kw[d]);
+        maxU[d]   = CDF(rmax[d], alpha, kw[d]);
+        //minU[d] = rmin[d];
+        //maxU[d] = rmax[d];
     }
 
     size_type nloc = totalP;
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 962698396..e1335e74f 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -137,6 +137,11 @@ struct generate_random {
   }
 };
 
+double CDF(const double& x, const double& mu, const double& sigma) {
+   double cdf = 0.5 * (1.0 + std::erf((x - mu)/(sigma * std::sqrt(2))));
+   return cdf;
+}
+
 
 double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
                       const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
@@ -396,7 +401,6 @@ int main(int argc, char *argv[]){
     sd[2] = 0.20*length[2];
 
 
-
     double dxPIF = rmax[0] / nmPIF[0];
     double dyPIF = rmax[1] / nmPIF[1];
     double dzPIF = rmax[2] / nmPIF[2];
@@ -433,8 +437,8 @@ int main(int argc, char *argv[]){
 
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
-        minU[d] = rmin[d];
-        maxU[d] = rmax[d];
+        minU[d] = CDF(rmin[d], mu[d], sd[d]);
+        maxU[d] = CDF(rmax[d], mu[d], sd[d]);
     }
 
     size_type nloc = totalP;

From 36a6975da349ee112d60adc48fa6310502995478 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 21 Dec 2022 09:01:58 +0100
Subject: [PATCH 035/117] ceil used for number of coarse and fine time steps

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 21 +++++++++------------
 alpine/PinT/LandauDampingPinT.cpp         |  4 ++--
 alpine/PinT/PenningTrapPinT.cpp           |  4 ++--
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 4df725214..ebb8d7abd 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -324,14 +324,14 @@ double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
 }
 
 
-//const char* TestName = "TwoStreamInstability";
-const char* TestName = "BumponTailInstability";
+const char* TestName = "TwoStreamInstability";
+//const char* TestName = "BumponTailInstability";
 
 int main(int argc, char *argv[]){
     Ippl ippl(argc, argv);
     
-    Inform msg("TestName");
-    Inform msg2all("TestName",INFORM_ALL_NODES);
+    Inform msg(TestName);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
 
     ippl::Vector<int,Dim> nmPIF = {
         std::atoi(argv[1]),
@@ -361,15 +361,11 @@ int main(int argc, char *argv[]){
     const double dtSlice = tEnd / Ippl::Comm->size();
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
-    const unsigned int ntFine = (unsigned int)(dtSlice / dtFine);
-    const unsigned int ntCoarse = (unsigned int)(dtSlice / dtCoarse);
+    const unsigned int ntFine = std::ceil(dtSlice / dtFine);
+    const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
     const unsigned int maxIter = std::atoi(argv[12]);
 
-    msg << "dtSlice: " << dtSlice 
-        << "dtSlice/dtFine: " << dtSlice / dtFine
-        << "(int)dtSlice/dtFine: " << (unsigned int)(dtSlice / dtFine)
-        << endl;
 
     const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
     //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
@@ -537,7 +533,8 @@ int main(int argc, char *argv[]){
 #endif
 
 
-    msg << "Parareal Bump on tail instability"
+    msg << "Parareal "
+        << TestName
         << endl
         << "Slice dT: " << dtSlice
         << endl
@@ -718,7 +715,7 @@ int main(int argc, char *argv[]){
         }
     }
 
-    msg << "Twostream instability Parareal: End." << endl;
+    msg << TestName << " Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index ea6a987b0..48a85bb7f 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -345,8 +345,8 @@ int main(int argc, char *argv[]){
     const double dtSlice = tEnd / Ippl::Comm->size();
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
-    const unsigned int ntFine = (unsigned int)(dtSlice / dtFine);
-    const unsigned int ntCoarse = (unsigned int)(dtSlice / dtCoarse);
+    const unsigned int ntFine = std::ceil(dtSlice / dtFine);
+    const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
     const unsigned int maxIter = std::atoi(argv[12]);
 
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index e1335e74f..bf9ab6586 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -348,8 +348,8 @@ int main(int argc, char *argv[]){
     const double dtSlice = tEnd / Ippl::Comm->size();
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
-    const unsigned int ntFine = (unsigned int)(dtSlice / dtFine);
-    const unsigned int ntCoarse = (unsigned int)(dtSlice / dtCoarse);
+    const unsigned int ntFine = std::ceil(dtSlice / dtFine);
+    const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
     const unsigned int maxIter = std::atoi(argv[12]);
 

From a7112e7b82b8c34bf94a506addb5f2101a1633aa Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 23 Dec 2022 14:45:00 +0100
Subject: [PATCH 036/117] Stopping criteria changed in all mini-apps

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 104 ++++++++-----------
 alpine/PinT/ChargedParticlesPinT.hpp      |   8 +-
 alpine/PinT/LandauDampingPinT.cpp         | 116 +++++++++++-----------
 alpine/PinT/PenningTrapPinT.cpp           | 102 ++++++++-----------
 src/Ippl.cpp                              |   2 +-
 5 files changed, 143 insertions(+), 189 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index ebb8d7abd..fe3ce5c7a 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -226,15 +226,16 @@ double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Q
     //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
 
 
-    double globaltemp = 0.0;
-    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+    //double globaltemp = 0.0;
+    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
 
-    double absError = globaltemp;
+    //double absError = globaltemp;
 
-    globaltemp = 0.0;
-    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+    //globaltemp = 0.0;
+    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
 
-    double relError = absError / globaltemp;
+    //double relError = absError / globaltemp;
+    double relError = lError;
     
     return relError;
 
@@ -330,7 +331,7 @@ const char* TestName = "TwoStreamInstability";
 int main(int argc, char *argv[]){
     Ippl ippl(argc, argv);
     
-    Inform msg(TestName);
+    Inform msg(TestName, Ippl::Comm->size()-1);
     Inform msg2all(TestName,INFORM_ALL_NODES);
 
     ippl::Vector<int,Dim> nmPIF = {
@@ -468,8 +469,6 @@ int main(int argc, char *argv[]){
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], delta, kw[d], d);
         maxU[d]   = CDF(rmax[d], delta, kw[d], d);
-        //minU[d] = rmin[d];
-        //maxU[d] = rmax[d];
     }
 
     double factorVelBulk = 1.0 - epsilon;
@@ -597,11 +596,13 @@ int main(int argc, char *argv[]){
 
     msg << "Starting parareal iterations ..." << endl;
     bool isConverged = false;
-    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
-    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
-    //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
-    //                     dtFine, isConverged, tStartMySlice, 0);
-    //Ippl::Comm->barrier();
+    bool isPreviousDomainConverged;
+    if(Ippl::Comm->rank() == 0) {
+        isPreviousDomainConverged = true;
+    }
+    else {
+        isPreviousDomainConverged = false;
+    }
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
@@ -610,21 +611,6 @@ int main(int argc, char *argv[]){
         IpplTimings::stopTimer(finePropagator);
     
 
-        //if(isConverged) {
-
-            //test with the serial solution
-            //Pcoarse->LeapFrogPIF(Pcoarse->R0, Pcoarse->P0, (Ippl::Comm->rank()+1)*ntFine, 
-            //                     dtFine, isConverged, tStartMySlice, it+1);
-            //Ippl::Comm->barrier();
-            //double Rerror = computeL2Error(Pcoarse->R0, Pbegin->R, it+1, Ippl::Comm->rank());
-            //double Perror = computeL2Error(Pcoarse->P0, Pbegin->P, it+1, Ippl::Comm->rank());
-            //msg << "Finished iteration: " << it+1 
-            //<< " Rerror: " << Rerror 
-            //<< " Perror: " << Perror
-            //<< endl;
-        //    break;
-        //}
-
         //Difference = Fine - Coarse
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
@@ -636,20 +622,25 @@ int main(int argc, char *argv[]){
         
         IpplTimings::startTimer(timeCommunication);
         tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+        int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
-        if(Ippl::Comm->rank() > 0) {
+        if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
             size_type bufSize = Pbegin->packedSize(nloc);
             buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
             Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
             buf->resetReadPos();
-        }
-        else {
-            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
-            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
+                    Ippl::getComm(), MPI_STATUS_IGNORE);
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
         }
         IpplTimings::stopTimer(timeCommunication);
 
         IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+        Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
         Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
         Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
         IpplTimings::stopTimer(deepCopy);
@@ -661,6 +652,18 @@ int main(int argc, char *argv[]){
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
+        IpplTimings::startTimer(computeErrors);
+        double localRerror, localPerror;
+        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+    
+        IpplTimings::stopTimer(computeErrors);
+
+        if((Rerror <= tol) && (Perror <= tol)) {
+            isConverged = true;
+        }
+
+
         IpplTimings::startTimer(timeCommunication);
         if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
             size_type bufSize = Pend->packedSize(nloc);
@@ -669,52 +672,27 @@ int main(int argc, char *argv[]){
             Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
             buf->resetWritePos();
             MPI_Wait(&request, MPI_STATUS_IGNORE);
+            MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
         }
         IpplTimings::stopTimer(timeCommunication);
 
-        //Pcoarse->EfieldPICprevIter_m = Pcoarse->EfieldPICprevIter_m - Pcoarse->EfieldPIC_m;
-        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPICprevIter_m, Pcoarse->EfieldPICprevIter_m);
-        //double absFieldError = std::sqrt(Pcoarse->rhoPIC_m.sum());
-        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPIC_m, Pcoarse->EfieldPIC_m);
-        //double EfieldNorm = std::sqrt(Pcoarse->rhoPIC_m.sum());
-        //double EfieldError = absFieldError / EfieldNorm;
-
-        IpplTimings::startTimer(computeErrors);
-        double localRerror, localPerror;
-        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-        //double Rerror = computeLinfError(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        //double Perror = computeLinfError(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-    
-        double EfieldError = 0;
-        if(it > 0) {
-            EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
-        }
-        IpplTimings::stopTimer(computeErrors);
-
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
-        IpplTimings::stopTimer(deepCopy);
         
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
-            << " Efield error: " << EfieldError
-            //<< " Rhofield error: " << EfieldError
             << endl;
 
         IpplTimings::startTimer(dumpData);
-        Pcoarse->writeError(Rerror, Perror, it+1);
+        //Pcoarse->writeError(Rerror, Perror, it+1);
         Pcoarse->writelocalError(localRerror, localPerror, it+1);
         IpplTimings::stopTimer(dumpData);
-        //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
-        if((Rerror <= tol) && (Perror <= tol)) {
-        //if(Perror <= tol) {
+        if(isConverged && isPreviousDomainConverged) {
             break;
         }
     }
 
+    Ippl::Comm->barrier();
     msg << TestName << " Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 7cad84807..c1bd3c611 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -779,8 +779,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0)) {
             IpplTimings::startTimer(dumpData);
-            //dumpLandau(iter);         
-            dumpBumponTail(iter);         
+            dumpLandau(iter);         
+            //dumpBumponTail(iter);         
             dumpEnergy(this->getLocalNum(), iter, Ptemp);
             IpplTimings::stopTimer(dumpData);
         }
@@ -812,8 +812,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             time_m += dt;
             
             IpplTimings::startTimer(dumpData);
-            //dumpLandau(iter);         
-            dumpBumponTail(iter);         
+            dumpLandau(iter);         
+            //dumpBumponTail(iter);         
             dumpEnergy(this->getLocalNum(), iter, Ptemp);         
             IpplTimings::stopTimer(dumpData);
     
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 48a85bb7f..1fe18d756 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -211,15 +211,16 @@ double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Q
     //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
 
 
-    double globaltemp = 0.0;
-    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+    //double globaltemp = 0.0;
+    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
 
-    double absError = globaltemp;
+    //double absError = globaltemp;
 
-    globaltemp = 0.0;
-    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+    //globaltemp = 0.0;
+    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
 
-    double relError = absError / globaltemp;
+    //double relError = absError / globaltemp;
+    double relError = lError;
     
     return relError;
 
@@ -314,8 +315,8 @@ const char* TestName = "LandauDampingPinT";
 int main(int argc, char *argv[]){
     Ippl ippl(argc, argv);
     
-    Inform msg("LandauDampingPinT");
-    Inform msg2all("LandauDampingPinT",INFORM_ALL_NODES);
+    Inform msg(TestName, Ippl::Comm->size()-1);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
 
     ippl::Vector<int,Dim> nmPIF = {
         std::atoi(argv[1]),
@@ -353,7 +354,8 @@ int main(int argc, char *argv[]){
     const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
     //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
 
-    msg << "Parareal Landau damping"
+    msg << "Parareal "
+        << TestName
         << endl
         << "Slice dT: " << dtSlice
         << endl
@@ -530,7 +532,6 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(coarsePropagator);
     msg << "Second Leap frog PIC done " << endl;
 
-    //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
     //The following might not be needed
     IpplTimings::startTimer(deepCopy);
@@ -540,12 +541,21 @@ int main(int argc, char *argv[]){
 
 
     msg << "Starting parareal iterations ..." << endl;
-    bool isConverged = false;
     //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
     //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
     //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
     //                     dtFine, isConverged, tStartMySlice, 0);
     //Ippl::Comm->barrier();
+    bool isConverged = false;
+    bool isPreviousDomainConverged;
+    if(Ippl::Comm->rank() == 0) {
+        isPreviousDomainConverged = true;
+    }
+    else {
+        isPreviousDomainConverged = false;
+    }
+
+    //unsigned int maxIterRank;
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
@@ -554,21 +564,6 @@ int main(int argc, char *argv[]){
         IpplTimings::stopTimer(finePropagator);
     
 
-        //if(isConverged) {
-
-            //test with the serial solution
-            //Pcoarse->LeapFrogPIF(Pcoarse->R0, Pcoarse->P0, (Ippl::Comm->rank()+1)*ntFine, 
-            //                     dtFine, isConverged, tStartMySlice, it+1);
-            //Ippl::Comm->barrier();
-            //double Rerror = computeL2Error(Pcoarse->R0, Pbegin->R, it+1, Ippl::Comm->rank());
-            //double Perror = computeL2Error(Pcoarse->P0, Pbegin->P, it+1, Ippl::Comm->rank());
-            //msg << "Finished iteration: " << it+1 
-            //<< " Rerror: " << Rerror 
-            //<< " Perror: " << Perror
-            //<< endl;
-        //    break;
-        //}
-
         //Difference = Fine - Coarse
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
@@ -580,20 +575,25 @@ int main(int argc, char *argv[]){
         
         IpplTimings::startTimer(timeCommunication);
         tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+        int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
-        if(Ippl::Comm->rank() > 0) {
+        if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
             size_type bufSize = Pbegin->packedSize(nloc);
             buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
             Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
             buf->resetReadPos();
-        }
-        else {
-            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
-            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
+                    Ippl::getComm(), MPI_STATUS_IGNORE);
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
         }
         IpplTimings::stopTimer(timeCommunication);
 
         IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+        Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
         Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
         Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
         IpplTimings::stopTimer(deepCopy);
@@ -605,6 +605,22 @@ int main(int argc, char *argv[]){
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
+        IpplTimings::startTimer(computeErrors);
+        double localRerror, localPerror;
+        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+    
+        //double EfieldError = 0;
+        //if(it > 0) {
+        //    EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
+        //}
+        IpplTimings::stopTimer(computeErrors);
+
+        if((Rerror <= tol) && (Perror <= tol)) {
+            isConverged = true;
+        }
+        
+        
         IpplTimings::startTimer(timeCommunication);
         if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
             size_type bufSize = Pend->packedSize(nloc);
@@ -613,53 +629,33 @@ int main(int argc, char *argv[]){
             Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
             buf->resetWritePos();
             MPI_Wait(&request, MPI_STATUS_IGNORE);
+            MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
         }
         IpplTimings::stopTimer(timeCommunication);
 
-        //Pcoarse->EfieldPICprevIter_m = Pcoarse->EfieldPICprevIter_m - Pcoarse->EfieldPIC_m;
-        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPICprevIter_m, Pcoarse->EfieldPICprevIter_m);
-        //double absFieldError = std::sqrt(Pcoarse->rhoPIC_m.sum());
-        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPIC_m, Pcoarse->EfieldPIC_m);
-        //double EfieldNorm = std::sqrt(Pcoarse->rhoPIC_m.sum());
-        //double EfieldError = absFieldError / EfieldNorm;
-
-        IpplTimings::startTimer(computeErrors);
-        double localRerror, localPerror;
-        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-        //double Rerror = computeLinfError(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        //double Perror = computeLinfError(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-    
-        double EfieldError = 0;
-        if(it > 0) {
-            EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
-        }
-        IpplTimings::stopTimer(computeErrors);
-
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
-        IpplTimings::stopTimer(deepCopy);
         
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
-            << " Efield error: " << EfieldError
+            //<< " Efield error: " << EfieldError
             //<< " Rhofield error: " << EfieldError
             << endl;
 
         IpplTimings::startTimer(dumpData);
-        Pcoarse->writeError(Rerror, Perror, it+1);
+        //Pcoarse->writeError(Rerror, Perror, it+1);
         Pcoarse->writelocalError(localRerror, localPerror, it+1);
         IpplTimings::stopTimer(dumpData);
-        //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
-        if((Rerror <= tol) && (Perror <= tol)) {
-        //if(Perror <= tol) {
+        if(isConverged && isPreviousDomainConverged) {
+            //maxIterRank = it+1;
             break;
         }
+
     }
 
-    msg << "LandauDamping Parareal: End." << endl;
+    //std::cout << "Rank " << Ippl::Comm->rank() << " is out of the loop in iteration: " << maxIterRank << std::endl;
+    Ippl::Comm->barrier();
+    msg << TestName << " Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index bf9ab6586..fa8ca441f 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -214,15 +214,16 @@ double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Q
     //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
 
 
-    double globaltemp = 0.0;
-    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+    //double globaltemp = 0.0;
+    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
 
-    double absError = globaltemp;
+    //double absError = globaltemp;
 
-    globaltemp = 0.0;
-    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+    //globaltemp = 0.0;
+    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
 
-    double relError = absError / globaltemp;
+    //double relError = absError / globaltemp;
+    double relError = lError;
     
     return relError;
 
@@ -317,7 +318,7 @@ const char* TestName = "PenningTrap";
 int main(int argc, char *argv[]){
     Ippl ippl(argc, argv);
     
-    Inform msg(TestName);
+    Inform msg(TestName, Ippl::Comm->size()-1);
     Inform msg2all(TestName,INFORM_ALL_NODES);
 
     ippl::Vector<int,Dim> nmPIF = {
@@ -560,11 +561,13 @@ int main(int argc, char *argv[]){
 
     msg << "Starting parareal iterations ..." << endl;
     bool isConverged = false;
-    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
-    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
-    //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
-    //                     dtFine, isConverged, tStartMySlice, 0);
-    //Ippl::Comm->barrier();
+    bool isPreviousDomainConverged;
+    if(Ippl::Comm->rank() == 0) {
+        isPreviousDomainConverged = true;
+    }
+    else {
+        isPreviousDomainConverged = false;
+    }
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
@@ -573,21 +576,6 @@ int main(int argc, char *argv[]){
         IpplTimings::stopTimer(finePropagator);
     
 
-        //if(isConverged) {
-
-            //test with the serial solution
-            //Pcoarse->LeapFrogPIF(Pcoarse->R0, Pcoarse->P0, (Ippl::Comm->rank()+1)*ntFine, 
-            //                     dtFine, isConverged, tStartMySlice, it+1);
-            //Ippl::Comm->barrier();
-            //double Rerror = computeL2Error(Pcoarse->R0, Pbegin->R, it+1, Ippl::Comm->rank());
-            //double Perror = computeL2Error(Pcoarse->P0, Pbegin->P, it+1, Ippl::Comm->rank());
-            //msg << "Finished iteration: " << it+1 
-            //<< " Rerror: " << Rerror 
-            //<< " Perror: " << Perror
-            //<< endl;
-        //    break;
-        //}
-
         //Difference = Fine - Coarse
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
@@ -599,20 +587,25 @@ int main(int argc, char *argv[]){
         
         IpplTimings::startTimer(timeCommunication);
         tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+        int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
         if(Ippl::Comm->rank() > 0) {
             size_type bufSize = Pbegin->packedSize(nloc);
             buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
             Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
             buf->resetReadPos();
-        }
-        else {
-            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
-            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
+                    Ippl::getComm(), MPI_STATUS_IGNORE);
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
         }
         IpplTimings::stopTimer(timeCommunication);
 
         IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+        Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
         Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
         Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
         IpplTimings::stopTimer(deepCopy);
@@ -624,6 +617,18 @@ int main(int argc, char *argv[]){
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
+        IpplTimings::startTimer(computeErrors);
+        double localRerror, localPerror;
+        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+    
+        IpplTimings::stopTimer(computeErrors);
+
+        if((Rerror <= tol) && (Perror <= tol)) {
+            isConverged = true;
+        }
+
+
         IpplTimings::startTimer(timeCommunication);
         if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
             size_type bufSize = Pend->packedSize(nloc);
@@ -632,53 +637,28 @@ int main(int argc, char *argv[]){
             Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
             buf->resetWritePos();
             MPI_Wait(&request, MPI_STATUS_IGNORE);
+            MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
         }
         IpplTimings::stopTimer(timeCommunication);
 
-        //Pcoarse->EfieldPICprevIter_m = Pcoarse->EfieldPICprevIter_m - Pcoarse->EfieldPIC_m;
-        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPICprevIter_m, Pcoarse->EfieldPICprevIter_m);
-        //double absFieldError = std::sqrt(Pcoarse->rhoPIC_m.sum());
-        //Pcoarse->rhoPIC_m = dot(Pcoarse->EfieldPIC_m, Pcoarse->EfieldPIC_m);
-        //double EfieldNorm = std::sqrt(Pcoarse->rhoPIC_m.sum());
-        //double EfieldError = absFieldError / EfieldNorm;
-
-        IpplTimings::startTimer(computeErrors);
-        double localRerror, localPerror;
-        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-        //double Rerror = computeLinfError(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        //double Perror = computeLinfError(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-    
-        double EfieldError = 0;
-        if(it > 0) {
-            EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
-        }
-        IpplTimings::stopTimer(computeErrors);
-
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pcoarse->rhoPIFprevIter_m.getView(), Pcoarse->rhoPIF_m.getView());
-        IpplTimings::stopTimer(deepCopy);
         
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
-            << " Efield error: " << EfieldError
-            //<< " Rhofield error: " << EfieldError
             << endl;
 
         IpplTimings::startTimer(dumpData);
-        Pcoarse->writeError(Rerror, Perror, it+1);
+        //Pcoarse->writeError(Rerror, Perror, it+1);
         Pcoarse->writelocalError(localRerror, localPerror, it+1);
         IpplTimings::stopTimer(dumpData);
-        //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
 
-        if((Rerror <= tol) && (Perror <= tol)) {
-        //if(Perror <= tol) {
+        if(isConverged && isPreviousDomainConverged) {
             break;
         }
     }
 
-    msg << "Penning trap Parareal: End." << endl;
+    Ippl::Comm->barrier();
+    msg << TestName << " Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
diff --git a/src/Ippl.cpp b/src/Ippl.cpp
index 420af917a..26b5b30a6 100644
--- a/src/Ippl.cpp
+++ b/src/Ippl.cpp
@@ -98,7 +98,7 @@ Ippl::Ippl(int& argc, char**& argv, MPI_Comm mpicomm)
 
         if (infoLevel > 0 && Comm->myNode() == 0) {
             for (auto& l : notparsed) {
-                std::cout << "Warning: Option '" << l << "' is not parsed by Ippl." << std::endl;
+                std::cout << "Option '" << l << "' is not parsed by Ippl. Make sure your application parses it." << std::endl;
             }
         }
 

From 17bf673dc32c5629d9ed8b6c696ca3631303a8cb Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 6 Jan 2023 16:44:10 +0100
Subject: [PATCH 037/117] If conditions removed and performance improved a
 little bit

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 107 ++++++++++++++----
 alpine/PinT/ChargedParticlesPinT.hpp          |   1 +
 src/Particle/ParticleAttrib.hpp               |  67 +++++++++--
 src/Solver/FFTPeriodicPoissonSolver.hpp       |  20 ++--
 4 files changed, 150 insertions(+), 45 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 602964ab6..3429969a1 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -130,33 +130,96 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
     }
 
 
-     void dumpLandau(size_type totalP) {
+     void dumpLandau(size_type /*totalP*/) {
         
-        auto Eview = E.getView();
+        
+        double fieldEnergy = 0.0; 
+        double ExAmp = 0.0;
 
-        double fieldEnergy, ExAmp;
-        double temp = 0.0;
+        auto rhoview = rho_m.getView();
+        const int nghost = rho_m.getNghost();
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+      
+        const FieldLayout_t& layout = rho_m.getLayout(); 
+        const Mesh_t& mesh = rho_m.get_mesh();
+        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        Vector<double, Dim> Len;
+        Vector<int, Dim> N;
 
-        Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const int i, double& valL){
-                                    double myVal = Eview(i)[0] * Eview(i)[0];
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
 
-        double globaltemp = 0.0;
-        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+        Kokkos::complex<double> imag = {0.0, 1.0};
+        double pi = std::acos(-1.0);
+        Kokkos::parallel_reduce("Ex energy and Max",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& tlSum,
+                                            double& tlMax)
+        {
+        
+            Vector<int, 3> iVec = {i, j, k};
+            Vector<double, 3> kVec;
+            double Dr = 0.0;
+            for(size_t d = 0; d < Dim; ++d) {
+                bool shift = (iVec[d] > (N[d]/2));
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                Dr += kVec[d] * kVec[d];
+            }
+
+            Kokkos::complex<double> Ek = {0.0, 0.0}; 
+            if(Dr != 0.0) {
+                Ek = -(imag * kVec[0] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+            }
+            double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+
+            tlSum += myVal;
+
+            double myValMax = std::sqrt(myVal);
+
+            if(myValMax > tlMax) tlMax = myValMax;
+
+        }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(ExAmp));
+        
+
+        Kokkos::fence();
         double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        fieldEnergy = globaltemp * volume / totalP ;
-
-        double tempMax = 0.0;
-        Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const size_t i, double& valL)
-                                {
-                                    double myVal = std::fabs(Eview(i)[0]);
-                                    if(myVal > valL) valL = myVal;
-                                }, Kokkos::Max<double>(tempMax));
-        ExAmp = 0.0;
-        MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+        fieldEnergy *= volume;
+         
+        //auto Eview = E.getView();
+
+        //double fieldEnergy, ExAmp;
+        //double temp = 0.0;
+
+        //Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
+        //                        KOKKOS_LAMBDA(const int i, double& valL){
+        //                            double myVal = Eview(i)[0] * Eview(i)[0];
+        //                            valL += myVal;
+        //                        }, Kokkos::Sum<double>(temp));
+
+        //double globaltemp = 0.0;
+        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+        //fieldEnergy = globaltemp * volume / totalP ;
+
+        //double tempMax = 0.0;
+        //Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
+        //                        KOKKOS_LAMBDA(const size_t i, double& valL)
+        //                        {
+        //                            double myVal = std::fabs(Eview(i)[0]);
+        //                            if(myVal > valL) valL = myVal;
+        //                        }, Kokkos::Max<double>(tempMax));
+        //ExAmp = 0.0;
+        //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
 
 
         if (Ippl::Comm->rank() == 0) {
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index c1bd3c611..7fad12b69 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -791,6 +791,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Ptemp = Ptemp - 0.5 * dt * E;
     
             //drift
+            
             Rtemp = Rtemp + dt * Ptemp;
     
             //Apply particle BC
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 3c0d9e183..726db2b09 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -254,23 +254,41 @@ namespace ippl {
                 const int i = flatIndex2D % N[0];
                 const int j = (int)(flatIndex2D / N[0]);
 
+                //const int i = (int)(flatIndex / (N[0] * N[1]));
+                //const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
+                //const int k = flatIndex2D % N[0];
+                //const int j = (int)(flatIndex2D / N[0]);
+                
                 FT reducedValue = 0.0;
+                Vector<int, 3> iVec = {i, j, k};
+                vector_type kVec;
+                double Sk = 1.0; //Fourier transform of the shape function
+                for(size_t d = 0; d < Dim; ++d) {
+                    bool shift = (iVec[d] > (N[d]/2));
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    //double kh = kVec[d] * dx[d];
+                    ////Fourier transform of CIC
+                    //if(kh != 0.0) {
+                    //    Sk *= std::pow(Kokkos::Experimental::sin(kh)/kh, 2);
+                    //}
+                }
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
                 [=](const size_t idx, FT& innerReduce)
                 {
-                    Vector<int, 3> iVec = {i, j, k};
-                    vector_type kVec;
-                    double arg=0.0;
+                    //Vector<int, 3> iVec = {i, j, k};
+                    //vector_type kVec;
+                    double arg = 0.0;
                     for(size_t d = 0; d < Dim; ++d) {
-                        bool shift = (iVec[d] > (N[d]/2));
-                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        //bool shift = (iVec[d] > (N[d]/2));
+                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                         //kVec[d] = 2 * pi / Len[d] * iVec[d];
                         //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
                         arg += kVec[d]*pp(idx)[d];
                     }
                     const value_type& val = dview_m(idx);
 
-                    innerReduce += (Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
+                    innerReduce += Sk*(Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
+                    //innerReduce += Sk*(arg - imag*arg)*val;
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {
@@ -395,17 +413,27 @@ namespace ippl {
                 const size_t idx = teamMember.league_rank();
 
                 value_type reducedValue = 0.0;
+                //double ExReducedValue = 0.0, EyReducedValue = 0.0;
+                //double EzReducedValue = 0.0;
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
                 [=](const size_t flatIndex, value_type& innerReduce)
+                //[=](const size_t flatIndex, double& ExReduce, double& EyReduce, double& EzReduce)
                 {
                     const int k = (int)(flatIndex / (N[0] * N[1]));
                     const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
                     const int i = flatIndex2D % N[0];
                     const int j = (int)(flatIndex2D / N[0]);
 
+
+                    //const int i = (int)(flatIndex / (N[0] * N[1]));
+                    //const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
+                    //const int k = flatIndex2D % N[0];
+                    //const int j = (int)(flatIndex2D / N[0]);
+
                     Vector<int, 3> iVec = {i, j, k};
                     vector_type kVec;
-                    double Dr = 0.0, arg=0.0;
+                    double Dr = 0.0, arg = 0.0;
+                    double Sk = 1.0; //Fourier transform of shape function
                     for(size_t d = 0; d < Dim; ++d) {
                         bool shift = (iVec[d] > (N[d]/2));
                         kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
@@ -413,31 +441,46 @@ namespace ippl {
                         //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
                         Dr += kVec[d] * kVec[d];
                         arg += kVec[d]*pp(idx)[d];
+                        //double kh = kVec[d] * dx[d];
+                        ////Fourier transform of CIC
+                        //if(kh != 0.0) {
+                        //    Sk *= std::pow(Kokkos::Experimental::sin(kh)/kh, 2);
+                        //}
                     }
 
                     FT Ek = 0.0;
-                    value_type Ex;
+                    value_type Ex = 0.0;
                     for(size_t d = 0; d < Dim; ++d) {
-                        if(Dr != 0.0) {
-                            Ek = -(imag * kVec[d] * fview(i+nghost,j+nghost,k+nghost) / Dr);
-                        }
+                        
+                        bool isNotZero = (Dr != 0.0);
+                        double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                        Ek = -(imag * kVec[d] * fview(i+nghost,j+nghost,k+nghost) * factor);
                         
                         //Inverse Fourier transform when the lhs is real. Use when 
                         //we choose k \in [0 K) instead of from [-K/2+1 K/2] 
                         //Ex[d] = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
                         //        - Ek.imag() * Kokkos::Experimental::sin(arg));
-                        Ek *= (Kokkos::Experimental::cos(arg) 
+                        Ek *= Sk * (Kokkos::Experimental::cos(arg) 
                                 + imag * Kokkos::Experimental::sin(arg));
+                        //Ek *= Sk * (arg + imag * arg);
                         Ex[d] = Ek.real();
                     }
                     
                     innerReduce += Ex;
+                    //ExReduce += Ex[0];
+                    //EyReduce += Ex[1];
+                    //EzReduce += Ex[2];
                 }, Kokkos::Sum<value_type>(reducedValue));
+                //}, Kokkos::Sum<double>(ExReducedValue), Kokkos::Sum<double>(EyReducedValue), 
+                //Kokkos::Sum<double>(EzReducedValue));
 
                 teamMember.team_barrier();
 
                 if(teamMember.team_rank() == 0) {
                     dview_m(idx) = reducedValue;
+                    //dview_m(idx)[0] = ExReducedValue;
+                    //dview_m(idx)[1] = EyReducedValue;
+                    //dview_m(idx)[2] = EzReducedValue;
                 }
 
                 }
diff --git a/src/Solver/FFTPeriodicPoissonSolver.hpp b/src/Solver/FFTPeriodicPoissonSolver.hpp
index e6f690942..73d7d2a2c 100644
--- a/src/Solver/FFTPeriodicPoissonSolver.hpp
+++ b/src/Solver/FFTPeriodicPoissonSolver.hpp
@@ -113,12 +113,11 @@ namespace ippl {
 
                   double Dr = kVec[0] * kVec[0] +
                               kVec[1] * kVec[1] + kVec[2] * kVec[2];
-
-                  //It would be great if we can remove this conditional
-                  if(Dr != 0.0)
-                      view(i, j, k) *= 1 / Dr;
-                  else
-                      view(i, j, k) = 0.0;
+                  
+                  bool isNotZero = (Dr != 0.0);
+                  double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                  
+                  view(i, j, k) *= factor;
               });
 
               fft_mp->transform(-1, *this->rhs_mp, fieldComplex_m);
@@ -168,11 +167,10 @@ namespace ippl {
 
                         tempview(i, j, k) = view(i, j, k);
                         
-                        //It would be great if we can remove this conditional
-                        if(Dr != 0.0)
-                            tempview(i, j, k) *= -(imag * kVec[gd] / Dr);
-                        else
-                            tempview(i, j, k) = 0.0;
+                        bool isNotZero = (Dr != 0.0);
+                        double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                        
+                        tempview(i, j, k) *= -(imag * kVec[gd] * factor);
                     });
 
                     fft_mp->transform(-1, *this->rhs_mp, tempFieldComplex_m);

From 8032aed2b2f21c019c21b47c4e40a6f81861126f Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 9 Jan 2023 08:23:15 +0100
Subject: [PATCH 038/117] Layout lefts and rights specified

---
 src/Particle/ParticleAttrib.hpp | 27 ++++++++++++++++-----------
 src/Types/ViewTypes.h           |  2 +-
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 726db2b09..f25022922 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -248,16 +248,19 @@ namespace ippl {
                 team_policy(flatN, Kokkos::AUTO),
                 KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
                 const size_t flatIndex = teamMember.league_rank();
-                
+               
+#ifdef KOKKOS_ENABLE_CUDA
                 const int k = (int)(flatIndex / (N[0] * N[1]));
                 const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
                 const int i = flatIndex2D % N[0];
                 const int j = (int)(flatIndex2D / N[0]);
+#else
 
-                //const int i = (int)(flatIndex / (N[0] * N[1]));
-                //const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
-                //const int k = flatIndex2D % N[0];
-                //const int j = (int)(flatIndex2D / N[0]);
+                const int i = (int)(flatIndex / (N[0] * N[1]));
+                const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
+                const int k = flatIndex2D % N[0];
+                const int j = (int)(flatIndex2D / N[0]);
+#endif
                 
                 FT reducedValue = 0.0;
                 Vector<int, 3> iVec = {i, j, k};
@@ -419,16 +422,18 @@ namespace ippl {
                 [=](const size_t flatIndex, value_type& innerReduce)
                 //[=](const size_t flatIndex, double& ExReduce, double& EyReduce, double& EzReduce)
                 {
+                    
+#ifdef KOKKOS_ENABLE_CUDA
                     const int k = (int)(flatIndex / (N[0] * N[1]));
                     const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
                     const int i = flatIndex2D % N[0];
                     const int j = (int)(flatIndex2D / N[0]);
-
-
-                    //const int i = (int)(flatIndex / (N[0] * N[1]));
-                    //const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
-                    //const int k = flatIndex2D % N[0];
-                    //const int j = (int)(flatIndex2D / N[0]);
+#else
+                    const int i = (int)(flatIndex / (N[0] * N[1]));
+                    const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
+                    const int k = flatIndex2D % N[0];
+                    const int j = (int)(flatIndex2D / N[0]);
+#endif
 
                     Vector<int, 3> iVec = {i, j, k};
                     vector_type kVec;
diff --git a/src/Types/ViewTypes.h b/src/Types/ViewTypes.h
index 179cc4056..7cfc4238d 100644
--- a/src/Types/ViewTypes.h
+++ b/src/Types/ViewTypes.h
@@ -54,7 +54,7 @@ namespace ippl {
         };
 
         /*!
-         * Specialized view type for thee dimensions.
+         * Specialized view type for three dimensions.
          */
         template <typename T, class... Properties>
         struct ViewType<T, 3, Properties...> {

From ab9b6b99de0ac44fa32285ed5849a7e32efcfd9d Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 9 Jan 2023 16:10:56 +0100
Subject: [PATCH 039/117] Some performance tests made and good configuration
 for delta shape functions found

---
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp |  2 +-
 src/Particle/ParticleAttrib.hpp              | 27 +++-----------------
 src/Utility/IpplTimings.cpp                  |  6 ++---
 3 files changed, 8 insertions(+), 27 deletions(-)

diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 93e9e7796..f90cb56fb 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -220,7 +220,7 @@ int main(int argc, char *argv[]){
     //const RegionLayout_t& RLayout = PL.getRegionLayout();
     //const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
     Vector_t minU, maxU;
-    int myRank = Ippl::Comm->rank();
+    //int myRank = Ippl::Comm->rank();
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = rmin[d];//CDF(Regions(myRank)[d].min(), alpha, kw[d]);
         maxU[d] = rmax[d];//CDF(Regions(myRank)[d].max(), alpha, kw[d]);
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index f25022922..4dd769e60 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -233,9 +233,9 @@ namespace ippl {
         typedef Kokkos::TeamPolicy<>::member_type member_type;
 
 
-        using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
+        //using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
 
-        view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
+        //view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
 
         double pi = std::acos(-1.0);
         Kokkos::complex<double> imag = {0.0, 1.0};
@@ -278,20 +278,13 @@ namespace ippl {
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
                 [=](const size_t idx, FT& innerReduce)
                 {
-                    //Vector<int, 3> iVec = {i, j, k};
-                    //vector_type kVec;
                     double arg = 0.0;
                     for(size_t d = 0; d < Dim; ++d) {
-                        //bool shift = (iVec[d] > (N[d]/2));
-                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                        //kVec[d] = 2 * pi / Len[d] * iVec[d];
-                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
                         arg += kVec[d]*pp(idx)[d];
                     }
                     const value_type& val = dview_m(idx);
 
                     innerReduce += Sk*(Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
-                    //innerReduce += Sk*(arg - imag*arg)*val;
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {
@@ -304,7 +297,6 @@ namespace ippl {
 
         IpplTimings::stopTimer(scatterPIFTimer);
 
-        //Kokkos::deep_copy(fview, viewLocal);
         //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
         //IpplTimings::startTimer(scatterAllReduceTimer);                                               
         //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
@@ -416,11 +408,8 @@ namespace ippl {
                 const size_t idx = teamMember.league_rank();
 
                 value_type reducedValue = 0.0;
-                //double ExReducedValue = 0.0, EyReducedValue = 0.0;
-                //double EzReducedValue = 0.0;
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
                 [=](const size_t flatIndex, value_type& innerReduce)
-                //[=](const size_t flatIndex, double& ExReduce, double& EyReduce, double& EzReduce)
                 {
                     
 #ifdef KOKKOS_ENABLE_CUDA
@@ -455,11 +444,12 @@ namespace ippl {
 
                     FT Ek = 0.0;
                     value_type Ex = 0.0;
+                    auto rho = fview(i+nghost,j+nghost,k+nghost);
                     for(size_t d = 0; d < Dim; ++d) {
                         
                         bool isNotZero = (Dr != 0.0);
                         double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
-                        Ek = -(imag * kVec[d] * fview(i+nghost,j+nghost,k+nghost) * factor);
+                        Ek = -(imag * kVec[d] * rho * factor);
                         
                         //Inverse Fourier transform when the lhs is real. Use when 
                         //we choose k \in [0 K) instead of from [-K/2+1 K/2] 
@@ -467,25 +457,16 @@ namespace ippl {
                         //        - Ek.imag() * Kokkos::Experimental::sin(arg));
                         Ek *= Sk * (Kokkos::Experimental::cos(arg) 
                                 + imag * Kokkos::Experimental::sin(arg));
-                        //Ek *= Sk * (arg + imag * arg);
                         Ex[d] = Ek.real();
                     }
                     
                     innerReduce += Ex;
-                    //ExReduce += Ex[0];
-                    //EyReduce += Ex[1];
-                    //EzReduce += Ex[2];
                 }, Kokkos::Sum<value_type>(reducedValue));
-                //}, Kokkos::Sum<double>(ExReducedValue), Kokkos::Sum<double>(EyReducedValue), 
-                //Kokkos::Sum<double>(EzReducedValue));
 
                 teamMember.team_barrier();
 
                 if(teamMember.team_rank() == 0) {
                     dview_m(idx) = reducedValue;
-                    //dview_m(idx)[0] = ExReducedValue;
-                    //dview_m(idx)[1] = EyReducedValue;
-                    //dview_m(idx)[2] = EzReducedValue;
                 }
 
                 }
diff --git a/src/Utility/IpplTimings.cpp b/src/Utility/IpplTimings.cpp
index 7cc1079c4..c52c53b51 100644
--- a/src/Utility/IpplTimings.cpp
+++ b/src/Utility/IpplTimings.cpp
@@ -115,7 +115,7 @@ void Timing::print() {
     msg << level1
         << "---------------------------------------------";
     msg << "\n";
-    msg << "     Timing results for " << Ippl::Comm->getNodes() << " nodes:" << "\n";
+    msg << "     Timing results for " << Ippl::Comm->getNodes() << " ranks:" << "\n";
     msg << "---------------------------------------------";
     msg << "\n";
 
@@ -178,7 +178,7 @@ void Timing::print(const std::string &fn, const std::map<std::string, unsigned i
         *msg << endl;
     }
 
-    *msg << std::setw(27) << "num Nodes"
+    *msg << std::setw(27) << "num Ranks"
          << std::setw(11) << "Wall tot\n"
          << std::string().assign(37,'=')
          << "\n";
@@ -198,7 +198,7 @@ void Timing::print(const std::string &fn, const std::map<std::string, unsigned i
     }
 
     *msg << "\n"
-         << std::setw(27) << "num Nodes"
+         << std::setw(27) << "num Ranks"
          << std::setw(10) << "Wall max"
          << std::setw(10) << "Wall min"
          << std::setw(11) << "Wall avg\n"

From 251725500d8a8ebfc2b975fc37cc75820b3180b1 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 10 Jan 2023 12:45:27 +0100
Subject: [PATCH 040/117] Precalculated shape functions implemented and tested

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 630 ++++++++++--------
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  |  12 +-
 alpine/PinT/BumponTailInstabilityPinT.cpp     |  18 +-
 alpine/PinT/ChargedParticlesPinT.hpp          |  74 +-
 alpine/PinT/LandauDampingPinT.cpp             |  19 +-
 alpine/PinT/PenningTrapPinT.cpp               |  17 +-
 src/Particle/ParticleAttrib.h                 |  12 +-
 src/Particle/ParticleAttrib.hpp               |  37 +-
 8 files changed, 482 insertions(+), 337 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 3429969a1..49145837a 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -51,6 +51,7 @@ template<class PLayout>
 class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 public:
     CxField_t rho_m;
+    Field_t Sk_m;
 
     Vector<int, Dim> nr_m;
 
@@ -66,6 +67,9 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     double rhoNorm_m;
 
+    std::string shapetype_m;
+
+    int shapedegree_m;
 
 public:
     ParticleAttrib<double>     q; // charge
@@ -115,7 +119,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     void gather() {
 
-        gatherPIF(this->E, rho_m, this->R);
+        gatherPIF(this->E, rho_m, Sk_m, this->R);
 
     }
 
@@ -123,340 +127,392 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         
         Inform m("scatter ");
         rho_m = {0.0, 0.0};
-        scatterPIF(q, rho_m, this->R);
+        scatterPIF(q, rho_m, Sk_m, this->R);
 
         rho_m = rho_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
 
     }
 
 
-     void dumpLandau(size_type /*totalP*/) {
-        
-        
-        double fieldEnergy = 0.0; 
-        double ExAmp = 0.0;
+    void dumpLandau(size_type /*totalP*/) {
+       
+       
+       double fieldEnergy = 0.0; 
+       double ExAmp = 0.0;
 
-        auto rhoview = rho_m.getView();
-        const int nghost = rho_m.getNghost();
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
-      
-        const FieldLayout_t& layout = rho_m.getLayout(); 
-        const Mesh_t& mesh = rho_m.get_mesh();
-        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
-        const auto& domain = layout.getDomain();
-        Vector<double, Dim> Len;
-        Vector<int, Dim> N;
-
-        for (unsigned d=0; d < Dim; ++d) {
-            N[d] = domain[d].length();
-            Len[d] = dx[d] * N[d];
-        }
+       auto rhoview = rho_m.getView();
+       const int nghost = rho_m.getNghost();
+       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+     
+       const FieldLayout_t& layout = rho_m.getLayout(); 
+       const Mesh_t& mesh = rho_m.get_mesh();
+       const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+       const auto& domain = layout.getDomain();
+       Vector<double, Dim> Len;
+       Vector<int, Dim> N;
+
+       for (unsigned d=0; d < Dim; ++d) {
+           N[d] = domain[d].length();
+           Len[d] = dx[d] * N[d];
+       }
+
+
+       Kokkos::complex<double> imag = {0.0, 1.0};
+       double pi = std::acos(-1.0);
+       Kokkos::parallel_reduce("Ex energy and Max",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& tlSum,
+                                           double& tlMax)
+       {
+       
+           Vector<int, 3> iVec = {i, j, k};
+           Vector<double, 3> kVec;
+           double Dr = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               bool shift = (iVec[d] > (N[d]/2));
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               Dr += kVec[d] * kVec[d];
+           }
 
+           Kokkos::complex<double> Ek = {0.0, 0.0}; 
+           if(Dr != 0.0) {
+               Ek = -(imag * kVec[0] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+           }
+           double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
 
-        Kokkos::complex<double> imag = {0.0, 1.0};
-        double pi = std::acos(-1.0);
-        Kokkos::parallel_reduce("Ex energy and Max",
-                              mdrange_type({0, 0, 0},
-                                           {N[0],
-                                            N[1],
-                                            N[2]}),
-                              KOKKOS_LAMBDA(const int i,
-                                            const int j,
-                                            const int k,
-                                            double& tlSum,
-                                            double& tlMax)
-        {
-        
-            Vector<int, 3> iVec = {i, j, k};
-            Vector<double, 3> kVec;
-            double Dr = 0.0;
-            for(size_t d = 0; d < Dim; ++d) {
-                bool shift = (iVec[d] > (N[d]/2));
-                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                Dr += kVec[d] * kVec[d];
-            }
+           tlSum += myVal;
 
-            Kokkos::complex<double> Ek = {0.0, 0.0}; 
-            if(Dr != 0.0) {
-                Ek = -(imag * kVec[0] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-            }
-            double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+           double myValMax = std::sqrt(myVal);
 
-            tlSum += myVal;
+           if(myValMax > tlMax) tlMax = myValMax;
 
-            double myValMax = std::sqrt(myVal);
-
-            if(myValMax > tlMax) tlMax = myValMax;
+       }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(ExAmp));
+       
 
-        }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(ExAmp));
+       Kokkos::fence();
+       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       fieldEnergy *= volume;
         
+       //auto Eview = E.getView();
 
-        Kokkos::fence();
-        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        fieldEnergy *= volume;
-         
-        //auto Eview = E.getView();
-
-        //double fieldEnergy, ExAmp;
-        //double temp = 0.0;
+       //double fieldEnergy, ExAmp;
+       //double temp = 0.0;
 
-        //Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
-        //                        KOKKOS_LAMBDA(const int i, double& valL){
-        //                            double myVal = Eview(i)[0] * Eview(i)[0];
-        //                            valL += myVal;
-        //                        }, Kokkos::Sum<double>(temp));
+       //Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
+       //                        KOKKOS_LAMBDA(const int i, double& valL){
+       //                            double myVal = Eview(i)[0] * Eview(i)[0];
+       //                            valL += myVal;
+       //                        }, Kokkos::Sum<double>(temp));
 
-        //double globaltemp = 0.0;
-        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-        //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        //fieldEnergy = globaltemp * volume / totalP ;
+       //double globaltemp = 0.0;
+       //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+       //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       //fieldEnergy = globaltemp * volume / totalP ;
 
-        //double tempMax = 0.0;
-        //Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
-        //                        KOKKOS_LAMBDA(const size_t i, double& valL)
-        //                        {
-        //                            double myVal = std::fabs(Eview(i)[0]);
-        //                            if(myVal > valL) valL = myVal;
-        //                        }, Kokkos::Max<double>(tempMax));
-        //ExAmp = 0.0;
-        //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+       //double tempMax = 0.0;
+       //Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
+       //                        KOKKOS_LAMBDA(const size_t i, double& valL)
+       //                        {
+       //                            double myVal = std::fabs(Eview(i)[0]);
+       //                            if(myVal > valL) valL = myVal;
+       //                        }, Kokkos::Max<double>(tempMax));
+       //ExAmp = 0.0;
+       //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
 
 
-        if (Ippl::Comm->rank() == 0) {
-            std::stringstream fname;
-            fname << "data/FieldLandau_";
-            fname << Ippl::Comm->size();
-            fname << ".csv";
+       if (Ippl::Comm->rank() == 0) {
+           std::stringstream fname;
+           fname << "data/FieldLandau_";
+           fname << Ippl::Comm->size();
+           fname << ".csv";
 
 
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-            csvout.precision(10);
-            csvout.setf(std::ios::scientific, std::ios::floatfield);
+           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+           csvout.precision(10);
+           csvout.setf(std::ios::scientific, std::ios::floatfield);
 
-            if(time_m == 0.0) {
-                csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
-            }
+           if(time_m == 0.0) {
+               csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
+           }
 
-            csvout << time_m << " "
-                   << fieldEnergy << " "
-                   << ExAmp << endl;
+           csvout << time_m << " "
+                  << fieldEnergy << " "
+                  << ExAmp << endl;
 
-        }
-        
-        Ippl::Comm->barrier();
-     }
+       }
+       
+       Ippl::Comm->barrier();
+    }
 
 
-     void dumpEnergy(size_type /*totalP*/) {
-        
+    void dumpEnergy(size_type /*totalP*/) {
+       
 
-        double potentialEnergy, kineticEnergy;
-        double temp = 0.0;
+       double potentialEnergy, kineticEnergy;
+       double temp = 0.0;
 
-        //auto Eview = E.getView();
-        //Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
-        //                        KOKKOS_LAMBDA(const int i, double& valL){
-        //                            double myVal = dot(Eview(i), Eview(i)).apply();
-        //                            valL += myVal;
-        //                        }, Kokkos::Sum<double>(temp));
+       //auto Eview = E.getView();
+       //Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
+       //                        KOKKOS_LAMBDA(const int i, double& valL){
+       //                            double myVal = dot(Eview(i), Eview(i)).apply();
+       //                            valL += myVal;
+       //                        }, Kokkos::Sum<double>(temp));
 
 
 
-        auto rhoview = rho_m.getView();
-        const int nghost = rho_m.getNghost();
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+       auto rhoview = rho_m.getView();
+       const int nghost = rho_m.getNghost();
+       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+      
+       const FieldLayout_t& layout = rho_m.getLayout(); 
+       const Mesh_t& mesh = rho_m.get_mesh();
+       const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+       const auto& domain = layout.getDomain();
+       Vector<double, Dim> Len;
+       Vector<int, Dim> N;
+
+       for (unsigned d=0; d < Dim; ++d) {
+           N[d] = domain[d].length();
+           Len[d] = dx[d] * N[d];
+       }
+
+
+       Kokkos::complex<double> imag = {0.0, 1.0};
+       double pi = std::acos(-1.0);
+       Kokkos::parallel_reduce("Potential energy",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& valL)
+       {
+       
+           Vector<int, 3> iVec = {i, j, k};
+           Vector<double, 3> kVec;
+           double Dr = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               bool shift = (iVec[d] > (N[d]/2));
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               //kVec[d] = 2 * pi / Len[d] * iVec[d];
+               Dr += kVec[d] * kVec[d];
+           }
+
+           Kokkos::complex<double> Ek = {0.0, 0.0}; 
+           double myVal = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               if(Dr != 0.0) {
+                   Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+               }
+               myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+           }
+
+           //double myVal = rhoview(i,j,k).real() * rhoview(i,j,k).real() + 
+           //               rhoview(i,j,k).imag() * rhoview(i,j,k).imag();
+           //if(Dr != 0.0) {
+           //    myVal /= Dr;
+           //}
+           //else {
+           //    myVal = 0.0;
+           //}
+           valL += myVal;
+
+       }, Kokkos::Sum<double>(temp));
        
-        const FieldLayout_t& layout = rho_m.getLayout(); 
-        const Mesh_t& mesh = rho_m.get_mesh();
-        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
-        const auto& domain = layout.getDomain();
-        Vector<double, Dim> Len;
-        Vector<int, Dim> N;
-
-        for (unsigned d=0; d < Dim; ++d) {
-            N[d] = domain[d].length();
-            Len[d] = dx[d] * N[d];
-        }
 
+       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       potentialEnergy = 0.5 * temp * volume;
 
-        Kokkos::complex<double> imag = {0.0, 1.0};
-        double pi = std::acos(-1.0);
-        Kokkos::parallel_reduce("Potential energy",
-                              mdrange_type({0, 0, 0},
-                                           {N[0],
-                                            N[1],
-                                            N[2]}),
-                              KOKKOS_LAMBDA(const int i,
-                                            const int j,
-                                            const int k,
-                                            double& valL)
-        {
-        
-            Vector<int, 3> iVec = {i, j, k};
-            Vector<double, 3> kVec;
-            double Dr = 0.0;
-            for(size_t d = 0; d < Dim; ++d) {
-                bool shift = (iVec[d] > (N[d]/2));
-                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                //kVec[d] = 2 * pi / Len[d] * iVec[d];
-                Dr += kVec[d] * kVec[d];
-            }
-
-            Kokkos::complex<double> Ek = {0.0, 0.0}; 
-            double myVal = 0.0;
-            for(size_t d = 0; d < Dim; ++d) {
-                if(Dr != 0.0) {
-                    Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-                }
-                myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
-            }
-
-            //double myVal = rhoview(i,j,k).real() * rhoview(i,j,k).real() + 
-            //               rhoview(i,j,k).imag() * rhoview(i,j,k).imag();
-            //if(Dr != 0.0) {
-            //    myVal /= Dr;
-            //}
-            //else {
-            //    myVal = 0.0;
-            //}
-            valL += myVal;
-
-        }, Kokkos::Sum<double>(temp));
-        
+       auto Pview = P.getView();
+       auto qView = q.getView();
+
+       temp = 0.0;
+
+       Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
+                               KOKKOS_LAMBDA(const int i, double& valL){
+                                   double myVal = dot(Pview(i), Pview(i)).apply();
+                                   myVal *= -qView(i);
+                                   valL += myVal;
+                               }, Kokkos::Sum<double>(temp));
+
+       temp *= 0.5;
+       double globaltemp = 0.0;
+       MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+
+       kineticEnergy = globaltemp;
 
-        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        potentialEnergy = 0.5 * temp * volume;
+       if (Ippl::Comm->rank() == 0) {
+           std::stringstream fname;
+           fname << "data/Energy_";
+           fname << Ippl::Comm->size();
+           fname << ".csv";
 
-        auto Pview = P.getView();
-        auto qView = q.getView();
 
-        temp = 0.0;
+           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+           csvout.precision(10);
+           csvout.setf(std::ios::scientific, std::ios::floatfield);
 
-        Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const int i, double& valL){
-                                    double myVal = dot(Pview(i), Pview(i)).apply();
-                                    myVal *= -qView(i);
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
+           if(time_m == 0.0) {
+               csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+           }
 
-        temp *= 0.5;
-        double globaltemp = 0.0;
-        MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+           csvout << time_m << " "
+                  << potentialEnergy << " "
+                  << kineticEnergy << " "
+                  << potentialEnergy + kineticEnergy << endl;
 
-        kineticEnergy = globaltemp;
+       }
+       
+       Ippl::Comm->barrier();
+    }
 
-        if (Ippl::Comm->rank() == 0) {
-            std::stringstream fname;
-            fname << "data/Energy_";
-            fname << Ippl::Comm->size();
-            fname << ".csv";
 
+    void initializeShapeFunctionPIF() {
 
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-            csvout.precision(10);
-            csvout.setf(std::ios::scientific, std::ios::floatfield);
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        auto Skview = Sk_m.getView();
+        auto N = nr_m;
+        const Mesh_t& mesh = rho_m.get_mesh();
+        const Vector_t& dx = mesh.getMeshSpacing();
+        const Vector_t& Len = rmax_m - rmin_m;
+        const double pi = std::acos(-1.0);
+        int order = shapedegree_m + 1;
+        if(shapetype_m == "Gaussian") {
 
-            if(time_m == 0.0) {
-                csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
-            }
+            throw IpplException("initializeShapeFunctionPIF",
+                                "Gaussian shape function not implemented yet");
 
-            csvout << time_m << " "
-                   << potentialEnergy << " "
-                   << kineticEnergy << " "
-                   << potentialEnergy + kineticEnergy << endl;
+        }
+        else if(shapetype_m == "B-spline") {
+
+            Kokkos::parallel_for("B-spline shape functions",
+                                mdrange_type({0, 0, 0},
+                                             {N[0], N[1], N[2]}),
+                                KOKKOS_LAMBDA(const int i,
+                                              const int j,
+                                              const int k)
+            {
+                
+                Vector<int, 3> iVec = {i, j, k};
+                Vector<double, 3> kVec;
+                double Sk = 1.0;
+                for(size_t d = 0; d < Dim; ++d) {
+                    bool shift = (iVec[d] > (N[d]/2));
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    double kh = kVec[d] * dx[d];
+                    bool isNotZero = (kh != 0.0);
+                    double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
+                    double arg = isNotZero * (Kokkos::Experimental::sin(kh) * factor) + 
+                                 (!isNotZero) * 1.0;
+                    //Fourier transform of CIC
+                    Sk *= std::pow(arg, order);
+                }
+                    Skview(i, j, k) = Sk;
+            });
 
         }
-        
-        Ippl::Comm->barrier();
-     }
-     
-     //void dumpBumponTail() {
-
-     //   const int nghostE = E_m.getNghost();
-     //   auto Eview = E_m.getView();
-     //   double fieldEnergy, EzAmp;
-     //   using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-     //   double temp = 0.0;
-     //   Kokkos::parallel_reduce("Ex inner product",
-     //                           mdrange_type({nghostE, nghostE, nghostE},
-     //                                        {Eview.extent(0) - nghostE,
-     //                                         Eview.extent(1) - nghostE,
-     //                                         Eview.extent(2) - nghostE}),
-     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
-     //                                         const size_t k, double& valL)
-     //                           {
-     //                               double myVal = std::pow(Eview(i, j, k)[2], 2);
-     //                               valL += myVal;
-     //                           }, Kokkos::Sum<double>(temp));
-     //   double globaltemp = 0.0;
-     //   MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-     //   fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
-
-     //   double tempMax = 0.0;
-     //   Kokkos::parallel_reduce("Ex max norm",
-     //                           mdrange_type({nghostE, nghostE, nghostE},
-     //                                        {Eview.extent(0) - nghostE,
-     //                                         Eview.extent(1) - nghostE,
-     //                                         Eview.extent(2) - nghostE}),
-     //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
-     //                                         const size_t k, double& valL)
-     //                           {
-     //                               double myVal = std::fabs(Eview(i, j, k)[2]);
-     //                               if(myVal > valL) valL = myVal;
-     //                           }, Kokkos::Max<double>(tempMax));
-     //   EzAmp = 0.0;
-     //   MPI_Reduce(&tempMax, &EzAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
-
-
-     //   if (Ippl::Comm->rank() == 0) {
-     //       std::stringstream fname;
-     //       fname << "data/FieldBumponTail_";
-     //       fname << Ippl::Comm->size();
-     //       fname << ".csv";
-
-
-     //       Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-     //       csvout.precision(10);
-     //       csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-     //       if(time_m == 0.0) {
-     //           csvout << "time, Ez_field_energy, Ez_max_norm" << endl;
-     //       }
-
-     //       csvout << time_m << " "
-     //              << fieldEnergy << " "
-     //              << EzAmp << endl;
-
-     //   }
-     //   
-     //   Ippl::Comm->barrier();
-     //}
-
-     //void dumpParticleData() {
-
-     //   typename ParticleAttrib<Vector_t>::HostMirror R_host = this->R.getHostMirror();
-     //   typename ParticleAttrib<Vector_t>::HostMirror P_host = this->P.getHostMirror();
-     //   Kokkos::deep_copy(R_host, this->R.getView());
-     //   Kokkos::deep_copy(P_host, P.getView());
-     //   std::stringstream pname;
-     //   pname << "data/ParticleIC_";
-     //   pname << Ippl::Comm->rank();
-     //   pname << ".csv";
-     //   Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
-     //   pcsvout.precision(10);
-     //   pcsvout.setf(std::ios::scientific, std::ios::floatfield);
-     //   pcsvout << "R_x, R_y, R_z, V_x, V_y, V_z" << endl;
-     //   for (size_type i = 0; i< this->getLocalNum(); i++) {
-     //       pcsvout << R_host(i)[0] << " "
-     //               << R_host(i)[1] << " "
-     //               << R_host(i)[2] << " "
-     //               << P_host(i)[0] << " "
-     //               << P_host(i)[1] << " "
-     //               << P_host(i)[2] << endl;
-     //   }
-     //   Ippl::Comm->barrier();
-     //}
+        else {
+            throw IpplException("initializeShapeFunctionPIF",
+                                "Unrecognized shape function type");
+        }
+
+    }
+    
+    //void dumpBumponTail() {
+
+    //   const int nghostE = E_m.getNghost();
+    //   auto Eview = E_m.getView();
+    //   double fieldEnergy, EzAmp;
+    //   using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+    //   double temp = 0.0;
+    //   Kokkos::parallel_reduce("Ex inner product",
+    //                           mdrange_type({nghostE, nghostE, nghostE},
+    //                                        {Eview.extent(0) - nghostE,
+    //                                         Eview.extent(1) - nghostE,
+    //                                         Eview.extent(2) - nghostE}),
+    //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
+    //                                         const size_t k, double& valL)
+    //                           {
+    //                               double myVal = std::pow(Eview(i, j, k)[2], 2);
+    //                               valL += myVal;
+    //                           }, Kokkos::Sum<double>(temp));
+    //   double globaltemp = 0.0;
+    //   MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+    //   fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
+
+    //   double tempMax = 0.0;
+    //   Kokkos::parallel_reduce("Ex max norm",
+    //                           mdrange_type({nghostE, nghostE, nghostE},
+    //                                        {Eview.extent(0) - nghostE,
+    //                                         Eview.extent(1) - nghostE,
+    //                                         Eview.extent(2) - nghostE}),
+    //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
+    //                                         const size_t k, double& valL)
+    //                           {
+    //                               double myVal = std::fabs(Eview(i, j, k)[2]);
+    //                               if(myVal > valL) valL = myVal;
+    //                           }, Kokkos::Max<double>(tempMax));
+    //   EzAmp = 0.0;
+    //   MPI_Reduce(&tempMax, &EzAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
+
+
+    //   if (Ippl::Comm->rank() == 0) {
+    //       std::stringstream fname;
+    //       fname << "data/FieldBumponTail_";
+    //       fname << Ippl::Comm->size();
+    //       fname << ".csv";
+
+
+    //       Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+    //       csvout.precision(10);
+    //       csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+    //       if(time_m == 0.0) {
+    //           csvout << "time, Ez_field_energy, Ez_max_norm" << endl;
+    //       }
+
+    //       csvout << time_m << " "
+    //              << fieldEnergy << " "
+    //              << EzAmp << endl;
+
+    //   }
+    //   
+    //   Ippl::Comm->barrier();
+    //}
+
+    //void dumpParticleData() {
+
+    //   typename ParticleAttrib<Vector_t>::HostMirror R_host = this->R.getHostMirror();
+    //   typename ParticleAttrib<Vector_t>::HostMirror P_host = this->P.getHostMirror();
+    //   Kokkos::deep_copy(R_host, this->R.getView());
+    //   Kokkos::deep_copy(P_host, P.getView());
+    //   std::stringstream pname;
+    //   pname << "data/ParticleIC_";
+    //   pname << Ippl::Comm->rank();
+    //   pname << ".csv";
+    //   Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    //   pcsvout.precision(10);
+    //   pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+    //   pcsvout << "R_x, R_y, R_z, V_x, V_y, V_z" << endl;
+    //   for (size_type i = 0; i< this->getLocalNum(); i++) {
+    //       pcsvout << R_host(i)[0] << " "
+    //               << R_host(i)[1] << " "
+    //               << R_host(i)[2] << " "
+    //               << P_host(i)[0] << " "
+    //               << P_host(i)[1] << " "
+    //               << P_host(i)[2] << endl;
+    //   }
+    //   Ippl::Comm->barrier();
+    //}
 
 private:
     void setBCAllPeriodic() {
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index f90cb56fb..02400d82a 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -8,7 +8,7 @@
 //     Nt       = Number of time steps
 //     dt       = Time stepsize
 //     Example:
-//     srun ./LandauDampingPIF 128 128 128 10000 10 --info 5
+//     srun ./LandauDampingPIF 128 128 128 10000 10 0.05 CIC --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Paul Scherrer Institut, Villigen PSI, Switzerland
@@ -160,6 +160,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
     static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
     static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -211,9 +212,13 @@ int main(int argc, char *argv[]){
     P->nr_m = nr;
 
     P->rho_m.initialize(mesh, FL);
+    P->Sk_m.initialize(mesh, FL);
 
     P->time_m = 0.0;
 
+    P->shapetype_m = argv[7]; 
+    P->shapedegree_m = std::atoi(argv[8]); 
+
     IpplTimings::startTimer(particleCreation);
 
     //typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
@@ -251,6 +256,11 @@ int main(int argc, char *argv[]){
     P->q = P->Q_m/totalP;
     msg << "particles created and initial conditions assigned " << endl;
 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    P->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+
+
     P->scatter();
 
     P->gather();
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index fe3ce5c7a..ad59c43c2 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -6,7 +6,8 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./BumponTailInstability <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> --info 5
+//     srun ./BumponTailInstability <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> 
+//     <Niter> <ShapeType> <degree> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
 //     nmz       = No. of Fourier modes in the z-direction
@@ -14,8 +15,10 @@
 //     ny       = No. of grid points in the y-direction
 //     nz       = No. of grid points in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./BumponTailInstability 16 16 16 32 32 32 655360 20.0 0.05 0.05 1e-5 100 --info 5
+//     srun ./BumponTailInstability 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -454,10 +457,10 @@ int main(int argc, char *argv[]){
     Pcoarse->nr_m = nrPIC;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
-    Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
+    Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
+    //Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
     Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
-    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
     //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
@@ -603,6 +606,13 @@ int main(int argc, char *argv[]){
     else {
         isPreviousDomainConverged = false;
     }
+    
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+    
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 7fad12b69..c95861c1f 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -53,12 +53,13 @@ template<class PLayout>
 class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 public:
     CxField_t rhoPIF_m;
-    CxField_t rhoPIFprevIter_m;
+    Field_t Sk_m;
     Field_t rhoPIC_m;
     VField_t EfieldPIC_m;
     //VField_t EfieldPICprevIter_m;
 
     Vector<int, Dim> nr_m;
+    Vector<int, Dim> nm_m;
 
     ippl::e_dim_tag decomp_m[Dim];
 
@@ -72,6 +73,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
     double time_m;
 
+    std::string shapetype_m;
+
+    int shapedegree_m;
 
 public:
     ParticleAttrib<double>     q; // charge
@@ -589,6 +593,58 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }
     }
 
+    void initializeShapeFunctionPIF() {
+
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        auto Skview = Sk_m.getView();
+        auto N = nm_m;
+        const Mesh_t& mesh = rhoPIF_m.get_mesh();
+        const Vector_t& dx = mesh.getMeshSpacing();
+        const Vector_t& Len = rmax_m - rmin_m;
+        const double pi = std::acos(-1.0);
+        int order = shapedegree_m + 1;
+        
+        if(shapetype_m == "Gaussian") {
+
+            throw IpplException("initializeShapeFunctionPIF",
+                                "Gaussian shape function not implemented yet");
+
+        }
+        else if(shapetype_m == "B-spline") {
+
+            Kokkos::parallel_for("B-spline shape functions",
+                                mdrange_type({0, 0, 0},
+                                             {N[0], N[1], N[2]}),
+                                KOKKOS_LAMBDA(const int i,
+                                              const int j,
+                                              const int k)
+            {
+                
+                Vector<int, 3> iVec = {i, j, k};
+                Vector<double, 3> kVec;
+                double Sk = 1.0;
+                for(size_t d = 0; d < Dim; ++d) {
+                    bool shift = (iVec[d] > (N[d]/2));
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    double kh = kVec[d] * dx[d];
+                    bool isNotZero = (kh != 0.0);
+                    double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
+                    double arg = isNotZero * (Kokkos::Experimental::sin(kh) * factor) + 
+                                 (!isNotZero) * 1.0;
+                    //Fourier transform of CIC
+                    Sk *= std::pow(arg, order);
+                }
+                    Skview(i, j, k) = Sk;
+            });
+
+        }
+        else {
+            throw IpplException("initializeShapeFunctionPIF",
+                                "Unrecognized shape function type");
+        }
+
+    }
+
     void LeapFrogPIC(ParticleAttrib<Vector_t>& Rtemp, 
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
                      const double dt, const double& tStartMySlice) {
@@ -768,12 +824,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
-        scatterPIF(q, rhoPIF_m, Rtemp);
+        scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
         // Solve for and gather E field
-        gatherPIF(E, rhoPIF_m, Rtemp);
+        gatherPIF(E, rhoPIF_m, Sk_m, Rtemp);
     
         time_m = tStartMySlice;
 
@@ -800,12 +856,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
-            scatterPIF(q, rhoPIF_m, Rtemp);
+            scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
             // Solve for and gather E field
-            gatherPIF(E, rhoPIF_m, Rtemp);
+            gatherPIF(E, rhoPIF_m, Sk_m, Rtemp);
     
             //kick
             Ptemp = Ptemp - 0.5 * dt * E;
@@ -832,12 +888,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
-        scatterPIF(q, rhoPIF_m, Rtemp);
+        scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
         // Solve for and gather E field
-        gatherPIF(E, rhoPIF_m, Rtemp);
+        gatherPIF(E, rhoPIF_m, Sk_m, Rtemp);
     
         time_m = tStartMySlice;
 
@@ -889,12 +945,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
-            scatterPIF(q, rhoPIF_m, Rtemp);
+            scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
             // Solve for and gather E field
-            gatherPIF(E, rhoPIF_m, Rtemp);
+            gatherPIF(E, rhoPIF_m, Sk_m, Rtemp);
     
             //kick
             auto R2view = Rtemp.getView();
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 1fe18d756..dbf02e99d 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -6,7 +6,8 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./LandauDampingPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> --info 5
+//     srun ./LandauDampingPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> 
+//          <Niter> <ShapeType> <degree> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
 //     nmz       = No. of Fourier modes in the z-direction
@@ -14,8 +15,10 @@
 //     ny       = No. of grid points in the y-direction
 //     nz       = No. of grid points in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./LandauDampingPinT 16 16 16 32 32 32 655360 20.0 0.05 0.05 1e-5 100 --info 5
+//     srun ./LandauDampingPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -338,6 +341,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
     static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -422,12 +426,13 @@ int main(int argc, char *argv[]){
     Pend = std::make_unique<states_end_type>(PL);
 
     Pcoarse->nr_m = nrPIC;
+    Pcoarse->nm_m = nmPIF;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
-    Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
+    Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
+    //Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
     Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
-    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
     //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
@@ -507,6 +512,7 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
+
     //Get initial guess for ranks other than 0 by propagating the coarse solver
     IpplTimings::startTimer(coarsePropagator);
     if (Ippl::Comm->rank() > 0) {
@@ -555,6 +561,11 @@ int main(int argc, char *argv[]){
         isPreviousDomainConverged = false;
     }
 
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
     //unsigned int maxIterRank;
     for (unsigned int it=0; it<maxIter; it++) {
 
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index fa8ca441f..4ce8437ec 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -6,7 +6,8 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./PenningTrap <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> --info 5
+//     srun ./PenningTrap <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> 
+//     <ShapeType> <degree> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
 //     nmz       = No. of Fourier modes in the z-direction
@@ -14,8 +15,10 @@
 //     ny       = No. of grid points in the y-direction
 //     nz       = No. of grid points in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./PenningTrap 16 16 16 32 32 32 655360 20.0 0.05 0.05 1e-5 100 --info 5
+//     srun ./PenningTrap 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -425,10 +428,10 @@ int main(int argc, char *argv[]){
     Pcoarse->nr_m = nrPIC;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
-    Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
+    Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
+    //Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
     Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
-    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
     //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
@@ -568,6 +571,12 @@ int main(int argc, char *argv[]){
     else {
         isPreviousDomainConverged = false;
     }
+    
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index bcfde8d3c..9b66e18c3 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -156,20 +156,20 @@ namespace ippl {
         scatter(Field<T, Dim, M, C>& f,
                 const ParticleAttrib<Vector<P2, Dim>, Properties... >& pp) const;
 
-        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
-        scatterPIF(Field<P2, Dim, M, C>& f,
-                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+        scatterPIF(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp) const;
 
         template <unsigned Dim, class M, class C, typename P2>
         void
         gather(Field<T, Dim, M, C>& f,
                const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp);
 
-        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
-        gatherPIF(Field<P2, Dim, M, C>& f,
-                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+        gatherPIF(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp) const;
 
         T sum();
         T max();
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 4dd769e60..687505bc1 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -202,8 +202,8 @@ namespace ippl {
 
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class PT>
-    void ParticleAttrib<T, Properties...>::scatterPIF(Field<FT,Dim,M,C>& f,
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIF(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
@@ -216,6 +216,7 @@ namespace ippl {
         using vector_type = typename M::vector_type;
         using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
         view_type fview = f.getView();
+        typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
         const int nghost = f.getNghost();
         const FieldLayout<Dim>& layout = f.getLayout(); 
         const M& mesh = f.get_mesh();
@@ -265,16 +266,11 @@ namespace ippl {
                 FT reducedValue = 0.0;
                 Vector<int, 3> iVec = {i, j, k};
                 vector_type kVec;
-                double Sk = 1.0; //Fourier transform of the shape function
                 for(size_t d = 0; d < Dim; ++d) {
                     bool shift = (iVec[d] > (N[d]/2));
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                    //double kh = kVec[d] * dx[d];
-                    ////Fourier transform of CIC
-                    //if(kh != 0.0) {
-                    //    Sk *= std::pow(Kokkos::Experimental::sin(kh)/kh, 2);
-                    //}
                 }
+                auto Sk = Skview(i, j, k);
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
                 [=](const size_t idx, FT& innerReduce)
                 {
@@ -366,8 +362,8 @@ namespace ippl {
     }
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class PT>
-    void ParticleAttrib<T, Properties...>::gatherPIF(Field<FT,Dim,M,C>& f,
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIF(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
@@ -379,6 +375,7 @@ namespace ippl {
         using vector_type = typename M::vector_type;
         using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
         view_type fview = f.getView();
+        typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
         const int nghost = f.getNghost();
         const FieldLayout<Dim>& layout = f.getLayout(); 
         const M& mesh = f.get_mesh();
@@ -427,7 +424,6 @@ namespace ippl {
                     Vector<int, 3> iVec = {i, j, k};
                     vector_type kVec;
                     double Dr = 0.0, arg = 0.0;
-                    double Sk = 1.0; //Fourier transform of shape function
                     for(size_t d = 0; d < Dim; ++d) {
                         bool shift = (iVec[d] > (N[d]/2));
                         kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
@@ -435,16 +431,13 @@ namespace ippl {
                         //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
                         Dr += kVec[d] * kVec[d];
                         arg += kVec[d]*pp(idx)[d];
-                        //double kh = kVec[d] * dx[d];
-                        ////Fourier transform of CIC
-                        //if(kh != 0.0) {
-                        //    Sk *= std::pow(Kokkos::Experimental::sin(kh)/kh, 2);
-                        //}
                     }
+                    
 
                     FT Ek = 0.0;
                     value_type Ex = 0.0;
                     auto rho = fview(i+nghost,j+nghost,k+nghost);
+                    auto Sk = Skview(i,j,k);
                     for(size_t d = 0; d < Dim; ++d) {
                         
                         bool isNotZero = (Dr != 0.0);
@@ -492,12 +485,12 @@ namespace ippl {
         attrib.scatter(f, pp);
     }
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
     void scatterPIF(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
     {
-        attrib.scatterPIF(f, pp);
+        attrib.scatterPIF(f, Sk, pp);
     }
 
 
@@ -510,12 +503,12 @@ namespace ippl {
         attrib.gather(f, pp);
     }
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
     void gatherPIF(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
     {
-        attrib.gatherPIF(f, pp);
+        attrib.gatherPIF(f, Sk, pp);
     }
 
 

From 897d7a69095c07a775b11df332537731a62c58c5 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 10 Jan 2023 12:48:54 +0100
Subject: [PATCH 041/117] Comments changed

---
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 02400d82a..e89cadfe6 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -1,14 +1,16 @@
 // Electrostatic Landau damping test with Particle-in-Fourier schemes
 //   Usage:
-//     srun ./LandauDampingPIF <nx> <ny> <nz> <Np> <Nt> <dt> --info 5
+//     srun ./LandauDampingPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> --info 5
 //     nx       = No. of Fourier modes in the x-direction
 //     ny       = No. of Fourier modes in the y-direction
 //     nz       = No. of Fourier modes in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
 //     Nt       = Number of time steps
 //     dt       = Time stepsize
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./LandauDampingPIF 128 128 128 10000 10 0.05 CIC --info 5
+//     srun ./LandauDampingPIF 32 32 32 655360 20 0.05 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Paul Scherrer Institut, Villigen PSI, Switzerland

From dc152a5bf10afb56918c5b88265a69b30509fb3b Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 10 Jan 2023 12:52:23 +0100
Subject: [PATCH 042/117] Comment typos corrected

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 4 ++--
 alpine/PinT/PenningTrapPinT.cpp           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index ad59c43c2..55eb7127d 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -6,7 +6,7 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./BumponTailInstability <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> 
+//     srun ./BumponTailInstabilityPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> 
 //     <Niter> <ShapeType> <degree> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
@@ -18,7 +18,7 @@
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./BumponTailInstability 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
+//     srun ./BumponTailInstabilityPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 4ce8437ec..3a59dd439 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -6,7 +6,7 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./PenningTrap <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> 
+//     srun ./PenningTrapPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> 
 //     <ShapeType> <degree> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
@@ -18,7 +18,7 @@
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./PenningTrap 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
+//     srun ./PenningTrapPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.

From 35a7f0acb73c6d4d8202ba4fc9bc59e593fd6774 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 13 Jan 2023 08:54:25 +0100
Subject: [PATCH 043/117] Bump-on-tail instability and Penning trap PIF
 examples created

---
 alpine/ElectrostaticPIF/CMakeLists.txt        |  6 ++
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 94 ++++++++++++++++++-
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  | 10 +-
 alpine/PinT/ChargedParticlesPinT.hpp          | 19 ++--
 alpine/PinT/LandauDampingPinT.cpp             | 17 ++--
 5 files changed, 122 insertions(+), 24 deletions(-)

diff --git a/alpine/ElectrostaticPIF/CMakeLists.txt b/alpine/ElectrostaticPIF/CMakeLists.txt
index 60fa9678b..969202fa2 100644
--- a/alpine/ElectrostaticPIF/CMakeLists.txt
+++ b/alpine/ElectrostaticPIF/CMakeLists.txt
@@ -16,6 +16,12 @@ set (COMPILE_FLAGS ${OPAL_CXX_FLAGS})
 add_executable (LandauDampingPIF LandauDampingPIF.cpp)
 target_link_libraries (LandauDampingPIF ${IPPL_LIBS})
 
+add_executable (BumponTailInstabilityPIF BumponTailInstabilityPIF.cpp)
+target_link_libraries (BumponTailInstabilityPIF ${IPPL_LIBS})
+
+add_executable (PenningTrapPIF PenningTrapPIF.cpp)
+target_link_libraries (PenningTrapPIF ${IPPL_LIBS})
+
 # vi: set et ts=4 sw=4 sts=4:
 
 # Local Variables:
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 49145837a..b76152f3b 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -134,7 +134,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
     }
 
 
-    void dumpLandau(size_type /*totalP*/) {
+    void dumpLandau() {
        
        
        double fieldEnergy = 0.0; 
@@ -251,7 +251,97 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
     }
 
 
-    void dumpEnergy(size_type /*totalP*/) {
+    void dumpBumponTail() {
+       
+       
+       double fieldEnergy = 0.0; 
+       double ExAmp = 0.0;
+
+       auto rhoview = rho_m.getView();
+       const int nghost = rho_m.getNghost();
+       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
+     
+       const FieldLayout_t& layout = rho_m.getLayout(); 
+       const Mesh_t& mesh = rho_m.get_mesh();
+       const Vector<double, Dim>& dx = mesh.getMeshSpacing();
+       const auto& domain = layout.getDomain();
+       Vector<double, Dim> Len;
+       Vector<int, Dim> N;
+
+       for (unsigned d=0; d < Dim; ++d) {
+           N[d] = domain[d].length();
+           Len[d] = dx[d] * N[d];
+       }
+
+
+       Kokkos::complex<double> imag = {0.0, 1.0};
+       double pi = std::acos(-1.0);
+       Kokkos::parallel_reduce("Ex energy and Max",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& tlSum,
+                                           double& tlMax)
+       {
+       
+           Vector<int, 3> iVec = {i, j, k};
+           Vector<double, 3> kVec;
+           double Dr = 0.0;
+           for(size_t d = 0; d < Dim; ++d) {
+               bool shift = (iVec[d] > (N[d]/2));
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               Dr += kVec[d] * kVec[d];
+           }
+
+           Kokkos::complex<double> Ek = {0.0, 0.0}; 
+           if(Dr != 0.0) {
+               Ek = -(imag * kVec[2] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
+           }
+           double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
+
+           tlSum += myVal;
+
+           double myValMax = std::sqrt(myVal);
+
+           if(myValMax > tlMax) tlMax = myValMax;
+
+       }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(ExAmp));
+       
+
+       Kokkos::fence();
+       double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
+       fieldEnergy *= volume;
+
+       if (Ippl::Comm->rank() == 0) {
+           std::stringstream fname;
+           fname << "data/FieldBumponTail_";
+           fname << Ippl::Comm->size();
+           fname << ".csv";
+
+
+           Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
+           csvout.precision(10);
+           csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+           if(time_m == 0.0) {
+               csvout << "time, Ez_field_energy, Ez_max_norm" << endl;
+           }
+
+           csvout << time_m << " "
+                  << fieldEnergy << " "
+                  << ExAmp << endl;
+
+       }
+       
+       Ippl::Comm->barrier();
+    }
+
+
+    void dumpEnergy() {
        
 
        double potentialEnergy, kineticEnergy;
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index e89cadfe6..5bbcfd57b 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -13,7 +13,7 @@
 //     srun ./LandauDampingPIF 32 32 32 655360 20 0.05 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
-// Paul Scherrer Institut, Villigen PSI, Switzerland
+// Jülich Supercomputing Centre, Jülich, Germany.
 // All rights reserved
 //
 // This file is part of IPPL.
@@ -268,8 +268,8 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    P->dumpLandau(totalP);
-    P->dumpEnergy(totalP);
+    P->dumpLandau();
+    P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     // begin main timestep loop
@@ -309,8 +309,8 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        P->dumpLandau(totalP);
-        P->dumpEnergy(totalP);
+        P->dumpLandau();
+        P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index c95861c1f..a0c88178e 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -268,9 +268,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             }
 
             Kokkos::complex<double> Ek = {0.0, 0.0}; 
-            if(Dr != 0.0) {
-                Ek = -(imag * kVec[0] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-            }
+            bool isNotZero = (Dr != 0.0);
+            double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+            Ek = -(imag * kVec[0] * rhoview(i+nghost,j+nghost,k+nghost) * factor);
             double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
 
             tlSum += myVal;
@@ -352,9 +352,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             }
 
             Kokkos::complex<double> Ek = {0.0, 0.0}; 
-            if(Dr != 0.0) {
-                Ek = -(imag * kVec[2] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-            }
+            bool isNotZero = (Dr != 0.0);
+            double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+            Ek = -(imag * kVec[2] * rhoview(i+nghost,j+nghost,k+nghost) * factor);
             double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
 
             tlSum += myVal;
@@ -440,10 +440,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
             Kokkos::complex<double> Ek = {0.0, 0.0}; 
             double myVal = 0.0;
+            auto rho = rhoview(i+nghost,j+nghost,k+nghost);
             for(size_t d = 0; d < Dim; ++d) {
-                if(Dr != 0.0) {
-                    Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-                }
+                bool isNotZero = (Dr != 0.0);
+                double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                Ek = -(imag * kVec[d] * rho * factor);
                 myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
             }
 
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index dbf02e99d..4c456eaa5 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -108,13 +108,13 @@ struct generate_random {
   // The GeneratorPool
   GeneratorPool rand_pool;
 
-  value_type alpha;
+  //value_type alpha;
 
-  T k, minU, maxU;
+  T alpha, k, minU, maxU;
 
   // Initialize all members
   generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
-                  value_type& alpha_, T& k_, T& minU_, T& maxU_)
+                  T& alpha_, T& k_, T& minU_, T& maxU_)
       : x(x_), v(v_), rand_pool(rand_pool_), 
         alpha(alpha_), k(k_), minU(minU_), maxU(maxU_) {}
 
@@ -127,8 +127,8 @@ struct generate_random {
     for (unsigned d = 0; d < Dim; ++d) {
 
         u = rand_gen.drand(minU[d], maxU[d]);
-        x(i)[d] = u / (1 + alpha);
-        Newton1D<value_type> solver(k[d], alpha, u);
+        x(i)[d] = u / (1 + alpha[d]);
+        Newton1D<value_type> solver(k[d], alpha[d], u);
         solver.solve(x(i)[d]);
         v(i)[d] = rand_gen.normal(0.0, 1.0);
     }
@@ -397,7 +397,8 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
-    double alpha = 0.05;
+    //double alpha = 0.05;
+    Vector_t alpha = {0.05, 0.05, 0.05};
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
     double dxPIC = rmax[0] / nrPIC[0];
@@ -442,8 +443,8 @@ int main(int argc, char *argv[]){
 
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
-        minU[d] = CDF(rmin[d], alpha, kw[d]);
-        maxU[d]   = CDF(rmax[d], alpha, kw[d]);
+        minU[d] = CDF(rmin[d], alpha[d], kw[d]);
+        maxU[d]   = CDF(rmax[d], alpha[d], kw[d]);
         //minU[d] = rmin[d];
         //maxU[d] = rmax[d];
     }

From 4b9db5abbac34f91c259c85217adcb6fbba0e302 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 13 Jan 2023 08:59:13 +0100
Subject: [PATCH 044/117] New files added

---
 .../BumponTailInstabilityPIF.cpp              | 361 ++++++++++++++++++
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp    | 359 +++++++++++++++++
 2 files changed, 720 insertions(+)
 create mode 100644 alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
 create mode 100644 alpine/ElectrostaticPIF/PenningTrapPIF.cpp

diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
new file mode 100644
index 000000000..53537509d
--- /dev/null
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -0,0 +1,361 @@
+// Electrostatic Two-stream/Bump-on-tail instability test with Particle-in-Fourier schemes
+//   Usage:
+//     srun ./BumponTailInstabilityPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> --info 5
+//     nx       = No. of Fourier modes in the x-direction
+//     ny       = No. of Fourier modes in the y-direction
+//     nz       = No. of Fourier modes in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Nt       = Number of time steps
+//     dt       = Time stepsize
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
+//     Example:
+//     srun ./BumponTailInstabilityPIF 32 32 32 655360 20 0.05 B-spline 1 --info 5
+//
+// Copyright (c) 2023, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPIF.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T k, delta, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& k_, const T& delta_, 
+           const T& u_) 
+  : k(k_), delta(delta_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = x  + (delta  * (std::sin(k * x) / k)) - u;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = 1  + (delta  * std::cos(k * x));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while (iterations < max_iter && std::fabs(f(x)) > tol) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  value_type delta, sigma, muBulk, muBeam;
+  size_type nlocBulk; 
+
+  T k, minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, 
+                  value_type& delta_, T& k_, value_type& sigma_, 
+                  value_type& muBulk_, value_type& muBeam_, 
+                  size_type& nlocBulk_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        delta(delta_), sigma(sigma_), muBulk(muBulk_), muBeam(muBeam_),
+        nlocBulk(nlocBulk_), k(k_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    bool isBeam = (i >= nlocBulk);
+    
+    value_type muZ = (value_type)(((!isBeam) * muBulk) + (isBeam * muBeam));
+    
+    for (unsigned d = 0; d < Dim-1; ++d) {
+        
+        x(i)[d] = rand_gen.drand(minU[d], maxU[d]); 
+        v(i)[d] = rand_gen.normal(0.0, sigma);
+    }
+    v(i)[Dim-1] = rand_gen.normal(muZ, sigma);
+    
+    value_type u = rand_gen.drand(minU[Dim-1], maxU[Dim-1]);
+    x(i)[Dim-1] = u / (1 + delta);
+    Newton1D<value_type> solver(k[Dim-1], delta, u);
+    solver.solve(x(i)[Dim-1]);
+    
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& delta, const double& k,
+           const unsigned& dim) {
+
+   bool isDimZ = (dim == (Dim-1)); 
+   double cdf = x + (double)(isDimZ * ((delta / k) * std::sin(k * x)));
+   return cdf;
+}
+
+const char* TestName = "TwoStreamInstabilityPIF";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg(TestName);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nr = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
+    static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
+    static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[4]);
+    const unsigned int nt     = std::atoi(argv[5]);
+    const double dt = std::atof(argv[6]);
+
+    msg << TestName 
+        << endl
+        << "nt " << nt << " Np= "
+        << totalP << " Fourier modes = " << nr
+        << endl;
+
+    using bunch_type = ChargedParticlesPIF<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  P;
+
+    ippl::NDIndex<Dim> domain;
+    for (unsigned i = 0; i< Dim; i++) {
+        domain[i] = ippl::Index(nr[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    
+    if(std::strcmp(TestName,"TwoStreamInstabilityPIF") == 0) {
+        // Parameters for two stream instability as in 
+        //  https://www.frontiersin.org/articles/10.3389/fphy.2018.00105/full
+        kw = {0.5, 0.5, 0.5};
+        sigma = 0.1;
+        epsilon = 0.5;
+        muBulk = -pi / 2.0;
+        muBeam = pi / 2.0;
+        delta = 0.01;
+    }
+    else if(std::strcmp(TestName,"BumponTailInstabilityPIF") == 0) {
+        kw = {0.21, 0.21, 0.21};
+        sigma = 1.0 / std::sqrt(2.0);
+        epsilon = 0.1;
+        muBulk = 0.0;
+        muBeam = 4.0;
+        delta = 0.01;
+    }
+    else {
+        //Default value is two stream instability
+        kw = {0.5, 0.5, 0.5};
+        sigma = 0.1;
+        epsilon = 0.5;
+        muBulk = -pi / 2.0;
+        muBeam = pi / 2.0;
+        delta = 0.01;
+    }
+    
+    
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw ;
+    double dx = rmax[0] / nr[0];
+    double dy = rmax[1] / nr[1];
+    double dz = rmax[2] / nr[2];
+
+    Vector_t hr = {dx, dy, dz};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t mesh(domain, hr, origin);
+    FieldLayout_t FL(domain, decomp, isAllPeriodic);
+    PLayout_t PL(FL, mesh);
+
+    //Q = -\int\int f dx dv
+    double Q = -rmax[0] * rmax[1] * rmax[2];
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+
+    P->nr_m = nr;
+
+    P->rho_m.initialize(mesh, FL);
+    P->Sk_m.initialize(mesh, FL);
+
+    P->time_m = 0.0;
+
+    P->shapetype_m = argv[7]; 
+    P->shapedegree_m = std::atoi(argv[8]); 
+
+    IpplTimings::startTimer(particleCreation);
+
+    //typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
+    //const RegionLayout_t& RLayout = PL.getRegionLayout();
+    //const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
+    Vector_t minU, maxU;
+    //int myRank = Ippl::Comm->rank();
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = CDF(rmin[d], delta, kw[d], d);
+        maxU[d]   = CDF(rmax[d], delta, kw[d], d);
+    }
+
+    double factorConf = 1.0/Ippl::Comm->size();
+    double factorVelBulk = 1.0 - epsilon;
+    double factorVelBeam = 1.0 - factorVelBulk;
+    size_type nlocBulk = (size_type)(factorConf * factorVelBulk * totalP);
+    size_type nlocBeam = (size_type)(factorConf * factorVelBeam * totalP);
+    size_type nloc = nlocBulk + nlocBeam;
+    size_type nloc = (size_type)(factor * totalP);
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+
+    int rest = (int) (totalP - Total_particles);
+
+    if ( Ippl::Comm->rank() < rest )
+        ++nloc;
+
+    P->create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         P->R.getView(), P->P.getView(), rand_pool64, delta, kw, sigma, muBulk, 
+                         muBeam, nlocBulk, minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    P->q = P->Q_m/totalP;
+    msg << "particles created and initial conditions assigned " << endl;
+
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    P->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+
+
+    P->scatter();
+
+    P->gather();
+
+    IpplTimings::startTimer(dumpDataTimer);
+    P->dumpBumponTail();
+    P->dumpEnergy();
+    IpplTimings::stopTimer(dumpDataTimer);
+
+    // begin main timestep loop
+    msg << "Starting iterations ..." << endl;
+    for (unsigned int it=0; it<nt; it++) {
+
+        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
+
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
+
+        //drift
+        IpplTimings::startTimer(RTimer);
+        P->R = P->R + dt * P->P;
+        IpplTimings::stopTimer(RTimer);
+
+        //Apply particle BC
+	    IpplTimings::startTimer(BCTimer);
+        PL.applyBC(P->R, PL.getRegionLayout().getDomain());
+        IpplTimings::stopTimer(BCTimer);
+
+        //scatter the charge onto the underlying grid
+        P->scatter();
+
+        // Solve for and gather E field
+        P->gather();
+
+        //kick
+        IpplTimings::startTimer(PTimer);
+        P->P = P->P - 0.5 * dt * P->E;
+        IpplTimings::stopTimer(PTimer);
+
+        P->time_m += dt;
+        IpplTimings::startTimer(dumpDataTimer);
+        P->dumpBumponTail();
+        P->dumpEnergy();
+        IpplTimings::stopTimer(dumpDataTimer);
+        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    }
+
+    msg << "BumponTailInstability: End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    return 0;
+}
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
new file mode 100644
index 000000000..8c5613b1b
--- /dev/null
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -0,0 +1,359 @@
+// Electrostatic Penning trap test with Particle-in-Fourier schemes
+//   Usage:
+//     srun ./PenningTrapPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> --info 5
+//     nx       = No. of Fourier modes in the x-direction
+//     ny       = No. of Fourier modes in the y-direction
+//     nz       = No. of Fourier modes in the z-direction
+//     Np       = Total no. of macro-particles in the simulation
+//     Nt       = Number of time steps
+//     dt       = Time stepsize
+//     ShapeType = Shape function type B-spline only for the moment
+//     degree = B-spline degree (-1 for delta function)
+//     Example:
+//     srun ./PenningTrapPIF 32 32 32 655360 20 0.05 B-spline 1 --info 5
+//
+// Copyright (c) 2023, Sriramkrishnan Muralikrishnan,
+// Jülich Supercomputing Centre, Jülich, Germany.
+// All rights reserved
+//
+// This file is part of IPPL.
+//
+// IPPL is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// You should have received a copy of the GNU General Public License
+// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
+//
+
+#include "ChargedParticlesPIF.hpp"
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <set>
+#include <chrono>
+
+#include<Kokkos_Random.hpp>
+
+#include <random>
+#include "Utility/IpplTimings.h"
+
+template <typename T>
+struct Newton1D {
+
+  double tol = 1e-12;
+  int max_iter = 20;
+  double pi = std::acos(-1.0);
+  
+  T mu, sigma, u;
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  Newton1D(const T& mu_, const T& sigma_, 
+           const T& u_) 
+  : mu(mu_), sigma(sigma_), u(u_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Newton1D() {}
+
+  KOKKOS_INLINE_FUNCTION
+  T f(T& x) {
+      T F;
+      F = std::erf((x - mu)/(sigma * std::sqrt(2.0))) 
+          - 2 * u + 1;
+      return F;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  T fprime(T& x) {
+      T Fprime;
+      Fprime = (1 / sigma) * std::sqrt(2 / pi) * 
+               std::exp(-0.5 * (std::pow(((x - mu) / sigma),2)));
+      return Fprime;
+  }
+
+  KOKKOS_FUNCTION
+  void solve(T& x) {
+      int iterations = 0;
+      while ((iterations < max_iter) && (std::fabs(f(x)) > tol)) {
+          x = x - (f(x)/fprime(x));
+          iterations += 1;
+      }
+  }
+};
+
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x, v;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  T mu, sigma, minU, maxU;
+
+  double pi = std::acos(-1.0);
+
+  // Initialize all members
+  generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_,
+                  T& mu_, T& sigma_, T& minU_, T& maxU_)
+      : x(x_), v(v_), rand_pool(rand_pool_), 
+        mu(mu_), sigma(sigma_), minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    value_type u;
+    for (unsigned d = 0; d < Dim; ++d) {
+        u = rand_gen.drand(minU[d], maxU[d]);
+        x(i)[d] = (std::sqrt(pi / 2) * (2 * u - 1)) * 
+                  sigma[d] + mu[d];
+        Newton1D<value_type> solver(mu[d], sigma[d], u);
+        solver.solve(x(i)[d]);
+        v(i)[d] = rand_gen.normal(0.0, 1.0);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+double CDF(const double& x, const double& mu, const double& sigma) {
+   double cdf = 0.5 * (1.0 + std::erf((x - mu)/(sigma * std::sqrt(2))));
+   return cdf;
+}
+
+const char* TestName = "PenningTrapPIF";
+
+int main(int argc, char *argv[]){
+    Ippl ippl(argc, argv);
+    
+    Inform msg(TestName);
+    Inform msg2all(TestName,INFORM_ALL_NODES);
+
+    ippl::Vector<int,Dim> nr = {
+        std::atoi(argv[1]),
+        std::atoi(argv[2]),
+        std::atoi(argv[3])
+    };
+
+    static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer");
+    static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
+    static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick");
+    static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift");
+    static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+
+    IpplTimings::startTimer(mainTimer);
+
+    const size_type totalP = std::atoll(argv[4]);
+    const unsigned int nt     = std::atoi(argv[5]);
+    const double dt = std::atof(argv[6]);
+
+    msg << TestName
+        << endl
+        << "nt " << nt << " Np= "
+        << totalP << " Fourier modes = " << nr
+        << endl;
+
+    using bunch_type = ChargedParticlesPIF<PLayout_t>;
+
+    std::unique_ptr<bunch_type>  P;
+
+    ippl::NDIndex<Dim> domain;
+    for (unsigned i = 0; i< Dim; i++) {
+        domain[i] = ippl::Index(nr[i]);
+    }
+
+    ippl::e_dim_tag decomp[Dim];
+    for (unsigned d = 0; d < Dim; ++d) {
+        decomp[d] = ippl::SERIAL;
+    }
+
+    // create mesh and layout objects for this problem domain
+    Vector_t rmin(0.0);
+    Vector_t rmax(20.0);
+    double dx = rmax[0] / nr[0];
+    double dy = rmax[1] / nr[1];
+    double dz = rmax[2] / nr[2];
+
+    Vector_t length = rmax - rmin;
+
+    Vector_t mu, sd;
+
+    for (unsigned d = 0; d<Dim; d++) {
+        mu[d] = 0.5 * length[d];
+    }
+    sd[0] = 0.15*length[0];
+    sd[1] = 0.05*length[1];
+    sd[2] = 0.20*length[2];
+
+    Vector_t hr = {dx, dy, dz};
+    Vector_t origin = {rmin[0], rmin[1], rmin[2]};
+
+    const bool isAllPeriodic=true;
+    Mesh_t mesh(domain, hr, origin);
+    FieldLayout_t FL(domain, decomp, isAllPeriodic);
+    PLayout_t PL(FL, mesh);
+
+    double Q = -1562.5;
+    double Bext = 5.0;
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+
+    P->nr_m = nr;
+
+    P->rho_m.initialize(mesh, FL);
+    P->Sk_m.initialize(mesh, FL);
+
+    P->time_m = 0.0;
+
+    P->shapetype_m = argv[7]; 
+    P->shapedegree_m = std::atoi(argv[8]); 
+
+    IpplTimings::startTimer(particleCreation);
+
+    //typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
+    //const RegionLayout_t& RLayout = PL.getRegionLayout();
+    //const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
+    Vector_t minU, maxU;
+    for (unsigned d = 0; d <Dim; ++d) {
+        minU[d] = CDF(rmin[d], mu[d], sd[d]);
+        maxU[d] = CDF(rmax[d], mu[d], sd[d]);
+    }
+
+    double factor = 1.0/Ippl::Comm->size();
+    size_type nloc = (size_type)(factor * totalP);
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+
+    int rest = (int) (totalP - Total_particles);
+
+    if ( Ippl::Comm->rank() < rest )
+        ++nloc;
+
+    P->create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+                         P->R.getView(), P->P.getView(), rand_pool64, mu, sd, minU, maxU));
+
+    Kokkos::fence();
+    Ippl::Comm->barrier();
+    IpplTimings::stopTimer(particleCreation);                                                    
+    
+    P->q = P->Q_m/totalP;
+    msg << "particles created and initial conditions assigned " << endl;
+
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    P->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+
+
+    P->scatter();
+
+    P->gather();
+
+    IpplTimings::startTimer(dumpDataTimer);
+    P->dumpEnergy();
+    IpplTimings::stopTimer(dumpDataTimer);
+
+    double alpha = -0.5 * dt;
+    double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2))); 
+    // begin main timestep loop
+    msg << "Starting iterations ..." << endl;
+    for (unsigned int it=0; it<nt; it++) {
+
+        // Staggered Leap frog or Boris algorithm as per 
+        // https://www.sciencedirect.com/science/article/pii/S2590055219300526
+        // eqns 4(a)-4(c). Note we don't use the Boris trick here and do
+        // the analytical matrix inversion which is not complex in this case.
+        // Here, we assume a constant charge-to-mass ratio of -1 for
+        // all the particles hence eliminating the need to store mass as
+        // an attribute
+        // kick
+        IpplTimings::startTimer(PTimer);
+        auto Rview = P->R.getView();
+        auto Pview = P->P.getView();
+        auto Eview = P->E.getView();
+        double V0 = 30*rmax[2];
+        Kokkos::parallel_for("Kick1", P->getLocalNum(),
+                              KOKKOS_LAMBDA(const size_t j){
+            double Eext_x = -(Rview(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+            double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+            double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+            Eview(j)[0] += Eext_x;
+            Eview(j)[1] += Eext_y;
+            Eview(j)[2] += Eext_z;
+            
+            Pview(j)[0] += alpha * (Eview(j)[0]  + Pview(j)[1] * Bext);
+            Pview(j)[1] += alpha * (Eview(j)[1]  - Pview(j)[0] * Bext);
+            Pview(j)[2] += alpha * Eview(j)[2];
+        });
+        IpplTimings::stopTimer(PTimer);
+
+        //drift
+        IpplTimings::startTimer(RTimer);
+        P->R = P->R + dt * P->P;
+        IpplTimings::stopTimer(RTimer);
+
+        //Apply particle BC
+	    IpplTimings::startTimer(BCTimer);
+        PL.applyBC(P->R, PL.getRegionLayout().getDomain());
+        IpplTimings::stopTimer(BCTimer);
+
+        //scatter the charge onto the underlying grid
+        P->scatter();
+
+        // Solve for and gather E field
+        P->gather();
+
+        //kick
+        IpplTimings::startTimer(PTimer);
+        auto R2view = P->R.getView();
+        auto P2view = P->P.getView();
+        auto E2view = P->E.getView();
+        Kokkos::parallel_for("Kick2", P->getLocalNum(),
+                              KOKKOS_LAMBDA(const size_t j){
+            double Eext_x = -(R2view(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
+            double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
+            double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
+
+            E2view(j)[0] += Eext_x;
+            E2view(j)[1] += Eext_y;
+            E2view(j)[2] += Eext_z;
+            P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (E2view(j)[0] 
+                            + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]) );
+            P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (E2view(j)[1] 
+                            - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
+            P2view(j)[2] += alpha * E2view(j)[2];
+        });
+        IpplTimings::stopTimer(PTimer);
+
+        P->time_m += dt;
+        IpplTimings::startTimer(dumpDataTimer);
+        P->dumpEnergy();
+        IpplTimings::stopTimer(dumpDataTimer);
+        msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
+    }
+
+    msg << TestName << " End." << endl;
+    IpplTimings::stopTimer(mainTimer);
+    IpplTimings::print();
+    IpplTimings::print(std::string("timing.dat"));
+
+    return 0;
+}

From 06ca5e6a1e372626cdaf55e8e9505c35ea4774f3 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 16 Jan 2023 10:02:23 +0100
Subject: [PATCH 045/117] bug fixed for BumponTail and PenningTrap

---
 .../ElectrostaticPIF/BumponTailInstabilityPIF.cpp |  3 ++-
 alpine/PinT/BumponTailInstabilityPinT.cpp         |  2 ++
 alpine/PinT/ChargedParticlesPinT.hpp              | 15 ++++++++++-----
 alpine/PinT/LandauDampingPinT.cpp                 |  2 +-
 alpine/PinT/PenningTrapPinT.cpp                   | 11 ++++-------
 src/Particle/ParticleAttrib.hpp                   | 12 +++++++-----
 6 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
index 53537509d..2ac5b18f4 100644
--- a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -194,6 +194,8 @@ int main(int argc, char *argv[]){
     }
 
     // create mesh and layout objects for this problem domain
+    Vector_t kw;
+    double sigma, muBulk, muBeam, epsilon, delta;
     
     if(std::strcmp(TestName,"TwoStreamInstabilityPIF") == 0) {
         // Parameters for two stream instability as in 
@@ -270,7 +272,6 @@ int main(int argc, char *argv[]){
     size_type nlocBulk = (size_type)(factorConf * factorVelBulk * totalP);
     size_type nlocBeam = (size_type)(factorConf * factorVelBeam * totalP);
     size_type nloc = nlocBulk + nlocBeam;
-    size_type nloc = (size_type)(factor * totalP);
     size_type Total_particles = 0;
 
     MPI_Allreduce(&nloc, &Total_particles, 1,
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 55eb7127d..235df8b14 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -357,6 +357,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
     static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -455,6 +456,7 @@ int main(int argc, char *argv[]){
     Pend = std::make_unique<states_end_type>(PL);
 
     Pcoarse->nr_m = nrPIC;
+    Pcoarse->nm_m = nmPIF;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
     Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index a0c88178e..52c19f614 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -596,9 +596,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void initializeShapeFunctionPIF() {
 
+        Inform m("initializeShape");
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
         auto Skview = Sk_m.getView();
         auto N = nm_m;
+        const int nghost = Sk_m.getNghost();
         const Mesh_t& mesh = rhoPIF_m.get_mesh();
         const Vector_t& dx = mesh.getMeshSpacing();
         const Vector_t& Len = rmax_m - rmin_m;
@@ -635,14 +637,17 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                     //Fourier transform of CIC
                     Sk *= std::pow(arg, order);
                 }
-                    Skview(i, j, k) = Sk;
+                    Skview(i+nghost, j+nghost, k+nghost) = Sk;
             });
+            
 
         }
         else {
             throw IpplException("initializeShapeFunctionPIF",
                                 "Unrecognized shape function type");
         }
+        double Sknorm = norm(Sk_m);
+        m << "Sknorm in initialize: " << Sknorm << endl;
 
     }
 
@@ -836,8 +841,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0)) {
             IpplTimings::startTimer(dumpData);
-            dumpLandau(iter);         
-            //dumpBumponTail(iter);         
+            //dumpLandau(iter);         
+            dumpBumponTail(iter);         
             dumpEnergy(this->getLocalNum(), iter, Ptemp);
             IpplTimings::stopTimer(dumpData);
         }
@@ -870,8 +875,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             time_m += dt;
             
             IpplTimings::startTimer(dumpData);
-            dumpLandau(iter);         
-            //dumpBumponTail(iter);         
+            //dumpLandau(iter);         
+            dumpBumponTail(iter);         
             dumpEnergy(this->getLocalNum(), iter, Ptemp);         
             IpplTimings::stopTimer(dumpData);
     
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 4c456eaa5..b23198bd8 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -502,7 +502,7 @@ int main(int argc, char *argv[]){
 #endif
 
 
-    Pcoarse->q = Pcoarse->Q_m/totalP;
+    Pcoarse->q = Pcoarse->Q_m/nloc;
     IpplTimings::stopTimer(particleCreation);                                                    
     
     msg << "particles created and initial conditions assigned " << endl;
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 3a59dd439..9629dd0ab 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -316,7 +316,7 @@ double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
 }
 
 
-const char* TestName = "PenningTrap";
+const char* TestName = "PenningTrapPinT";
 
 int main(int argc, char *argv[]){
     Ippl ippl(argc, argv);
@@ -344,6 +344,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
     static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
+    static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -357,11 +358,6 @@ int main(int argc, char *argv[]){
     const double tol = std::atof(argv[11]);
     const unsigned int maxIter = std::atoi(argv[12]);
 
-    msg << "dtSlice: " << dtSlice 
-        << "dtSlice/dtFine: " << dtSlice / dtFine
-        << "(int)dtSlice/dtFine: " << (unsigned int)(dtSlice / dtFine)
-        << endl;
-
     const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
     //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
 
@@ -426,6 +422,7 @@ int main(int argc, char *argv[]){
     Pend = std::make_unique<states_end_type>(PL);
 
     Pcoarse->nr_m = nrPIC;
+    Pcoarse->nm_m = nmPIF;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
     Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
@@ -598,7 +595,7 @@ int main(int argc, char *argv[]){
         tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
         
-        if(Ippl::Comm->rank() > 0) {
+        if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
             size_type bufSize = Pbegin->packedSize(nloc);
             buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
             Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 687505bc1..156368dad 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -207,7 +207,6 @@ namespace ippl {
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
-        //Inform msg("scatterPIF");
         
         static IpplTimings::TimerRef scatterPIFTimer = IpplTimings::getTimer("ScatterPIF");           
         IpplTimings::startTimer(scatterPIFTimer);
@@ -225,6 +224,7 @@ namespace ippl {
         vector_type Len;
         Vector<int, Dim> N;
 
+
         for (unsigned d=0; d < Dim; ++d) {
             N[d] = domain[d].length();
             Len[d] = dx[d] * N[d];
@@ -270,7 +270,7 @@ namespace ippl {
                     bool shift = (iVec[d] > (N[d]/2));
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                 }
-                auto Sk = Skview(i, j, k);
+                auto Sk = Skview(i+nghost, j+nghost, k+nghost);
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
                 [=](const size_t idx, FT& innerReduce)
                 {
@@ -280,7 +280,8 @@ namespace ippl {
                     }
                     const value_type& val = dview_m(idx);
 
-                    innerReduce += Sk*(Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
+                    innerReduce += Sk * (Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
+                    //innerReduce += (Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {
@@ -367,7 +368,6 @@ namespace ippl {
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
-        //Inform msg("gatherPIF");
         static IpplTimings::TimerRef gatherPIFTimer = IpplTimings::getTimer("GatherPIF");           
         IpplTimings::startTimer(gatherPIFTimer);
         
@@ -389,6 +389,8 @@ namespace ippl {
             Len[d] = dx[d] * N[d];
         }
 
+
+
         typedef Kokkos::TeamPolicy<> team_policy;
         typedef Kokkos::TeamPolicy<>::member_type member_type;
 
@@ -437,7 +439,7 @@ namespace ippl {
                     FT Ek = 0.0;
                     value_type Ex = 0.0;
                     auto rho = fview(i+nghost,j+nghost,k+nghost);
-                    auto Sk = Skview(i,j,k);
+                    auto Sk = Skview(i+nghost,j+nghost,k+nghost);
                     for(size_t d = 0; d < Dim; ++d) {
                         
                         bool isNotZero = (Dr != 0.0);

From a3151c653f4f5c093a69a2f386ae13b412e1b901 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 16 Jan 2023 10:17:11 +0100
Subject: [PATCH 046/117] few tweaks

---
 alpine/PinT/ChargedParticlesPinT.hpp | 3 ---
 src/Particle/ParticleAttrib.hpp      | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 52c19f614..31976918f 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -596,7 +596,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void initializeShapeFunctionPIF() {
 
-        Inform m("initializeShape");
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
         auto Skview = Sk_m.getView();
         auto N = nm_m;
@@ -646,8 +645,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             throw IpplException("initializeShapeFunctionPIF",
                                 "Unrecognized shape function type");
         }
-        double Sknorm = norm(Sk_m);
-        m << "Sknorm in initialize: " << Sknorm << endl;
 
     }
 
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 156368dad..452d0f923 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -280,8 +280,8 @@ namespace ippl {
                     }
                     const value_type& val = dview_m(idx);
 
-                    innerReduce += Sk * (Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
-                    //innerReduce += (Kokkos::Experimental::cos(arg) - imag*Kokkos::Experimental::sin(arg))*val;
+                    innerReduce += Sk * (Kokkos::Experimental::cos(arg) 
+                                - imag * Kokkos::Experimental::sin(arg)) * val;
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {

From 62ca48ce4836c07c1446290fba7cabdbc7204746 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 20 Jan 2023 23:09:08 +0100
Subject: [PATCH 047/117] Position error calculation changed

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 125 ++++++++++++++++++----
 alpine/PinT/ChargedParticlesPinT.hpp      |  29 +++++
 alpine/PinT/PenningTrapPinT.cpp           |   3 +
 3 files changed, 139 insertions(+), 18 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 235df8b14..af00d033c 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -180,10 +180,10 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
     //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
 
 
-    double globaltemp = 0.0;
-    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //double globaltemp = 0.0;
+    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
 
-    double absError = std::sqrt(globaltemp);
+    //double absError = std::sqrt(globaltemp);
 
     //temp = 0.0;
     //Kokkos::parallel_reduce("Q norm", Q.size(),
@@ -193,10 +193,10 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
     //                        }, Kokkos::Sum<double>(temp));
 
 
-    globaltemp = 0.0;
-    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //globaltemp = 0.0;
+    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
 
-    double relError = absError / std::sqrt(globaltemp);
+    double relError = lError;//absError / std::sqrt(globaltemp);
     
     return relError;
 
@@ -244,6 +244,74 @@ double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Q
 
 }
 
+double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+
+                                myValError = std::sqrt(myValError);
+
+                                bool isIncluded = (myValError < 10.0);
+
+                                myValError *= isIncluded;
+                                
+                                if(myValError > valLError) valLError = myValError;
+                                
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                myValnorm = std::sqrt(myValnorm);
+
+                                myValnorm *= isIncluded;
+                                
+                                if(myValnorm > valLnorm) valLnorm = myValnorm;
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
+
+    Kokkos::fence();
+    lError = localError/localNorm;
+    
+    double relError = lError;
+    
+    return relError;
+
+}
+
+double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
+
+    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                myValError = std::sqrt(myValError);
+                                
+                                if(myValError > valLError) valLError = myValError;
+                                
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                myValnorm = std::sqrt(myValnorm);
+                                
+                                if(myValnorm > valLnorm) valLnorm = myValnorm;
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
+
+    Kokkos::fence();
+    lError = localError/localNorm;
+    
+    double relError = lError;
+    
+    return relError;
+
+}
 
 double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
 
@@ -429,15 +497,16 @@ int main(int argc, char *argv[]){
         delta = 0.01;
     }
     Vector_t rmin(0.0);
-    Vector_t rmax = 2 * pi / kw ;
-    double dxPIC = rmax[0] / nrPIC[0];
-    double dyPIC = rmax[1] / nrPIC[1];
-    double dzPIC = rmax[2] / nrPIC[2];
+    Vector_t rmax = (2 * pi / kw);
+    Vector_t length = rmax - rmin;
+    double dxPIC = length[0] / nrPIC[0];
+    double dyPIC = length[1] / nrPIC[1];
+    double dzPIC = length[2] / nrPIC[2];
 
 
-    double dxPIF = rmax[0] / nmPIF[0];
-    double dyPIF = rmax[1] / nmPIF[1];
-    double dzPIF = rmax[2] / nmPIF[2];
+    double dxPIF = length[0] / nmPIF[0];
+    double dyPIF = length[1] / nmPIF[1];
+    double dzPIF = length[2] / nmPIF[2];
     Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
     Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
@@ -450,7 +519,7 @@ int main(int argc, char *argv[]){
     PLayout_t PL(FLPIC, meshPIC);
 
     //Q = -\int\int f dx dv
-    double Q = -rmax[0] * rmax[1] * rmax[2];
+    double Q = -length[0] * length[1] * length[2];
     Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
@@ -476,6 +545,8 @@ int main(int argc, char *argv[]){
         maxU[d]   = CDF(rmax[d], delta, kw[d], d);
     }
 
+    minU = minU;
+    maxU = maxU;
     double factorVelBulk = 1.0 - epsilon;
     double factorVelBeam = 1.0 - factorVelBulk;
     size_type nlocBulk = (size_type)(factorVelBulk * totalP);
@@ -536,6 +607,7 @@ int main(int argc, char *argv[]){
     Ippl::Comm->barrier();
 #endif
 
+    Pcoarse->dumpParticleData(0, Pcoarse->R, Pcoarse->P, "Parareal");
 
     msg << "Parareal "
         << TestName
@@ -614,7 +686,7 @@ int main(int argc, char *argv[]){
     IpplTimings::startTimer(initializeShapeFunctionPIF);
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
-    
+   
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
@@ -627,6 +699,9 @@ int main(int argc, char *argv[]){
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
 
+        //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gk");
+        //Pcoarse->dumpParticleData(it+1, Pbegin->R, Pbegin->P, "Fk");
+
         IpplTimings::startTimer(deepCopy);
         Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
         Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
@@ -664,17 +739,28 @@ int main(int argc, char *argv[]){
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
-        IpplTimings::startTimer(computeErrors);
+        //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gkp1");
+
+
+        PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
         double localRerror, localPerror;
-        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        
+        //if(it > 0) {
+        IpplTimings::startTimer(computeErrors);
+        double Rerror = computeRLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Perror = computePLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        //Rerror = computeLinfError(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        //Perror = computeLinfError(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
         IpplTimings::stopTimer(computeErrors);
+        //}
 
         if((Rerror <= tol) && (Perror <= tol)) {
             isConverged = true;
         }
 
+        //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pend->R.getView());
+        //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pend->P.getView());
 
         IpplTimings::startTimer(timeCommunication);
         if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
@@ -697,6 +783,9 @@ int main(int argc, char *argv[]){
         IpplTimings::startTimer(dumpData);
         //Pcoarse->writeError(Rerror, Perror, it+1);
         Pcoarse->writelocalError(localRerror, localPerror, it+1);
+        //if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
+        Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
+        //}
         IpplTimings::stopTimer(dumpData);
 
         if(isConverged && isPreviousDomainConverged) {
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 31976918f..f9474643d 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -504,6 +504,35 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     }
 
+
+     void dumpParticleData(const unsigned int& iter, ParticleAttrib<Vector_t>& Rtemp, ParticleAttrib<Vector_t>& Ptemp, const char* fname) {
+
+        typename ParticleAttrib<Vector_t>::HostMirror R_host = Rtemp.getHostMirror();
+        typename ParticleAttrib<Vector_t>::HostMirror P_host = Ptemp.getHostMirror();
+        Kokkos::deep_copy(R_host, Rtemp.getView());
+        Kokkos::deep_copy(P_host, Ptemp.getView());
+        std::stringstream pname;
+        pname << "data/";
+        pname << fname;
+        pname << "_rank_";
+        pname << Ippl::Comm->rank();
+        pname << "_iter_";
+        pname << iter;
+        pname << ".csv";
+        Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+        pcsvout.precision(10);
+        pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+        pcsvout << "R_x, R_y, R_z, V_x, V_y, V_z" << endl;
+        for (size_type i = 0; i< this->getLocalNum(); i++) {
+            pcsvout << R_host(i)[0] << " "
+                    << R_host(i)[1] << " "
+                    << R_host(i)[2] << " "
+                    << P_host(i)[0] << " "
+                    << P_host(i)[1] << " "
+                    << P_host(i)[2] << endl;
+        }
+     }
+
     void writelocalError(double Rerror, double Perror, unsigned int iter) {
         
             std::stringstream fname;
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 9629dd0ab..3e6d0a879 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -656,6 +656,9 @@ int main(int argc, char *argv[]){
         IpplTimings::startTimer(dumpData);
         //Pcoarse->writeError(Rerror, Perror, it+1);
         Pcoarse->writelocalError(localRerror, localPerror, it+1);
+        if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
+            Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P);
+        }
         IpplTimings::stopTimer(dumpData);
 
         if(isConverged && isPreviousDomainConverged) {

From 3f1e8703b28070ef1d8732302cf72e1aa6b371e6 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 20 Jan 2023 23:12:58 +0100
Subject: [PATCH 048/117] unnecesary thing removed

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index af00d033c..5b086c3f8 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -545,8 +545,6 @@ int main(int argc, char *argv[]){
         maxU[d]   = CDF(rmax[d], delta, kw[d], d);
     }
 
-    minU = minU;
-    maxU = maxU;
     double factorVelBulk = 1.0 - epsilon;
     double factorVelBeam = 1.0 - factorVelBulk;
     size_type nlocBulk = (size_type)(factorVelBulk * totalP);

From a838a0caefecd8536c5c153c5bd4432fd5465c29 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Sat, 21 Jan 2023 09:08:46 +0100
Subject: [PATCH 049/117] modified stopping added to LandauDamping and
 BumponTail

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 63 +++++----------------
 alpine/PinT/ChargedParticlesPinT.hpp      |  8 +--
 alpine/PinT/LandauDampingPinT.cpp         | 67 +++++++++++++++++------
 alpine/PinT/PenningTrapPinT.cpp           |  1 +
 4 files changed, 69 insertions(+), 70 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 5b086c3f8..63d978b92 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -202,58 +202,20 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
 }
 
-double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
-    
-    auto Qview = Q.getView();
-    auto QprevIterView = QprevIter.getView();
-    double localError = 0.0;
-    double localNorm = 0.0;
-
-    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
-                                Vector_t diff = Qview(i) - QprevIterView(i);
-                                double myValError = dot(diff, diff).apply();
-                                myValError = std::sqrt(myValError);
-                                
-                                if(myValError > valLError) valLError = myValError;
-                                
-                                double myValnorm = dot(Qview(i), Qview(i)).apply();
-                                myValnorm = std::sqrt(myValnorm);
-                                
-                                if(myValnorm > valLnorm) valLnorm = myValnorm;
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
-
-    Kokkos::fence();
-    lError = localError/localNorm;
-    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
-
-
-    //double globaltemp = 0.0;
-    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
-
-    //double absError = globaltemp;
-
-    //globaltemp = 0.0;
-    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
-
-    //double relError = absError / globaltemp;
-    double relError = lError;
-    
-    return relError;
-
-}
 
 double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
+                      unsigned int& notIncluded) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
     double localError = 0.0;
     double localNorm = 0.0;
+    notIncluded = 0;
 
     Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm, 
+                                          unsigned int& excluded){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
                                 double myValError = dot(diff, diff).apply();
 
@@ -261,6 +223,7 @@ double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>&
 
                                 bool isIncluded = (myValError < 10.0);
 
+
                                 myValError *= isIncluded;
                                 
                                 if(myValError > valLError) valLError = myValError;
@@ -271,7 +234,10 @@ double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>&
                                 myValnorm *= isIncluded;
                                 
                                 if(myValnorm > valLnorm) valLnorm = myValnorm;
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
+                                
+                                excluded += (!isIncluded);
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm), 
+                               Kokkos::Sum<unsigned int>(notIncluded));
 
     Kokkos::fence();
     lError = localError/localNorm;
@@ -742,13 +708,11 @@ int main(int argc, char *argv[]){
 
         PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
         double localRerror, localPerror;
+        unsigned int excludedNp;
         
-        //if(it > 0) {
         IpplTimings::startTimer(computeErrors);
-        double Rerror = computeRLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Rerror = computeRLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, excludedNp);
         double Perror = computePLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-        //Rerror = computeLinfError(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        //Perror = computeLinfError(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
         IpplTimings::stopTimer(computeErrors);
         //}
@@ -757,8 +721,6 @@ int main(int argc, char *argv[]){
             isConverged = true;
         }
 
-        //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pend->R.getView());
-        //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pend->P.getView());
 
         IpplTimings::startTimer(timeCommunication);
         if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
@@ -776,6 +738,7 @@ int main(int argc, char *argv[]){
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
+            << " # Excluded: " << excludedNp 
             << endl;
 
         IpplTimings::startTimer(dumpData);
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index f9474643d..c827669a5 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -683,7 +683,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
         PLayout& PL = this->getLayout();
-        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIC_m = 0.0;
         scatter(q, rhoPIC_m, Rtemp);
@@ -744,7 +744,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
         PLayout& PL = this->getLayout();
-        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIC_m = 0.0;
         scatter(q, rhoPIC_m, Rtemp);
@@ -853,7 +853,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
-        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
         scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
@@ -917,7 +917,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
-        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
         scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index b23198bd8..a22e325e8 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -187,42 +187,76 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
 }
 
-double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
+                      unsigned int& notIncluded) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
     double localError = 0.0;
     double localNorm = 0.0;
+    notIncluded = 0;
 
     Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm, 
+                                          unsigned int& excluded){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
                                 double myValError = dot(diff, diff).apply();
+
                                 myValError = std::sqrt(myValError);
+
+                                bool isIncluded = (myValError < 10.0);
+
+
+                                myValError *= isIncluded;
                                 
                                 if(myValError > valLError) valLError = myValError;
                                 
                                 double myValnorm = dot(Qview(i), Qview(i)).apply();
                                 myValnorm = std::sqrt(myValnorm);
+
+                                myValnorm *= isIncluded;
                                 
                                 if(myValnorm > valLnorm) valLnorm = myValnorm;
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
+                                
+                                excluded += (!isIncluded);
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm), 
+                               Kokkos::Sum<unsigned int>(notIncluded));
 
     Kokkos::fence();
     lError = localError/localNorm;
-    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
-
+    
+    double relError = lError;
+    
+    return relError;
 
-    //double globaltemp = 0.0;
-    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+}
 
-    //double absError = globaltemp;
+double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
 
-    //globaltemp = 0.0;
-    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                myValError = std::sqrt(myValError);
+                                
+                                if(myValError > valLError) valLError = myValError;
+                                
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                myValnorm = std::sqrt(myValnorm);
+                                
+                                if(myValnorm > valLnorm) valLnorm = myValnorm;
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
 
-    //double relError = absError / globaltemp;
+    Kokkos::fence();
+    lError = localError/localNorm;
+    
     double relError = lError;
     
     return relError;
@@ -617,10 +651,12 @@ int main(int argc, char *argv[]){
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
+        PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
         IpplTimings::startTimer(computeErrors);
         double localRerror, localPerror;
-        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        unsigned int excludedNp;
+        double Rerror = computeRLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, excludedNp);
+        double Perror = computePLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
         //double EfieldError = 0;
         //if(it > 0) {
@@ -649,8 +685,7 @@ int main(int argc, char *argv[]){
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
-            //<< " Efield error: " << EfieldError
-            //<< " Rhofield error: " << EfieldError
+            << " # Excluded: " << excludedNp 
             << endl;
 
         IpplTimings::startTimer(dumpData);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 3e6d0a879..c7b2f13ba 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -623,6 +623,7 @@ int main(int argc, char *argv[]){
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
+        PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
         IpplTimings::startTimer(computeErrors);
         double localRerror, localPerror;
         double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);

From b8cd2d9b4c567e2c3f0e094777afa929956d5005 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Sat, 21 Jan 2023 12:22:57 +0100
Subject: [PATCH 050/117] bug in PenningTrapPinT fixed

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 2 +-
 alpine/PinT/PenningTrapPinT.cpp           | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 63d978b92..0ee2ee66e 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -745,7 +745,7 @@ int main(int argc, char *argv[]){
         //Pcoarse->writeError(Rerror, Perror, it+1);
         Pcoarse->writelocalError(localRerror, localPerror, it+1);
         //if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
-        Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
+        //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
         //}
         IpplTimings::stopTimer(dumpData);
 
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index c7b2f13ba..7bf180a3d 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -657,9 +657,7 @@ int main(int argc, char *argv[]){
         IpplTimings::startTimer(dumpData);
         //Pcoarse->writeError(Rerror, Perror, it+1);
         Pcoarse->writelocalError(localRerror, localPerror, it+1);
-        if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
-            Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P);
-        }
+        //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
         IpplTimings::stopTimer(dumpData);
 
         if(isConverged && isPreviousDomainConverged) {

From 77f7b679007c8766a866875e18b147f3561a4fcb Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Sat, 21 Jan 2023 19:40:54 +0100
Subject: [PATCH 051/117] Linf errors changed to L2

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 85 ++++++++++++--------
 alpine/PinT/LandauDampingPinT.cpp         | 98 ++++++++++++++---------
 alpine/PinT/PenningTrapPinT.cpp           | 24 ++----
 3 files changed, 121 insertions(+), 86 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 0ee2ee66e..389f619d9 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -158,8 +158,9 @@ double CDF(const double& x, const double& delta, const double& k,
    return cdf;
 }
 
-double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                         const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
+                         Vector_t& length) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -169,6 +170,15 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
     Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
                             KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
+
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -10.0);
+                                    bool isRight = (diff[d] >= 10.0);
+                                    bool isInside = ((diff[d] > -10.0) && (diff[d] < 10.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+
                                 double myValError = dot(diff, diff).apply();
                                 valLError += myValError;
                                 double myValnorm = dot(Qview(i), Qview(i)).apply();
@@ -177,24 +187,32 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
     Kokkos::fence();
     lError = std::sqrt(localError)/std::sqrt(localNorm);
-    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
-
 
-    //double globaltemp = 0.0;
-    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    double relError = lError;//absError / std::sqrt(globaltemp);
+    
+    return relError;
 
-    //double absError = std::sqrt(globaltemp);
+}
 
-    //temp = 0.0;
-    //Kokkos::parallel_reduce("Q norm", Q.size(),
-    //                        KOKKOS_LAMBDA(const int i, double& valL){
-    //                            double myVal = dot(Qview(i), Qview(i)).apply();
-    //                            valL += myVal;
-    //                        }, Kokkos::Sum<double>(temp));
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
 
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
-    //globaltemp = 0.0;
-    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    Kokkos::fence();
+    lError = std::sqrt(localError)/std::sqrt(localNorm);
 
     double relError = lError;//absError / std::sqrt(globaltemp);
     
@@ -202,42 +220,46 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
 }
 
-
 double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
                       const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
-                      unsigned int& notIncluded) {
+                      Vector_t& length) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
     double localError = 0.0;
     double localNorm = 0.0;
-    notIncluded = 0;
 
     Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm, 
-                                          unsigned int& excluded){
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
+                                
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -10.0);
+                                    bool isRight = (diff[d] >= 10.0);
+                                    bool isInside = ((diff[d] > -10.0) && (diff[d] < 10.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+                                
                                 double myValError = dot(diff, diff).apply();
 
                                 myValError = std::sqrt(myValError);
 
-                                bool isIncluded = (myValError < 10.0);
+                                //bool isIncluded = (myValError < 10.0);
 
-
-                                myValError *= isIncluded;
+                                //myValError *= isIncluded;
                                 
                                 if(myValError > valLError) valLError = myValError;
                                 
                                 double myValnorm = dot(Qview(i), Qview(i)).apply();
                                 myValnorm = std::sqrt(myValnorm);
 
-                                myValnorm *= isIncluded;
+                                //myValnorm *= isIncluded;
                                 
                                 if(myValnorm > valLnorm) valLnorm = myValnorm;
                                 
-                                excluded += (!isIncluded);
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm), 
-                               Kokkos::Sum<unsigned int>(notIncluded));
+                                //excluded += (!isIncluded);
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
 
     Kokkos::fence();
     lError = localError/localNorm;
@@ -248,6 +270,7 @@ double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>&
 
 }
 
+
 double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
                       const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
     
@@ -571,7 +594,7 @@ int main(int argc, char *argv[]){
     Ippl::Comm->barrier();
 #endif
 
-    Pcoarse->dumpParticleData(0, Pcoarse->R, Pcoarse->P, "Parareal");
+    //Pcoarse->dumpParticleData(0, Pcoarse->R, Pcoarse->P, "Parareal");
 
     msg << "Parareal "
         << TestName
@@ -708,11 +731,10 @@ int main(int argc, char *argv[]){
 
         PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
         double localRerror, localPerror;
-        unsigned int excludedNp;
         
         IpplTimings::startTimer(computeErrors);
-        double Rerror = computeRLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, excludedNp);
-        double Perror = computePLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
+        double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
         IpplTimings::stopTimer(computeErrors);
         //}
@@ -738,7 +760,6 @@ int main(int argc, char *argv[]){
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
-            << " # Excluded: " << excludedNp 
             << endl;
 
         IpplTimings::startTimer(dumpData);
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index a22e325e8..ea02ff985 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -143,8 +143,9 @@ double CDF(const double& x, const double& alpha, const double& k) {
    return cdf;
 }
 
-double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                         const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
+                         Vector_t& length) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -154,6 +155,15 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
     Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
                             KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
+
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -10.0);
+                                    bool isRight = (diff[d] >= 10.0);
+                                    bool isInside = ((diff[d] > -10.0) && (diff[d] < 10.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+
                                 double myValError = dot(diff, diff).apply();
                                 valLError += myValError;
                                 double myValnorm = dot(Qview(i), Qview(i)).apply();
@@ -162,26 +172,34 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
     Kokkos::fence();
     lError = std::sqrt(localError)/std::sqrt(localNorm);
-    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
-
 
-    double globaltemp = 0.0;
-    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    double relError = lError;//absError / std::sqrt(globaltemp);
+    
+    return relError;
 
-    double absError = std::sqrt(globaltemp);
+}
 
-    //temp = 0.0;
-    //Kokkos::parallel_reduce("Q norm", Q.size(),
-    //                        KOKKOS_LAMBDA(const int i, double& valL){
-    //                            double myVal = dot(Qview(i), Qview(i)).apply();
-    //                            valL += myVal;
-    //                        }, Kokkos::Sum<double>(temp));
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
 
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
-    globaltemp = 0.0;
-    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    Kokkos::fence();
+    lError = std::sqrt(localError)/std::sqrt(localNorm);
 
-    double relError = absError / std::sqrt(globaltemp);
+    double relError = lError;//absError / std::sqrt(globaltemp);
     
     return relError;
 
@@ -189,39 +207,44 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
 double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
                       const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
-                      unsigned int& notIncluded) {
+                      Vector_t& length) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
     double localError = 0.0;
     double localNorm = 0.0;
-    notIncluded = 0;
 
     Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm, 
-                                          unsigned int& excluded){
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
+                                
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -10.0);
+                                    bool isRight = (diff[d] >= 10.0);
+                                    bool isInside = ((diff[d] > -10.0) && (diff[d] < 10.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+                                
                                 double myValError = dot(diff, diff).apply();
 
                                 myValError = std::sqrt(myValError);
 
-                                bool isIncluded = (myValError < 10.0);
-
+                                //bool isIncluded = (myValError < 10.0);
 
-                                myValError *= isIncluded;
+                                //myValError *= isIncluded;
                                 
                                 if(myValError > valLError) valLError = myValError;
                                 
                                 double myValnorm = dot(Qview(i), Qview(i)).apply();
                                 myValnorm = std::sqrt(myValnorm);
 
-                                myValnorm *= isIncluded;
+                                //myValnorm *= isIncluded;
                                 
                                 if(myValnorm > valLnorm) valLnorm = myValnorm;
                                 
-                                excluded += (!isIncluded);
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm), 
-                               Kokkos::Sum<unsigned int>(notIncluded));
+                                //excluded += (!isIncluded);
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
 
     Kokkos::fence();
     lError = localError/localNorm;
@@ -435,14 +458,15 @@ int main(int argc, char *argv[]){
     Vector_t alpha = {0.05, 0.05, 0.05};
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
-    double dxPIC = rmax[0] / nrPIC[0];
-    double dyPIC = rmax[1] / nrPIC[1];
-    double dzPIC = rmax[2] / nrPIC[2];
+    Vector_t length = rmax - rmin;
+    double dxPIC = length[0] / nrPIC[0];
+    double dyPIC = length[1] / nrPIC[1];
+    double dzPIC = length[2] / nrPIC[2];
 
 
-    double dxPIF = rmax[0] / nmPIF[0];
-    double dyPIF = rmax[1] / nmPIF[1];
-    double dzPIF = rmax[2] / nmPIF[2];
+    double dxPIF = length[0] / nmPIF[0];
+    double dyPIF = length[1] / nmPIF[1];
+    double dzPIF = length[2] / nmPIF[2];
     Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
     Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
@@ -455,7 +479,7 @@ int main(int argc, char *argv[]){
     PLayout_t PL(FLPIC, meshPIC);
 
     //Q = -\int\int f dx dv
-    double Q = -rmax[0] * rmax[1] * rmax[2];
+    double Q = -length[0] * length[1] * length[2];
     Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
@@ -654,9 +678,8 @@ int main(int argc, char *argv[]){
         PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
         IpplTimings::startTimer(computeErrors);
         double localRerror, localPerror;
-        unsigned int excludedNp;
-        double Rerror = computeRLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, excludedNp);
-        double Perror = computePLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
+        double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
         //double EfieldError = 0;
         //if(it > 0) {
@@ -685,7 +708,6 @@ int main(int argc, char *argv[]){
         msg << "Finished iteration: " << it+1 
             << " Rerror: " << Rerror 
             << " Perror: " << Perror
-            << " # Excluded: " << excludedNp 
             << endl;
 
         IpplTimings::startTimer(dumpData);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 7bf180a3d..e4b25794e 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -168,23 +168,15 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
     //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
 
 
-    double globaltemp = 0.0;
-    MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-
-    double absError = std::sqrt(globaltemp);
-
-    //temp = 0.0;
-    //Kokkos::parallel_reduce("Q norm", Q.size(),
-    //                        KOKKOS_LAMBDA(const int i, double& valL){
-    //                            double myVal = dot(Qview(i), Qview(i)).apply();
-    //                            valL += myVal;
-    //                        }, Kokkos::Sum<double>(temp));
+    //double globaltemp = 0.0;
+    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
 
+    //double absError = std::sqrt(globaltemp);
 
-    globaltemp = 0.0;
-    MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //globaltemp = 0.0;
+    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
 
-    double relError = absError / std::sqrt(globaltemp);
+    double relError = lError;//absError / std::sqrt(globaltemp);
     
     return relError;
 
@@ -626,8 +618,8 @@ int main(int argc, char *argv[]){
         PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
         IpplTimings::startTimer(computeErrors);
         double localRerror, localPerror;
-        double Rerror = computeLinfError(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        double Perror = computeLinfError(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
+        double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
         IpplTimings::stopTimer(computeErrors);
 

From 9249627574799edae19bb77ac15104ccb447247e Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 31 Jan 2023 11:19:53 +0100
Subject: [PATCH 052/117] Grid, Pc change test done for all examples

---
 alpine/ElectrostaticPIC/ChargedParticles.hpp |   2 +-
 alpine/ElectrostaticPIC/PenningTrap.cpp      |   4 +-
 alpine/PinT/LandauDampingPinT.cpp            |  15 ++-
 alpine/PinT/PenningTrapPinT.cpp              | 134 ++++++++++++++-----
 4 files changed, 115 insertions(+), 40 deletions(-)

diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index 61730648d..67b8f738f 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -341,7 +341,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
          rhoNorm_m = norm(rho_m);
          IpplTimings::stopTimer(sumTimer);
 
-         //dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
+         dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
 
          //rho = rho_e - rho_i
          rho_m = rho_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
diff --git a/alpine/ElectrostaticPIC/PenningTrap.cpp b/alpine/ElectrostaticPIC/PenningTrap.cpp
index 9ea440176..669634089 100644
--- a/alpine/ElectrostaticPIC/PenningTrap.cpp
+++ b/alpine/ElectrostaticPIC/PenningTrap.cpp
@@ -205,8 +205,8 @@ int main(int argc, char *argv[]){
     }
 
     // create mesh and layout objects for this problem domain
-    Vector_t rmin(0.0);
-    Vector_t rmax(20.0);
+    Vector_t rmin = {0.0, 0.0, 0.0};
+    Vector_t rmax = {20.0, 20.0, 20.0};
     double dx = rmax[0] / nr[0];
     double dy = rmax[1] / nr[1];
     double dz = rmax[2] / nr[2];
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index ea02ff985..e80bed086 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -606,11 +606,6 @@ int main(int argc, char *argv[]){
 
 
     msg << "Starting parareal iterations ..." << endl;
-    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
-    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
-    //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
-    //                     dtFine, isConverged, tStartMySlice, 0);
-    //Ippl::Comm->barrier();
     bool isConverged = false;
     bool isPreviousDomainConverged;
     if(Ippl::Comm->rank() == 0) {
@@ -625,6 +620,14 @@ int main(int argc, char *argv[]){
     IpplTimings::startTimer(initializeShapeFunctionPIF);
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
+    
+    
+    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
+    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
+    //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
+    //                     dtFine, isConverged, tStartMySlice, 0);
+    //Ippl::Comm->barrier();
+    
     //unsigned int maxIterRank;
     for (unsigned int it=0; it<maxIter; it++) {
 
@@ -681,6 +684,8 @@ int main(int argc, char *argv[]){
         double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
         double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
+        //double Rerror = computeRL2Error(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
+        //double Perror = computePL2Error(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
         //double EfieldError = 0;
         //if(it > 0) {
         //    EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index e4b25794e..016351a19 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -145,9 +145,9 @@ double CDF(const double& x, const double& mu, const double& sigma) {
    return cdf;
 }
 
-
-double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                         const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
+                         Vector_t& length) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -157,6 +157,15 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
     Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
                             KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
+
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -22.0);
+                                    bool isRight = (diff[d] >= 22.0);
+                                    bool isInside = ((diff[d] > -22.0) && (diff[d] < 22.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+
                                 double myValError = dot(diff, diff).apply();
                                 valLError += myValError;
                                 double myValnorm = dot(Qview(i), Qview(i)).apply();
@@ -165,16 +174,32 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
     Kokkos::fence();
     lError = std::sqrt(localError)/std::sqrt(localNorm);
-    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
 
+    double relError = lError;//absError / std::sqrt(globaltemp);
+    
+    return relError;
 
-    //double globaltemp = 0.0;
-    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+}
 
-    //double absError = std::sqrt(globaltemp);
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
 
-    //globaltemp = 0.0;
-    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    Kokkos::parallel_reduce("Abs. error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                valLError += myValError;
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                valLnorm += myValnorm;
+                            }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
+
+    Kokkos::fence();
+    lError = std::sqrt(localError)/std::sqrt(localNorm);
 
     double relError = lError;//absError / std::sqrt(globaltemp);
     
@@ -182,8 +207,9 @@ double computeL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qpr
 
 }
 
-double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
+                      Vector_t& length) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -193,31 +219,69 @@ double computeLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Q
     Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
                             KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
+                                
+                                for (unsigned d = 0; d < 3; ++d) {
+                                    bool isLeft = (diff[d] <= -22.0);
+                                    bool isRight = (diff[d] >= 22.0);
+                                    bool isInside = ((diff[d] > -22.0) && (diff[d] < 22.0));
+                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
+                                              +(isRight * (diff[d] - length[d]));
+                                }
+                                
                                 double myValError = dot(diff, diff).apply();
+
                                 myValError = std::sqrt(myValError);
+
+                                //bool isIncluded = (myValError < 10.0);
+
+                                //myValError *= isIncluded;
                                 
                                 if(myValError > valLError) valLError = myValError;
                                 
                                 double myValnorm = dot(Qview(i), Qview(i)).apply();
                                 myValnorm = std::sqrt(myValnorm);
+
+                                //myValnorm *= isIncluded;
                                 
                                 if(myValnorm > valLnorm) valLnorm = myValnorm;
+                                
+                                //excluded += (!isIncluded);
                             }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
 
     Kokkos::fence();
     lError = localError/localNorm;
-    //std::cout << "Rank: " << myrank << " Iter: " << iter << " Local. Error: " << lError << std::endl;
-
+    
+    double relError = lError;
+    
+    return relError;
 
-    //double globaltemp = 0.0;
-    //MPI_Allreduce(&localError, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+}
 
-    //double absError = globaltemp;
+double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
+                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+    
+    auto Qview = Q.getView();
+    auto QprevIterView = QprevIter.getView();
+    double localError = 0.0;
+    double localNorm = 0.0;
 
-    //globaltemp = 0.0;
-    //MPI_Allreduce(&localNorm, &globaltemp, 1, MPI_DOUBLE, MPI_MAX, Ippl::getComm());
+    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
+                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
+                                Vector_t diff = Qview(i) - QprevIterView(i);
+                                double myValError = dot(diff, diff).apply();
+                                myValError = std::sqrt(myValError);
+                                
+                                if(myValError > valLError) valLError = myValError;
+                                
+                                double myValnorm = dot(Qview(i), Qview(i)).apply();
+                                myValnorm = std::sqrt(myValnorm);
+                                
+                                if(myValnorm > valLnorm) valLnorm = myValnorm;
+                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
 
-    //double relError = absError / globaltemp;
+    Kokkos::fence();
+    lError = localError/localNorm;
+    
     double relError = lError;
     
     return relError;
@@ -376,26 +440,26 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t rmin(0.0);
-    Vector_t rmax(20.0);
-    double dxPIC = rmax[0] / nrPIC[0];
-    double dyPIC = rmax[1] / nrPIC[1];
-    double dzPIC = rmax[2] / nrPIC[2];
-
+    Vector_t rmax(25.0);
     Vector_t length = rmax - rmin;
+    double dxPIC = length[0] / nrPIC[0];
+    double dyPIC = length[1] / nrPIC[1];
+    double dzPIC = length[2] / nrPIC[2];
+
 
     Vector_t mu, sd;
 
     for (unsigned d = 0; d<Dim; d++) {
         mu[d] = 0.5 * length[d];
     }
-    sd[0] = 0.15*length[0];
-    sd[1] = 0.05*length[1];
-    sd[2] = 0.20*length[2];
+    sd[0] = 0.10*20.0;//length[0];
+    sd[1] = 0.05*20.0;//length[1];
+    sd[2] = 0.15*20.0;//length[2];
 
 
-    double dxPIF = rmax[0] / nmPIF[0];
-    double dyPIF = rmax[1] / nmPIF[1];
-    double dzPIF = rmax[2] / nmPIF[2];
+    double dxPIF = length[0] / nmPIF[0];
+    double dyPIF = length[1] / nmPIF[1];
+    double dzPIF = length[2] / nmPIF[2];
     Vector_t hrPIC = {dxPIC, dyPIC, dzPIC};
     Vector_t hrPIF = {dxPIF, dyPIF, dzPIF};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
@@ -578,6 +642,10 @@ int main(int argc, char *argv[]){
         Pend->R = Pbegin->R - Pcoarse->R;
         Pend->P = Pbegin->P - Pcoarse->P;
 
+        //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gk");
+        //Pcoarse->dumpParticleData(it+1, Pbegin->R, Pbegin->P, "Fk");
+
+
         IpplTimings::startTimer(deepCopy);
         Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
         Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
@@ -615,11 +683,13 @@ int main(int argc, char *argv[]){
         Pend->R = Pend->R + Pcoarse->R;
         Pend->P = Pend->P + Pcoarse->P;
 
+        //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gkp1");
+
         PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
         IpplTimings::startTimer(computeErrors);
         double localRerror, localPerror;
-        double Rerror = computeL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror);
-        double Perror = computeL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
+        double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
         IpplTimings::stopTimer(computeErrors);
 

From 55bc603edb5c163f903bb71b1518032d83ae50cf Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 31 Jan 2023 18:56:02 +0100
Subject: [PATCH 053/117] dumpVTK commented

---
 alpine/ElectrostaticPIC/ChargedParticles.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index 67b8f738f..61730648d 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -341,7 +341,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
          rhoNorm_m = norm(rho_m);
          IpplTimings::stopTimer(sumTimer);
 
-         dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
+         //dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
 
          //rho = rho_e - rho_i
          rho_m = rho_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));

From 31720f9115b24098fc01b0ba7cf52437db8b1061 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 10 Feb 2023 14:07:07 +0100
Subject: [PATCH 054/117] cuFINUFFT interface made. Need to create a test and
 see if it works

---
 src/FFT/FFT.h   |   77 ++++
 src/FFT/FFT.hpp | 1036 ++++++++++++++++++++++++++++-------------------
 2 files changed, 687 insertions(+), 426 deletions(-)

diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 8a13e3b45..703fc9373 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -30,6 +30,7 @@
 
 #include <heffte_fft3d.h>
 #include <heffte_fft3d_r2c.h>
+#include <cufinufft.h>
 #include <array>
 #include <memory>
 #include <type_traits>
@@ -64,6 +65,10 @@ namespace ippl {
        Tag classes for Cosine transforms
     */
     class CosTransform {};
+    /**
+       Tag classes for Non-uniform type of Fourier transforms
+    */
+    class NUFFTransform {};
 
     enum FFTComm {
         a2av = 0,
@@ -110,6 +115,29 @@ namespace ippl {
             using backendCos = heffte::backend::stock_cos;
         };
 #endif
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+        template <class T>
+        struct CufinufftType {};
+
+        template <>
+        struct Cufinufft<float> {
+            using makeplan    = cufinufftf_makeplan;
+            using setpts      = cufinufftf_setpts;
+            using transform   = cufinufftf_execute;
+            using destroy     = cufinufftf_destroy;
+            using plan_t      = cufinufftf_plan;
+        };
+
+        template <>
+        struct Cufinufft<double> {
+            using makeplan    = cufinufft_makeplan;
+            using setpts      = cufinufft_setpts;
+            using transform   = cufinufft_execute;
+            using destroy     = cufinufft_destroy;
+            using plan_t      = cufinufft_plan;
+        };
 #endif
     }
 
@@ -296,6 +324,55 @@ namespace ippl {
     };
 
 
+    /**
+       Non-uniform FFT class
+    */
+    template <size_t Dim, class T>
+    class FFT<NUFFTransform,Dim,T> {
+
+    public:
+
+        typedef FieldLayout<Dim> Layout_t;
+        typedef std::complex<T> Complex_t;
+        typedef Field<Complex_t,Dim> ComplexField_t;
+
+        using makeplan = detail::Cufinufft<T>::makeplan;
+        using setpts = detail::Cufinufft<T>::setpts;
+        using transform = detail::Cufinufft<T>::transform;
+        using destroy = detail::Cufinufft<T>::destroy;
+        using plan_t = detail::Cufinufft<T>::plan_t;
+
+        /** Create a new FFT object with the layout for the input Field, type 
+         * (1 or 2) for the NUFFT and parameters for cuFINUFFT.
+        */
+        FFT(const Layout_t& layout, int type, const ParameterList& params);
+
+        // Destructor
+        ~FFT();
+
+        /** Do the NUFFT.
+        */
+        template<class PT1, class PT2, class... Properties>
+        void transform(const ParticleAttrib< Vector<PT1, Dim>, Properties... >& R, 
+                       ParticleAttrib<PT2, Properties... >& Q, ComplexField_t& f);
+
+
+    private:
+
+        /**
+           setup performs the initialization necessary.
+        */
+        void setup(const std::array<int, 3>& nmodes,
+                   const ParameterList& params);
+
+        plan_t plan_m;
+        int ier_m;
+        T tol_m;
+        int type_m;
+
+    };
+
+
 }
 #include "FFT/FFT.hpp"
 #endif // IPPL_FFT_FFT_H
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 853651858..f804413b5 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -57,8 +57,8 @@ namespace ippl {
          * 1D FFTs we just have to make the length in other
          * dimensions to be 1.
          */
-        std::array<long long, Dim> low;
-        std::array<long long, Dim> high;
+        std::array<long long, 3> low;
+        std::array<long long, 3> high;
 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
 
@@ -88,45 +88,45 @@ namespace ippl {
                                   const ParameterList& params)
     {
 
-         heffte::box3d<long long> inbox  = {low, high};
-         heffte::box3d<long long> outbox = {low, high};
+        heffte::box3d<long long> inbox  = {low, high};
+        heffte::box3d<long long> outbox = {low, high};
 
-         heffte::plan_options heffteOptions =
-             heffte::default_options<heffteBackend>();
+        heffte::plan_options heffteOptions =
+            heffte::default_options<heffteBackend>();
 
-         if(!params.get<bool>("use_heffte_defaults")) {
-            heffteOptions.use_pencils = params.get<bool>("use_pencils");
-            heffteOptions.use_reorder = params.get<bool>("use_reorder");
+        if(!params.get<bool>("use_heffte_defaults")) {
+           heffteOptions.use_pencils = params.get<bool>("use_pencils");
+           heffteOptions.use_reorder = params.get<bool>("use_reorder");
 #ifdef Heffte_ENABLE_GPU
-            heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
+           heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
 #endif
 
-            switch (params.get<int>("comm")) {
-            
-               case a2a:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
-                   break;
-               case a2av:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
-                   break;
-               case p2p:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
-                   break;
-               case p2p_pl:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
-                   break;
-               default:
-                   throw IpplException("FFT::setup",
-                                       "Unrecognized heffte communication type");
-            }
-         }
-
-         heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-                    (inbox, outbox, Ippl::getComm(), heffteOptions);
-
-         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
-         if(workspace_m.size() < heffte_m->size_workspace())
-            workspace_m = workspace_t(heffte_m->size_workspace());
+           switch (params.get<int>("comm")) {
+           
+              case a2a:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
+                  break;
+              case a2av:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
+                  break;
+              case p2p:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
+                  break;
+              case p2p_pl:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
+                  break;
+              default:
+                  throw IpplException("FFT::setup",
+                                      "Unrecognized heffte communication type");
+           }
+        }
+
+        heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
+                   (inbox, outbox, Ippl::getComm(), heffteOptions);
+
+        //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
+        if(workspace_m.size() < heffte_m->size_workspace())
+           workspace_m = workspace_t(heffte_m->size_workspace());
 
     }
 
@@ -138,74 +138,74 @@ namespace ippl {
         int direction,
         typename FFT<CCTransform,Dim,T>::ComplexField_t& f)
     {
-       auto fview = f.getView();
-       const int nghost = f.getNghost();
-
-       /**
-        *This copy to a temporary Kokkos view is needed because of following
-        *reasons:
-        *1) heffte wants the input and output fields without ghost layers
-        *2) heffte accepts data in layout left (by default) eventhough this
-        *can be changed during heffte box creation
-       */
-       Kokkos::View<Complex_t***,Kokkos::LayoutLeft>
-           tempField("tempField", fview.extent(0) - 2*nghost,
-                                  fview.extent(1) - 2*nghost,
-                                  fview.extent(2) - 2*nghost);
-
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-       Kokkos::parallel_for("copy from Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                                tempField(i-nghost, j-nghost, k-nghost).real(
-                                      fview(i, j, k).real());
-                                tempField(i-nghost, j-nghost, k-nghost).imag(
-                                      fview(i, j, k).imag());
-                            });
-
-
-
-
-       if ( direction == 1 )
-       {
-           heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
-                             heffte::scale::full);
-       }
-       else if ( direction == -1 )
-       {
-           heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
-                              heffte::scale::none);
-       }
-       else
-       {
-           throw std::logic_error(
-                "Only 1:forward and -1:backward are allowed as directions");
-       }
-
-
-       Kokkos::parallel_for("copy to Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                                fview(i, j, k).real() =
-                                tempField(i-nghost, j-nghost, k-nghost).real();
-                                fview(i, j, k).imag() =
-                                tempField(i-nghost, j-nghost, k-nghost).imag();
-                            });
+        auto fview = f.getView();
+        const int nghost = f.getNghost();
+
+        /**
+         *This copy to a temporary Kokkos view is needed because of following
+         *reasons:
+         *1) heffte wants the input and output fields without ghost layers
+         *2) heffte accepts data in layout left (by default) eventhough this
+         *can be changed during heffte box creation
+        */
+        Kokkos::View<Complex_t***,Kokkos::LayoutLeft>
+            tempField("tempField", fview.extent(0) - 2*nghost,
+                                   fview.extent(1) - 2*nghost,
+                                   fview.extent(2) - 2*nghost);
+
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 tempField(i-nghost, j-nghost, k-nghost).real(
+                                       fview(i, j, k).real());
+                                 tempField(i-nghost, j-nghost, k-nghost).imag(
+                                       fview(i, j, k).imag());
+                             });
+
+
+
+
+        if ( direction == 1 )
+        {
+            heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
+                              heffte::scale::full);
+        }
+        else if ( direction == -1 )
+        {
+            heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
+                               heffte::scale::none);
+        }
+        else
+        {
+            throw std::logic_error(
+                 "Only 1:forward and -1:backward are allowed as directions");
+        }
+
+
+        Kokkos::parallel_for("copy to Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 fview(i, j, k).real() =
+                                 tempField(i-nghost, j-nghost, k-nghost).real();
+                                 fview(i, j, k).imag() =
+                                 tempField(i-nghost, j-nghost, k-nghost).imag();
+                             });
 
     }
 
@@ -275,46 +275,46 @@ namespace ippl {
                                   const ParameterList& params)
     {
 
-         heffte::box3d<long long> inbox  = {lowInput, highInput};
-         heffte::box3d<long long> outbox = {lowOutput, highOutput};
+        heffte::box3d<long long> inbox  = {lowInput, highInput};
+        heffte::box3d<long long> outbox = {lowOutput, highOutput};
 
-         heffte::plan_options heffteOptions = 
-             heffte::default_options<heffteBackend>();
+        heffte::plan_options heffteOptions = 
+            heffte::default_options<heffteBackend>();
         
-         if(!params.get<bool>("use_heffte_defaults")) {
-            heffteOptions.use_pencils = params.get<bool>("use_pencils");
-            heffteOptions.use_reorder = params.get<bool>("use_reorder");
+        if(!params.get<bool>("use_heffte_defaults")) {
+           heffteOptions.use_pencils = params.get<bool>("use_pencils");
+           heffteOptions.use_reorder = params.get<bool>("use_reorder");
 #ifdef Heffte_ENABLE_GPU
-            heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
+           heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
 #endif
 
-            switch (params.get<int>("comm")) {
-            
-               case a2a:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
-                   break;
-               case a2av:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
-                   break;
-               case p2p:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
-                   break;
-               case p2p_pl:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
-                   break;
-               default:
-                   throw IpplException("FFT::setup",
-                                       "Unrecognized heffte communication type");
-            }
-         }
-
-         heffte_m = std::make_shared<heffte::fft3d_r2c<heffteBackend, long long>>
-                    (inbox, outbox, params.get<int>("r2c_direction"), Ippl::getComm(),
-                     heffteOptions);
+           switch (params.get<int>("comm")) {
+           
+              case a2a:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
+                  break;
+              case a2av:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
+                  break;
+              case p2p:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
+                  break;
+              case p2p_pl:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
+                  break;
+              default:
+                  throw IpplException("FFT::setup",
+                                      "Unrecognized heffte communication type");
+           }
+        }
+
+        heffte_m = std::make_shared<heffte::fft3d_r2c<heffteBackend, long long>>
+                   (inbox, outbox, params.get<int>("r2c_direction"), Ippl::getComm(),
+                    heffteOptions);
         
-         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
-         if(workspace_m.size() < heffte_m->size_workspace())
-            workspace_m = workspace_t(heffte_m->size_workspace());
+        //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
+        if(workspace_m.size() < heffte_m->size_workspace())
+           workspace_m = workspace_t(heffte_m->size_workspace());
 
     }
 
@@ -325,104 +325,104 @@ namespace ippl {
         typename FFT<RCTransform,Dim,T>::RealField_t& f,
         typename FFT<RCTransform,Dim,T>::ComplexField_t& g)
     {
-       auto fview = f.getView();
-       auto gview = g.getView();
-       const int nghostf = f.getNghost();
-       const int nghostg = g.getNghost();
-
-       /**
-        *This copy to a temporary Kokkos view is needed because of following
-        *reasons:
-        *1) heffte wants the input and output fields without ghost layers
-        *2) heffte accepts data in layout left (by default) eventhough this
-        *can be changed during heffte box creation
-       */
-       Kokkos::View<T***, Kokkos::LayoutLeft>
-           tempFieldf("tempFieldf", fview.extent(0) - 2*nghostf,
-                                    fview.extent(1) - 2*nghostf,
-                                    fview.extent(2) - 2*nghostf);
-
-       Kokkos::View<Complex_t***, Kokkos::LayoutLeft>
-           tempFieldg("tempFieldg", gview.extent(0) - 2*nghostg,
-                                    gview.extent(1) - 2*nghostg,
-                                    gview.extent(2) - 2*nghostg);
-
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-       Kokkos::parallel_for("copy from Kokkos f field in FFT",
-                            mdrange_type({nghostf, nghostf, nghostf},
-                                         {fview.extent(0) - nghostf,
-                                          fview.extent(1) - nghostf,
-                                          fview.extent(2) - nghostf
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-       {
-            tempFieldf(i-nghostf, j-nghostf, k-nghostf) = fview(i, j, k);
-       });
-       Kokkos::parallel_for("copy from Kokkos g field in FFT",
-                            mdrange_type({nghostg, nghostg, nghostg},
-                                         {gview.extent(0) - nghostg,
-                                          gview.extent(1) - nghostg,
-                                          gview.extent(2) - nghostg
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                                tempFieldg(i-nghostg, j-nghostg, k-nghostg).real(
-                                      gview(i, j, k).real());
-                                tempFieldg(i-nghostg, j-nghostg, k-nghostg).imag(
-                                      gview(i, j, k).imag());
-                            });
+        auto fview = f.getView();
+        auto gview = g.getView();
+        const int nghostf = f.getNghost();
+        const int nghostg = g.getNghost();
+
+        /**
+         *This copy to a temporary Kokkos view is needed because of following
+         *reasons:
+         *1) heffte wants the input and output fields without ghost layers
+         *2) heffte accepts data in layout left (by default) eventhough this
+         *can be changed during heffte box creation
+        */
+        Kokkos::View<T***, Kokkos::LayoutLeft>
+            tempFieldf("tempFieldf", fview.extent(0) - 2*nghostf,
+                                     fview.extent(1) - 2*nghostf,
+                                     fview.extent(2) - 2*nghostf);
+
+        Kokkos::View<Complex_t***, Kokkos::LayoutLeft>
+            tempFieldg("tempFieldg", gview.extent(0) - 2*nghostg,
+                                     gview.extent(1) - 2*nghostg,
+                                     gview.extent(2) - 2*nghostg);
+
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from Kokkos f field in FFT",
+                             mdrange_type({nghostf, nghostf, nghostf},
+                                          {fview.extent(0) - nghostf,
+                                           fview.extent(1) - nghostf,
+                                           fview.extent(2) - nghostf
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+        {
+             tempFieldf(i-nghostf, j-nghostf, k-nghostf) = fview(i, j, k);
+        });
+        Kokkos::parallel_for("copy from Kokkos g field in FFT",
+                             mdrange_type({nghostg, nghostg, nghostg},
+                                          {gview.extent(0) - nghostg,
+                                           gview.extent(1) - nghostg,
+                                           gview.extent(2) - nghostg
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 tempFieldg(i-nghostg, j-nghostg, k-nghostg).real(
+                                       gview(i, j, k).real());
+                                 tempFieldg(i-nghostg, j-nghostg, k-nghostg).imag(
+                                       gview(i, j, k).imag());
+                             });
 
       
-       if ( direction == 1 )
-       {
-           heffte_m->forward( tempFieldf.data(), tempFieldg.data(), workspace_m.data(),
-                              heffte::scale::full );
-       }
-       else if ( direction == -1 )
-       {
-           heffte_m->backward( tempFieldg.data(), tempFieldf.data(), workspace_m.data(),
-                               heffte::scale::none );
-       }
-       else
-       {
-           throw std::logic_error(
-                "Only 1:forward and -1:backward are allowed as directions");
-       }
-
-
-       Kokkos::parallel_for("copy to Kokkos f field FFT",
-                            mdrange_type({nghostf, nghostf, nghostf},
-                                         {fview.extent(0) - nghostf,
-                                          fview.extent(1) - nghostf,
-                                          fview.extent(2) - nghostf
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-       {
-            fview(i, j, k) = tempFieldf(i-nghostf, j-nghostf, k-nghostf);
-       });
-
-       Kokkos::parallel_for("copy to Kokkos g field FFT",
-                            mdrange_type({nghostg, nghostg, nghostg},
-                                         {gview.extent(0) - nghostg,
-                                          gview.extent(1) - nghostg,
-                                          gview.extent(2) - nghostg
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                                gview(i, j, k).real() =
-                                tempFieldg(i-nghostg, j-nghostg, k-nghostg).real();
-                                gview(i, j, k).imag() =
-                                tempFieldg(i-nghostg, j-nghostg, k-nghostg).imag();
-                            });
+        if ( direction == 1 )
+        {
+            heffte_m->forward( tempFieldf.data(), tempFieldg.data(), workspace_m.data(),
+                               heffte::scale::full );
+        }
+        else if ( direction == -1 )
+        {
+            heffte_m->backward( tempFieldg.data(), tempFieldf.data(), workspace_m.data(),
+                                heffte::scale::none );
+        }
+        else
+        {
+            throw std::logic_error(
+                 "Only 1:forward and -1:backward are allowed as directions");
+        }
+
+
+        Kokkos::parallel_for("copy to Kokkos f field FFT",
+                             mdrange_type({nghostf, nghostf, nghostf},
+                                          {fview.extent(0) - nghostf,
+                                           fview.extent(1) - nghostf,
+                                           fview.extent(2) - nghostf
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+        {
+             fview(i, j, k) = tempFieldf(i-nghostf, j-nghostf, k-nghostf);
+        });
+
+        Kokkos::parallel_for("copy to Kokkos g field FFT",
+                             mdrange_type({nghostg, nghostg, nghostg},
+                                          {gview.extent(0) - nghostg,
+                                           gview.extent(1) - nghostg,
+                                           gview.extent(2) - nghostg
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 gview(i, j, k).real() =
+                                 tempFieldg(i-nghostg, j-nghostg, k-nghostg).real();
+                                 gview(i, j, k).imag() =
+                                 tempFieldg(i-nghostg, j-nghostg, k-nghostg).imag();
+                             });
 
     }
 
@@ -446,8 +446,8 @@ namespace ippl {
          * 1D FFTs we just have to make the length in other
          * dimensions to be 1.
          */
-        std::array<long long, Dim> low;
-        std::array<long long, Dim> high;
+        std::array<long long, 3> low;
+        std::array<long long, 3> high;
 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
 
@@ -477,44 +477,44 @@ namespace ippl {
                                   const ParameterList& params)
     {
 
-         heffte::box3d<long long> inbox  = {low, high};
-         heffte::box3d<long long> outbox = {low, high};
+        heffte::box3d<long long> inbox  = {low, high};
+        heffte::box3d<long long> outbox = {low, high};
 
-         heffte::plan_options heffteOptions =
-             heffte::default_options<heffteBackend>();
+        heffte::plan_options heffteOptions =
+            heffte::default_options<heffteBackend>();
 
-         if(!params.get<bool>("use_heffte_defaults")) {
-            heffteOptions.use_pencils = params.get<bool>("use_pencils");
-            heffteOptions.use_reorder = params.get<bool>("use_reorder");
+        if(!params.get<bool>("use_heffte_defaults")) {
+           heffteOptions.use_pencils = params.get<bool>("use_pencils");
+           heffteOptions.use_reorder = params.get<bool>("use_reorder");
 #ifdef Heffte_ENABLE_GPU
-            heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
+           heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
 #endif
-            switch (params.get<int>("comm")) {
-            
-               case a2a:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
-                   break;
-               case a2av:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
-                   break;
-               case p2p:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
-                   break;
-               case p2p_pl:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
-                   break;
-               default:
-                   throw IpplException("FFT::setup",
-                                       "Unrecognized heffte communication type");
-            }
-         }
-
-         heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-                    (inbox, outbox, Ippl::getComm(), heffteOptions);
-
-         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
-         if(workspace_m.size() < heffte_m->size_workspace())
-            workspace_m = workspace_t(heffte_m->size_workspace());
+           switch (params.get<int>("comm")) {
+           
+              case a2a:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
+                  break;
+              case a2av:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
+                  break;
+              case p2p:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
+                  break;
+              case p2p_pl:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
+                  break;
+              default:
+                  throw IpplException("FFT::setup",
+                                      "Unrecognized heffte communication type");
+           }
+        }
+
+        heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
+                   (inbox, outbox, Ippl::getComm(), heffteOptions);
+
+        //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
+        if(workspace_m.size() < heffte_m->size_workspace())
+           workspace_m = workspace_t(heffte_m->size_workspace());
 
     }
 
@@ -524,66 +524,66 @@ namespace ippl {
         int direction,
         typename FFT<SineTransform,Dim,T>::Field_t& f)
     {
-       auto fview = f.getView();
-       const int nghost = f.getNghost();
-       
-       /**
-        *This copy to a temporary Kokkos view is needed because of following
-        *reasons:
-        *1) heffte wants the input and output fields without ghost layers
-        *2) heffte accepts data in layout left (by default) eventhough this
-        *can be changed during heffte box creation
-       */
-       Kokkos::View<T***,Kokkos::LayoutLeft>
-           tempField("tempField", fview.extent(0) - 2*nghost,
-                                  fview.extent(1) - 2*nghost,
-                                  fview.extent(2) - 2*nghost);
-
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-       Kokkos::parallel_for("copy from Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                              tempField(i-nghost, j-nghost, k-nghost) = 
-                              fview(i, j, k);
-                            });
-
-       if ( direction == 1 )
-       {
-           heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
-                             heffte::scale::full);
-       }
-       else if ( direction == -1 )
-       {
-           heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
-                              heffte::scale::none);
-       }
-       else
-       {
-           throw std::logic_error(
-                "Only 1:forward and -1:backward are allowed as directions");
-       }
-
-       Kokkos::parallel_for("copy to Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                              fview(i, j, k) =
-                              tempField(i-nghost, j-nghost, k-nghost);
-                            });
+        auto fview = f.getView();
+        const int nghost = f.getNghost();
+        
+        /**
+         *This copy to a temporary Kokkos view is needed because of following
+         *reasons:
+         *1) heffte wants the input and output fields without ghost layers
+         *2) heffte accepts data in layout left (by default) eventhough this
+         *can be changed during heffte box creation
+        */
+        Kokkos::View<T***,Kokkos::LayoutLeft>
+            tempField("tempField", fview.extent(0) - 2*nghost,
+                                   fview.extent(1) - 2*nghost,
+                                   fview.extent(2) - 2*nghost);
+
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                               tempField(i-nghost, j-nghost, k-nghost) = 
+                               fview(i, j, k);
+                             });
+
+        if ( direction == 1 )
+        {
+            heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
+                              heffte::scale::full);
+        }
+        else if ( direction == -1 )
+        {
+            heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
+                               heffte::scale::none);
+        }
+        else
+        {
+            throw std::logic_error(
+                 "Only 1:forward and -1:backward are allowed as directions");
+        }
+
+        Kokkos::parallel_for("copy to Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                               fview(i, j, k) =
+                               tempField(i-nghost, j-nghost, k-nghost);
+                             });
 
     }
 
@@ -607,8 +607,8 @@ namespace ippl {
          * 1D FFTs we just have to make the length in other
          * dimensions to be 1.
          */
-        std::array<long long, Dim> low;
-        std::array<long long, Dim> high;
+        std::array<long long, 3> low;
+        std::array<long long, 3> high;
 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
 
@@ -638,44 +638,44 @@ namespace ippl {
                                   const ParameterList& params)
     {
 
-         heffte::box3d<long long> inbox  = {low, high};
-         heffte::box3d<long long> outbox = {low, high};
+        heffte::box3d<long long> inbox  = {low, high};
+        heffte::box3d<long long> outbox = {low, high};
 
-         heffte::plan_options heffteOptions =
-             heffte::default_options<heffteBackend>();
+        heffte::plan_options heffteOptions =
+            heffte::default_options<heffteBackend>();
 
-         if(!params.get<bool>("use_heffte_defaults")) {
-            heffteOptions.use_pencils = params.get<bool>("use_pencils");
-            heffteOptions.use_reorder = params.get<bool>("use_reorder");
+        if(!params.get<bool>("use_heffte_defaults")) {
+           heffteOptions.use_pencils = params.get<bool>("use_pencils");
+           heffteOptions.use_reorder = params.get<bool>("use_reorder");
 #ifdef Heffte_ENABLE_GPU
-            heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
+           heffteOptions.use_gpu_aware = params.get<bool>("use_gpu_aware");
 #endif
-            switch (params.get<int>("comm")) {
-            
-               case a2a:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
-                   break;
-               case a2av:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
-                   break;
-               case p2p:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
-                   break;
-               case p2p_pl:
-                   heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
-                   break;
-               default:
-                   throw IpplException("FFT::setup",
-                                       "Unrecognized heffte communication type");
-            }
-         }
-
-         heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-                    (inbox, outbox, Ippl::getComm(), heffteOptions);
-
-         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
-         if(workspace_m.size() < heffte_m->size_workspace())
-            workspace_m = workspace_t(heffte_m->size_workspace());
+           switch (params.get<int>("comm")) {
+           
+              case a2a:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoall;
+                  break;
+              case a2av:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv;
+                  break;
+              case p2p:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p;
+                  break;
+              case p2p_pl:
+                  heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined;
+                  break;
+              default:
+                  throw IpplException("FFT::setup",
+                                      "Unrecognized heffte communication type");
+           }
+        }
+
+        heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
+                   (inbox, outbox, Ippl::getComm(), heffteOptions);
+
+        //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
+        if(workspace_m.size() < heffte_m->size_workspace())
+           workspace_m = workspace_t(heffte_m->size_workspace());
 
     }
 
@@ -686,66 +686,250 @@ namespace ippl {
         int direction,
         typename FFT<CosTransform,Dim,T>::Field_t& f)
     {
-       auto fview = f.getView();
-       const int nghost = f.getNghost();
-
-       /**
-        *This copy to a temporary Kokkos view is needed because of following
-        *reasons:
-        *1) heffte wants the input and output fields without ghost layers
-        *2) heffte accepts data in layout left (by default) eventhough this
-        *can be changed during heffte box creation
-       */
-       Kokkos::View<T***,Kokkos::LayoutLeft>
-           tempField("tempField", fview.extent(0) - 2*nghost,
-                                  fview.extent(1) - 2*nghost,
-                                  fview.extent(2) - 2*nghost);
-
-       using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-       Kokkos::parallel_for("copy from Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                              tempField(i-nghost, j-nghost, k-nghost) = 
-                              fview(i, j, k);
-                            });
-
-       if ( direction == 1 )
-       {
-           heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
-                             heffte::scale::full);
-       }
-       else if ( direction == -1 )
-       {
-           heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
-                              heffte::scale::none);
-       }
-       else
-       {
-           throw std::logic_error(
-                "Only 1:forward and -1:backward are allowed as directions");
-       }
-
-       Kokkos::parallel_for("copy to Kokkos FFT",
-                            mdrange_type({nghost, nghost, nghost},
-                                         {fview.extent(0) - nghost,
-                                          fview.extent(1) - nghost,
-                                          fview.extent(2) - nghost
-                                         }),
-                            KOKKOS_LAMBDA(const size_t i,
-                                          const size_t j,
-                                          const size_t k)
-                            {
-                              fview(i, j, k) =
-                              tempField(i-nghost, j-nghost, k-nghost);
-                            });
+        auto fview = f.getView();
+        const int nghost = f.getNghost();
+
+        /**
+         *This copy to a temporary Kokkos view is needed because of following
+         *reasons:
+         *1) heffte wants the input and output fields without ghost layers
+         *2) heffte accepts data in layout left (by default) eventhough this
+         *can be changed during heffte box creation
+        */
+        Kokkos::View<T***,Kokkos::LayoutLeft>
+            tempField("tempField", fview.extent(0) - 2*nghost,
+                                   fview.extent(1) - 2*nghost,
+                                   fview.extent(2) - 2*nghost);
+
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                               tempField(i-nghost, j-nghost, k-nghost) = 
+                               fview(i, j, k);
+                             });
+
+        if ( direction == 1 )
+        {
+            heffte_m->forward(tempField.data(), tempField.data(), workspace_m.data(),
+                              heffte::scale::full);
+        }
+        else if ( direction == -1 )
+        {
+            heffte_m->backward(tempField.data(), tempField.data(), workspace_m.data(),
+                               heffte::scale::none);
+        }
+        else
+        {
+            throw std::logic_error(
+                 "Only 1:forward and -1:backward are allowed as directions");
+        }
+
+        Kokkos::parallel_for("copy to Kokkos FFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                               fview(i, j, k) =
+                               tempField(i-nghost, j-nghost, k-nghost);
+                             });
+
+    }
+
+
+    //=========================================================================
+    // FFT NUFFTransform Constructors
+    //=========================================================================
+
+    /**
+       Create a new FFT object of type NUFFTransform, with a
+       given layout and cuFINUFFT parameters.
+    */
+
+    template <size_t Dim, class T>
+    FFT<NUFFTransform,Dim,T>::FFT(const Layout_t& layout,
+                                  int type,
+                                  const ParameterList& params)
+    {
+
+
+        /**
+         * cuFINUFFT requires to pass a 3D array even for 2D and
+         * 1D FFTs we just have to fill in other
+         * dimensions to be 1. Note this is different from Heffte
+         * where we fill 0.
+         */
+        
+        std::array<int, 3> nmodes;
+
+        const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
+
+        nmodes.fill(1);
+    
+        for(size_t d = 0; d < Dim; ++d) {
+            nmodes[d] = lDom[d].length();;
+        }
+
+        type_m = type;
+        setup(nmodes, params);
+    }
+
+
+    /**
+        setup performs the initialization necessary.
+    */
+    template <size_t Dim, class T>
+    void
+    FFT<NUFFTransform,Dim,T>::setup(const std::array<int, 3>& nmodes,
+                                    const ParameterList& params)
+    {
+
+        cufinufft_opts opts;
+	    ier = cufinufft_default_opts(type, Dim, &opts);
+
+        if(!params.get<bool>("use_cufinufft_defaults")) {
+           tol = params.get<T>("tolerance");
+           opts.gpu_method = params.get<int>("gpu_method");
+           opts.gpu_sort = params.get<int>("gpu_sort");
+           opts.gpu_kerevalmeth = params.get<int>("gpu_kerevalmeth");
+        }
+
+        int maxbatchsize = 0; //default option. ignored for ntransf = 1 which
+                              // is our case
+
+        int iflag;
+        
+        if(type_m == 1) {
+            iflag = -1;
+        }
+        else if(type_m == 2) {
+            iflag = 1;
+        }
+        else {
+            throw std::logic_error("Only type 1 and type 2 NUFFT are allowed now");
+        }
+        
+        ier = makeplan(type_m, Dim, nmodes, iflag, 1, tol,
+                       maxbatchsize, &plan, &opts);  
+
+    }
+
+
+
+    template <size_t Dim, class T>
+    template<class PT1, class PT2, class... Properties>
+    void
+    FFT<NUFFTransform,Dim,T>::transform(const ParticleAttrib< Vector<PT1, Dim>, Properties... >& R,
+                                        ParticleAttrib<PT2, Properties... >& Q,
+                                        typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
+    {
+        auto fview = f.getView();
+        auto Rview = R.getView();
+        auto Qview = Q.getView();
+        const int nghost = f.getNghost();
+
+        auto localNp = R.getParticleCount();
+
+        /**
+         * cuFINUFFT's layout is left, hence we allocate the temporary
+         * Kokkos views with the same layout
+         */
+        Kokkos::View<Complex_t***,Kokkos::LayoutLeft>
+            tempField("tempField", fview.extent(0) - 2*nghost,
+                                   fview.extent(1) - 2*nghost,
+                                   fview.extent(2) - 2*nghost);
+
+
+        std::array<Kokkos::View<PT1*,Kokkos::LayoutLeft>, 3> tempR;
+      
+        tempR.fill(NULL);
+
+        for(size_t d = 0; d < Dim; ++d) {
+            Kokkos::realloc(tempR[d], localNp);
+        }
+       
+        Kokkos::View<std::complex<PT2>*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
+       
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+
+        Kokkos::parallel_for("copy from field data NUFFT",
+                             mdrange_type({nghost, nghost, nghost},
+                                          {fview.extent(0) - nghost,
+                                           fview.extent(1) - nghost,
+                                           fview.extent(2) - nghost
+                                          }),
+                             KOKKOS_LAMBDA(const size_t i,
+                                           const size_t j,
+                                           const size_t k)
+                             {
+                                 tempField(i-nghost, j-nghost, k-nghost).real(
+                                       fview(i, j, k).real());
+                                 tempField(i-nghost, j-nghost, k-nghost).imag(
+                                       fview(i, j, k).imag());
+                             });
+
+
+        Kokkos::parallel_for("copy from particle data NUFFT",
+                             localNp,
+                             KOKKOS_LAMBDA(const size_t i)
+                             {
+                                 for(size_t d = 0; d < Dim; ++d) {
+                                    temp[R](i) = Rview(i)[d];
+                                 }
+                                 tempQ(i).real(Qview(i));
+                                 tempQ(i).imag(0.0);
+                             });
+
+        ier = setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
+                     NULL, NULL, NULL, plan);
+
+        ier = transform(tempQ.data(), tempField.data(), plan);
+
+
+        if(type_m == 1) { 
+            Kokkos::parallel_for("copy to field data NUFFT",
+                                 mdrange_type({nghost, nghost, nghost},
+                                              {fview.extent(0) - nghost,
+                                               fview.extent(1) - nghost,
+                                               fview.extent(2) - nghost
+                                              }),
+                                 KOKKOS_LAMBDA(const size_t i,
+                                               const size_t j,
+                                               const size_t k)
+                                 {
+                                     fview(i, j, k).real() =
+                                     tempField(i-nghost, j-nghost, k-nghost).real();
+                                     fview(i, j, k).imag() =
+                                     tempField(i-nghost, j-nghost, k-nghost).imag();
+                                 });
+        }
+        else if(type_m == 2) {
+            Kokkos::parallel_for("copy to particle data NUFFT",
+                                 localNp,
+                                 KOKKOS_LAMBDA(const size_t i)
+                                 {
+                                     Qview(i) = tempQ(i).real();
+                                 });
+        }
+    }
+    
+    template <size_t Dim, class T>
+    FFT<NUFFTransform,Dim,T>::~FFT() {
+        
+        ier = destroy(plan);
 
     }
 }

From 64d3df9980d1476d3fc1903260194e84366cf365 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 13 Feb 2023 16:54:07 +0100
Subject: [PATCH 055/117] Interface made and test done but have some
 compilation issues

---
 CMakeLists.txt          |  7 +++++
 src/CMakeLists.txt      |  8 +++++-
 src/FFT/FFT.h           | 60 ++++++++++++++++++++++++++---------------
 src/FFT/FFT.hpp         | 52 ++++++++++++++++++-----------------
 test/FFT/CMakeLists.txt |  6 +++++
 5 files changed, 87 insertions(+), 46 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91e072bf8..8f15ec370 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,13 @@ if (ENABLE_FFT)
     message (STATUS "Found Heffte_DIR: ${Heffte_DIR}")
 endif ()
 
+option (ENABLE_NUFFT "Enable NUFFT transform" OFF)
+if (ENABLE_NUFFT)
+    add_definitions (-DENABLE_NUFFT)
+    find_package(CUFINUFFT REQUIRED)
+    message (STATUS "Found CUFINUFFT_DIR: ${CUFINUFFT_DIR}")
+endif ()
+
 option (ENABLE_SOLVERS "Enable IPPL solvers" OFF)
 
 add_subdirectory (src)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ad4b1f186..b4c04d6c5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -94,7 +94,13 @@ include_directories (
 
 add_library ( ippl ${IPPL_SRCS} ${IPPL_SRCS_FORT} )
 
-target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY})
+
+if (ENABLE_NUFFT)
+    target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY} ${CUFINUFFT_LIBRARY_DIR}/libcufinufft.a)
+else()
+    target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY})
+endif()
+
 
 install (TARGETS ippl DESTINATION lib)
 install (FILES ${IPPL_BASEDIR_HDRS} DESTINATION include)
diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 703fc9373..16fab61f3 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -37,6 +37,7 @@
 
 #include "FieldLayout/FieldLayout.h"
 #include "Field/Field.h"
+#include "Particle/ParticleAttrib.h"
 #include "Utility/ParameterList.h"
 #include "Utility/IpplException.h"
 
@@ -122,21 +123,33 @@ namespace ippl {
         struct CufinufftType {};
 
         template <>
-        struct Cufinufft<float> {
-            using makeplan    = cufinufftf_makeplan;
-            using setpts      = cufinufftf_setpts;
-            using transform   = cufinufftf_execute;
-            using destroy     = cufinufftf_destroy;
-            using plan_t      = cufinufftf_plan;
+        struct CufinufftType<float> {
+            //using makeplan    = typename  cufinufftf_makeplan;
+            //using setpts      = typename  cufinufftf_setpts;
+            //using execute     = typename  cufinufftf_execute;
+            //using destroy     = typename  cufinufftf_destroy;
+            //using plan_t      = typename  cufinufftf_plan;
+
+
+            //typedef typename cufinufftf_makeplan makeplan;
+            //typedef typename cufinufftf_setpts setpts;
+            //typedef typename cufinufftf_execute execute;
+            //typedef typename cufinufftf_destroy destroy;
+            //typedef typename cufinufftf_plan plan_t;
         };
 
         template <>
-        struct Cufinufft<double> {
-            using makeplan    = cufinufft_makeplan;
-            using setpts      = cufinufft_setpts;
-            using transform   = cufinufft_execute;
-            using destroy     = cufinufft_destroy;
-            using plan_t      = cufinufft_plan;
+        struct CufinufftType<double> {
+            //using makeplan    = typename  cufinufft_makeplan;
+            //using setpts      = typename  cufinufft_setpts;
+            //using execute     = typename  cufinufft_execute;
+            //using destroy     = typename  cufinufft_destroy;
+            //using plan_t      = typename  cufinufft_plan;
+            //typedef typename cufinufft_makeplan makeplan;
+            //typedef typename cufinufft_setpts setpts;
+            //typedef typename cufinufft_execute execute;
+            //typedef typename cufinufft_destroy destroy;
+            //typedef typename cufinufft_plan plan_t;
         };
 #endif
     }
@@ -333,14 +346,15 @@ namespace ippl {
     public:
 
         typedef FieldLayout<Dim> Layout_t;
-        typedef std::complex<T> Complex_t;
-        typedef Field<Complex_t,Dim> ComplexField_t;
+        typedef std::complex<T> StdComplex_t;
+        typedef Kokkos::complex<T> KokkosComplex_t;
+        typedef Field<KokkosComplex_t,Dim> ComplexField_t;
 
-        using makeplan = detail::Cufinufft<T>::makeplan;
-        using setpts = detail::Cufinufft<T>::setpts;
-        using transform = detail::Cufinufft<T>::transform;
-        using destroy = detail::Cufinufft<T>::destroy;
-        using plan_t = detail::Cufinufft<T>::plan_t;
+        //using makeplan = typename detail::CufinufftType<T>::makeplan;
+        //using setpts = typename detail::CufinufftType<T>::setpts;
+        //using execute = typename detail::CufinufftType<T>::execute;
+        //using destroy = typename detail::CufinufftType<T>::destroy;
+        //using plan_t = typename detail::CufinufftType<T>::plan_t;
 
         /** Create a new FFT object with the layout for the input Field, type 
          * (1 or 2) for the NUFFT and parameters for cuFINUFFT.
@@ -355,6 +369,9 @@ namespace ippl {
         template<class PT1, class PT2, class... Properties>
         void transform(const ParticleAttrib< Vector<PT1, Dim>, Properties... >& R, 
                        ParticleAttrib<PT2, Properties... >& Q, ComplexField_t& f);
+        //template<class PT1, class PT2, class... Properties>
+        //void transform(const ParticleAttrib< Vector<double, Dim>>& R, 
+        //               ParticleAttrib<Kokkos::complex<double>>& Q, ComplexField_t& f);
 
 
     private:
@@ -362,10 +379,11 @@ namespace ippl {
         /**
            setup performs the initialization necessary.
         */
-        void setup(const std::array<int, 3>& nmodes,
+        void setup(std::array<int, 3>& nmodes,
                    const ParameterList& params);
 
-        plan_t plan_m;
+        //plan_t plan_m;
+        cufinufft_plan plan_m;
         int ier_m;
         T tol_m;
         int type_m;
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index f804413b5..59abf7184 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -793,15 +793,15 @@ namespace ippl {
     */
     template <size_t Dim, class T>
     void
-    FFT<NUFFTransform,Dim,T>::setup(const std::array<int, 3>& nmodes,
+    FFT<NUFFTransform,Dim,T>::setup(std::array<int, 3>& nmodes,
                                     const ParameterList& params)
     {
 
         cufinufft_opts opts;
-	    ier = cufinufft_default_opts(type, Dim, &opts);
+	    ier_m = cufinufft_default_opts(type_m, Dim, &opts);
 
         if(!params.get<bool>("use_cufinufft_defaults")) {
-           tol = params.get<T>("tolerance");
+           tol_m = params.get<T>("tolerance");
            opts.gpu_method = params.get<int>("gpu_method");
            opts.gpu_sort = params.get<int>("gpu_sort");
            opts.gpu_kerevalmeth = params.get<int>("gpu_kerevalmeth");
@@ -821,9 +821,10 @@ namespace ippl {
         else {
             throw std::logic_error("Only type 1 and type 2 NUFFT are allowed now");
         }
-        
-        ier = makeplan(type_m, Dim, nmodes, iflag, 1, tol,
-                       maxbatchsize, &plan, &opts);  
+
+        int dim = static_cast<int>(Dim);
+        ier_m = cufinufft_makeplan(type_m, dim, nmodes.data(), iflag, 1, tol_m,
+                       maxbatchsize, &plan_m, &opts);  
 
     }
 
@@ -835,6 +836,9 @@ namespace ippl {
     FFT<NUFFTransform,Dim,T>::transform(const ParticleAttrib< Vector<PT1, Dim>, Properties... >& R,
                                         ParticleAttrib<PT2, Properties... >& Q,
                                         typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
+    //FFT<NUFFTransform,Dim,T>::transform(const ParticleAttrib< Vector<double, Dim>>& R,
+    //                                    ParticleAttrib<Kokkos::complex<double>>& Q,
+    //                                    typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
     {
         auto fview = f.getView();
         auto Rview = R.getView();
@@ -847,21 +851,21 @@ namespace ippl {
          * cuFINUFFT's layout is left, hence we allocate the temporary
          * Kokkos views with the same layout
          */
-        Kokkos::View<Complex_t***,Kokkos::LayoutLeft>
+        Kokkos::View<cuDoubleComplex***,Kokkos::LayoutLeft>
             tempField("tempField", fview.extent(0) - 2*nghost,
                                    fview.extent(1) - 2*nghost,
                                    fview.extent(2) - 2*nghost);
 
 
-        std::array<Kokkos::View<PT1*,Kokkos::LayoutLeft>, 3> tempR;
+        Vector<Kokkos::View<PT1*,Kokkos::LayoutLeft>, 3> tempR;
       
-        tempR.fill(NULL);
 
         for(size_t d = 0; d < Dim; ++d) {
             Kokkos::realloc(tempR[d], localNp);
         }
+
        
-        Kokkos::View<std::complex<PT2>*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
+        Kokkos::View<cuDoubleComplex*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
        
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
 
@@ -875,10 +879,10 @@ namespace ippl {
                                            const size_t j,
                                            const size_t k)
                              {
-                                 tempField(i-nghost, j-nghost, k-nghost).real(
-                                       fview(i, j, k).real());
-                                 tempField(i-nghost, j-nghost, k-nghost).imag(
-                                       fview(i, j, k).imag());
+                                 tempField(i-nghost, j-nghost, k-nghost).x =
+                                       fview(i, j, k).real();
+                                 tempField(i-nghost, j-nghost, k-nghost).y = 
+                                       fview(i, j, k).imag();
                              });
 
 
@@ -887,16 +891,16 @@ namespace ippl {
                              KOKKOS_LAMBDA(const size_t i)
                              {
                                  for(size_t d = 0; d < Dim; ++d) {
-                                    temp[R](i) = Rview(i)[d];
+                                    tempR[d](i) = Rview(i)[d];
                                  }
-                                 tempQ(i).real(Qview(i));
-                                 tempQ(i).imag(0.0);
+                                 tempQ(i).x = Qview(i).real();
+                                 tempQ(i).y = Qview(i).imag();
                              });
 
-        ier = setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
-                     NULL, NULL, NULL, plan);
+        ier_m = cufinufft_setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
+                     NULL, NULL, NULL, plan_m);
 
-        ier = transform(tempQ.data(), tempField.data(), plan);
+        ier_m = cufinufft_execute(tempQ.data(), tempField.data(), plan_m);
 
 
         if(type_m == 1) { 
@@ -911,9 +915,9 @@ namespace ippl {
                                                const size_t k)
                                  {
                                      fview(i, j, k).real() =
-                                     tempField(i-nghost, j-nghost, k-nghost).real();
+                                     tempField(i-nghost, j-nghost, k-nghost).x;
                                      fview(i, j, k).imag() =
-                                     tempField(i-nghost, j-nghost, k-nghost).imag();
+                                     tempField(i-nghost, j-nghost, k-nghost).y;
                                  });
         }
         else if(type_m == 2) {
@@ -921,7 +925,7 @@ namespace ippl {
                                  localNp,
                                  KOKKOS_LAMBDA(const size_t i)
                                  {
-                                     Qview(i) = tempQ(i).real();
+                                     Qview(i) = tempQ(i).x;
                                  });
         }
     }
@@ -929,7 +933,7 @@ namespace ippl {
     template <size_t Dim, class T>
     FFT<NUFFTransform,Dim,T>::~FFT() {
         
-        ier = destroy(plan);
+        ier_m = cufinufft_destroy(plan_m);
 
     }
 }
diff --git a/test/FFT/CMakeLists.txt b/test/FFT/CMakeLists.txt
index 5d0332166..7b7ecfdde 100644
--- a/test/FFT/CMakeLists.txt
+++ b/test/FFT/CMakeLists.txt
@@ -39,6 +39,12 @@ target_link_libraries (
     ${IPPL_LIBS}
     ${MPI_CXX_LIBRARIES}
 )
+add_executable (TestNUFFT1 TestNUFFT1.cpp)
+target_link_libraries (
+    TestNUFFT1
+    ${IPPL_LIBS}
+    ${MPI_CXX_LIBRARIES}
+)
 # vi: set et ts=4 sw=4 sts=4:
 
 # Local Variables:

From e1ab9d118a0b2bdaaabba8b24831e56ac496fa96 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 13 Feb 2023 16:55:04 +0100
Subject: [PATCH 056/117] test file added

---
 test/FFT/TestNUFFT1.cpp | 199 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 test/FFT/TestNUFFT1.cpp

diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
new file mode 100644
index 000000000..822236627
--- /dev/null
+++ b/test/FFT/TestNUFFT1.cpp
@@ -0,0 +1,199 @@
+#include "Ippl.h"
+#include "Utility/ParameterList.h"
+
+#include <iostream>
+#include <typeinfo>
+#include <array>
+#include<Kokkos_Random.hpp>
+#include <random>
+
+template<class PLayout>
+struct Bunch : public ippl::ParticleBase<PLayout>
+{
+
+    Bunch(PLayout& playout)
+    : ippl::ParticleBase<PLayout>(playout)
+    {
+        this->addAttribute(Q);
+    }
+
+    ~Bunch(){ }
+    
+    typedef ippl::ParticleAttrib<Kokkos::complex<double>> charge_container_type;
+    charge_container_type Q;
+
+};
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  using view_type_complex = typename ippl::detail::ViewType<Kokkos::complex<value_type>, 1>::view_type;
+  // Output View for the random numbers
+  view_type x;
+
+  view_type_complex Q;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  T minU, maxU;
+
+  // Initialize all members
+  generate_random(view_type x_,view_type_complex Q_,  GeneratorPool rand_pool_, 
+                  T& minU_, T& maxU_)
+      : x(x_), Q(Q_), rand_pool(rand_pool_), 
+        minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    for (unsigned d = 0; d < Dim; ++d) {
+        x(i)[d] = rand_gen.drand(minU[d], maxU[d]);
+    }
+    Q(i).real() = rand_gen.drand(0.0, 1.0);
+    Q(i).imag() = rand_gen.drand(0.0, 1.0);
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+
+int main(int argc, char *argv[]) {
+
+    Ippl ippl(argc,argv);
+
+    constexpr unsigned int dim = 3;
+    const double pi = std::acos(-1.0);
+
+    typedef ippl::ParticleSpatialLayout<double, 3> playout_type;
+    typedef Bunch<playout_type> bunch_type;
+
+    
+    std::array<int, dim> pt = {32, 32, 32};
+    ippl::Index I(pt[0]);
+    ippl::Index J(pt[1]);
+    ippl::Index K(pt[2]);
+    ippl::NDIndex<dim> owned(I, J, K);
+
+    ippl::e_dim_tag decomp[dim];    // Specifies SERIAL, PARALLEL dims
+    for (unsigned int d=0; d<dim; d++)
+        decomp[d] = ippl::SERIAL;
+
+    ippl::FieldLayout<dim> layout(owned, decomp);
+
+    std::array<double, dim> dx = {
+        2.0 * pi / double(pt[0]),
+        2.0 * pi / double(pt[1]),
+        2.0 * pi / double(pt[2]),
+    };
+
+    typedef ippl::Vector<double, 3> Vector_t;
+
+    Vector_t hx = {dx[0], dx[1], dx[2]};
+    Vector_t origin = {-pi, -pi, -pi};
+    ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
+
+    playout_type pl(layout, mesh);
+
+    bunch_type bunch(pl);
+    bunch.setParticleBC(ippl::BC::PERIODIC);
+   
+    using size_type = ippl::detail::size_type;
+
+
+    size_type Np = std::pow(32,3) * 10;
+    
+    typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
+
+    field_type field(mesh, layout);
+
+    ippl::ParameterList fftParams;
+
+    fftParams.add("use_cufinufft_defaults", true);  
+    
+    typedef ippl::FFT<ippl::NUFFTransform, 3, double> FFT_type;
+
+    std::unique_ptr<FFT_type> fft;
+    
+    int type = 1;
+    
+    fft = std::make_unique<FFT_type>(layout, type, fftParams);
+
+    Vector_t minU = {-pi, -pi, -pi};
+    Vector_t maxU = {pi, pi, pi};
+
+
+    size_type nloc = Np/Ippl::Comm->size();
+
+    bunch.create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42));
+    Kokkos::parallel_for(nloc,
+                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, dim>(
+                         bunch.R.getView(), bunch.Q.getView(), rand_pool64, minU, maxU));
+    
+
+    fft->transform(bunch.R, bunch.Q, field);
+    
+    auto field_result = Kokkos::create_mirror_view_and_copy(
+                        Kokkos::HostSpace(), field.getView());
+
+    Kokkos::complex<double> max_error_abs(0.0, 0.0);
+    Kokkos::complex<double> max_error_rel(0.0, 0.0);
+
+    //Pick some mode to check. We choose it same as cuFINUFFT testcase example2d1many.cpp in 
+    //the first 2 dimensions
+    ippl::Vector<int, 3> kVec;
+    kVec[0] = (int)(0.37 * pt[0]);
+    kVec[1] = (int)(0.26 * pt[1]);
+    kVec[2] = (int)(0.20 * pt[2]);
+
+    //Linearize based on LayoutLeft and the results from cuFINUFFT are already fftshifted
+    //int it = (pt[0]/2 + kVec[0]) + (pt[0] * (pt[1]/2 + kVec[1])) + 
+    //         (pt[0] * pt[1] * (pt[2]/2 + kVec[2]));
+
+    int iInd = (pt[0]/2 + kVec[0]);
+    int jInd = (pt[1]/2 + kVec[1]);
+    int kInd = (pt[2]/2 + kVec[2]);
+
+
+    Kokkos::complex<double> reducedValue(0.0, 0.0);
+
+    auto Rview = bunch.R.getView();
+    auto Qview = bunch.Q.getView();
+
+    Kokkos::complex<double> imag = {0.0, 1.0};
+
+    Kokkos::parallel_reduce("NUDFT type1", nloc,
+                             KOKKOS_LAMBDA(const size_t idx, Kokkos::complex<double>& valL) {
+
+                                double arg = 0.0;
+                                for(size_t d = 0; d < dim; ++d) {
+                                    arg += kVec[d]*Rview(idx)[d];
+                                }
+
+                                valL += (Kokkos::Experimental::cos(arg) 
+                                - imag * Kokkos::Experimental::sin(arg)) * Qview(idx);
+                            }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
+    
+    double abs_error_real = std::fabs(reducedValue.real() - field_result(iInd, jInd, kInd).real());
+    double rel_error_real = std::fabs(reducedValue.real() - field_result(iInd, jInd, kInd).real()) /std::fabs(reducedValue.real());
+    double abs_error_imag = std::fabs(reducedValue.imag() - field_result(iInd, jInd, kInd).imag());
+    double rel_error_imag = std::fabs(reducedValue.imag() - field_result(iInd, jInd, kInd).imag()) /std::fabs(reducedValue.imag());
+ 
+    std::cout << "Abs Error in real part: " << std::setprecision(16) 
+              << abs_error_real << "Rel. error: " << std::setprecision(16) << rel_error_real << std::endl;
+    std::cout << "Abs Error in imag part: " << std::setprecision(16) 
+              << abs_error_imag << "Rel. error: " << std::setprecision(16) << rel_error_imag << std::endl;
+
+
+    //Kokkos::complex<double> max_error(0.0, 0.0);
+    //MPI_Reduce(&max_error_local, &max_error, 1, 
+    //           MPI_C_DOUBLE_COMPLEX, MPI_MAX, 0, Ippl::getComm());
+
+    return 0;
+}

From 9f7534f129b3e31118d898686605043bb2a33bdb Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 14 Feb 2023 08:56:56 +0100
Subject: [PATCH 057/117] Find cmake file added for cufinufft

---
 CMakeModules/FindCUFINUFFT.cmake | 31 +++++++++++++++++++++++++++++++
 src/CMakeLists.txt               |  2 +-
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 CMakeModules/FindCUFINUFFT.cmake

diff --git a/CMakeModules/FindCUFINUFFT.cmake b/CMakeModules/FindCUFINUFFT.cmake
new file mode 100644
index 000000000..755062d33
--- /dev/null
+++ b/CMakeModules/FindCUFINUFFT.cmake
@@ -0,0 +1,31 @@
+#
+# Find CUFINUFFT includes and library
+#
+# CUFINUFFT_INCLUDE_DIR - where to find cufinufft.h
+# CUFINUFFT_LIBRARY     - libcufinufft.a path
+# CUFINUFFT_FOUND       - do not attempt to use if "no" or undefined.
+
+FIND_PATH(CUFINUFFT_INCLUDE_DIR cufinufft.h
+    HINTS $ENV{CUFINUFFT_INCLUDE_PATH} $ENV{CUFINUFFT_INCLUDE_DIR} $ENV{CUFINUFFT_PREFIX}/include $ENV{CUFINUFFT_DIR}/include ${PROJECT_SOURCE_DIR}/include
+    PATHS ENV C_INCLUDE_PATH
+)
+
+FIND_LIBRARY(CUFINUFFT_LIBRARY_DIR libcufinufft.a
+    HINTS $ENV{CUFINUFFT_LIBRARY_PATH} $ENV{CUFINUFFT_LIBRARY_DIR} $ENV{CUFINUFFT_PREFIX}/lib-static $ENV{CUFINUFFT_DIR}/lib-static $ENV{CUFINUFFT}/lib-static ${PROJECT_SOURCE_DIR}/lib-static
+    PATHS ENV LIBRARY_PATH
+)
+
+IF(CUFINUFFT_INCLUDE_DIR AND CUFINUFFT_LIBRARY_DIR)
+    SET( CUFINUFFT_FOUND "YES" )
+ENDIF()
+
+IF (CUFINUFFT_FOUND)
+   IF (NOT CUFINUFFT_FIND_QUIETLY)
+       MESSAGE(STATUS "Found cufinufft library dir: ${CUFINUFFT_LIBRARY_DIR}")
+       MESSAGE(STATUS "Found cufinufft include dir: ${CUFINUFFT_INCLUDE_DIR}")
+   ENDIF (NOT CUFINUFFT_FIND_QUIETLY)
+ELSE (CUFINUFFT_FOUND)
+    IF (CUFINUFFT_FIND_REQUIRED)
+      MESSAGE(FATAL_ERROR "Could not find CUFINUFFT!")
+  ENDIF (CUFINUFFT_FIND_REQUIRED)
+ENDIF (CUFINUFFT_FOUND)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b4c04d6c5..8c96a6bc7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,7 +96,7 @@ add_library ( ippl ${IPPL_SRCS} ${IPPL_SRCS_FORT} )
 
 
 if (ENABLE_NUFFT)
-    target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY} ${CUFINUFFT_LIBRARY_DIR}/libcufinufft.a)
+    target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY} ${CUFINUFFT_LIBRARY_DIR})
 else()
     target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY})
 endif()

From dccf8cac5b049465c2ec25486b1b4e7a4e94d594 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 14 Feb 2023 13:27:34 +0100
Subject: [PATCH 058/117] Test for type 2 NUFFT also added

---
 CMakeModules/FindCUFINUFFT.cmake |   6 +-
 src/FFT/FFT.hpp                  |  31 +++--
 test/FFT/CMakeLists.txt          |   6 +
 test/FFT/TestNUFFT1.cpp          |  30 ++--
 test/FFT/TestNUFFT2.cpp          | 229 +++++++++++++++++++++++++++++++
 5 files changed, 273 insertions(+), 29 deletions(-)
 create mode 100644 test/FFT/TestNUFFT2.cpp

diff --git a/CMakeModules/FindCUFINUFFT.cmake b/CMakeModules/FindCUFINUFFT.cmake
index 755062d33..691eb510f 100644
--- a/CMakeModules/FindCUFINUFFT.cmake
+++ b/CMakeModules/FindCUFINUFFT.cmake
@@ -7,11 +7,11 @@
 
 FIND_PATH(CUFINUFFT_INCLUDE_DIR cufinufft.h
     HINTS $ENV{CUFINUFFT_INCLUDE_PATH} $ENV{CUFINUFFT_INCLUDE_DIR} $ENV{CUFINUFFT_PREFIX}/include $ENV{CUFINUFFT_DIR}/include ${PROJECT_SOURCE_DIR}/include
-    PATHS ENV C_INCLUDE_PATH
+    PATHS ENV CPP_INCLUDE_PATH
 )
 
-FIND_LIBRARY(CUFINUFFT_LIBRARY_DIR libcufinufft.a
-    HINTS $ENV{CUFINUFFT_LIBRARY_PATH} $ENV{CUFINUFFT_LIBRARY_DIR} $ENV{CUFINUFFT_PREFIX}/lib-static $ENV{CUFINUFFT_DIR}/lib-static $ENV{CUFINUFFT}/lib-static ${PROJECT_SOURCE_DIR}/lib-static
+FIND_LIBRARY(CUFINUFFT_LIBRARY_DIR libcufinufft.so
+    HINTS $ENV{CUFINUFFT_LIBRARY_PATH} $ENV{CUFINUFFT_LIBRARY_DIR} $ENV{CUFINUFFT_PREFIX}/lib $ENV{CUFINUFFT_DIR}/lib $ENV{CUFINUFFT}/lib ${PROJECT_SOURCE_DIR}/lib
     PATHS ENV LIBRARY_PATH
 )
 
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 59abf7184..4ef372730 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -799,6 +799,7 @@ namespace ippl {
 
         cufinufft_opts opts;
 	    ier_m = cufinufft_default_opts(type_m, Dim, &opts);
+        tol_m = 1e-6;
 
         if(!params.get<bool>("use_cufinufft_defaults")) {
            tol_m = params.get<T>("tolerance");
@@ -836,9 +837,6 @@ namespace ippl {
     FFT<NUFFTransform,Dim,T>::transform(const ParticleAttrib< Vector<PT1, Dim>, Properties... >& R,
                                         ParticleAttrib<PT2, Properties... >& Q,
                                         typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
-    //FFT<NUFFTransform,Dim,T>::transform(const ParticleAttrib< Vector<double, Dim>>& R,
-    //                                    ParticleAttrib<Kokkos::complex<double>>& Q,
-    //                                    typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
     {
         auto fview = f.getView();
         auto Rview = R.getView();
@@ -857,12 +855,15 @@ namespace ippl {
                                    fview.extent(2) - 2*nghost);
 
 
-        Vector<Kokkos::View<PT1*,Kokkos::LayoutLeft>, 3> tempR;
+        //Vector<Kokkos::View<PT1[localNp],Kokkos::LayoutLeft>, 3> tempR;
+        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRx("tempRx", localNp);
+        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRy("tempRy", localNp);
+        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRz("tempRz", localNp);
       
 
-        for(size_t d = 0; d < Dim; ++d) {
-            Kokkos::realloc(tempR[d], localNp);
-        }
+        //for(size_t d = 0; d < Dim; ++d) {
+        //    Kokkos::realloc(tempR[d], localNp);
+        //}
 
        
         Kokkos::View<cuDoubleComplex*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
@@ -890,14 +891,19 @@ namespace ippl {
                              localNp,
                              KOKKOS_LAMBDA(const size_t i)
                              {
-                                 for(size_t d = 0; d < Dim; ++d) {
-                                    tempR[d](i) = Rview(i)[d];
-                                 }
+                                 //for(size_t d = 0; d < Dim; ++d) {
+                                 //   tempR[d](i) = Rview(i)[d];
+                                 //}
+                                 tempRx(i) = Rview(i)[0];
+                                 tempRy(i) = Rview(i)[1];
+                                 tempRz(i) = Rview(i)[2];
                                  tempQ(i).x = Qview(i).real();
                                  tempQ(i).y = Qview(i).imag();
                              });
 
-        ier_m = cufinufft_setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
+        //ier_m = cufinufft_setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
+        //             NULL, NULL, NULL, plan_m);
+        ier_m = cufinufft_setpts(localNp, tempRx.data(), tempRy.data(), tempRz.data(), 0, 
                      NULL, NULL, NULL, plan_m);
 
         ier_m = cufinufft_execute(tempQ.data(), tempField.data(), plan_m);
@@ -925,7 +931,8 @@ namespace ippl {
                                  localNp,
                                  KOKKOS_LAMBDA(const size_t i)
                                  {
-                                     Qview(i) = tempQ(i).x;
+                                     Qview(i).real() = tempQ(i).x;
+                                     Qview(i).imag() = tempQ(i).y;
                                  });
         }
     }
diff --git a/test/FFT/CMakeLists.txt b/test/FFT/CMakeLists.txt
index 7b7ecfdde..4d3e5fe90 100644
--- a/test/FFT/CMakeLists.txt
+++ b/test/FFT/CMakeLists.txt
@@ -45,6 +45,12 @@ target_link_libraries (
     ${IPPL_LIBS}
     ${MPI_CXX_LIBRARIES}
 )
+add_executable (TestNUFFT2 TestNUFFT2.cpp)
+target_link_libraries (
+    TestNUFFT2
+    ${IPPL_LIBS}
+    ${MPI_CXX_LIBRARIES}
+)
 # vi: set et ts=4 sw=4 sts=4:
 
 # Local Variables:
diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
index 822236627..06ac71234 100644
--- a/test/FFT/TestNUFFT1.cpp
+++ b/test/FFT/TestNUFFT1.cpp
@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) {
     typedef Bunch<playout_type> bunch_type;
 
     
-    std::array<int, dim> pt = {32, 32, 32};
+    std::array<int, dim> pt = {256, 256, 256};
     ippl::Index I(pt[0]);
     ippl::Index J(pt[1]);
     ippl::Index K(pt[2]);
@@ -106,7 +106,7 @@ int main(int argc, char *argv[]) {
     using size_type = ippl::detail::size_type;
 
 
-    size_type Np = std::pow(32,3) * 10;
+    size_type Np = std::pow(256,3) * 8;
     
     typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
 
@@ -114,7 +114,12 @@ int main(int argc, char *argv[]) {
 
     ippl::ParameterList fftParams;
 
-    fftParams.add("use_cufinufft_defaults", true);  
+    fftParams.add("gpu_method", 1);
+    fftParams.add("gpu_sort", 1);
+    fftParams.add("gpu_kerevalmeth", 1);
+    fftParams.add("tolerance", 1e-6);
+
+    fftParams.add("use_cufinufft_defaults", false);  
     
     typedef ippl::FFT<ippl::NUFFTransform, 3, double> FFT_type;
 
@@ -145,20 +150,17 @@ int main(int argc, char *argv[]) {
     Kokkos::complex<double> max_error_abs(0.0, 0.0);
     Kokkos::complex<double> max_error_rel(0.0, 0.0);
 
-    //Pick some mode to check. We choose it same as cuFINUFFT testcase example2d1many.cpp in 
-    //the first 2 dimensions
+    //Pick some mode to check. We choose it same as cuFINUFFT testcase cufinufft3d1_test.cu
     ippl::Vector<int, 3> kVec;
     kVec[0] = (int)(0.37 * pt[0]);
     kVec[1] = (int)(0.26 * pt[1]);
-    kVec[2] = (int)(0.20 * pt[2]);
+    kVec[2] = (int)(0.13 * pt[2]);
 
-    //Linearize based on LayoutLeft and the results from cuFINUFFT are already fftshifted
-    //int it = (pt[0]/2 + kVec[0]) + (pt[0] * (pt[1]/2 + kVec[1])) + 
-    //         (pt[0] * pt[1] * (pt[2]/2 + kVec[2]));
+    const int nghost = field.getNghost();
 
-    int iInd = (pt[0]/2 + kVec[0]);
-    int jInd = (pt[1]/2 + kVec[1]);
-    int kInd = (pt[2]/2 + kVec[2]);
+    int iInd = (pt[0]/2 + kVec[0] + nghost);
+    int jInd = (pt[1]/2 + kVec[1] + nghost);
+    int kInd = (pt[2]/2 + kVec[2] + nghost);
 
 
     Kokkos::complex<double> reducedValue(0.0, 0.0);
@@ -186,9 +188,9 @@ int main(int argc, char *argv[]) {
     double rel_error_imag = std::fabs(reducedValue.imag() - field_result(iInd, jInd, kInd).imag()) /std::fabs(reducedValue.imag());
  
     std::cout << "Abs Error in real part: " << std::setprecision(16) 
-              << abs_error_real << "Rel. error: " << std::setprecision(16) << rel_error_real << std::endl;
+              << abs_error_real << " Rel. error in real part: " << std::setprecision(16) << rel_error_real << std::endl;
     std::cout << "Abs Error in imag part: " << std::setprecision(16) 
-              << abs_error_imag << "Rel. error: " << std::setprecision(16) << rel_error_imag << std::endl;
+              << abs_error_imag << " Rel. error in imag part: " << std::setprecision(16) << rel_error_imag << std::endl;
 
 
     //Kokkos::complex<double> max_error(0.0, 0.0);
diff --git a/test/FFT/TestNUFFT2.cpp b/test/FFT/TestNUFFT2.cpp
new file mode 100644
index 000000000..147c2ba74
--- /dev/null
+++ b/test/FFT/TestNUFFT2.cpp
@@ -0,0 +1,229 @@
+#include "Ippl.h"
+#include "Utility/ParameterList.h"
+
+#include <iostream>
+#include <typeinfo>
+#include <array>
+#include<Kokkos_Random.hpp>
+#include <random>
+
+template<class PLayout>
+struct Bunch : public ippl::ParticleBase<PLayout>
+{
+
+    Bunch(PLayout& playout)
+    : ippl::ParticleBase<PLayout>(playout)
+    {
+        this->addAttribute(Q);
+    }
+
+    ~Bunch(){ }
+    
+    typedef ippl::ParticleAttrib<Kokkos::complex<double>> charge_container_type;
+    charge_container_type Q;
+
+};
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random_particles {
+
+  using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
+  using value_type  = typename T::value_type;
+  // Output View for the random numbers
+  view_type x;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  T minU, maxU;
+
+  // Initialize all members
+  generate_random_particles(view_type x_, GeneratorPool rand_pool_, 
+                  T& minU_, T& maxU_)
+      : x(x_), rand_pool(rand_pool_), 
+        minU(minU_), maxU(maxU_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    for (unsigned d = 0; d < Dim; ++d) {
+        x(i)[d] = rand_gen.drand(minU[d], maxU[d]);
+    }
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+template <typename T, class GeneratorPool, unsigned Dim>
+struct generate_random_field {
+
+  using view_type = typename ippl::detail::ViewType<T, Dim>::view_type;
+  view_type f;
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  // Initialize all members
+  generate_random_field(view_type f_, GeneratorPool rand_pool_)
+      : f(f_), rand_pool(rand_pool_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t i, const size_t j, const size_t k) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    f(i, j, k).real() = rand_gen.drand(0.0, 1.0);
+    f(i, j, k).imag() = rand_gen.drand(0.0, 1.0);
+
+    // Give the state back, which will allow another thread to acquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+int main(int argc, char *argv[]) {
+
+    Ippl ippl(argc,argv);
+
+    constexpr unsigned int dim = 3;
+    const double pi = std::acos(-1.0);
+
+    typedef ippl::ParticleSpatialLayout<double, 3> playout_type;
+    typedef Bunch<playout_type> bunch_type;
+
+    
+    ippl::Vector<int, dim> pt = {32, 32, 32};
+    ippl::Index I(pt[0]);
+    ippl::Index J(pt[1]);
+    ippl::Index K(pt[2]);
+    ippl::NDIndex<dim> owned(I, J, K);
+
+    ippl::e_dim_tag decomp[dim];    // Specifies SERIAL, PARALLEL dims
+    for (unsigned int d=0; d<dim; d++)
+        decomp[d] = ippl::SERIAL;
+
+    ippl::FieldLayout<dim> layout(owned, decomp);
+
+    std::array<double, dim> dx = {
+        2.0 * pi / double(pt[0]),
+        2.0 * pi / double(pt[1]),
+        2.0 * pi / double(pt[2]),
+    };
+
+    typedef ippl::Vector<double, 3> Vector_t;
+    //typedef ippl::Vector<Kokkos::complex<double>, 3> CxVector_t;
+
+    Vector_t hx = {dx[0], dx[1], dx[2]};
+    Vector_t origin = {-pi, -pi, -pi};
+    ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
+
+    playout_type pl(layout, mesh);
+
+    bunch_type bunch(pl);
+    bunch.setParticleBC(ippl::BC::PERIODIC);
+   
+    using size_type = ippl::detail::size_type;
+
+
+    size_type Np = std::pow(32,3) * 10;
+    
+    typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
+
+    field_type field(mesh, layout);
+
+    ippl::ParameterList fftParams;
+
+    fftParams.add("gpu_method", 1);
+    fftParams.add("gpu_sort", 1);
+    fftParams.add("gpu_kerevalmeth", 1);
+    fftParams.add("tolerance", 1e-12);
+
+    fftParams.add("use_cufinufft_defaults", false);  
+    
+    typedef ippl::FFT<ippl::NUFFTransform, 3, double> FFT_type;
+
+    std::unique_ptr<FFT_type> fft;
+    
+    int type = 2;
+    
+    fft = std::make_unique<FFT_type>(layout, type, fftParams);
+
+    Vector_t minU = {-pi, -pi, -pi};
+    Vector_t maxU = {pi, pi, pi};
+
+
+    size_type nloc = Np/Ippl::Comm->size();
+
+    const int nghost = field.getNghost();
+    using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+    auto fview = field.getView();
+    bunch.create(nloc);
+    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42));
+    Kokkos::parallel_for(nloc,
+                         generate_random_particles<Vector_t, Kokkos::Random_XorShift64_Pool<>, dim>(
+                         bunch.R.getView(), rand_pool64, minU, maxU));
+    
+    Kokkos::parallel_for(mdrange_type({nghost, nghost, nghost},
+                                      {fview.extent(0) - nghost,
+                                       fview.extent(1) - nghost,
+                                       fview.extent(2) - nghost}),
+                         generate_random_field<Kokkos::complex<double>, Kokkos::Random_XorShift64_Pool<>, dim>(
+                         field.getView(), rand_pool64));
+
+    fft->transform(bunch.R, bunch.Q, field);
+    
+    auto Q_result = Kokkos::create_mirror_view_and_copy(
+                        Kokkos::HostSpace(), bunch.Q.getView());
+
+    Kokkos::complex<double> max_error_abs(0.0, 0.0);
+    Kokkos::complex<double> max_error_rel(0.0, 0.0);
+
+    //Pick some target point to check. We choose it same as cuFINUFFT testcase cufinufft3d2_test.cu
+    
+    int idx = nloc/2;
+
+    Kokkos::complex<double> reducedValue(0.0, 0.0);
+
+    auto Rview = bunch.R.getView();
+
+    Kokkos::complex<double> imag = {0.0, 1.0};
+
+    Kokkos::parallel_reduce("NUDFT type2",
+                            mdrange_type({0, 0, 0},
+                                         {fview.extent(0) - 2 * nghost,
+                                          fview.extent(1) - 2 * nghost,
+                                          fview.extent(2) - 2 * nghost}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           Kokkos::complex<double>& valL) 
+                             {
+                                ippl::Vector<int, 3> iVec = {i, j, k};
+                                double arg = 0.0;
+                                for(size_t d = 0; d < dim; ++d) {
+                                    arg += (iVec[d] - (pt[d]/2)) * Rview(idx)[d];
+                                }
+
+                                valL += (Kokkos::Experimental::cos(arg) 
+                                + imag * Kokkos::Experimental::sin(arg)) * fview(i + nghost, j + nghost, k + nghost);
+                            }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
+    
+    double abs_error_real = std::fabs(reducedValue.real() - Q_result(idx).real());
+    double rel_error_real = std::fabs(reducedValue.real() - Q_result(idx).real()) /std::fabs(reducedValue.real());
+    double abs_error_imag = std::fabs(reducedValue.imag() - Q_result(idx).imag());
+    double rel_error_imag = std::fabs(reducedValue.imag() - Q_result(idx).imag()) /std::fabs(reducedValue.imag());
+ 
+    std::cout << "Abs Error in real part: " << std::setprecision(16) 
+              << abs_error_real << " Rel. error in real part: " << std::setprecision(16) << rel_error_real << std::endl;
+    std::cout << "Abs Error in imag part: " << std::setprecision(16) 
+              << abs_error_imag << " Rel. error in imag part: " << std::setprecision(16) << rel_error_imag << std::endl;
+
+
+    //Kokkos::complex<double> max_error(0.0, 0.0);
+    //MPI_Reduce(&max_error_local, &max_error, 1, 
+    //           MPI_C_DOUBLE_COMPLEX, MPI_MAX, 0, Ippl::getComm());
+
+    return 0;
+}

From e526befeb7066ee98c5058029d3b711778149d60 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 14 Feb 2023 14:04:55 +0100
Subject: [PATCH 059/117] few tweaks and cleanups but still lot of things need
 to be generalized

---
 CMakeModules/FindCUFINUFFT.cmake | 4 ++--
 src/FFT/FFT.h                    | 8 ++++----
 src/FFT/FFT.hpp                  | 5 +++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/CMakeModules/FindCUFINUFFT.cmake b/CMakeModules/FindCUFINUFFT.cmake
index 691eb510f..9098a6e7a 100644
--- a/CMakeModules/FindCUFINUFFT.cmake
+++ b/CMakeModules/FindCUFINUFFT.cmake
@@ -2,14 +2,14 @@
 # Find CUFINUFFT includes and library
 #
 # CUFINUFFT_INCLUDE_DIR - where to find cufinufft.h
-# CUFINUFFT_LIBRARY     - libcufinufft.a path
+# CUFINUFFT_LIBRARY     - libcufinufft.so path
 # CUFINUFFT_FOUND       - do not attempt to use if "no" or undefined.
 
 FIND_PATH(CUFINUFFT_INCLUDE_DIR cufinufft.h
     HINTS $ENV{CUFINUFFT_INCLUDE_PATH} $ENV{CUFINUFFT_INCLUDE_DIR} $ENV{CUFINUFFT_PREFIX}/include $ENV{CUFINUFFT_DIR}/include ${PROJECT_SOURCE_DIR}/include
     PATHS ENV CPP_INCLUDE_PATH
 )
-
+#Static library has some issues and gives a cuda error at the end of compilation
 FIND_LIBRARY(CUFINUFFT_LIBRARY_DIR libcufinufft.so
     HINTS $ENV{CUFINUFFT_LIBRARY_PATH} $ENV{CUFINUFFT_LIBRARY_DIR} $ENV{CUFINUFFT_PREFIX}/lib $ENV{CUFINUFFT_DIR}/lib $ENV{CUFINUFFT}/lib ${PROJECT_SOURCE_DIR}/lib
     PATHS ENV LIBRARY_PATH
diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 16fab61f3..cec240b8f 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -66,10 +66,12 @@ namespace ippl {
        Tag classes for Cosine transforms
     */
     class CosTransform {};
+#ifdef KOKKOS_ENABLE_CUDA
     /**
        Tag classes for Non-uniform type of Fourier transforms
     */
     class NUFFTransform {};
+#endif
 
     enum FFTComm {
         a2av = 0,
@@ -337,6 +339,7 @@ namespace ippl {
     };
 
 
+#ifdef KOKKOS_ENABLE_CUDA
     /**
        Non-uniform FFT class
     */
@@ -346,7 +349,6 @@ namespace ippl {
     public:
 
         typedef FieldLayout<Dim> Layout_t;
-        typedef std::complex<T> StdComplex_t;
         typedef Kokkos::complex<T> KokkosComplex_t;
         typedef Field<KokkosComplex_t,Dim> ComplexField_t;
 
@@ -369,9 +371,6 @@ namespace ippl {
         template<class PT1, class PT2, class... Properties>
         void transform(const ParticleAttrib< Vector<PT1, Dim>, Properties... >& R, 
                        ParticleAttrib<PT2, Properties... >& Q, ComplexField_t& f);
-        //template<class PT1, class PT2, class... Properties>
-        //void transform(const ParticleAttrib< Vector<double, Dim>>& R, 
-        //               ParticleAttrib<Kokkos::complex<double>>& Q, ComplexField_t& f);
 
 
     private:
@@ -392,6 +391,7 @@ namespace ippl {
 
 
 }
+#endif
 #include "FFT/FFT.hpp"
 #endif // IPPL_FFT_FFT_H
 
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 4ef372730..c79b247a7 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -750,6 +750,7 @@ namespace ippl {
     }
 
 
+#ifdef KOKKOS_ENABLE_CUDA
     //=========================================================================
     // FFT NUFFTransform Constructors
     //=========================================================================
@@ -764,8 +765,6 @@ namespace ippl {
                                   int type,
                                   const ParameterList& params)
     {
-
-
         /**
          * cuFINUFFT requires to pass a 3D array even for 2D and
          * 1D FFTs we just have to fill in other
@@ -823,6 +822,7 @@ namespace ippl {
             throw std::logic_error("Only type 1 and type 2 NUFFT are allowed now");
         }
 
+        //dim in cufinufft is int
         int dim = static_cast<int>(Dim);
         ier_m = cufinufft_makeplan(type_m, dim, nmodes.data(), iflag, 1, tol_m,
                        maxbatchsize, &plan_m, &opts);  
@@ -943,6 +943,7 @@ namespace ippl {
         ier_m = cufinufft_destroy(plan_m);
 
     }
+#endif
 }
 
 // vi: set et ts=4 sw=4 sts=4:

From 9d7204ed3cc919f552cad5df6a6dc15b430eef4b Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 14 Feb 2023 16:11:30 +0100
Subject: [PATCH 060/117] In the middle of changes

---
 CMakeModules/FindCUFINUFFT.cmake |   1 +
 src/CMakeLists.txt               |   3 +
 src/Particle/ParticleAttrib.hpp  | 271 +++++++++++++++++++++++++++++--
 3 files changed, 261 insertions(+), 14 deletions(-)

diff --git a/CMakeModules/FindCUFINUFFT.cmake b/CMakeModules/FindCUFINUFFT.cmake
index 9098a6e7a..ce40536f1 100644
--- a/CMakeModules/FindCUFINUFFT.cmake
+++ b/CMakeModules/FindCUFINUFFT.cmake
@@ -17,6 +17,7 @@ FIND_LIBRARY(CUFINUFFT_LIBRARY_DIR libcufinufft.so
 
 IF(CUFINUFFT_INCLUDE_DIR AND CUFINUFFT_LIBRARY_DIR)
     SET( CUFINUFFT_FOUND "YES" )
+    SET( CUFINUFFT_DIR  $ENV{CUFINUFFT_DIR} )
 ENDIF()
 
 IF (CUFINUFFT_FOUND)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8c96a6bc7..8b4330823 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,6 +96,9 @@ add_library ( ippl ${IPPL_SRCS} ${IPPL_SRCS_FORT} )
 
 
 if (ENABLE_NUFFT)
+    include_directories (
+        BEFORE ${CUFINUFFT_INCLUDE_DIR}
+    )
     target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY} ${CUFINUFFT_LIBRARY_DIR})
 else()
     target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY})
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 4d3919df1..56030879f 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -204,13 +204,13 @@ namespace ippl {
 
     template<typename T, class... Properties>
     template <unsigned Dim, class M, class C, class FT, class ST, class PT>
-    void ParticleAttrib<T, Properties...>::scatterPIF(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+    void ParticleAttrib<T, Properties...>::scatterPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
         
-        static IpplTimings::TimerRef scatterPIFTimer = IpplTimings::getTimer("ScatterPIF");           
-        IpplTimings::startTimer(scatterPIFTimer);
+        static IpplTimings::TimerRef scatterPIFNUDFTTimer = IpplTimings::getTimer("ScatterPIFNUDFT");           
+        IpplTimings::startTimer(scatterPIFNUDFTTimer);
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         using vector_type = typename M::vector_type;
@@ -246,7 +246,7 @@ namespace ippl {
 
         size_t flatN = N[0]*N[1]*N[2];
 
-        Kokkos::parallel_for("ParticleAttrib::scatterPIF compute",
+        Kokkos::parallel_for("ParticleAttrib::scatterPIFNUDFT compute",
                 team_policy(flatN, Kokkos::AUTO),
                 KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
                 const size_t flatIndex = teamMember.league_rank();
@@ -293,7 +293,7 @@ namespace ippl {
                 }
         );
 
-        IpplTimings::stopTimer(scatterPIFTimer);
+        IpplTimings::stopTimer(scatterPIFNUDFTTimer);
 
         //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
         //IpplTimings::startTimer(scatterAllReduceTimer);                                               
@@ -366,12 +366,12 @@ namespace ippl {
 
     template<typename T, class... Properties>
     template <unsigned Dim, class M, class C, class FT, class ST, class PT>
-    void ParticleAttrib<T, Properties...>::gatherPIF(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+    void ParticleAttrib<T, Properties...>::gatherPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
-        static IpplTimings::TimerRef gatherPIFTimer = IpplTimings::getTimer("GatherPIF");           
-        IpplTimings::startTimer(gatherPIFTimer);
+        static IpplTimings::TimerRef gatherPIFNUDFTTimer = IpplTimings::getTimer("GatherPIFNUDFT");           
+        IpplTimings::startTimer(gatherPIFNUDFTTimer);
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         using vector_type = typename M::vector_type;
@@ -403,7 +403,7 @@ namespace ippl {
 
         size_t flatN = N[0]*N[1]*N[2];
 
-        Kokkos::parallel_for("ParticleAttrib::gatherPIF",
+        Kokkos::parallel_for("ParticleAttrib::gatherPIFNUDFT",
                 team_policy(Np, Kokkos::AUTO),
                 KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
                 const size_t idx = teamMember.league_rank();
@@ -470,10 +470,253 @@ namespace ippl {
         );
 
         
-        IpplTimings::stopTimer(gatherPIFTimer);
+        IpplTimings::stopTimer(gatherPIFNUDFTTimer);
 
     }
 
+#ifdef KOKKOS_ENABLE_CUDA
+
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+    const
+    {
+        
+        static IpplTimings::TimerRef scatterPIFNUFFTTimer = IpplTimings::getTimer("ScatterPIFNUFFT");           
+        IpplTimings::startTimer(scatterPIFNUFFTTimer);
+        
+        using view_type = typename Field<FT, Dim, M, C>::view_type;
+        using vector_type = typename M::vector_type;
+        using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
+        view_type fview = f.getView();
+        typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
+        const int nghost = f.getNghost();
+        const FieldLayout<Dim>& layout = f.getLayout(); 
+        const M& mesh = f.get_mesh();
+        const vector_type& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        vector_type Len;
+        Vector<int, Dim> N;
+
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+        
+        typedef Kokkos::TeamPolicy<> team_policy;
+        typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+
+        //using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
+
+        //view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
+
+        double pi = std::acos(-1.0);
+        Kokkos::complex<double> imag = {0.0, 1.0};
+
+        size_t Np = *(this->localNum_mp);
+
+        size_t flatN = N[0]*N[1]*N[2];
+
+        Kokkos::parallel_for("ParticleAttrib::scatterPIFNUFFT compute",
+                team_policy(flatN, Kokkos::AUTO),
+                KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
+                const size_t flatIndex = teamMember.league_rank();
+               
+#ifdef KOKKOS_ENABLE_CUDA
+                const int k = (int)(flatIndex / (N[0] * N[1]));
+                const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
+                const int i = flatIndex2D % N[0];
+                const int j = (int)(flatIndex2D / N[0]);
+#else
+
+                const int i = (int)(flatIndex / (N[0] * N[1]));
+                const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
+                const int k = flatIndex2D % N[0];
+                const int j = (int)(flatIndex2D / N[0]);
+#endif
+                
+                FT reducedValue = 0.0;
+                Vector<int, 3> iVec = {i, j, k};
+                vector_type kVec;
+                for(size_t d = 0; d < Dim; ++d) {
+                    bool shift = (iVec[d] > (N[d]/2));
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                }
+                auto Sk = Skview(i+nghost, j+nghost, k+nghost);
+                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
+                [=](const size_t idx, FT& innerReduce)
+                {
+                    double arg = 0.0;
+                    for(size_t d = 0; d < Dim; ++d) {
+                        arg += kVec[d]*pp(idx)[d];
+                    }
+                    const value_type& val = dview_m(idx);
+
+                    innerReduce += Sk * (Kokkos::Experimental::cos(arg) 
+                                - imag * Kokkos::Experimental::sin(arg)) * val;
+                }, Kokkos::Sum<FT>(reducedValue));
+
+                if(teamMember.team_rank() == 0) {
+                    //viewLocal(i+nghost,j+nghost,k+nghost) = reducedValue;
+                    fview(i+nghost,j+nghost,k+nghost) = reducedValue;
+                }
+
+                }
+        );
+
+        IpplTimings::stopTimer(scatterPIFNUFFTTimer);
+
+        //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
+        //IpplTimings::startTimer(scatterAllReduceTimer);                                               
+        //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
+        //MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
+        //              MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
+        //IpplTimings::stopTimer(scatterAllReduceTimer);
+
+    }
+
+
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+    const
+    {
+        static IpplTimings::TimerRef gatherPIFNUFFTTimer = IpplTimings::getTimer("GatherPIFNUFFT");           
+        IpplTimings::startTimer(gatherPIFNUFFTTimer);
+        
+        using view_type = typename Field<FT, Dim, M, C>::view_type;
+        using vector_type = typename M::vector_type;
+        using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
+        view_type fview = f.getView();
+        typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
+        const int nghost = f.getNghost();
+        const FieldLayout<Dim>& layout = f.getLayout(); 
+        const M& mesh = f.get_mesh();
+        const vector_type& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        vector_type Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+
+
+        typedef Kokkos::TeamPolicy<> team_policy;
+        typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+        double pi = std::acos(-1.0);
+        Kokkos::complex<double> imag = {0.0, 1.0};
+
+        size_t Np = *(this->localNum_mp);
+
+        size_t flatN = N[0]*N[1]*N[2];
+
+        Kokkos::parallel_for("ParticleAttrib::gatherPIFNUFFT",
+                team_policy(Np, Kokkos::AUTO),
+                KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
+                const size_t idx = teamMember.league_rank();
+
+                value_type reducedValue = 0.0;
+                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
+                [=](const size_t flatIndex, value_type& innerReduce)
+                {
+                    
+#ifdef KOKKOS_ENABLE_CUDA
+                    const int k = (int)(flatIndex / (N[0] * N[1]));
+                    const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
+                    const int i = flatIndex2D % N[0];
+                    const int j = (int)(flatIndex2D / N[0]);
+#else
+                    const int i = (int)(flatIndex / (N[0] * N[1]));
+                    const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
+                    const int k = flatIndex2D % N[0];
+                    const int j = (int)(flatIndex2D / N[0]);
+#endif
+
+                    Vector<int, 3> iVec = {i, j, k};
+                    vector_type kVec;
+                    double Dr = 0.0, arg = 0.0;
+                    for(size_t d = 0; d < Dim; ++d) {
+                        bool shift = (iVec[d] > (N[d]/2));
+                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        //kVec[d] = 2 * pi / Len[d] * iVec[d];
+                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
+                        Dr += kVec[d] * kVec[d];
+                        arg += kVec[d]*pp(idx)[d];
+                    }
+                    
+
+                    FT Ek = 0.0;
+                    value_type Ex = 0.0;
+                    auto rho = fview(i+nghost,j+nghost,k+nghost);
+                    auto Sk = Skview(i+nghost,j+nghost,k+nghost);
+                    for(size_t d = 0; d < Dim; ++d) {
+                        
+                        bool isNotZero = (Dr != 0.0);
+                        double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                        Ek = -(imag * kVec[d] * rho * factor);
+                        
+                        //Inverse Fourier transform when the lhs is real. Use when 
+                        //we choose k \in [0 K) instead of from [-K/2+1 K/2] 
+                        //Ex[d] = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
+                        //        - Ek.imag() * Kokkos::Experimental::sin(arg));
+                        Ek *= Sk * (Kokkos::Experimental::cos(arg) 
+                                + imag * Kokkos::Experimental::sin(arg));
+                        Ex[d] = Ek.real();
+                    }
+                    
+                    innerReduce += Ex;
+                }, Kokkos::Sum<value_type>(reducedValue));
+
+                teamMember.team_barrier();
+
+                if(teamMember.team_rank() == 0) {
+                    dview_m(idx) = reducedValue;
+                }
+
+                }
+        );
+
+        
+        IpplTimings::stopTimer(gatherPIFNUFFTTimer);
+
+    }
+#endif
+
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    inline
+    void scatterPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+    {
+#ifdef KOKKOS_ENABLE_CUDA
+        attrib.scatterPIFNUFFT(f, Sk, pp);
+#else
+        throw IpplException("scatterPIFNUFFT",
+                            "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
+                             be compiled with CUDA. Otherwise use scatterPIFNUDFT.");
+#endif
+    }
+
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    inline
+    void gatherPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+    {
+#ifdef KOKKOS_ENABLE_CUDA
+        attrib.gatherPIFNUFFT(f, Sk, pp);
+#else
+        throw IpplException("gatherPIFNUFFT",
+                            "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
+                             be compiled with CUDA. Otherwise use gatherPIFNUDFT.");
+#endif
+    }
 
     /*
      * Non-class function
@@ -491,10 +734,10 @@ namespace ippl {
 
     template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
-    void scatterPIF(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+    void scatterPIFNUDFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
                  Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
     {
-        attrib.scatterPIF(f, Sk, pp);
+        attrib.scatterPIFNUDFT(f, Sk, pp);
     }
 
 
@@ -509,10 +752,10 @@ namespace ippl {
 
     template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
-    void gatherPIF(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+    void gatherPIFNUDFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
                  Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
     {
-        attrib.gatherPIF(f, Sk, pp);
+        attrib.gatherPIFNUDFT(f, Sk, pp);
     }
 
 

From 613a80a0da18c0da71a734ae877dcf7ce4b29495 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 14 Feb 2023 16:25:18 +0100
Subject: [PATCH 061/117] include directories added

---
 CMakeModules/FindCUFINUFFT.cmake | 1 +
 src/CMakeLists.txt               | 3 +++
 test/FFT/CMakeLists.txt          | 1 +
 3 files changed, 5 insertions(+)

diff --git a/CMakeModules/FindCUFINUFFT.cmake b/CMakeModules/FindCUFINUFFT.cmake
index 9098a6e7a..202a044a3 100644
--- a/CMakeModules/FindCUFINUFFT.cmake
+++ b/CMakeModules/FindCUFINUFFT.cmake
@@ -17,6 +17,7 @@ FIND_LIBRARY(CUFINUFFT_LIBRARY_DIR libcufinufft.so
 
 IF(CUFINUFFT_INCLUDE_DIR AND CUFINUFFT_LIBRARY_DIR)
     SET( CUFINUFFT_FOUND "YES" )
+    SET( CUFINUFFT_DIR $ENV{CUFINUFFT_DIR} )
 ENDIF()
 
 IF (CUFINUFFT_FOUND)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8c96a6bc7..8b4330823 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,6 +96,9 @@ add_library ( ippl ${IPPL_SRCS} ${IPPL_SRCS_FORT} )
 
 
 if (ENABLE_NUFFT)
+    include_directories (
+        BEFORE ${CUFINUFFT_INCLUDE_DIR}
+    )
     target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY} ${CUFINUFFT_LIBRARY_DIR})
 else()
     target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY})
diff --git a/test/FFT/CMakeLists.txt b/test/FFT/CMakeLists.txt
index 4d3e5fe90..834e9c762 100644
--- a/test/FFT/CMakeLists.txt
+++ b/test/FFT/CMakeLists.txt
@@ -3,6 +3,7 @@ message (STATUS "Adding test FFT found in ${_relPath}")
 
 include_directories (
     ${CMAKE_SOURCE_DIR}/src
+    ${CUFINUFFT_INCLUDE_DIR}
 )
 
 link_directories (

From 7198e2f7c13e536fa55acbbd24fae4e56f3c3005 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Thu, 16 Feb 2023 08:48:40 +0100
Subject: [PATCH 062/117] target_include_directories seems to work

---
 src/CMakeLists.txt      | 4 +---
 test/FFT/CMakeLists.txt | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8b4330823..bd6a2205b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,9 +96,7 @@ add_library ( ippl ${IPPL_SRCS} ${IPPL_SRCS_FORT} )
 
 
 if (ENABLE_NUFFT)
-    include_directories (
-        BEFORE ${CUFINUFFT_INCLUDE_DIR}
-    )
+    target_include_directories(ippl PUBLIC ${CUFINUFFT_INCLUDE_DIR})
     target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY} ${CUFINUFFT_LIBRARY_DIR})
 else()
     target_link_libraries(ippl PUBLIC Kokkos::kokkos ${HEFFTE_LIBRARY})
diff --git a/test/FFT/CMakeLists.txt b/test/FFT/CMakeLists.txt
index 834e9c762..4d3e5fe90 100644
--- a/test/FFT/CMakeLists.txt
+++ b/test/FFT/CMakeLists.txt
@@ -3,7 +3,6 @@ message (STATUS "Adding test FFT found in ${_relPath}")
 
 include_directories (
     ${CMAKE_SOURCE_DIR}/src
-    ${CUFINUFFT_INCLUDE_DIR}
 )
 
 link_directories (

From 47442818f95c9fea04193639b98f7dd69819ed76 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Thu, 16 Feb 2023 15:36:55 +0100
Subject: [PATCH 063/117] Almost done but have some compilation errors

---
 alpine/PinT/ChargedParticlesPinT.hpp |   4 +-
 alpine/PinT/LandauDampingPinT.cpp    |   6 +-
 src/FFT/FFT.hpp                      |  28 +--
 src/Particle/ParticleAttrib.h        |  38 +++-
 src/Particle/ParticleAttrib.hpp      | 260 ++++++++++-----------------
 5 files changed, 143 insertions(+), 193 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index c827669a5..492df0c68 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -861,7 +861,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
         // Solve for and gather E field
-        gatherPIF(E, rhoPIF_m, Sk_m, Rtemp);
+        gatherPIF(E, rhoPIF_m, Sk_m, Rtemp, q);
     
         time_m = tStartMySlice;
 
@@ -893,7 +893,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
             // Solve for and gather E field
-            gatherPIF(E, rhoPIF_m, Sk_m, Rtemp);
+            gatherPIF(E, rhoPIF_m, Sk_m, Rtemp, q);
     
             //kick
             Ptemp = Ptemp - 0.5 * dt * E;
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index e80bed086..5662b2619 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -456,8 +456,10 @@ int main(int argc, char *argv[]){
     Vector_t kw = {0.5, 0.5, 0.5};
     //double alpha = 0.05;
     Vector_t alpha = {0.05, 0.05, 0.05};
-    Vector_t rmin(0.0);
-    Vector_t rmax = 2 * pi / kw ;
+    //Vector_t rmin(0.0);
+    //Vector_t rmax = 2 * pi / kw ;
+    Vector_t rmin(-2.0 * pi);
+    Vector_t rmax = 2 * pi;
     Vector_t length = rmax - rmin;
     double dxPIC = length[0] / nrPIC[0];
     double dyPIC = length[1] / nrPIC[1];
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index c87b10f0e..6a04da61b 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -855,15 +855,16 @@ namespace ippl {
                                    fview.extent(2) - 2*nghost);
 
 
-        //Vector<Kokkos::View<PT1[localNp],Kokkos::LayoutLeft>, 3> tempR;
+        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempR[3];
+        //tempR = {NULL, NULL, NULL};
         Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRx("tempRx", localNp);
         Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRy("tempRy", localNp);
         Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRz("tempRz", localNp);
       
 
-        //for(size_t d = 0; d < Dim; ++d) {
-        //    Kokkos::realloc(tempR[d], localNp);
-        //}
+        for(size_t d = 0; d < Dim; ++d) {
+            Kokkos::realloc(tempR[d], localNp);
+        }
 
        
         Kokkos::View<cuDoubleComplex*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
@@ -891,20 +892,21 @@ namespace ippl {
                              localNp,
                              KOKKOS_LAMBDA(const size_t i)
                              {
-                                 //for(size_t d = 0; d < Dim; ++d) {
-                                 //   tempR[d](i) = Rview(i)[d];
-                                 //}
-                                 tempRx(i) = Rview(i)[0];
-                                 tempRy(i) = Rview(i)[1];
-                                 tempRz(i) = Rview(i)[2];
+                                 for(size_t d = 0; d < Dim; ++d) {
+                                    tempR[d](i) = Rview(i)[d];
+                                 }
+                                 //tempRx(i) = Rview(i)[0];
+                                 //tempRy(i) = Rview(i)[1];
+                                 //tempRz(i) = Rview(i)[2];
                                  tempQ(i).x = Qview(i).real();
+                                 //tempQ(i).y = 0.0;
                                  tempQ(i).y = Qview(i).imag();
                              });
 
-        //ier_m = cufinufft_setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
-        //             NULL, NULL, NULL, plan_m);
-        ier_m = cufinufft_setpts(localNp, tempRx.data(), tempRy.data(), tempRz.data(), 0, 
+        ier_m = cufinufft_setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
                      NULL, NULL, NULL, plan_m);
+        //ier_m = cufinufft_setpts(localNp, tempRx.data(), tempRy.data(), tempRz.data(), 0, 
+        //             NULL, NULL, NULL, plan_m);
 
         ier_m = cufinufft_execute(tempQ.data(), tempField.data(), plan_m);
 
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 9b66e18c3..b33761d61 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -59,6 +59,12 @@ namespace ippl {
 
         using size_type = detail::size_type;
 
+#ifdef KOKKOS_ENABLE_CUDA
+        //TODO: Remove hard-coded dimension by having Dim as template 
+        //parameter. Does this need to be in CUDA ifdefs? 
+        using FFT_t = FFT<NUFFTransform, 3, T>;
+#endif
+
         // Create storage for M particle attributes.  The storage is uninitialized.
         // New items are appended to the end of the array.
         void create(size_type) override;
@@ -156,20 +162,36 @@ namespace ippl {
         scatter(Field<T, Dim, M, C>& f,
                 const ParticleAttrib<Vector<P2, Dim>, Properties... >& pp) const;
 
-        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
+        template <unsigned Dim, class M, class C, typename P2, typename P3>
         void
-        scatterPIF(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
-                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp) const;
+        scatterPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P2, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
 
         template <unsigned Dim, class M, class C, typename P2>
         void
         gather(Field<T, Dim, M, C>& f,
                const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp);
 
-        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
+        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        void
+        gatherPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P2, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+
+#ifdef KOKKOS_ENABLE_CUDA
+        template<unsigned Dim>
+        void initializeNUFFT(FieldLayout<Dim>& layout, ParameterList& fftParams); 
+
+        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        void
+        scatterPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P2, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+        
+        template <unsigned Dim, class M, class C, typename P2, typename P3>
         void
-        gatherPIF(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
-                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp) const;
+        gatherPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P2, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp,
+                ParticleAttrib<P3, Properties... >& q) const;
+#endif
 
         T sum();
         T max();
@@ -178,6 +200,10 @@ namespace ippl {
 
     private:
         view_type dview_m;
+#ifdef KOKKOS_ENABLE_CUDA
+        std::shared_ptr<FFT_t> fftType1_mp;
+        std::shared_ptr<FFT_t> fftType2_mp;
+#endif
     };
 }
 
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 56030879f..48d1a4f53 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -203,8 +203,8 @@ namespace ippl {
 
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
-    void ParticleAttrib<T, Properties...>::scatterPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+    template <unsigned Dim, class M, class C, class FT, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIFNUDFT(Field<FT,Dim,M,C>& f, Field<FT,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
@@ -365,8 +365,8 @@ namespace ippl {
     }
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
-    void ParticleAttrib<T, Properties...>::gatherPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+    template <unsigned Dim, class M, class C, class FT, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIFNUDFT(Field<FT,Dim,M,C>& f, Field<FT,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
@@ -477,95 +477,44 @@ namespace ippl {
 #ifdef KOKKOS_ENABLE_CUDA
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
-    void ParticleAttrib<T, Properties...>::scatterPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
+    template<unsigned Dim>
+    void initializeNUFFT(FieldLayout<Dim>& layout, ParameterList& fftParams) {
+        
+        fftType1_mp = std::make_shared<FFT<NUFFTransform, Dim, T>>(layout, 1, fftParams);
+        fftType2_mp = std::make_shared<FFT<NUFFTransform, Dim, T>>(layout, 2, fftParams);
+    }
+    
+    
+    
+    template<typename T, class... Properties>
+    template <unsigned Dim, class M, class C, class FT, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIFNUFFT(Field<FT,Dim,M,C>& f, Field<FT,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
         
         static IpplTimings::TimerRef scatterPIFNUFFTTimer = IpplTimings::getTimer("ScatterPIFNUFFT");           
         IpplTimings::startTimer(scatterPIFNUFFTTimer);
+
+        fftType1_mp->transform(pp, *this, f);
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
-        using vector_type = typename M::vector_type;
-        using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
         view_type fview = f.getView();
         typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
         const int nghost = f.getNghost();
-        const FieldLayout<Dim>& layout = f.getLayout(); 
-        const M& mesh = f.get_mesh();
-        const vector_type& dx = mesh.getMeshSpacing();
-        const auto& domain = layout.getDomain();
-        vector_type Len;
-        Vector<int, Dim> N;
-
-
-        for (unsigned d=0; d < Dim; ++d) {
-            N[d] = domain[d].length();
-            Len[d] = dx[d] * N[d];
-        }
         
-        typedef Kokkos::TeamPolicy<> team_policy;
-        typedef Kokkos::TeamPolicy<>::member_type member_type;
-
-
-        //using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
-
-        //view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
-
-        double pi = std::acos(-1.0);
-        Kokkos::complex<double> imag = {0.0, 1.0};
-
-        size_t Np = *(this->localNum_mp);
-
-        size_t flatN = N[0]*N[1]*N[2];
-
-        Kokkos::parallel_for("ParticleAttrib::scatterPIFNUFFT compute",
-                team_policy(flatN, Kokkos::AUTO),
-                KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
-                const size_t flatIndex = teamMember.league_rank();
-               
-#ifdef KOKKOS_ENABLE_CUDA
-                const int k = (int)(flatIndex / (N[0] * N[1]));
-                const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
-                const int i = flatIndex2D % N[0];
-                const int j = (int)(flatIndex2D / N[0]);
-#else
-
-                const int i = (int)(flatIndex / (N[0] * N[1]));
-                const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
-                const int k = flatIndex2D % N[0];
-                const int j = (int)(flatIndex2D / N[0]);
-#endif
-                
-                FT reducedValue = 0.0;
-                Vector<int, 3> iVec = {i, j, k};
-                vector_type kVec;
-                for(size_t d = 0; d < Dim; ++d) {
-                    bool shift = (iVec[d] > (N[d]/2));
-                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                }
-                auto Sk = Skview(i+nghost, j+nghost, k+nghost);
-                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
-                [=](const size_t idx, FT& innerReduce)
-                {
-                    double arg = 0.0;
-                    for(size_t d = 0; d < Dim; ++d) {
-                        arg += kVec[d]*pp(idx)[d];
-                    }
-                    const value_type& val = dview_m(idx);
-
-                    innerReduce += Sk * (Kokkos::Experimental::cos(arg) 
-                                - imag * Kokkos::Experimental::sin(arg)) * val;
-                }, Kokkos::Sum<FT>(reducedValue));
-
-                if(teamMember.team_rank() == 0) {
-                    //viewLocal(i+nghost,j+nghost,k+nghost) = reducedValue;
-                    fview(i+nghost,j+nghost,k+nghost) = reducedValue;
-                }
-
-                }
-        );
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        Kokkos::parallel_for("Multiply with shape functions",
+                            mdrange_type({nghost, nghost, nghost},
+                                         {fview.extent(0) - nghost, 
+                                          fview.extent(1) - nghost,
+                                          fview.extent(2) - nghost}),
+                            KOKKOS_LAMBDA(const size_t i,
+                                          const size_t j,
+                                          const size_t k)
+        {
+            fview(i, j, k) *= Skview(i, j, k);    
+        });
 
         IpplTimings::stopTimer(scatterPIFNUFFTTimer);
 
@@ -580,22 +529,29 @@ namespace ippl {
 
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
-    void ParticleAttrib<T, Properties...>::gatherPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
-                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+    template <unsigned Dim, class M, class C, class FT, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIFNUFFT(Field<FT,Dim,M,C>& f, Field<FT,Dim,M,C>& Sk,
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp, 
+                                                   ParticleAttrib<PT, Properties... >& q)
     const
     {
         static IpplTimings::TimerRef gatherPIFNUFFTTimer = IpplTimings::getTimer("GatherPIFNUFFT");           
         IpplTimings::startTimer(gatherPIFNUFFTTimer);
+
+        Field<FT,Dim,M,C> tempField;
+
+        const FieldLayout<Dim>& layout = f.getLayout(); 
+        const M& mesh = f.get_mesh();
+
+        tempField.initialize(mesh, layout);
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         using vector_type = typename M::vector_type;
-        using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
         view_type fview = f.getView();
+        view_type tempview = tempField.getView();
+        auto qview = q.getView();
         typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
         const int nghost = f.getNghost();
-        const FieldLayout<Dim>& layout = f.getLayout(); 
-        const M& mesh = f.get_mesh();
         const vector_type& dx = mesh.getMeshSpacing();
         const auto& domain = layout.getDomain();
         vector_type Len;
@@ -607,82 +563,46 @@ namespace ippl {
         }
 
 
-
-        typedef Kokkos::TeamPolicy<> team_policy;
-        typedef Kokkos::TeamPolicy<>::member_type member_type;
-
         double pi = std::acos(-1.0);
         Kokkos::complex<double> imag = {0.0, 1.0};
-
         size_t Np = *(this->localNum_mp);
 
-        size_t flatN = N[0]*N[1]*N[2];
-
-        Kokkos::parallel_for("ParticleAttrib::gatherPIFNUFFT",
-                team_policy(Np, Kokkos::AUTO),
-                KOKKOS_CLASS_LAMBDA(const member_type& teamMember) {
-                const size_t idx = teamMember.league_rank();
-
-                value_type reducedValue = 0.0;
-                Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, flatN),
-                [=](const size_t flatIndex, value_type& innerReduce)
-                {
-                    
-#ifdef KOKKOS_ENABLE_CUDA
-                    const int k = (int)(flatIndex / (N[0] * N[1]));
-                    const int flatIndex2D = flatIndex - (k * N[0] * N[1]);
-                    const int i = flatIndex2D % N[0];
-                    const int j = (int)(flatIndex2D / N[0]);
-#else
-                    const int i = (int)(flatIndex / (N[0] * N[1]));
-                    const int flatIndex2D = flatIndex - (i * N[0] * N[1]);
-                    const int k = flatIndex2D % N[0];
-                    const int j = (int)(flatIndex2D / N[0]);
-#endif
-
-                    Vector<int, 3> iVec = {i, j, k};
-                    vector_type kVec;
-                    double Dr = 0.0, arg = 0.0;
-                    for(size_t d = 0; d < Dim; ++d) {
-                        bool shift = (iVec[d] > (N[d]/2));
-                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                        //kVec[d] = 2 * pi / Len[d] * iVec[d];
-                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
-                        Dr += kVec[d] * kVec[d];
-                        arg += kVec[d]*pp(idx)[d];
-                    }
-                    
+        for(size_t gd = 0; gd < Dim; ++gd) {
+            Kokkos::parallel_for("Gather NUFFT",
+                                mdrange_type({nghost, nghost, nghost},
+                                             {fview.extent(0) - nghost,
+                                              fview.extent(1) - nghost,
+                                              fview.extent(2) - nghost}),
+                                KOKKOS_LAMBDA(const size_t i,
+                                              const size_t j,
+                                              const size_t k)
+            {
+                Vector<int, 3> iVec = {i, j, k};
+                Vector_t kVec;
 
-                    FT Ek = 0.0;
-                    value_type Ex = 0.0;
-                    auto rho = fview(i+nghost,j+nghost,k+nghost);
-                    auto Sk = Skview(i+nghost,j+nghost,k+nghost);
-                    for(size_t d = 0; d < Dim; ++d) {
-                        
-                        bool isNotZero = (Dr != 0.0);
-                        double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
-                        Ek = -(imag * kVec[d] * rho * factor);
-                        
-                        //Inverse Fourier transform when the lhs is real. Use when 
-                        //we choose k \in [0 K) instead of from [-K/2+1 K/2] 
-                        //Ex[d] = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
-                        //        - Ek.imag() * Kokkos::Experimental::sin(arg));
-                        Ek *= Sk * (Kokkos::Experimental::cos(arg) 
-                                + imag * Kokkos::Experimental::sin(arg));
-                        Ex[d] = Ek.real();
-                    }
-                    
-                    innerReduce += Ex;
-                }, Kokkos::Sum<value_type>(reducedValue));
+                double Dr = 0.0;
+                for(size_t d = 0; d < Dim; ++d) {
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+                    Dr += kVec[d] * kVec[d];
+                }
 
-                teamMember.team_barrier();
+                tempview(i, j, k) = fview(i, j, k);
+                
+                bool isNotZero = (Dr != 0.0);
+                double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+                
+                tempview(i, j, k) *= -Skview(i, j, k) * (imag * kVec[gd] * factor);
+            });
 
-                if(teamMember.team_rank() == 0) {
-                    dview_m(idx) = reducedValue;
-                }
+            fftType2_mp->transform(pp, q, tempField);
 
-                }
-        );
+            Kokkos::parallel_for("Assign E gather NUFFT",
+                                Np,
+                                KOKKOS_CLASS_LAMBDA(const size_t i)
+            {
+                dview_m(i)[gd] = qview(i);
+            });
+        }
 
         
         IpplTimings::stopTimer(gatherPIFNUFFTTimer);
@@ -690,31 +610,31 @@ namespace ippl {
     }
 #endif
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
     inline
     void scatterPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+                 Field<P2, Dim, M, C>& Sk, const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
     {
 #ifdef KOKKOS_ENABLE_CUDA
         attrib.scatterPIFNUFFT(f, Sk, pp);
 #else
-        throw IpplException("scatterPIFNUFFT",
-                            "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
-                             be compiled with CUDA. Otherwise use scatterPIFNUDFT.");
+        //throw IpplException("scatterPIFNUFFT", "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
+        //                     be compiled with CUDA. Otherwise use scatterPIFNUDFT.");
 #endif
     }
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
     inline
     void gatherPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+                 Field<P2, Dim, M, C>& Sk, const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp, 
+                 ParticleAttrib<P3, Properties... >& q)
     {
 #ifdef KOKKOS_ENABLE_CUDA
-        attrib.gatherPIFNUFFT(f, Sk, pp);
+        attrib.gatherPIFNUFFT(f, Sk, pp, q);
 #else
-        throw IpplException("gatherPIFNUFFT",
-                            "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
-                             be compiled with CUDA. Otherwise use gatherPIFNUDFT.");
+        //throw IpplException("gatherPIFNUFFT",
+        //                    "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
+        //                     be compiled with CUDA. Otherwise use gatherPIFNUDFT.");
 #endif
     }
 
@@ -732,10 +652,10 @@ namespace ippl {
         attrib.scatter(f, pp);
     }
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
     inline
     void scatterPIFNUDFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+                 Field<P2, Dim, M, C>& Sk, const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
     {
         attrib.scatterPIFNUDFT(f, Sk, pp);
     }
@@ -750,10 +670,10 @@ namespace ippl {
         attrib.gather(f, pp);
     }
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
     inline
     void gatherPIFNUDFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+                 Field<P2, Dim, M, C>& Sk, const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
     {
         attrib.gatherPIFNUDFT(f, Sk, pp);
     }

From c4f9d714e168679c6f337510b448a44dad6ce949 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 17 Feb 2023 09:34:29 +0100
Subject: [PATCH 064/117] Function pointers and C-style arrays introduced to
 solve the type and dimension issues

---
 src/FFT/FFT.h   | 52 ++++++++++++++++++++++---------------------------
 src/FFT/FFT.hpp | 36 ++++++++++++++--------------------
 2 files changed, 38 insertions(+), 50 deletions(-)

diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index cec240b8f..6544b54f2 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -33,6 +33,7 @@
 #include <cufinufft.h>
 #include <array>
 #include <memory>
+#include <functional>
 #include <type_traits>
 
 #include "FieldLayout/FieldLayout.h"
@@ -126,32 +127,28 @@ namespace ippl {
 
         template <>
         struct CufinufftType<float> {
-            //using makeplan    = typename  cufinufftf_makeplan;
-            //using setpts      = typename  cufinufftf_setpts;
-            //using execute     = typename  cufinufftf_execute;
-            //using destroy     = typename  cufinufftf_destroy;
-            //using plan_t      = typename  cufinufftf_plan;
-
-
-            //typedef typename cufinufftf_makeplan makeplan;
-            //typedef typename cufinufftf_setpts setpts;
-            //typedef typename cufinufftf_execute execute;
-            //typedef typename cufinufftf_destroy destroy;
-            //typedef typename cufinufftf_plan plan_t;
+            std::function<int(int, int, int*, int, int, 
+                              float, int, cufinufftf_plan*, cufinufft_opts*)> makeplan = cufinufftf_makeplan; 
+            std::function<int(int, float*, float*, float*, 
+                              int, float*, float*, float*, cufinufftf_plan)> setpts = cufinufftf_setpts; 
+            std::function<int(cuFloatComplex*, cuFloatComplex*, cufinufftf_plan)> execute = cufinufftf_execute; 
+            std::function<int(cufinufftf_plan)> destroy = cufinufftf_destroy;
+            
+            using complexType = cuFloatComplex;
+            using plan_t      = cufinufftf_plan;
         };
 
         template <>
         struct CufinufftType<double> {
-            //using makeplan    = typename  cufinufft_makeplan;
-            //using setpts      = typename  cufinufft_setpts;
-            //using execute     = typename  cufinufft_execute;
-            //using destroy     = typename  cufinufft_destroy;
-            //using plan_t      = typename  cufinufft_plan;
-            //typedef typename cufinufft_makeplan makeplan;
-            //typedef typename cufinufft_setpts setpts;
-            //typedef typename cufinufft_execute execute;
-            //typedef typename cufinufft_destroy destroy;
-            //typedef typename cufinufft_plan plan_t;
+            std::function<int(int, int, int*, int, int, 
+                              double, int, cufinufft_plan*, cufinufft_opts*)> makeplan = cufinufft_makeplan; 
+            std::function<int(int, double*, double*, double*, 
+                              int, double*, double*, double*, cufinufft_plan)> setpts = cufinufft_setpts; 
+            std::function<int(cuDoubleComplex*, cuDoubleComplex*, cufinufft_plan)> execute = cufinufft_execute; 
+            std::function<int(cufinufft_plan)> destroy = cufinufft_destroy; 
+            
+            using complexType = cuDoubleComplex;
+            using plan_t      = cufinufft_plan;
         };
 #endif
     }
@@ -352,11 +349,8 @@ namespace ippl {
         typedef Kokkos::complex<T> KokkosComplex_t;
         typedef Field<KokkosComplex_t,Dim> ComplexField_t;
 
-        //using makeplan = typename detail::CufinufftType<T>::makeplan;
-        //using setpts = typename detail::CufinufftType<T>::setpts;
-        //using execute = typename detail::CufinufftType<T>::execute;
-        //using destroy = typename detail::CufinufftType<T>::destroy;
-        //using plan_t = typename detail::CufinufftType<T>::plan_t;
+        using complexType = typename detail::CufinufftType<T>::complexType;
+        using plan_t = typename detail::CufinufftType<T>::plan_t;
 
         /** Create a new FFT object with the layout for the input Field, type 
          * (1 or 2) for the NUFFT and parameters for cuFINUFFT.
@@ -381,8 +375,8 @@ namespace ippl {
         void setup(std::array<int, 3>& nmodes,
                    const ParameterList& params);
 
-        //plan_t plan_m;
-        cufinufft_plan plan_m;
+        detail::CufinufftType<T> nufft_m;
+        plan_t plan_m;
         int ier_m;
         T tol_m;
         int type_m;
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index c79b247a7..985e50ab4 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -824,7 +824,7 @@ namespace ippl {
 
         //dim in cufinufft is int
         int dim = static_cast<int>(Dim);
-        ier_m = cufinufft_makeplan(type_m, dim, nmodes.data(), iflag, 1, tol_m,
+        ier_m = nufft_m.makeplan(type_m, dim, nmodes.data(), iflag, 1, tol_m,
                        maxbatchsize, &plan_m, &opts);  
 
     }
@@ -849,24 +849,23 @@ namespace ippl {
          * cuFINUFFT's layout is left, hence we allocate the temporary
          * Kokkos views with the same layout
          */
-        Kokkos::View<cuDoubleComplex***,Kokkos::LayoutLeft>
+        Kokkos::View<complexType***,Kokkos::LayoutLeft>
             tempField("tempField", fview.extent(0) - 2*nghost,
                                    fview.extent(1) - 2*nghost,
                                    fview.extent(2) - 2*nghost);
 
 
-        //Vector<Kokkos::View<PT1[localNp],Kokkos::LayoutLeft>, 3> tempR;
-        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRx("tempRx", localNp);
-        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRy("tempRy", localNp);
-        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempRz("tempRz", localNp);
+        //Initialize the pointers to NULL and fill only relevant dimensions
+        //CUFINUFFT requires the input like this.
+        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempR[3] = {};
       
 
-        //for(size_t d = 0; d < Dim; ++d) {
-        //    Kokkos::realloc(tempR[d], localNp);
-        //}
+        for(size_t d = 0; d < Dim; ++d) {
+            Kokkos::realloc(tempR[d], localNp);
+        }
 
        
-        Kokkos::View<cuDoubleComplex*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
+        Kokkos::View<complexType*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
        
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
 
@@ -891,22 +890,17 @@ namespace ippl {
                              localNp,
                              KOKKOS_LAMBDA(const size_t i)
                              {
-                                 //for(size_t d = 0; d < Dim; ++d) {
-                                 //   tempR[d](i) = Rview(i)[d];
-                                 //}
-                                 tempRx(i) = Rview(i)[0];
-                                 tempRy(i) = Rview(i)[1];
-                                 tempRz(i) = Rview(i)[2];
+                                 for(size_t d = 0; d < Dim; ++d) {
+                                    tempR[d](i) = Rview(i)[d];
+                                 }
                                  tempQ(i).x = Qview(i).real();
                                  tempQ(i).y = Qview(i).imag();
                              });
 
-        //ier_m = cufinufft_setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
-        //             NULL, NULL, NULL, plan_m);
-        ier_m = cufinufft_setpts(localNp, tempRx.data(), tempRy.data(), tempRz.data(), 0, 
+        ier_m = nufft_m.setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
                      NULL, NULL, NULL, plan_m);
 
-        ier_m = cufinufft_execute(tempQ.data(), tempField.data(), plan_m);
+        ier_m = nufft_m.execute(tempQ.data(), tempField.data(), plan_m);
 
 
         if(type_m == 1) { 
@@ -940,7 +934,7 @@ namespace ippl {
     template <size_t Dim, class T>
     FFT<NUFFTransform,Dim,T>::~FFT() {
         
-        ier_m = cufinufft_destroy(plan_m);
+        ier_m = nufft_m.destroy(plan_m);
 
     }
 #endif

From 70cd2c4a9cf8da3508f4481e46ccc40b498555a8 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 17 Feb 2023 17:18:59 +0100
Subject: [PATCH 065/117] Code compiles and runs but the results are wrong.
 Need to see.

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 25 ++++---
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  | 28 ++++++--
 alpine/PinT/ChargedParticlesPinT.hpp          |  4 +-
 src/FFT/FFT.h                                 |  4 +-
 src/FFT/FFT.hpp                               |  8 +--
 src/Particle/ParticleAttrib.h                 | 40 ++++++-----
 src/Particle/ParticleAttrib.hpp               | 66 ++++++++++---------
 test/FFT/TestNUFFT1.cpp                       | 13 ++--
 test/FFT/TestNUFFT2.cpp                       | 16 ++---
 9 files changed, 110 insertions(+), 94 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index b76152f3b..465180ef6 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -63,6 +63,8 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     double Q_m;
 
+    size_type Np_m;
+
     double time_m;
 
     double rhoNorm_m;
@@ -95,12 +97,14 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                      Vector_t rmin,
                      Vector_t rmax,
                      ippl::e_dim_tag decomp[Dim],
-                     double Q)
+                     double Q,
+                     size_type Np)
     : ippl::ParticleBase<PLayout>(pl)
     , hr_m(hr)
     , rmin_m(rmin)
     , rmax_m(rmax)
     , Q_m(Q)
+    , Np_m(Np)  
     {
         // register the particle attributes
         this->addAttribute(q);
@@ -119,7 +123,11 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     void gather() {
 
-        gatherPIF(this->E, rho_m, Sk_m, this->R);
+        gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, this->q);
+
+        //Set the charge back to original as we used this view as a 
+        //temporary buffer during gather
+        this->q = Q_m / Np_m; 
 
     }
 
@@ -127,7 +135,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         
         Inform m("scatter ");
         rho_m = {0.0, 0.0};
-        scatterPIF(q, rho_m, Sk_m, this->R);
+        scatterPIFNUFFT(q, rho_m, Sk_m, this->R);
 
         rho_m = rho_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
 
@@ -390,18 +398,17 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            Vector<double, 3> kVec;
            double Dr = 0.0;
            for(size_t d = 0; d < Dim; ++d) {
-               bool shift = (iVec[d] > (N[d]/2));
-               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-               //kVec[d] = 2 * pi / Len[d] * iVec[d];
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                Dr += kVec[d] * kVec[d];
            }
 
            Kokkos::complex<double> Ek = {0.0, 0.0}; 
            double myVal = 0.0;
+           auto rho = rhoview(i+nghost,j+nghost,k+nghost);
            for(size_t d = 0; d < Dim; ++d) {
-               if(Dr != 0.0) {
-                   Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-               }
+               bool isNotZero = (Dr != 0.0);
+               double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+               Ek = -(imag * kVec[d] * rho * factor);
                myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
            }
 
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 5bbcfd57b..2baa8eef4 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -193,11 +193,12 @@ int main(int argc, char *argv[]){
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
     double alpha = 0.05;
-    Vector_t rmin(0.0);
-    Vector_t rmax = 2 * pi / kw ;
-    double dx = rmax[0] / nr[0];
-    double dy = rmax[1] / nr[1];
-    double dz = rmax[2] / nr[2];
+    Vector_t rmin(-2.0 * pi);
+    Vector_t rmax = 2 * pi;
+    Vector_t length = rmax - rmin;
+    double dx = length[0] / nr[0];
+    double dy = length[1] / nr[1];
+    double dz = length[2] / nr[2];
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
@@ -208,8 +209,8 @@ int main(int argc, char *argv[]){
     PLayout_t PL(FL, mesh);
 
     //Q = -\int\int f dx dv
-    double Q = -rmax[0] * rmax[1] * rmax[2];
-    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+    double Q = -length[0] * length[1] * length[2];
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,totalP);
 
     P->nr_m = nr;
 
@@ -263,6 +264,19 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
 
+    ippl::ParameterList fftParams;
+
+    fftParams.add("gpu_method", 1);
+    fftParams.add("gpu_sort", 1);
+    fftParams.add("gpu_kerevalmeth", 1);
+    fftParams.add("tolerance", 1e-10);
+
+    fftParams.add("use_cufinufft_defaults", false);
+
+    P->q.initializeNUFFT(FL, 1, fftParams);
+    P->E.initializeNUFFT(FL, 2, fftParams);
+
+
     P->scatter();
 
     P->gather();
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 492df0c68..6bd2360a4 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -925,7 +925,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
         // Solve for and gather E field
-        gatherPIF(E, rhoPIF_m, Sk_m, Rtemp);
+        gatherPIF(E, rhoPIF_m, Sk_m, Rtemp, q);
     
         time_m = tStartMySlice;
 
@@ -982,7 +982,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
             // Solve for and gather E field
-            gatherPIF(E, rhoPIF_m, Sk_m, Rtemp);
+            gatherPIF(E, rhoPIF_m, Sk_m, Rtemp, q);
     
             //kick
             auto R2view = Rtemp.getView();
diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 6544b54f2..2b41a9495 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -38,7 +38,7 @@
 
 #include "FieldLayout/FieldLayout.h"
 #include "Field/Field.h"
-#include "Particle/ParticleAttrib.h"
+//#include "Particle/ParticleAttrib.h"
 #include "Utility/ParameterList.h"
 #include "Utility/IpplException.h"
 
@@ -51,6 +51,8 @@ namespace heffte {
 
 namespace ippl {
 
+    template <typename T, class... Properties> class ParticleAttrib;
+
     /**
        Tag classes for CC type of Fourier transforms
     */
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 4214d4b13..59f8d3ca6 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -893,9 +893,8 @@ namespace ippl {
                                  for(size_t d = 0; d < Dim; ++d) {
                                     tempR[d](i) = Rview(i)[d];
                                  }
-                                 tempQ(i).x = Qview(i).real();
-                                 //tempQ(i).y = 0.0;
-                                 tempQ(i).y = Qview(i).imag();
+                                 tempQ(i).x = Qview(i);
+                                 tempQ(i).y = 0.0;
                              });
 
         ier_m = nufft_m.setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
@@ -926,8 +925,7 @@ namespace ippl {
                                  localNp,
                                  KOKKOS_LAMBDA(const size_t i)
                                  {
-                                     Qview(i).real() = tempQ(i).x;
-                                     Qview(i).imag() = tempQ(i).y;
+                                     Qview(i) = tempQ(i).x;
                                  });
         }
     }
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index b33761d61..a50bb9007 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -31,6 +31,8 @@
 
 #include "Expression/IpplExpressions.h"
 #include "Particle/ParticleAttribBase.h"
+#include "FFT/FFT.h"
+#include "Utility/ParameterList.h"
 
 namespace Kokkos { //reduction identity must be defined in Kokkos namespace
     template<>
@@ -59,11 +61,6 @@ namespace ippl {
 
         using size_type = detail::size_type;
 
-#ifdef KOKKOS_ENABLE_CUDA
-        //TODO: Remove hard-coded dimension by having Dim as template 
-        //parameter. Does this need to be in CUDA ifdefs? 
-        using FFT_t = FFT<NUFFTransform, 3, T>;
-#endif
 
         // Create storage for M particle attributes.  The storage is uninitialized.
         // New items are appended to the end of the array.
@@ -162,35 +159,35 @@ namespace ippl {
         scatter(Field<T, Dim, M, C>& f,
                 const ParticleAttrib<Vector<P2, Dim>, Properties... >& pp) const;
 
-        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
-        scatterPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P2, Dim, M, C>& Sk,
-                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+        scatterPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp) const;
 
         template <unsigned Dim, class M, class C, typename P2>
         void
         gather(Field<T, Dim, M, C>& f,
                const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp);
 
-        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
-        gatherPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P2, Dim, M, C>& Sk,
-                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+        gatherPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp);
 
 #ifdef KOKKOS_ENABLE_CUDA
         template<unsigned Dim>
-        void initializeNUFFT(FieldLayout<Dim>& layout, ParameterList& fftParams); 
+        void initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams); 
 
-        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
-        scatterPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P2, Dim, M, C>& Sk,
-                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp) const;
+        scatterPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp) const;
         
-        template <unsigned Dim, class M, class C, typename P2, typename P3>
+        template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
-        gatherPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P2, Dim, M, C>& Sk,
-                const ParticleAttrib<Vector<P3, Dim>, Properties... >& pp,
-                ParticleAttrib<P3, Properties... >& q) const;
+        gatherPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp,
+                ParticleAttrib<P4, Properties... >& q);
 #endif
 
         T sum();
@@ -201,8 +198,9 @@ namespace ippl {
     private:
         view_type dview_m;
 #ifdef KOKKOS_ENABLE_CUDA
-        std::shared_ptr<FFT_t> fftType1_mp;
-        std::shared_ptr<FFT_t> fftType2_mp;
+        //TODO: Remove hard-coded dimension by having Dim as template 
+        //parameter. Does this need to be in CUDA ifdefs?
+        std::shared_ptr<FFT<NUFFTransform, 3, double>> fftType_mp;
 #endif
     };
 }
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 48d1a4f53..fc6fe1430 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -203,8 +203,8 @@ namespace ippl {
 
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class PT>
-    void ParticleAttrib<T, Properties...>::scatterPIFNUDFT(Field<FT,Dim,M,C>& f, Field<FT,Dim,M,C>& Sk,
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
@@ -365,10 +365,9 @@ namespace ippl {
     }
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class PT>
-    void ParticleAttrib<T, Properties...>::gatherPIFNUDFT(Field<FT,Dim,M,C>& f, Field<FT,Dim,M,C>& Sk,
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
-    const
     {
         static IpplTimings::TimerRef gatherPIFNUDFTTimer = IpplTimings::getTimer("GatherPIFNUDFT");           
         IpplTimings::startTimer(gatherPIFNUDFTTimer);
@@ -478,17 +477,16 @@ namespace ippl {
 
     template<typename T, class... Properties>
     template<unsigned Dim>
-    void initializeNUFFT(FieldLayout<Dim>& layout, ParameterList& fftParams) {
+    void ParticleAttrib<T, Properties...>::initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams) {
         
-        fftType1_mp = std::make_shared<FFT<NUFFTransform, Dim, T>>(layout, 1, fftParams);
-        fftType2_mp = std::make_shared<FFT<NUFFTransform, Dim, T>>(layout, 2, fftParams);
+        fftType_mp = std::make_shared<FFT<NUFFTransform, Dim, double>>(layout, type, fftParams);
     }
     
     
     
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class PT>
-    void ParticleAttrib<T, Properties...>::scatterPIFNUFFT(Field<FT,Dim,M,C>& f, Field<FT,Dim,M,C>& Sk,
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::scatterPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
     const
     {
@@ -496,7 +494,9 @@ namespace ippl {
         static IpplTimings::TimerRef scatterPIFNUFFTTimer = IpplTimings::getTimer("ScatterPIFNUFFT");           
         IpplTimings::startTimer(scatterPIFNUFFTTimer);
 
-        fftType1_mp->transform(pp, *this, f);
+        auto q = *this;
+
+        fftType_mp->transform(pp, q, f);
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         view_type fview = f.getView();
@@ -529,19 +529,18 @@ namespace ippl {
 
 
     template<typename T, class... Properties>
-    template <unsigned Dim, class M, class C, class FT, class PT>
-    void ParticleAttrib<T, Properties...>::gatherPIFNUFFT(Field<FT,Dim,M,C>& f, Field<FT,Dim,M,C>& Sk,
+    template <unsigned Dim, class M, class C, class FT, class ST, class PT>
+    void ParticleAttrib<T, Properties...>::gatherPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp, 
                                                    ParticleAttrib<PT, Properties... >& q)
-    const
     {
         static IpplTimings::TimerRef gatherPIFNUFFTTimer = IpplTimings::getTimer("GatherPIFNUFFT");           
         IpplTimings::startTimer(gatherPIFNUFFTTimer);
 
         Field<FT,Dim,M,C> tempField;
 
-        const FieldLayout<Dim>& layout = f.getLayout(); 
-        const M& mesh = f.get_mesh();
+        FieldLayout<Dim>& layout = f.getLayout(); 
+        M& mesh = f.get_mesh();
 
         tempField.initialize(mesh, layout);
         
@@ -567,18 +566,21 @@ namespace ippl {
         Kokkos::complex<double> imag = {0.0, 1.0};
         size_t Np = *(this->localNum_mp);
 
+        
+        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
+        
         for(size_t gd = 0; gd < Dim; ++gd) {
             Kokkos::parallel_for("Gather NUFFT",
                                 mdrange_type({nghost, nghost, nghost},
                                              {fview.extent(0) - nghost,
                                               fview.extent(1) - nghost,
                                               fview.extent(2) - nghost}),
-                                KOKKOS_LAMBDA(const size_t i,
-                                              const size_t j,
-                                              const size_t k)
+                                KOKKOS_LAMBDA(const int i,
+                                              const int j,
+                                              const int k)
             {
                 Vector<int, 3> iVec = {i, j, k};
-                Vector_t kVec;
+                Vector<double, 3> kVec;
 
                 double Dr = 0.0;
                 for(size_t d = 0; d < Dim; ++d) {
@@ -594,7 +596,7 @@ namespace ippl {
                 tempview(i, j, k) *= -Skview(i, j, k) * (imag * kVec[gd] * factor);
             });
 
-            fftType2_mp->transform(pp, q, tempField);
+            fftType_mp->transform(pp, q, tempField);
 
             Kokkos::parallel_for("Assign E gather NUFFT",
                                 Np,
@@ -610,10 +612,10 @@ namespace ippl {
     }
 #endif
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
     void scatterPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P2, Dim, M, C>& Sk, const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
     {
 #ifdef KOKKOS_ENABLE_CUDA
         attrib.scatterPIFNUFFT(f, Sk, pp);
@@ -623,11 +625,11 @@ namespace ippl {
 #endif
     }
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
-    void gatherPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P2, Dim, M, C>& Sk, const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp, 
-                 ParticleAttrib<P3, Properties... >& q)
+    void gatherPIFNUFFT(ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp, 
+                 ParticleAttrib<P4, Properties... >& q)
     {
 #ifdef KOKKOS_ENABLE_CUDA
         attrib.gatherPIFNUFFT(f, Sk, pp, q);
@@ -652,10 +654,10 @@ namespace ippl {
         attrib.scatter(f, pp);
     }
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
     void scatterPIFNUDFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P2, Dim, M, C>& Sk, const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
     {
         attrib.scatterPIFNUDFT(f, Sk, pp);
     }
@@ -670,10 +672,10 @@ namespace ippl {
         attrib.gather(f, pp);
     }
 
-    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, class... Properties>
+    template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
-    void gatherPIFNUDFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P2, Dim, M, C>& Sk, const ParticleAttrib<Vector<P3, Dim>, Properties...>& pp)
+    void gatherPIFNUDFT(ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
     {
         attrib.gatherPIFNUDFT(f, Sk, pp);
     }
diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
index 06ac71234..73629a0e0 100644
--- a/test/FFT/TestNUFFT1.cpp
+++ b/test/FFT/TestNUFFT1.cpp
@@ -19,7 +19,7 @@ struct Bunch : public ippl::ParticleBase<PLayout>
 
     ~Bunch(){ }
     
-    typedef ippl::ParticleAttrib<Kokkos::complex<double>> charge_container_type;
+    typedef ippl::ParticleAttrib<double> charge_container_type;
     charge_container_type Q;
 
 };
@@ -29,11 +29,11 @@ struct generate_random {
 
   using view_type = typename ippl::detail::ViewType<T, 1>::view_type;
   using value_type  = typename T::value_type;
-  using view_type_complex = typename ippl::detail::ViewType<Kokkos::complex<value_type>, 1>::view_type;
+  using view_type_scalar = typename ippl::detail::ViewType<value_type, 1>::view_type;
   // Output View for the random numbers
   view_type x;
 
-  view_type_complex Q;
+  view_type_scalar Q;
 
   // The GeneratorPool
   GeneratorPool rand_pool;
@@ -41,7 +41,7 @@ struct generate_random {
   T minU, maxU;
 
   // Initialize all members
-  generate_random(view_type x_,view_type_complex Q_,  GeneratorPool rand_pool_, 
+  generate_random(view_type x_,view_type_scalar Q_,  GeneratorPool rand_pool_, 
                   T& minU_, T& maxU_)
       : x(x_), Q(Q_), rand_pool(rand_pool_), 
         minU(minU_), maxU(maxU_) {}
@@ -54,8 +54,7 @@ struct generate_random {
     for (unsigned d = 0; d < Dim; ++d) {
         x(i)[d] = rand_gen.drand(minU[d], maxU[d]);
     }
-    Q(i).real() = rand_gen.drand(0.0, 1.0);
-    Q(i).imag() = rand_gen.drand(0.0, 1.0);
+    Q(i) = rand_gen.drand(0.0, 1.0);
 
     // Give the state back, which will allow another thread to acquire it
     rand_pool.free_state(rand_gen);
@@ -147,8 +146,6 @@ int main(int argc, char *argv[]) {
     auto field_result = Kokkos::create_mirror_view_and_copy(
                         Kokkos::HostSpace(), field.getView());
 
-    Kokkos::complex<double> max_error_abs(0.0, 0.0);
-    Kokkos::complex<double> max_error_rel(0.0, 0.0);
 
     //Pick some mode to check. We choose it same as cuFINUFFT testcase cufinufft3d1_test.cu
     ippl::Vector<int, 3> kVec;
diff --git a/test/FFT/TestNUFFT2.cpp b/test/FFT/TestNUFFT2.cpp
index 147c2ba74..f5063b84c 100644
--- a/test/FFT/TestNUFFT2.cpp
+++ b/test/FFT/TestNUFFT2.cpp
@@ -19,7 +19,7 @@ struct Bunch : public ippl::ParticleBase<PLayout>
 
     ~Bunch(){ }
     
-    typedef ippl::ParticleAttrib<Kokkos::complex<double>> charge_container_type;
+    typedef ippl::ParticleAttrib<double> charge_container_type;
     charge_container_type Q;
 
 };
@@ -177,8 +177,6 @@ int main(int argc, char *argv[]) {
     auto Q_result = Kokkos::create_mirror_view_and_copy(
                         Kokkos::HostSpace(), bunch.Q.getView());
 
-    Kokkos::complex<double> max_error_abs(0.0, 0.0);
-    Kokkos::complex<double> max_error_rel(0.0, 0.0);
 
     //Pick some target point to check. We choose it same as cuFINUFFT testcase cufinufft3d2_test.cu
     
@@ -210,15 +208,15 @@ int main(int argc, char *argv[]) {
                                 + imag * Kokkos::Experimental::sin(arg)) * fview(i + nghost, j + nghost, k + nghost);
                             }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
     
-    double abs_error_real = std::fabs(reducedValue.real() - Q_result(idx).real());
-    double rel_error_real = std::fabs(reducedValue.real() - Q_result(idx).real()) /std::fabs(reducedValue.real());
-    double abs_error_imag = std::fabs(reducedValue.imag() - Q_result(idx).imag());
-    double rel_error_imag = std::fabs(reducedValue.imag() - Q_result(idx).imag()) /std::fabs(reducedValue.imag());
+    double abs_error_real = std::fabs(reducedValue.real() - Q_result(idx));
+    double rel_error_real = std::fabs(reducedValue.real() - Q_result(idx)) /std::fabs(reducedValue.real());
+    //double abs_error_imag = std::fabs(reducedValue.imag() - Q_result(idx).imag());
+    //double rel_error_imag = std::fabs(reducedValue.imag() - Q_result(idx).imag()) /std::fabs(reducedValue.imag());
  
     std::cout << "Abs Error in real part: " << std::setprecision(16) 
               << abs_error_real << " Rel. error in real part: " << std::setprecision(16) << rel_error_real << std::endl;
-    std::cout << "Abs Error in imag part: " << std::setprecision(16) 
-              << abs_error_imag << " Rel. error in imag part: " << std::setprecision(16) << rel_error_imag << std::endl;
+    //std::cout << "Abs Error in imag part: " << std::setprecision(16) 
+    //          << abs_error_imag << " Rel. error in imag part: " << std::setprecision(16) << rel_error_imag << std::endl;
 
 
     //Kokkos::complex<double> max_error(0.0, 0.0);

From f06bb87e22e5db0a07020436db3ac0e51c0a32d8 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Sat, 18 Feb 2023 12:41:55 +0100
Subject: [PATCH 066/117] some more modifications

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp   | 18 ++++++++++++------
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp   |  6 ++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 465180ef6..27d0b4133 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -123,7 +123,8 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     void gather() {
 
-        gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, this->q);
+        //gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, this->q);
+        gatherPIFNUDFT(this->E, rho_m, Sk_m, this->R);
 
         //Set the charge back to original as we used this view as a 
         //temporary buffer during gather
@@ -135,7 +136,8 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         
         Inform m("scatter ");
         rho_m = {0.0, 0.0};
-        scatterPIFNUFFT(q, rho_m, Sk_m, this->R);
+        //scatterPIFNUFFT(q, rho_m, Sk_m, this->R);
+        scatterPIFNUDFT(q, rho_m, Sk_m, this->R);
 
         rho_m = rho_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
 
@@ -185,13 +187,15 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            for(size_t d = 0; d < Dim; ++d) {
                bool shift = (iVec[d] > (N[d]/2));
                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                Dr += kVec[d] * kVec[d];
            }
 
            Kokkos::complex<double> Ek = {0.0, 0.0}; 
-           if(Dr != 0.0) {
-               Ek = -(imag * kVec[0] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-           }
+           auto rho = rhoview(i+nghost,j+nghost,k+nghost);
+           bool isNotZero = (Dr != 0.0);
+           double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+           Ek = -(imag * kVec[0] * rho * factor);
            double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
 
            tlSum += myVal;
@@ -398,7 +402,9 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            Vector<double, 3> kVec;
            double Dr = 0.0;
            for(size_t d = 0; d < Dim; ++d) {
-               kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+               bool shift = (iVec[d] > (N[d]/2));
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                Dr += kVec[d] * kVec[d];
            }
 
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 2baa8eef4..7c3c5714e 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -193,8 +193,10 @@ int main(int argc, char *argv[]){
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
     double alpha = 0.05;
-    Vector_t rmin(-2.0 * pi);
-    Vector_t rmax = 2 * pi;
+    //Vector_t rmin(-2.0 * pi);
+    //Vector_t rmax = 2 * pi;
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw;
     Vector_t length = rmax - rmin;
     double dx = length[0] / nr[0];
     double dy = length[1] / nr[1];

From f1934f150dd52461e6a0911168b733c140260341 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 21 Feb 2023 15:58:19 +0100
Subject: [PATCH 067/117] PIF with NUFFT now seems to be working on 1 GPU. Need
 to test more

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 79 +++++++++++++---
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  | 16 +++-
 src/FFT/FFT.hpp                               | 16 +++-
 src/Particle/ParticleAttrib.hpp               | 16 ++--
 test/FFT/TestNUFFT1.cpp                       | 92 ++++++++++++++++++-
 test/FFT/TestNUFFT2.cpp                       |  8 +-
 6 files changed, 194 insertions(+), 33 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 27d0b4133..a3a797823 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -42,6 +42,8 @@ typedef Field<double, Dim>   Field_t;
 typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
 typedef Field<Vector_t, Dim> VField_t;
 
+typedef ippl::FFT<ippl::NUFFTransform, 3, double> FFT_type;
+
 const double pi = std::acos(-1.0);
 
 // Test programs have to define this variable for VTK dump purposes
@@ -51,6 +53,7 @@ template<class PLayout>
 class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 public:
     CxField_t rho_m;
+    CxField_t rhoDFT_m;
     Field_t Sk_m;
 
     Vector<int, Dim> nr_m;
@@ -73,6 +76,8 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     int shapedegree_m;
 
+    std::shared_ptr<FFT_type> fft;
+
 public:
     ParticleAttrib<double>     q; // charge
     typename ippl::ParticleBase<PLayout>::particle_position_type P;  // particle velocity
@@ -123,8 +128,8 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     void gather() {
 
-        //gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, this->q);
-        gatherPIFNUDFT(this->E, rho_m, Sk_m, this->R);
+        gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, this->q);
+        //gatherPIFNUDFT(this->E, rho_m, Sk_m, this->R);
 
         //Set the charge back to original as we used this view as a 
         //temporary buffer during gather
@@ -136,10 +141,15 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         
         Inform m("scatter ");
         rho_m = {0.0, 0.0};
-        //scatterPIFNUFFT(q, rho_m, Sk_m, this->R);
-        scatterPIFNUDFT(q, rho_m, Sk_m, this->R);
+        scatterPIFNUFFT(q, rho_m, Sk_m, this->R);
+        //fft->transform(this->R, q, rho_m);
+        //rhoDFT_m = {0.0, 0.0};
+        //scatterPIFNUDFT(q, rho_m, Sk_m, this->R);
+
+        //dumpFieldData();
 
         rho_m = rho_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
+        //rhoDFT_m = rhoDFT_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
 
     }
 
@@ -185,9 +195,9 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            Vector<double, 3> kVec;
            double Dr = 0.0;
            for(size_t d = 0; d < Dim; ++d) {
-               bool shift = (iVec[d] > (N[d]/2));
-               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-               //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+               //bool shift = (iVec[d] > (N[d]/2));
+               //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                Dr += kVec[d] * kVec[d];
            }
 
@@ -402,9 +412,9 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            Vector<double, 3> kVec;
            double Dr = 0.0;
            for(size_t d = 0; d < Dim; ++d) {
-               bool shift = (iVec[d] > (N[d]/2));
-               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-               //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+               //bool shift = (iVec[d] > (N[d]/2));
+               //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                Dr += kVec[d] * kVec[d];
            }
 
@@ -483,6 +493,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
         auto Skview = Sk_m.getView();
         auto N = nr_m;
+        const int nghost = Sk_m.getNghost();
         const Mesh_t& mesh = rho_m.get_mesh();
         const Vector_t& dx = mesh.getMeshSpacing();
         const Vector_t& Len = rmax_m - rmin_m;
@@ -508,8 +519,9 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                 Vector<double, 3> kVec;
                 double Sk = 1.0;
                 for(size_t d = 0; d < Dim; ++d) {
-                    bool shift = (iVec[d] > (N[d]/2));
-                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    //bool shift = (iVec[d] > (N[d]/2));
+                    //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                     double kh = kVec[d] * dx[d];
                     bool isNotZero = (kh != 0.0);
                     double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
@@ -518,7 +530,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                     //Fourier transform of CIC
                     Sk *= std::pow(arg, order);
                 }
-                    Skview(i, j, k) = Sk;
+                    Skview(i+nghost, j+nghost, k+nghost) = Sk;
             });
 
         }
@@ -592,6 +604,47 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
     //   Ippl::Comm->barrier();
     //}
 
+    void dumpFieldData() {
+
+       typename CxField_t::HostMirror rhoNUFFT_host = rho_m.getHostMirror();
+       typename CxField_t::HostMirror rhoNUDFT_host = rhoDFT_m.getHostMirror();
+       Kokkos::deep_copy(rhoNUFFT_host, rho_m.getView());
+       Kokkos::deep_copy(rhoNUDFT_host, rhoDFT_m.getView());
+       const int nghost = rho_m.getNghost();
+       std::stringstream pname;
+       pname << "data/FieldFFT_";
+       pname << Ippl::Comm->rank();
+       pname << ".csv";
+       Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+       pcsvout.precision(10);
+       pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+       pcsvout << "rho" << endl;
+       for (int i = 0; i< nr_m[0]; i++) {
+            for (int j = 0; j< nr_m[1]; j++) {
+                for (int k = 0; k< nr_m[2]; k++) {
+                    pcsvout << rhoNUFFT_host(i+nghost,j+nghost, k+nghost) << endl;
+                }
+            }
+       }
+       std::stringstream pname2;
+       pname2 << "data/FieldDFT_";
+       pname2 << Ippl::Comm->rank();
+       pname2 << ".csv";
+       Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+       pcsvout2.precision(10);
+       pcsvout2.setf(std::ios::scientific, std::ios::floatfield);
+       pcsvout2 << "rho" << endl;
+       for (int i = 0; i< nr_m[0]; i++) {
+            for (int j = 0; j< nr_m[1]; j++) {
+                for (int k = 0; k< nr_m[2]; k++) {
+                    pcsvout2 << rhoNUDFT_host(i+nghost,j+nghost, k+nghost) << endl;
+                }
+            }
+       }
+       Ippl::Comm->barrier();
+    }
+
+
     //void dumpParticleData() {
 
     //   typename ParticleAttrib<Vector_t>::HostMirror R_host = this->R.getHostMirror();
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 7c3c5714e..08e187ec8 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -192,9 +192,10 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
+    //Vector_t kw = {1.0, 1.0, 1.0};
     double alpha = 0.05;
-    //Vector_t rmin(-2.0 * pi);
-    //Vector_t rmax = 2 * pi;
+    //Vector_t rmin(-pi);
+    //Vector_t rmax(pi);
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw;
     Vector_t length = rmax - rmin;
@@ -212,11 +213,13 @@ int main(int argc, char *argv[]){
 
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
+    //double Q = -64.0 * pi * pi * pi;
     P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,totalP);
 
     P->nr_m = nr;
 
     P->rho_m.initialize(mesh, FL);
+    P->rhoDFT_m.initialize(mesh, FL);
     P->Sk_m.initialize(mesh, FL);
 
     P->time_m = 0.0;
@@ -232,8 +235,10 @@ int main(int argc, char *argv[]){
     Vector_t minU, maxU;
     //int myRank = Ippl::Comm->rank();
     for (unsigned d = 0; d <Dim; ++d) {
-        minU[d] = rmin[d];//CDF(Regions(myRank)[d].min(), alpha, kw[d]);
-        maxU[d] = rmax[d];//CDF(Regions(myRank)[d].max(), alpha, kw[d]);
+        minU[d] = CDF(rmin[d], alpha, kw[d]);
+        maxU[d]   = CDF(rmax[d], alpha, kw[d]);
+        //minU[d] = rmin[d];//CDF(Regions(myRank)[d].min(), alpha, kw[d]);
+        //maxU[d] = rmax[d];//CDF(Regions(myRank)[d].max(), alpha, kw[d]);
     }
 
     double factor = 1.0/Ippl::Comm->size();
@@ -275,6 +280,9 @@ int main(int argc, char *argv[]){
 
     fftParams.add("use_cufinufft_defaults", false);
 
+
+    P->fft = std::make_shared<FFT_type>(FL, 1, fftParams);
+
     P->q.initializeNUFFT(FL, 1, fftParams);
     P->E.initializeNUFFT(FL, 2, fftParams);
 
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 59f8d3ca6..b28196de7 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -845,6 +845,20 @@ namespace ippl {
 
         auto localNp = R.getParticleCount();
 
+        const Layout_t& layout = f.getLayout(); 
+        const UniformCartesian<T, Dim>& mesh = f.get_mesh();
+        const Vector<T, Dim>& dx = mesh.getMeshSpacing();
+        const auto& domain = layout.getDomain();
+        Vector<T, Dim> Len;
+        Vector<int, Dim> N;
+
+        for (unsigned d=0; d < Dim; ++d) {
+            N[d] = domain[d].length();
+            Len[d] = dx[d] * N[d];
+        }
+
+        const double pi = std::acos(-1.0);
+
         /**
          * cuFINUFFT's layout is left, hence we allocate the temporary
          * Kokkos views with the same layout
@@ -891,7 +905,7 @@ namespace ippl {
                              KOKKOS_LAMBDA(const size_t i)
                              {
                                  for(size_t d = 0; d < Dim; ++d) {
-                                    tempR[d](i) = Rview(i)[d];
+                                    tempR[d](i) = Rview(i)[d] * (2.0 * pi / Len[d]);
                                  }
                                  tempQ(i).x = Qview(i);
                                  tempQ(i).y = 0.0;
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index fc6fe1430..429758b36 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -268,8 +268,9 @@ namespace ippl {
                 Vector<int, 3> iVec = {i, j, k};
                 vector_type kVec;
                 for(size_t d = 0; d < Dim; ++d) {
-                    bool shift = (iVec[d] > (N[d]/2));
-                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    //bool shift = (iVec[d] > (N[d]/2));
+                    //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                 }
                 auto Sk = Skview(i+nghost, j+nghost, k+nghost);
                 Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, Np),
@@ -428,10 +429,10 @@ namespace ippl {
                     vector_type kVec;
                     double Dr = 0.0, arg = 0.0;
                     for(size_t d = 0; d < Dim; ++d) {
-                        bool shift = (iVec[d] > (N[d]/2));
-                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                        //bool shift = (iVec[d] > (N[d]/2));
+                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                         //kVec[d] = 2 * pi / Len[d] * iVec[d];
-                        //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
+                        kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d]/2));
                         Dr += kVec[d] * kVec[d];
                         arg += kVec[d]*pp(idx)[d];
                     }
@@ -497,6 +498,8 @@ namespace ippl {
         auto q = *this;
 
         fftType_mp->transform(pp, q, f);
+
+        //std::cout << "NUFFT transform done" << std::endl;
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         view_type fview = f.getView();
@@ -579,12 +582,13 @@ namespace ippl {
                                               const int j,
                                               const int k)
             {
-                Vector<int, 3> iVec = {i, j, k};
+                Vector<int, 3> iVec = {i-nghost, j-nghost, k-nghost};
                 Vector<double, 3> kVec;
 
                 double Dr = 0.0;
                 for(size_t d = 0; d < Dim; ++d) {
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
+                    //kVec[d] = (iVec[d] - (N[d] / 2));
                     Dr += kVec[d] * kVec[d];
                 }
 
diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
index 73629a0e0..c2cf66d09 100644
--- a/test/FFT/TestNUFFT1.cpp
+++ b/test/FFT/TestNUFFT1.cpp
@@ -73,7 +73,7 @@ int main(int argc, char *argv[]) {
     typedef Bunch<playout_type> bunch_type;
 
     
-    std::array<int, dim> pt = {256, 256, 256};
+    ippl::Vector<int, dim> pt = {32, 32, 32};
     ippl::Index I(pt[0]);
     ippl::Index J(pt[1]);
     ippl::Index K(pt[2]);
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
     typedef ippl::Vector<double, 3> Vector_t;
 
     Vector_t hx = {dx[0], dx[1], dx[2]};
-    Vector_t origin = {-pi, -pi, -pi};
+    Vector_t origin = {-2.0 * pi, -2.0 * pi, -2.0 * pi};
     ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
 
     playout_type pl(layout, mesh);
@@ -105,18 +105,19 @@ int main(int argc, char *argv[]) {
     using size_type = ippl::detail::size_type;
 
 
-    size_type Np = std::pow(256,3) * 8;
+    size_type Np = std::pow(32,3) * 20;
     
     typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
 
     field_type field(mesh, layout);
+    field_type field_dft(mesh, layout);
 
     ippl::ParameterList fftParams;
 
     fftParams.add("gpu_method", 1);
     fftParams.add("gpu_sort", 1);
     fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-6);
+    fftParams.add("tolerance", 1e-10);
 
     fftParams.add("use_cufinufft_defaults", false);  
     
@@ -166,7 +167,85 @@ int main(int argc, char *argv[]) {
     auto Qview = bunch.Q.getView();
 
     Kokkos::complex<double> imag = {0.0, 1.0};
-
+    size_t flatN = pt[0] * pt[1] * pt[2];
+    auto fview = field_dft.getView();
+  
+
+
+    typedef Kokkos::TeamPolicy<> team_policy;
+    typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+    Kokkos::parallel_for("NUDFT type 1",
+           team_policy(flatN, Kokkos::AUTO),
+           KOKKOS_LAMBDA(const member_type& teamMember) {
+           const size_t flatIndex = teamMember.league_rank();
+          
+           const int k = (int)(flatIndex / (pt[0] * pt[1]));
+           const int flatIndex2D = flatIndex - (k * pt[0] * pt[1]);
+           const int i = flatIndex2D % pt[0];
+           const int j = (int)(flatIndex2D / pt[0]);
+           
+           Kokkos::complex<double> reducedValue = 0.0;
+           ippl::Vector<int, 3> iVec = {i, j, k};
+           ippl::Vector<double, 3>kVec;
+           for(size_t d = 0; d < 3; ++d) {
+               kVec[d] = (2.0 * pi / (maxU[d] - minU[d])) * (iVec[d] - (pt[d] / 2));
+           }
+           Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nloc),
+           [=](const size_t idx, Kokkos::complex<double>& innerReduce)
+           {
+               double arg = 0.0;
+               for(size_t d = 0; d < 3; ++d) {
+                   arg += kVec[d]*Rview(idx)[d];
+               }
+               const double& val = Qview(idx);
+
+               innerReduce += (Kokkos::Experimental::cos(arg) 
+                           - imag * Kokkos::Experimental::sin(arg)) * val;
+           }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
+
+           if(teamMember.team_rank() == 0) {
+               fview(i+nghost,j+nghost,k+nghost) = reducedValue;
+           }
+
+           });
+    
+    typename field_type::HostMirror rhoNUDFT_host = field_dft.getHostMirror();
+    Kokkos::deep_copy(rhoNUDFT_host, field_dft.getView());
+    std::stringstream pname;
+    pname << "data/FieldFFT_";
+    pname << Ippl::Comm->rank();
+    pname << ".csv";
+    Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    pcsvout.precision(10);
+    pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+    pcsvout << "rho" << endl;
+    for (int i = 0; i< pt[0]; i++) {
+         for (int j = 0; j< pt[1]; j++) {
+             for (int k = 0; k< pt[2]; k++) {
+                 pcsvout << field_result(i+nghost,j+nghost, k+nghost) << endl;
+             }
+         }
+    }
+    std::stringstream pname2;
+    pname2 << "data/FieldDFT_";
+    pname2 << Ippl::Comm->rank();
+    pname2 << ".csv";
+    Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    pcsvout2.precision(10);
+    pcsvout2.setf(std::ios::scientific, std::ios::floatfield);
+    pcsvout2 << "rho" << endl;
+    for (int i = 0; i< pt[0]; i++) {
+         for (int j = 0; j< pt[1]; j++) {
+             for (int k = 0; k< pt[2]; k++) {
+                 pcsvout2 << rhoNUDFT_host(i+nghost,j+nghost, k+nghost) << endl;
+             }
+         }
+       }
+       Ippl::Comm->barrier();
+    
+    
+    
     Kokkos::parallel_reduce("NUDFT type1", nloc,
                              KOKKOS_LAMBDA(const size_t idx, Kokkos::complex<double>& valL) {
 
@@ -188,6 +267,9 @@ int main(int argc, char *argv[]) {
               << abs_error_real << " Rel. error in real part: " << std::setprecision(16) << rel_error_real << std::endl;
     std::cout << "Abs Error in imag part: " << std::setprecision(16) 
               << abs_error_imag << " Rel. error in imag part: " << std::setprecision(16) << rel_error_imag << std::endl;
+    std::cout << "Field result: " << std::setprecision(16) 
+              << field_result(iInd,jInd,kInd).real() << " " << std::setprecision(16) << field_result(iInd,jInd,kInd).imag() 
+              << "index: " << iInd << "," << jInd << "," << kInd << std::endl;
 
 
     //Kokkos::complex<double> max_error(0.0, 0.0);
diff --git a/test/FFT/TestNUFFT2.cpp b/test/FFT/TestNUFFT2.cpp
index f5063b84c..56ac68622 100644
--- a/test/FFT/TestNUFFT2.cpp
+++ b/test/FFT/TestNUFFT2.cpp
@@ -116,7 +116,7 @@ int main(int argc, char *argv[]) {
     //typedef ippl::Vector<Kokkos::complex<double>, 3> CxVector_t;
 
     Vector_t hx = {dx[0], dx[1], dx[2]};
-    Vector_t origin = {-pi, -pi, -pi};
+    Vector_t origin = {-2.0 * pi, -2.0 * pi, -2.0 * pi};
     ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
 
     playout_type pl(layout, mesh);
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
     using size_type = ippl::detail::size_type;
 
 
-    size_type Np = std::pow(32,3) * 10;
+    size_type Np = std::pow(32,3) * 20;
     
     typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
 
@@ -138,7 +138,7 @@ int main(int argc, char *argv[]) {
     fftParams.add("gpu_method", 1);
     fftParams.add("gpu_sort", 1);
     fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-12);
+    fftParams.add("tolerance", 1e-10);
 
     fftParams.add("use_cufinufft_defaults", false);  
     
@@ -150,10 +150,10 @@ int main(int argc, char *argv[]) {
     
     fft = std::make_unique<FFT_type>(layout, type, fftParams);
 
+
     Vector_t minU = {-pi, -pi, -pi};
     Vector_t maxU = {pi, pi, pi};
 
-
     size_type nloc = Np/Ippl::Comm->size();
 
     const int nghost = field.getNghost();

From 8778100258e149ddd8013f39b0b1879575a3a8ce Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 21 Feb 2023 16:03:21 +0100
Subject: [PATCH 068/117] bugs in the origin corrected in TestNUFFT1 and
 TestNUFFT2

---
 test/FFT/TestNUFFT1.cpp | 2 +-
 test/FFT/TestNUFFT2.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
index c2cf66d09..a244e7816 100644
--- a/test/FFT/TestNUFFT1.cpp
+++ b/test/FFT/TestNUFFT1.cpp
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
     typedef ippl::Vector<double, 3> Vector_t;
 
     Vector_t hx = {dx[0], dx[1], dx[2]};
-    Vector_t origin = {-2.0 * pi, -2.0 * pi, -2.0 * pi};
+    Vector_t origin = {-pi, -pi, -pi};
     ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
 
     playout_type pl(layout, mesh);
diff --git a/test/FFT/TestNUFFT2.cpp b/test/FFT/TestNUFFT2.cpp
index 56ac68622..d48abe9fd 100644
--- a/test/FFT/TestNUFFT2.cpp
+++ b/test/FFT/TestNUFFT2.cpp
@@ -116,7 +116,7 @@ int main(int argc, char *argv[]) {
     //typedef ippl::Vector<Kokkos::complex<double>, 3> CxVector_t;
 
     Vector_t hx = {dx[0], dx[1], dx[2]};
-    Vector_t origin = {-2.0 * pi, -2.0 * pi, -2.0 * pi};
+    Vector_t origin = {-pi, -pi, -pi};
     ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
 
     playout_type pl(layout, mesh);

From 65664ff5c2df9c4c3f1c7ded9b814094759b542f Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 22 Feb 2023 10:34:39 +0100
Subject: [PATCH 069/117] PinT also works with NUFFT. Need to do space-time
 parallel now.

---
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp |  2 +-
 alpine/PinT/ChargedParticlesPinT.hpp         | 53 +++++++++++++-------
 alpine/PinT/LandauDampingPinT.cpp            | 22 +++++---
 src/Particle/ParticleAttrib.hpp              | 29 +++++++----
 4 files changed, 71 insertions(+), 35 deletions(-)

diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 08e187ec8..74d921320 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -276,7 +276,7 @@ int main(int argc, char *argv[]){
     fftParams.add("gpu_method", 1);
     fftParams.add("gpu_sort", 1);
     fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-10);
+    fftParams.add("tolerance", 1e-2);
 
     fftParams.add("use_cufinufft_defaults", false);
 
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 6bd2360a4..91d835be0 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -69,6 +69,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     double Q_m;
 
+    size_type Np_m;
+    
     std::shared_ptr<Solver_t> solver_mp;
     
     double time_m;
@@ -110,12 +112,14 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                      Vector_t rmin,
                      Vector_t rmax,
                      ippl::e_dim_tag decomp[Dim],
-                     double Q)
+                     double Q,
+                     size_type Np)
     : ippl::ParticleBase<PLayout>(pl)
     , hr_m(hr)
     , rmin_m(rmin)
     , rmax_m(rmax)
     , Q_m(Q)
+    , Np_m(Np)
     {
         // register the particle attributes
         this->addAttribute(q);
@@ -262,8 +266,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Vector<double, 3> kVec;
             double Dr = 0.0;
             for(size_t d = 0; d < Dim; ++d) {
-                bool shift = (iVec[d] > (N[d]/2));
-                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                //bool shift = (iVec[d] > (N[d]/2));
+                //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                 Dr += kVec[d] * kVec[d];
             }
 
@@ -346,8 +351,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Vector<double, 3> kVec;
             double Dr = 0.0;
             for(size_t d = 0; d < Dim; ++d) {
-                bool shift = (iVec[d] > (N[d]/2));
-                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                //bool shift = (iVec[d] > (N[d]/2));
+                //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                 Dr += kVec[d] * kVec[d];
             }
 
@@ -432,8 +438,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Vector<double, 3> kVec;
             double Dr = 0.0;
             for(size_t d = 0; d < Dim; ++d) {
-                bool shift = (iVec[d] > (N[d]/2));
-                kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                //bool shift = (iVec[d] > (N[d]/2));
+                //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                 //kVec[d] = 2 * pi / Len[d] * iVec[d];
                 Dr += kVec[d] * kVec[d];
             }
@@ -655,8 +662,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 Vector<double, 3> kVec;
                 double Sk = 1.0;
                 for(size_t d = 0; d < Dim; ++d) {
-                    bool shift = (iVec[d] > (N[d]/2));
-                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    //bool shift = (iVec[d] > (N[d]/2));
+                    //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+                    kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                     double kh = kVec[d] * dx[d];
                     bool isNotZero = (kh != 0.0);
                     double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
@@ -856,12 +864,14 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
-        scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
+        scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp);
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
         // Solve for and gather E field
-        gatherPIF(E, rhoPIF_m, Sk_m, Rtemp, q);
+        gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
+
+        q = Q_m / Np_m;
     
         time_m = tStartMySlice;
 
@@ -888,13 +898,15 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
-            scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp);
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
             // Solve for and gather E field
-            gatherPIF(E, rhoPIF_m, Sk_m, Rtemp, q);
-    
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
+
+            q = Q_m / Np_m;
+
             //kick
             Ptemp = Ptemp - 0.5 * dt * E;
     
@@ -920,13 +932,15 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
-        scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
+        scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp);
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
         // Solve for and gather E field
-        gatherPIF(E, rhoPIF_m, Sk_m, Rtemp, q);
-    
+        gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
+
+        q = Q_m / Np_m;
+
         time_m = tStartMySlice;
 
         if((time_m == 0.0)) {
@@ -977,13 +991,14 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
-            scatterPIF(q, rhoPIF_m, Sk_m, Rtemp);
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp);
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
             // Solve for and gather E field
-            gatherPIF(E, rhoPIF_m, Sk_m, Rtemp, q);
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
     
+            q = Q_m / Np_m;
             //kick
             auto R2view = Rtemp.getView();
             auto P2view = Ptemp.getView();
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 5662b2619..985c2c3c3 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -456,10 +456,8 @@ int main(int argc, char *argv[]){
     Vector_t kw = {0.5, 0.5, 0.5};
     //double alpha = 0.05;
     Vector_t alpha = {0.05, 0.05, 0.05};
-    //Vector_t rmin(0.0);
-    //Vector_t rmax = 2 * pi / kw ;
-    Vector_t rmin(-2.0 * pi);
-    Vector_t rmax = 2 * pi;
+    Vector_t rmin(0.0);
+    Vector_t rmax = 2 * pi / kw ;
     Vector_t length = rmax - rmin;
     double dxPIC = length[0] / nrPIC[0];
     double dyPIC = length[1] / nrPIC[1];
@@ -482,7 +480,7 @@ int main(int argc, char *argv[]){
 
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
-    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,totalP);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -622,7 +620,19 @@ int main(int argc, char *argv[]){
     IpplTimings::startTimer(initializeShapeFunctionPIF);
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
-    
+
+    ippl::ParameterList fftParams;
+
+    fftParams.add("gpu_method", 1);
+    fftParams.add("gpu_sort", 1);
+    fftParams.add("gpu_kerevalmeth", 1);
+    fftParams.add("tolerance", 1e-6);
+
+    fftParams.add("use_cufinufft_defaults", false);
+
+    Pcoarse->q.initializeNUFFT(FLPIF, 1, fftParams);
+    Pcoarse->E.initializeNUFFT(FLPIF, 2, fftParams);
+
     
     //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
     //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 429758b36..b7b5b92fe 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -496,16 +496,35 @@ namespace ippl {
         IpplTimings::startTimer(scatterPIFNUFFTTimer);
 
         auto q = *this;
+        
+        //Field<FT,Dim,M,C> tempField;
+
+        //FieldLayout<Dim>& layout = f.getLayout(); 
+        //M& mesh = f.get_mesh();
 
+        //tempField.initialize(mesh, layout);
+        //
+        //fftType_mp->transform(pp, q, tempField);
         fftType_mp->transform(pp, q, f);
 
-        //std::cout << "NUFFT transform done" << std::endl;
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         view_type fview = f.getView();
+        //view_type viewLocal = tempField.getView();
         typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
         const int nghost = f.getNghost();
         
+        IpplTimings::stopTimer(scatterPIFNUFFTTimer);
+
+        //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
+        //IpplTimings::startTimer(scatterAllReduceTimer);                                               
+        //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
+        //MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
+        //              MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
+        //IpplTimings::stopTimer(scatterAllReduceTimer);
+
+        //IpplTimings::startTimer(scatterPIFNUFFTTimer);
+
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
         Kokkos::parallel_for("Multiply with shape functions",
                             mdrange_type({nghost, nghost, nghost},
@@ -520,14 +539,6 @@ namespace ippl {
         });
 
         IpplTimings::stopTimer(scatterPIFNUFFTTimer);
-
-        //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
-        //IpplTimings::startTimer(scatterAllReduceTimer);                                               
-        //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
-        //MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
-        //              MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
-        //IpplTimings::stopTimer(scatterAllReduceTimer);
-
     }
 
 

From 9474ff8d3e85676f802061b86f7923dcd7afec4a Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 24 Feb 2023 08:24:33 +0100
Subject: [PATCH 070/117] Twostream instability and Penning trap ran with
 higher no. of modes and particles

---
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp |  2 +-
 alpine/PinT/BumponTailInstabilityPinT.cpp    | 19 ++++++++++++-------
 alpine/PinT/ChargedParticlesPinT.hpp         | 15 +++++++++++++++
 alpine/PinT/LandauDampingPinT.cpp            | 19 +++++--------------
 alpine/PinT/PenningTrapPinT.cpp              | 11 +++++++++--
 src/Particle/ParticleAttrib.hpp              |  2 +-
 6 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 74d921320..d0e9d3b92 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -276,7 +276,7 @@ int main(int argc, char *argv[]){
     fftParams.add("gpu_method", 1);
     fftParams.add("gpu_sort", 1);
     fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-2);
+    fftParams.add("tolerance", 1e-4);
 
     fftParams.add("use_cufinufft_defaults", false);
 
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 389f619d9..9abdc69bf 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -507,9 +507,17 @@ int main(int argc, char *argv[]){
     FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
     PLayout_t PL(FLPIC, meshPIC);
 
+
+    double factorVelBulk = 1.0 - epsilon;
+    double factorVelBeam = 1.0 - factorVelBulk;
+    size_type nlocBulk = (size_type)(factorVelBulk * totalP);
+    size_type nlocBeam = (size_type)(factorVelBeam * totalP);
+    size_type nloc = nlocBulk + nlocBeam;
+    
+    
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
-    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,nloc);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -534,11 +542,6 @@ int main(int argc, char *argv[]){
         maxU[d]   = CDF(rmax[d], delta, kw[d], d);
     }
 
-    double factorVelBulk = 1.0 - epsilon;
-    double factorVelBeam = 1.0 - factorVelBulk;
-    size_type nlocBulk = (size_type)(factorVelBulk * totalP);
-    size_type nlocBeam = (size_type)(factorVelBeam * totalP);
-    size_type nloc = nlocBulk + nlocBeam;
 
     Pcoarse->create(nloc);
     Pbegin->create(nloc);
@@ -673,7 +676,9 @@ int main(int argc, char *argv[]){
     IpplTimings::startTimer(initializeShapeFunctionPIF);
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
-   
+  
+    Pcoarse->initNUFFT(FLPIF);
+
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 91d835be0..7596d987d 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -160,6 +160,21 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         solver_mp->setLhs(EfieldPIC_m);
     }
 
+
+    void initNUFFT(FieldLayout_t& FLPIF) {
+        ippl::ParameterList fftParams;
+
+        fftParams.add("gpu_method", 1);
+        fftParams.add("gpu_sort", 1);
+        fftParams.add("gpu_kerevalmeth", 1);
+        fftParams.add("tolerance", 1e-6);
+
+        fftParams.add("use_cufinufft_defaults", false);
+
+        q.initializeNUFFT(FLPIF, 1, fftParams);
+        E.initializeNUFFT(FLPIF, 2, fftParams);
+    }
+
      void dumpLandauPIC() {
 
         const int nghostE = EfieldPIC_m.getNghost();
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 985c2c3c3..f08a275b4 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -478,9 +478,12 @@ int main(int argc, char *argv[]){
     FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
     PLayout_t PL(FLPIC, meshPIC);
 
+
+    size_type nloc = totalP;
+
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
-    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,totalP);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,nloc);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -507,7 +510,6 @@ int main(int argc, char *argv[]){
         //maxU[d] = rmax[d];
     }
 
-    size_type nloc = totalP;
 
     Pcoarse->create(nloc);
     Pbegin->create(nloc);
@@ -621,18 +623,7 @@ int main(int argc, char *argv[]){
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
-    ippl::ParameterList fftParams;
-
-    fftParams.add("gpu_method", 1);
-    fftParams.add("gpu_sort", 1);
-    fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-6);
-
-    fftParams.add("use_cufinufft_defaults", false);
-
-    Pcoarse->q.initializeNUFFT(FLPIF, 1, fftParams);
-    Pcoarse->E.initializeNUFFT(FLPIF, 2, fftParams);
-
+    Pcoarse->initNUFFT(FLPIF);
     
     //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
     //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 016351a19..f18fc3aa2 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -471,9 +471,12 @@ int main(int argc, char *argv[]){
     FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
     PLayout_t PL(FLPIC, meshPIC);
 
+    size_type nloc = totalP;
+
+
     double Q = -1562.5;
     double Bext = 5.0;
-    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,nloc);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -498,7 +501,6 @@ int main(int argc, char *argv[]){
         maxU[d] = CDF(rmax[d], mu[d], sd[d]);
     }
 
-    size_type nloc = totalP;
 
     Pcoarse->create(nloc);
     Pbegin->create(nloc);
@@ -630,6 +632,11 @@ int main(int argc, char *argv[]){
     IpplTimings::startTimer(initializeShapeFunctionPIF);
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
+    
+    
+    Pcoarse->initNUFFT(FLPIF);
+    
+    
     for (unsigned int it=0; it<maxIter; it++) {
 
         //Run fine integrator in parallel
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index b7b5b92fe..f453f7294 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -503,7 +503,7 @@ namespace ippl {
         //M& mesh = f.get_mesh();
 
         //tempField.initialize(mesh, layout);
-        //
+        
         //fftType_mp->transform(pp, q, tempField);
         fftType_mp->transform(pp, q, f);
 

From 5f508e40e99de9bdb5ac79ac306cc0d4c08905e3 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 3 Mar 2023 15:34:01 +0100
Subject: [PATCH 071/117] Block parareal first version done

---
 alpine/PinT/ChargedParticlesPinT.hpp |  37 +++
 alpine/PinT/PenningTrapPinT.cpp      | 335 ++++++++++++++++-----------
 2 files changed, 240 insertions(+), 132 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 7596d987d..963192044 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -175,6 +175,43 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         E.initializeNUFFT(FLPIF, 2, fftParams);
     }
 
+    void initializeParareal(ParticleAttrib<Vector_t>& Rbegin,
+                            ParticleAttrib<Vector_t>& Pbegin,
+                            bool& isConverged,
+                            bool& isPreviousDomainConverged,
+                            const unsigned int& ntCoarse,
+                            const double& dtCoarse,
+                            const double& tStartMySlice,
+                            const double& Bext) {
+
+        //Copy initial conditions as they are needed later
+        Kokkos::deep_copy(R0.getView(), R.getView());
+        Kokkos::deep_copy(P0.getView(), P.getView());
+
+        //Get initial guess for ranks other than 0 by propagating the coarse solver
+        if (Ippl::Comm->rank() > 0) {
+            BorisPIC(R, P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice, Bext); 
+        }
+        
+        Ippl::Comm->barrier();
+        
+        Kokkos::deep_copy(Rbegin.getView(), R.getView());
+        Kokkos::deep_copy(Pbegin.getView(), P.getView());
+
+
+        //Run the coarse integrator to get the values at the end of the time slice 
+        Pcoarse->BorisPIC(R, P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
+
+        isConverged = false;
+        if(Ippl::Comm->rank() == 0) {
+            isPreviousDomainConverged = true;
+        }
+        else {
+            isPreviousDomainConverged = false;
+        }
+    }
+
+
      void dumpLandauPIC() {
 
         const int nghostE = EfieldPIC_m.getNghost();
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index f18fc3aa2..d76885630 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -406,15 +406,16 @@ int main(int argc, char *argv[]){
 
     const size_type totalP = std::atoll(argv[7]);
     const double tEnd = std::atof(argv[8]);
-    const double dtSlice = tEnd / Ippl::Comm->size();
+    const unsigned int maxCycles = std::atoi(argv[12]);
+    double tEndCycle = tEnd / maxCycles;
+    const double dtSlice = tEndCycle / Ippl::Comm->size();
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
     const unsigned int ntFine = std::ceil(dtSlice / dtFine);
     const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
-    const unsigned int maxIter = std::atoi(argv[12]);
+    //const unsigned int maxIter = std::atoi(argv[12]);
 
-    const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
     //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
 
 
@@ -491,7 +492,6 @@ int main(int argc, char *argv[]){
     //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
-    Pcoarse->time_m = tStartMySlice;
 
     IpplTimings::startTimer(particleCreation);
 
@@ -509,9 +509,41 @@ int main(int argc, char *argv[]){
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
 #ifdef KOKKOS_ENABLE_CUDA
+    //If we don't do the following even with the same seed the initial 
+    //condition is not the same on different GPUs
+    //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    //if(Ippl::Comm->rank() == 0) {
+    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    //    Kokkos::parallel_for(nloc,
+    //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+    //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, mu, sd, 
+    //                         minU, maxU));
+
+
+    //    Kokkos::fence();
+    //    size_type bufSize = Pbegin->packedSize(nloc);
+    //    std::vector<MPI_Request> requests(0);
+    //    int sends = 0;
+    //    for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
+    //        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
+    //        requests.resize(requests.size() + 1);
+    //        Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
+    //        buf->resetWritePos();
+    //        ++sends;
+    //    }
+    //    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+    //}
+    //else {
+    //    size_type bufSize = Pbegin->packedSize(nloc);
+    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+    //    Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+    //    buf->resetReadPos();
+    //}
+
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+
     if(Ippl::Comm->rank() == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
         Kokkos::parallel_for(nloc,
@@ -521,24 +553,24 @@ int main(int argc, char *argv[]){
 
 
         Kokkos::fence();
-        size_type bufSize = Pbegin->packedSize(nloc);
-        std::vector<MPI_Request> requests(0);
-        int sends = 0;
-        for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
-            requests.resize(requests.size() + 1);
-            Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
-            buf->resetWritePos();
-            ++sends;
-        }
-        MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
     }
     else {
         size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-        Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+        Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
         buf->resetReadPos();
     }
+
+    
+    if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+        MPI_Request request;
+        Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pbegin, *buf, request, nloc);
+        buf->resetWritePos();
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+    }
+
     Ippl::Comm->barrier();
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
@@ -565,7 +597,8 @@ int main(int argc, char *argv[]){
         << "No. of coarse time steps: " << ntCoarse
         << endl
         << "Tolerance: " << tol
-        << " Max. iterations: " << maxIter
+        //<< " Max. iterations: " << maxIter
+        << " Max. cycles: " << maxCycles
         << endl
         << "Np= " << nloc 
         << " Fourier modes = " << nmPIF
@@ -578,55 +611,59 @@ int main(int argc, char *argv[]){
     msg << "particles created and initial conditions assigned " << endl;
 
     //Copy initial conditions as they are needed later
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
 
-    //Get initial guess for ranks other than 0 by propagating the coarse solver
-    IpplTimings::startTimer(coarsePropagator);
-    if (Ippl::Comm->rank() > 0) {
-        Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice, Bext); 
-    }
-    
-    Ippl::Comm->barrier();
-    
-    IpplTimings::stopTimer(coarsePropagator);
+    ////Get initial guess for ranks other than 0 by propagating the coarse solver
+    //IpplTimings::startTimer(coarsePropagator);
+    //if (Ippl::Comm->rank() > 0) {
+    //    Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice, Bext); 
+    //}
+    //
+    //Ippl::Comm->barrier();
+    //
+    //IpplTimings::stopTimer(coarsePropagator);
 
-    msg << "First Boris PIC done " << endl;
+    //msg << "First Boris PIC done " << endl;
 
-    
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
+    //
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
 
 
-    //Run the coarse integrator to get the values at the end of the time slice 
-    IpplTimings::startTimer(coarsePropagator);
-    Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
-    IpplTimings::stopTimer(coarsePropagator);
-    msg << "Second Boris PIC done " << endl;
+    ////Run the coarse integrator to get the values at the end of the time slice 
+    //IpplTimings::startTimer(coarsePropagator);
+    //Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
+    //IpplTimings::stopTimer(coarsePropagator);
+    //msg << "Second Boris PIC done " << endl;
 
-    //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+    ////Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+
+    ////The following might not be needed
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+
+
+    //msg << "Starting parareal iterations ..." << endl;
+    //bool isConverged = false;
+    //bool isPreviousDomainConverged;
+    //if(Ippl::Comm->rank() == 0) {
+    //    isPreviousDomainConverged = true;
+    //}
+    //else {
+    //    isPreviousDomainConverged = false;
+    //}
+
+    bool isConverged, isPreviousDomainConverged;
 
-    //The following might not be needed
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
 
 
-    msg << "Starting parareal iterations ..." << endl;
-    bool isConverged = false;
-    bool isPreviousDomainConverged;
-    if(Ippl::Comm->rank() == 0) {
-        isPreviousDomainConverged = true;
-    }
-    else {
-        isPreviousDomainConverged = false;
-    }
-    
     Pcoarse->shapetype_m = argv[13];
     Pcoarse->shapedegree_m = std::atoi(argv[14]); 
     IpplTimings::startTimer(initializeShapeFunctionPIF);
@@ -636,105 +673,139 @@ int main(int argc, char *argv[]){
     
     Pcoarse->initNUFFT(FLPIF);
     
-    
-    for (unsigned int it=0; it<maxIter; it++) {
-
-        //Run fine integrator in parallel
-        IpplTimings::startTimer(finePropagator);
-        Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1, Bext);
-        IpplTimings::stopTimer(finePropagator);
-    
+   
+    unsigned int it = 0;
+    for (unsigned int nc=0; nc < maxCycles; nc++) {
+        double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
+        Pcoarse->time_m = tStartMySlice;
+        Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
+                                    isPreviousDomainConverged, ntCoarse,
+                                    dtCoarse, tStartMySlice, Bext);
+        while ((!isPreviousDomainConverged) || (!isConverged)) { 
+        //for (unsigned int it=0; it < maxIter; it++) {
+
+            //Run fine integrator in parallel
+            IpplTimings::startTimer(finePropagator);
+            Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1, Bext);
+            IpplTimings::stopTimer(finePropagator);
+        
 
-        //Difference = Fine - Coarse
-        Pend->R = Pbegin->R - Pcoarse->R;
-        Pend->P = Pbegin->P - Pcoarse->P;
+            //Difference = Fine - Coarse
+            Pend->R = Pbegin->R - Pcoarse->R;
+            Pend->P = Pbegin->P - Pcoarse->P;
 
-        //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gk");
-        //Pcoarse->dumpParticleData(it+1, Pbegin->R, Pbegin->P, "Fk");
+            //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gk");
+            //Pcoarse->dumpParticleData(it+1, Pbegin->R, Pbegin->P, "Fk");
 
 
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
-        Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
-        IpplTimings::stopTimer(deepCopy);
-        
-        IpplTimings::startTimer(timeCommunication);
-        tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-        int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-        
-        if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
-            size_type bufSize = Pbegin->packedSize(nloc);
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-            Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
-            buf->resetReadPos();
-            MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
-                    Ippl::getComm(), MPI_STATUS_IGNORE);
             IpplTimings::startTimer(deepCopy);
-            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
-            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+            Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
             IpplTimings::stopTimer(deepCopy);
-        }
-        IpplTimings::stopTimer(timeCommunication);
+            
+            IpplTimings::startTimer(timeCommunication);
+            tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+                buf->resetReadPos();
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
+                        Ippl::getComm(), MPI_STATUS_IGNORE);
+                IpplTimings::startTimer(deepCopy);
+                Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+                Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+                IpplTimings::stopTimer(deepCopy);
+            }
+            IpplTimings::stopTimer(timeCommunication);
 
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
-        Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
-        Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-        Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-        IpplTimings::stopTimer(deepCopy);
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
 
-        IpplTimings::startTimer(coarsePropagator);
-        Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
-        IpplTimings::stopTimer(coarsePropagator);
+            IpplTimings::startTimer(coarsePropagator);
+            Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
+            IpplTimings::stopTimer(coarsePropagator);
 
-        Pend->R = Pend->R + Pcoarse->R;
-        Pend->P = Pend->P + Pcoarse->P;
+            Pend->R = Pend->R + Pcoarse->R;
+            Pend->P = Pend->P + Pcoarse->P;
 
-        //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gkp1");
+            //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gkp1");
 
-        PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
-        IpplTimings::startTimer(computeErrors);
-        double localRerror, localPerror;
-        double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
-        double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-    
-        IpplTimings::stopTimer(computeErrors);
+            PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
+            IpplTimings::startTimer(computeErrors);
+            double localRerror, localPerror;
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+        
+            IpplTimings::stopTimer(computeErrors);
 
-        if((Rerror <= tol) && (Perror <= tol)) {
-            isConverged = true;
-        }
+            if((Rerror <= tol) && (Perror <= tol)) {
+                isConverged = true;
+            }
 
 
+            IpplTimings::startTimer(timeCommunication);
+            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            
+            msg << "Finished iteration: " << it+1
+                << " in cycle: " << nc+1
+                << " Rerror: " << Rerror 
+                << " Perror: " << Perror
+                << endl;
+
+            IpplTimings::startTimer(dumpData);
+            //Pcoarse->writeError(Rerror, Perror, it+1);
+            Pcoarse->writelocalError(localRerror, localPerror, it+1);
+            //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
+            IpplTimings::stopTimer(dumpData);
+
+            it += 1;
+            //if(isConverged && isPreviousDomainConverged) {
+            //    break;
+            //}
+        }
+    
+        Ippl::Comm->barrier();
         IpplTimings::startTimer(timeCommunication);
+        tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+        
         if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+            size_type bufSize = Pend->packedSize(nloc);
+            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+            Ippl::Comm->recv(Ippl::Comm->rank()+1, tag, *Pend, *buf, bufSize, nloc);
+            buf->resetReadPos();
+        }
+        if(Ippl::Comm->rank() > 0) {
             size_type bufSize = Pend->packedSize(nloc);
             buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
             MPI_Request request;
-            Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+            Ippl::Comm->isend(Ippl::Comm->rank()-1, tag, *Pend, *buf, request, nloc);
             buf->resetWritePos();
             MPI_Wait(&request, MPI_STATUS_IGNORE);
-            MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
         }
         IpplTimings::stopTimer(timeCommunication);
-
-        
-        msg << "Finished iteration: " << it+1 
-            << " Rerror: " << Rerror 
-            << " Perror: " << Perror
-            << endl;
-
-        IpplTimings::startTimer(dumpData);
-        //Pcoarse->writeError(Rerror, Perror, it+1);
-        Pcoarse->writelocalError(localRerror, localPerror, it+1);
-        //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-        IpplTimings::stopTimer(dumpData);
-
-        if(isConverged && isPreviousDomainConverged) {
-            break;
-        }
+        Ippl::Comm->barrier();
+        IpplTimings::startTimer(deepCopy);
+        Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+        Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+        IpplTimings::stopTimer(deepCopy);
     }
-
-    Ippl::Comm->barrier();
     msg << TestName << " Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();

From 1331f37f619b81075da7b75edd876f0408f45ec0 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Sat, 4 Mar 2023 08:03:52 +0100
Subject: [PATCH 072/117] Output writing changed for block parareal. Need to
 compile and test

---
 alpine/PinT/ChargedParticlesPinT.hpp | 20 +++++++++++++-------
 alpine/PinT/PenningTrapPinT.cpp      | 17 ++++++++---------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 963192044..07f823292 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -450,7 +450,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 
 
-    void dumpEnergy(size_type /*totalP*/, const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp) {
+    void dumpEnergy(size_type /*totalP*/, const unsigned int& nc, 
+                    const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp) {
        
 
         double potentialEnergy, kineticEnergy;
@@ -543,8 +544,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         kineticEnergy = globaltemp;
 
         std::stringstream fname;
-        fname << "data/Energy_";
+        fname << "data/Energy_rank_";
         fname << Ippl::Comm->rank();
+        fname << "_nc_";
+        fname << nc;
         fname << "_iter_";
         fname << iter;
         fname << ".csv";
@@ -592,11 +595,13 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }
      }
 
-    void writelocalError(double Rerror, double Perror, unsigned int iter) {
+    void writelocalError(double Rerror, double Perror, unsigned int nc, unsigned int iter) {
         
             std::stringstream fname;
-            fname << "data/localError_";
+            fname << "data/localError_rank_";
             fname << Ippl::Comm->rank();
+            fname << "_nc_";
+            fname << nc;
             fname << ".csv";
 
             Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
@@ -977,7 +982,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
                      const double& dt, const bool& /*isConverged*/, 
-                     const double& tStartMySlice, const unsigned int& iter, const double& Bext) {
+                     const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, const double& Bext) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
@@ -997,7 +1003,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0)) {
             IpplTimings::startTimer(dumpData);
-            dumpEnergy(this->getLocalNum(), iter, Ptemp);
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp);
             IpplTimings::stopTimer(dumpData);
         }
         double alpha = -0.5 * dt;
@@ -1074,7 +1080,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             time_m += dt;
             
             IpplTimings::startTimer(dumpData);
-            dumpEnergy(this->getLocalNum(), iter, Ptemp);         
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp);         
             IpplTimings::stopTimer(dumpData);
     
         }
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index d76885630..1c03e69b9 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -6,7 +6,7 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./PenningTrapPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <Niter> 
+//     srun ./PenningTrapPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <nCycles> 
 //     <ShapeType> <degree> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
@@ -15,6 +15,7 @@
 //     ny       = No. of grid points in the y-direction
 //     nz       = No. of grid points in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
+//     nCycles = No. of Parareal blocks/cycles
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
 //     Example:
@@ -406,8 +407,8 @@ int main(int argc, char *argv[]){
 
     const size_type totalP = std::atoll(argv[7]);
     const double tEnd = std::atof(argv[8]);
-    const unsigned int maxCycles = std::atoi(argv[12]);
-    double tEndCycle = tEnd / maxCycles;
+    const unsigned int nCycles = std::atoi(argv[12]);
+    double tEndCycle = tEnd / nCycles;
     const double dtSlice = tEndCycle / Ippl::Comm->size();
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
@@ -598,7 +599,7 @@ int main(int argc, char *argv[]){
         << endl
         << "Tolerance: " << tol
         //<< " Max. iterations: " << maxIter
-        << " Max. cycles: " << maxCycles
+        << " Max. cycles: " << nCycles
         << endl
         << "Np= " << nloc 
         << " Fourier modes = " << nmPIF
@@ -662,8 +663,6 @@ int main(int argc, char *argv[]){
 
     bool isConverged, isPreviousDomainConverged;
 
-
-
     Pcoarse->shapetype_m = argv[13];
     Pcoarse->shapedegree_m = std::atoi(argv[14]); 
     IpplTimings::startTimer(initializeShapeFunctionPIF);
@@ -675,7 +674,7 @@ int main(int argc, char *argv[]){
     
    
     unsigned int it = 0;
-    for (unsigned int nc=0; nc < maxCycles; nc++) {
+    for (unsigned int nc=0; nc < nCycles; nc++) {
         double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
         Pcoarse->time_m = tStartMySlice;
         Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
@@ -686,7 +685,7 @@ int main(int argc, char *argv[]){
 
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
-            Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1, Bext);
+            Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, nc+1, it+1, Bext);
             IpplTimings::stopTimer(finePropagator);
         
 
@@ -771,7 +770,7 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(localRerror, localPerror, it+1);
+            Pcoarse->writelocalError(localRerror, localPerror, nc+1, it+1);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
             IpplTimings::stopTimer(dumpData);
 

From 2b46726cf3e1ed6d953a843a6f95299fc4e447fb Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 6 Mar 2023 22:24:17 +0100
Subject: [PATCH 073/117] multiCycle Parareal implemented for PenningTrap and
 TSI. Need to postprocess and verify

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 364 +++++++++++++---------
 alpine/PinT/ChargedParticlesPinT.hpp      |  61 +++-
 alpine/PinT/PenningTrapPinT.cpp           |  67 ++--
 3 files changed, 304 insertions(+), 188 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 9abdc69bf..188760492 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -420,16 +420,18 @@ int main(int argc, char *argv[]){
 
     const size_type totalP = std::atoll(argv[7]);
     const double tEnd = std::atof(argv[8]);
-    const double dtSlice = tEnd / Ippl::Comm->size();
+    const unsigned int nCycles = std::atoi(argv[12]);
+    double tEndCycle = tEnd / nCycles;
+    const double dtSlice = tEndCycle / Ippl::Comm->size();
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
     const unsigned int ntFine = std::ceil(dtSlice / dtFine);
     const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
-    const unsigned int maxIter = std::atoi(argv[12]);
+    //const unsigned int maxIter = std::atoi(argv[12]);
 
 
-    const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
+    //const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
     //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
 
 
@@ -532,7 +534,6 @@ int main(int argc, char *argv[]){
     //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
-    Pcoarse->time_m = tStartMySlice;
 
     IpplTimings::startTimer(particleCreation);
 
@@ -552,7 +553,42 @@ int main(int argc, char *argv[]){
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
+    //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    //if(Ippl::Comm->rank() == 0) {
+    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    //    Kokkos::parallel_for(nloc,
+    //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+    //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, delta, kw, 
+    //                         sigma, muBulk, muBeam, nlocBulk, minU, maxU));
+
+
+    //    Kokkos::fence();
+    //    size_type bufSize = Pbegin->packedSize(nloc);
+    //    std::vector<MPI_Request> requests(0);
+    //    int sends = 0;
+    //    for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
+    //        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
+    //        requests.resize(requests.size() + 1);
+    //        Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
+    //        buf->resetWritePos();
+    //        ++sends;
+    //    }
+    //    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+    //}
+    //else {
+    //    size_type bufSize = Pbegin->packedSize(nloc);
+    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+    //    Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+    //    buf->resetReadPos();
+    //}
+    //Ippl::Comm->barrier();
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+
     if(Ippl::Comm->rank() == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
         Kokkos::parallel_for(nloc,
@@ -562,25 +598,25 @@ int main(int argc, char *argv[]){
 
 
         Kokkos::fence();
-        size_type bufSize = Pbegin->packedSize(nloc);
-        std::vector<MPI_Request> requests(0);
-        int sends = 0;
-        for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
-            requests.resize(requests.size() + 1);
-            Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
-            buf->resetWritePos();
-            ++sends;
-        }
-        MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
     }
     else {
         size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-        Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+        Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
         buf->resetReadPos();
     }
-    Ippl::Comm->barrier();
+
+    
+    if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+        MPI_Request request;
+        Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pbegin, *buf, request, nloc);
+        buf->resetWritePos();
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+    }
+
+    //Ippl::Comm->barrier();
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
@@ -609,7 +645,7 @@ int main(int argc, char *argv[]){
         << "No. of coarse time steps: " << ntCoarse
         << endl
         << "Tolerance: " << tol
-        << " Max. iterations: " << maxIter
+        << " No. of cycles: " << nCycles
         << endl
         << "Np= " << nloc 
         << " Fourier modes = " << nmPIF
@@ -622,55 +658,57 @@ int main(int argc, char *argv[]){
     msg << "particles created and initial conditions assigned " << endl;
 
     //Copy initial conditions as they are needed later
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
-
-    //Get initial guess for ranks other than 0 by propagating the coarse solver
-    IpplTimings::startTimer(coarsePropagator);
-    if (Ippl::Comm->rank() > 0) {
-        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
-    }
-    
-    Ippl::Comm->barrier();
-    
-    IpplTimings::stopTimer(coarsePropagator);
-
-    msg << "First Leap frog PIC done " << endl;
-
-    
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
-
-
-    //Run the coarse integrator to get the values at the end of the time slice 
-    IpplTimings::startTimer(coarsePropagator);
-    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
-    IpplTimings::stopTimer(coarsePropagator);
-    msg << "Second Leap frog PIC done " << endl;
-
-    //Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
-
-    //The following might not be needed
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
-
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+
+    ////Get initial guess for ranks other than 0 by propagating the coarse solver
+    //IpplTimings::startTimer(coarsePropagator);
+    //if (Ippl::Comm->rank() > 0) {
+    //    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
+    //}
+    //
+    //Ippl::Comm->barrier();
+    //
+    //IpplTimings::stopTimer(coarsePropagator);
+
+    //msg << "First Leap frog PIC done " << endl;
+
+    //
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+
+
+    ////Run the coarse integrator to get the values at the end of the time slice 
+    //IpplTimings::startTimer(coarsePropagator);
+    //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+    //IpplTimings::stopTimer(coarsePropagator);
+    //msg << "Second Leap frog PIC done " << endl;
+
+    ////Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
+
+    ////The following might not be needed
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+
+
+    //msg << "Starting parareal iterations ..." << endl;
+    //bool isConverged = false;
+    //bool isPreviousDomainConverged;
+    //if(Ippl::Comm->rank() == 0) {
+    //    isPreviousDomainConverged = true;
+    //}
+    //else {
+    //    isPreviousDomainConverged = false;
+    //}
+   
+    bool isConverged, isPreviousDomainConverged;
 
-    msg << "Starting parareal iterations ..." << endl;
-    bool isConverged = false;
-    bool isPreviousDomainConverged;
-    if(Ippl::Comm->rank() == 0) {
-        isPreviousDomainConverged = true;
-    }
-    else {
-        isPreviousDomainConverged = false;
-    }
-    
     Pcoarse->shapetype_m = argv[13];
     Pcoarse->shapedegree_m = std::atoi(argv[14]); 
     IpplTimings::startTimer(initializeShapeFunctionPIF);
@@ -679,108 +717,140 @@ int main(int argc, char *argv[]){
   
     Pcoarse->initNUFFT(FLPIF);
 
-    for (unsigned int it=0; it<maxIter; it++) {
 
-        //Run fine integrator in parallel
-        IpplTimings::startTimer(finePropagator);
-        Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1);
-        IpplTimings::stopTimer(finePropagator);
+    for (unsigned int nc=0; nc < nCycles; nc++) {
+        double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
+        Pcoarse->time_m = tStartMySlice;
+        Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
+                                    isPreviousDomainConverged, ntCoarse,
+                                    dtCoarse, tStartMySlice);
+        unsigned int it = 0;
+        while (!isConverged) { 
+            //Run fine integrator in parallel
+            IpplTimings::startTimer(finePropagator);
+            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, nc+1, it+1);
+            IpplTimings::stopTimer(finePropagator);
     
 
-        //Difference = Fine - Coarse
-        Pend->R = Pbegin->R - Pcoarse->R;
-        Pend->P = Pbegin->P - Pcoarse->P;
+            //Difference = Fine - Coarse
+            Pend->R = Pbegin->R - Pcoarse->R;
+            Pend->P = Pbegin->P - Pcoarse->P;
 
-        //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gk");
-        //Pcoarse->dumpParticleData(it+1, Pbegin->R, Pbegin->P, "Fk");
+            //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gk");
+            //Pcoarse->dumpParticleData(it+1, Pbegin->R, Pbegin->P, "Fk");
 
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
-        Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
-        IpplTimings::stopTimer(deepCopy);
-        
-        IpplTimings::startTimer(timeCommunication);
-        tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-        int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-        
-        if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
-            size_type bufSize = Pbegin->packedSize(nloc);
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-            Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
-            buf->resetReadPos();
-            MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
-                    Ippl::getComm(), MPI_STATUS_IGNORE);
             IpplTimings::startTimer(deepCopy);
-            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
-            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+            Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
             IpplTimings::stopTimer(deepCopy);
-        }
-        IpplTimings::stopTimer(timeCommunication);
+            
+            IpplTimings::startTimer(timeCommunication);
+            tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+                buf->resetReadPos();
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
+                        Ippl::getComm(), MPI_STATUS_IGNORE);
+                IpplTimings::startTimer(deepCopy);
+                Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+                Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+                IpplTimings::stopTimer(deepCopy);
+            }
+            IpplTimings::stopTimer(timeCommunication);
 
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
-        Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
-        Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-        Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-        IpplTimings::stopTimer(deepCopy);
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
 
-        IpplTimings::startTimer(coarsePropagator);
-        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
-        IpplTimings::stopTimer(coarsePropagator);
+            IpplTimings::startTimer(coarsePropagator);
+            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+            IpplTimings::stopTimer(coarsePropagator);
 
-        Pend->R = Pend->R + Pcoarse->R;
-        Pend->P = Pend->P + Pcoarse->P;
+            Pend->R = Pend->R + Pcoarse->R;
+            Pend->P = Pend->P + Pcoarse->P;
 
-        //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gkp1");
+            //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gkp1");
 
 
-        PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
-        double localRerror, localPerror;
-        
-        IpplTimings::startTimer(computeErrors);
-        double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
-        double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+            PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
+            double localRerror, localPerror;
+            
+            IpplTimings::startTimer(computeErrors);
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
-        IpplTimings::stopTimer(computeErrors);
-        //}
+            IpplTimings::stopTimer(computeErrors);
+            //}
 
-        if((Rerror <= tol) && (Perror <= tol)) {
-            isConverged = true;
-        }
+            if((Rerror <= tol) && (Perror <= tol) && isPreviousDomainConverged) {
+                isConverged = true;
+            }
 
 
-        IpplTimings::startTimer(timeCommunication);
-        if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
-            size_type bufSize = Pend->packedSize(nloc);
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
-            MPI_Request request;
-            Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
-            buf->resetWritePos();
-            MPI_Wait(&request, MPI_STATUS_IGNORE);
-            MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
+            IpplTimings::startTimer(timeCommunication);
+            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            
+            msg << "Finished iteration: " << it+1
+                << " in cycle: " << nc+1
+                << " Rerror: " << Rerror 
+                << " Perror: " << Perror
+                << endl;
+
+            IpplTimings::startTimer(dumpData);
+            //Pcoarse->writeError(Rerror, Perror, it+1);
+            Pcoarse->writelocalError(localRerror, localPerror, nc+1, it+1);
+            //if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
+            //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
+            //}
+            IpplTimings::stopTimer(dumpData);
+
+            it += 1;
         }
-        IpplTimings::stopTimer(timeCommunication);
 
-        
-        msg << "Finished iteration: " << it+1 
-            << " Rerror: " << Rerror 
-            << " Perror: " << Perror
-            << endl;
-
-        IpplTimings::startTimer(dumpData);
-        //Pcoarse->writeError(Rerror, Perror, it+1);
-        Pcoarse->writelocalError(localRerror, localPerror, it+1);
-        //if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
-        //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-        //}
-        IpplTimings::stopTimer(dumpData);
-
-        if(isConverged && isPreviousDomainConverged) {
-            break;
+        Ippl::Comm->barrier();
+        if((nCycles > 1) && (nc < (nCycles - 1))) {  
+            IpplTimings::startTimer(timeCommunication);
+            tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(Ippl::Comm->rank()+1, tag, *Pend, *buf, bufSize, nloc);
+                buf->resetReadPos();
+            }
+            if(Ippl::Comm->rank() > 0) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(Ippl::Comm->rank()-1, tag, *Pend, *buf, request, nloc);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
         }
     }
-
-    Ippl::Comm->barrier();
     msg << TestName << " Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 07f823292..bb27e0201 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -185,22 +185,22 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                             const double& Bext) {
 
         //Copy initial conditions as they are needed later
-        Kokkos::deep_copy(R0.getView(), R.getView());
+        Kokkos::deep_copy(R0.getView(), this->R.getView());
         Kokkos::deep_copy(P0.getView(), P.getView());
 
         //Get initial guess for ranks other than 0 by propagating the coarse solver
         if (Ippl::Comm->rank() > 0) {
-            BorisPIC(R, P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice, Bext); 
+            BorisPIC(this->R, P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice, Bext); 
         }
         
-        Ippl::Comm->barrier();
+        //Ippl::Comm->barrier();
         
-        Kokkos::deep_copy(Rbegin.getView(), R.getView());
+        Kokkos::deep_copy(Rbegin.getView(), this->R.getView());
         Kokkos::deep_copy(Pbegin.getView(), P.getView());
 
 
         //Run the coarse integrator to get the values at the end of the time slice 
-        Pcoarse->BorisPIC(R, P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
+        BorisPIC(this->R, P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
 
         isConverged = false;
         if(Ippl::Comm->rank() == 0) {
@@ -211,6 +211,40 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }
     }
 
+    void initializeParareal(ParticleAttrib<Vector_t>& Rbegin,
+                            ParticleAttrib<Vector_t>& Pbegin,
+                            bool& isConverged,
+                            bool& isPreviousDomainConverged,
+                            const unsigned int& ntCoarse,
+                            const double& dtCoarse,
+                            const double& tStartMySlice) {
+
+        //Copy initial conditions as they are needed later
+        Kokkos::deep_copy(R0.getView(), this->R.getView());
+        Kokkos::deep_copy(P0.getView(), P.getView());
+
+        //Get initial guess for ranks other than 0 by propagating the coarse solver
+        if (Ippl::Comm->rank() > 0) {
+            LeapFrogPIC(this->R, P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
+        }
+        
+        //Ippl::Comm->barrier();
+        
+        Kokkos::deep_copy(Rbegin.getView(), this->R.getView());
+        Kokkos::deep_copy(Pbegin.getView(), P.getView());
+
+
+        //Run the coarse integrator to get the values at the end of the time slice 
+        LeapFrogPIC(this->R, P, ntCoarse, dtCoarse, tStartMySlice); 
+
+        isConverged = false;
+        if(Ippl::Comm->rank() == 0) {
+            isPreviousDomainConverged = true;
+        }
+        else {
+            isPreviousDomainConverged = false;
+        }
+    }
 
      void dumpLandauPIC() {
 
@@ -362,7 +396,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                << ExAmp << endl;
     }
 
-    void dumpBumponTail(const unsigned int& iter) {
+    void dumpBumponTail(const unsigned int& nc, const unsigned int& iter) {
        
 
         double fieldEnergy = 0.0; 
@@ -430,8 +464,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 
         std::stringstream fname;
-        fname << "data/FieldBumponTail_";
+        fname << "data/FieldBumponTail_rank_";
         fname << Ippl::Comm->rank();
+        fname << "_nc_";
+        fname << nc;
         fname << "_iter_";
         fname << iter;
         fname << ".csv";
@@ -914,7 +950,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
                      const double& dt, const bool& /*isConverged*/, 
-                     const double& tStartMySlice, const unsigned int& iter) {
+                     const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
@@ -935,8 +972,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         if((time_m == 0.0)) {
             IpplTimings::startTimer(dumpData);
             //dumpLandau(iter);         
-            dumpBumponTail(iter);         
-            dumpEnergy(this->getLocalNum(), iter, Ptemp);
+            dumpBumponTail(nc, iter);         
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp);
             IpplTimings::stopTimer(dumpData);
         }
         for (unsigned int it=0; it<nt; it++) {
@@ -971,8 +1008,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             IpplTimings::startTimer(dumpData);
             //dumpLandau(iter);         
-            dumpBumponTail(iter);         
-            dumpEnergy(this->getLocalNum(), iter, Ptemp);         
+            dumpBumponTail(nc, iter);         
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp);         
             IpplTimings::stopTimer(dumpData);
     
         }
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 1c03e69b9..1f3f411c6 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -572,7 +572,7 @@ int main(int argc, char *argv[]){
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
 
-    Ippl::Comm->barrier();
+    //Ippl::Comm->barrier();
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
@@ -599,7 +599,7 @@ int main(int argc, char *argv[]){
         << endl
         << "Tolerance: " << tol
         //<< " Max. iterations: " << maxIter
-        << " Max. cycles: " << nCycles
+        << " No. of cycles: " << nCycles
         << endl
         << "Np= " << nloc 
         << " Fourier modes = " << nmPIF
@@ -673,14 +673,15 @@ int main(int argc, char *argv[]){
     Pcoarse->initNUFFT(FLPIF);
     
    
-    unsigned int it = 0;
     for (unsigned int nc=0; nc < nCycles; nc++) {
         double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
         Pcoarse->time_m = tStartMySlice;
         Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
                                     isPreviousDomainConverged, ntCoarse,
                                     dtCoarse, tStartMySlice, Bext);
-        while ((!isPreviousDomainConverged) || (!isConverged)) { 
+        unsigned int it = 0;
+        while (!isConverged) { 
+        //while ((!isPreviousDomainConverged) || (!isConverged)) { 
         //for (unsigned int it=0; it < maxIter; it++) {
 
             //Run fine integrator in parallel
@@ -703,8 +704,8 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
             
             IpplTimings::startTimer(timeCommunication);
-            tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-            int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             
             if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
                 size_type bufSize = Pbegin->packedSize(nloc);
@@ -744,7 +745,7 @@ int main(int argc, char *argv[]){
         
             IpplTimings::stopTimer(computeErrors);
 
-            if((Rerror <= tol) && (Perror <= tol)) {
+            if((Rerror <= tol) && (Perror <= tol) && isPreviousDomainConverged) {
                 isConverged = true;
             }
 
@@ -780,30 +781,38 @@ int main(int argc, char *argv[]){
             //}
         }
     
+        //std::cout << "Before barrier in cycle: " << nc+1 << "for rank: " << Ippl::Comm->rank() << std::endl;
         Ippl::Comm->barrier();
-        IpplTimings::startTimer(timeCommunication);
-        tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-        
-        if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
-            size_type bufSize = Pend->packedSize(nloc);
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-            Ippl::Comm->recv(Ippl::Comm->rank()+1, tag, *Pend, *buf, bufSize, nloc);
-            buf->resetReadPos();
-        }
-        if(Ippl::Comm->rank() > 0) {
-            size_type bufSize = Pend->packedSize(nloc);
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
-            MPI_Request request;
-            Ippl::Comm->isend(Ippl::Comm->rank()-1, tag, *Pend, *buf, request, nloc);
-            buf->resetWritePos();
-            MPI_Wait(&request, MPI_STATUS_IGNORE);
+        //msg << "Communication started in cycle: " << nc+1 << endl;
+        //std::cout << "Communication started in cycle: " << nc+1 << "for rank: " << Ippl::Comm->rank() << std::endl;
+        if((nCycles > 1) && (nc < (nCycles - 1))) {  
+            IpplTimings::startTimer(timeCommunication);
+            tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(Ippl::Comm->rank()+1, tag, *Pend, *buf, bufSize, nloc);
+                buf->resetReadPos();
+            }
+            if(Ippl::Comm->rank() > 0) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(Ippl::Comm->rank()-1, tag, *Pend, *buf, request, nloc);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+            //std::cout << "Communication finished in cycle: " << nc+1 << "for rank: " << Ippl::Comm->rank() << std::endl;
+            //Ippl::Comm->barrier();
+
+            //msg << "Communication finished in cycle: " << nc+1 << endl;
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
         }
-        IpplTimings::stopTimer(timeCommunication);
-        Ippl::Comm->barrier();
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
-        Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
-        IpplTimings::stopTimer(deepCopy);
     }
     msg << TestName << " Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);

From d890e035291ad6712e2fb7cb6149c862d8c6faff Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 12 Apr 2023 14:30:26 +0200
Subject: [PATCH 074/117] Kokkos::Experimenta->numbers for v 4.0.0

---
 src/Particle/ParticleAttrib.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index f453f7294..6de534ae1 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -282,8 +282,8 @@ namespace ippl {
                     }
                     const value_type& val = dview_m(idx);
 
-                    innerReduce += Sk * (Kokkos::Experimental::cos(arg) 
-                                - imag * Kokkos::Experimental::sin(arg)) * val;
+                    innerReduce += Sk * (Kokkos::numbers::cos(arg) 
+                                - imag * Kokkos::numbers::sin(arg)) * val;
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {
@@ -450,10 +450,10 @@ namespace ippl {
                         
                         //Inverse Fourier transform when the lhs is real. Use when 
                         //we choose k \in [0 K) instead of from [-K/2+1 K/2] 
-                        //Ex[d] = 2.0 * (Ek.real() * Kokkos::Experimental::cos(arg) 
-                        //        - Ek.imag() * Kokkos::Experimental::sin(arg));
-                        Ek *= Sk * (Kokkos::Experimental::cos(arg) 
-                                + imag * Kokkos::Experimental::sin(arg));
+                        //Ex[d] = 2.0 * (Ek.real() * Kokkos::numbers::cos(arg) 
+                        //        - Ek.imag() * Kokkos::numbers::sin(arg));
+                        Ek *= Sk * (Kokkos::numbers::cos(arg) 
+                                + imag * Kokkos::numbers::sin(arg));
                         Ex[d] = Ek.real();
                     }
                     

From a06118ea90c91e0d2d4cc4585b4272579b768cd7 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 12 Apr 2023 14:55:45 +0200
Subject: [PATCH 075/117] Kokkos::numbers removed for v 4.0.0

---
 alpine/PinT/ChargedParticlesPinT.hpp |  2 +-
 src/Particle/ParticleAttrib.hpp      | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index bb27e0201..f324d2e26 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -761,7 +761,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                     double kh = kVec[d] * dx[d];
                     bool isNotZero = (kh != 0.0);
                     double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
-                    double arg = isNotZero * (Kokkos::Experimental::sin(kh) * factor) + 
+                    double arg = isNotZero * (Kokkos::sin(kh) * factor) + 
                                  (!isNotZero) * 1.0;
                     //Fourier transform of CIC
                     Sk *= std::pow(arg, order);
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 6de534ae1..39ba19e3b 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -282,8 +282,8 @@ namespace ippl {
                     }
                     const value_type& val = dview_m(idx);
 
-                    innerReduce += Sk * (Kokkos::numbers::cos(arg) 
-                                - imag * Kokkos::numbers::sin(arg)) * val;
+                    innerReduce += Sk * (Kokkos::cos(arg) 
+                                - imag * Kokkos::sin(arg)) * val;
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {
@@ -450,10 +450,10 @@ namespace ippl {
                         
                         //Inverse Fourier transform when the lhs is real. Use when 
                         //we choose k \in [0 K) instead of from [-K/2+1 K/2] 
-                        //Ex[d] = 2.0 * (Ek.real() * Kokkos::numbers::cos(arg) 
-                        //        - Ek.imag() * Kokkos::numbers::sin(arg));
-                        Ek *= Sk * (Kokkos::numbers::cos(arg) 
-                                + imag * Kokkos::numbers::sin(arg));
+                        //Ex[d] = 2.0 * (Ek.real() * Kokkos::cos(arg) 
+                        //        - Ek.imag() * Kokkos::sin(arg));
+                        Ek *= Sk * (Kokkos::cos(arg) 
+                                + imag * Kokkos::sin(arg));
                         Ex[d] = Ek.real();
                     }
                     

From b27bc5dc7244c06bfe226369d544687f197ecd05 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 12 Apr 2023 15:34:10 +0200
Subject: [PATCH 076/117] LandauDamping modified for multiBlock Parareal

---
 .../BumponTailInstabilityPIF.cpp              |   2 +-
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  |   2 +-
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp    |   2 +-
 alpine/PinT/BumponTailInstabilityPinT.cpp     |   5 +-
 alpine/PinT/LandauDampingPinT.cpp             | 401 ++++++++++--------
 alpine/PinT/PenningTrapPinT.cpp               |   2 +-
 test/FFT/TestNUFFT1.cpp                       |   8 +-
 test/FFT/TestNUFFT2.cpp                       |   4 +-
 8 files changed, 243 insertions(+), 183 deletions(-)

diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
index 2ac5b18f4..5ddbd6c46 100644
--- a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -242,7 +242,7 @@ int main(int argc, char *argv[]){
 
     //Q = -\int\int f dx dv
     double Q = -rmax[0] * rmax[1] * rmax[2];
-    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,totalP);
 
     P->nr_m = nr;
 
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index a3a797823..e2688105f 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -525,7 +525,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                     double kh = kVec[d] * dx[d];
                     bool isNotZero = (kh != 0.0);
                     double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
-                    double arg = isNotZero * (Kokkos::Experimental::sin(kh) * factor) + 
+                    double arg = isNotZero * (Kokkos::sin(kh) * factor) + 
                                  (!isNotZero) * 1.0;
                     //Fourier transform of CIC
                     Sk *= std::pow(arg, order);
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index 8c5613b1b..1c81783b9 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -209,7 +209,7 @@ int main(int argc, char *argv[]){
 
     double Q = -1562.5;
     double Bext = 5.0;
-    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q);
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,totalP);
 
     P->nr_m = nr;
 
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 188760492..cf1a2c8e3 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -7,7 +7,7 @@
 // 
 //  Usage:
 //     srun ./BumponTailInstabilityPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> 
-//     <Niter> <ShapeType> <degree> --info 5
+//     <nCycles> <ShapeType> <degree> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
 //     nmz       = No. of Fourier modes in the z-direction
@@ -15,10 +15,11 @@
 //     ny       = No. of grid points in the y-direction
 //     nz       = No. of grid points in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
+//     nCycles = No. of Parareal blocks/cycles
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./BumponTailInstabilityPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
+//     srun ./BumponTailInstabilityPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 4 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index f08a275b4..da2491d49 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -7,7 +7,7 @@
 // 
 //  Usage:
 //     srun ./LandauDampingPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> 
-//          <Niter> <ShapeType> <degree> --info 5
+//          <nCycles> <ShapeType> <degree> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
 //     nmz       = No. of Fourier modes in the z-direction
@@ -15,10 +15,11 @@
 //     ny       = No. of grid points in the y-direction
 //     nz       = No. of grid points in the z-direction
 //     Np       = Total no. of macro-particles in the simulation
+//     nCycles = No. of Parareal blocks/cycles
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./LandauDampingPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
+//     srun ./LandauDampingPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 4 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -404,33 +405,18 @@ int main(int argc, char *argv[]){
 
     const size_type totalP = std::atoll(argv[7]);
     const double tEnd = std::atof(argv[8]);
-    const double dtSlice = tEnd / Ippl::Comm->size();
+    const unsigned int nCycles = std::atoi(argv[12]);
+    double tEndCycle = tEnd / nCycles;
+    const double dtSlice = tEndCycle / Ippl::Comm->size();
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
     const unsigned int ntFine = std::ceil(dtSlice / dtFine);
     const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
-    const unsigned int maxIter = std::atoi(argv[12]);
 
-    const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
+    //const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
     //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
 
-    msg << "Parareal "
-        << TestName
-        << endl
-        << "Slice dT: " << dtSlice
-        << endl
-        << "No. of fine time steps: " << ntFine 
-        << endl
-        << "No. of coarse time steps: " << ntCoarse
-        << endl
-        << "Tolerance: " << tol
-        << " Max. iterations: " << maxIter
-        << endl
-        << "Np= " << totalP 
-        << " Fourier modes = " << nmPIF
-        << " Grid points = " << nrPIC
-        << endl;
 
     using bunch_type = ChargedParticlesPinT<PLayout_t>;
     using states_begin_type = StatesBeginSlice<PLayout_t>;
@@ -498,7 +484,6 @@ int main(int argc, char *argv[]){
     //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
-    Pcoarse->time_m = tStartMySlice;
 
     IpplTimings::startTimer(particleCreation);
 
@@ -506,8 +491,6 @@ int main(int argc, char *argv[]){
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], alpha[d], kw[d]);
         maxU[d]   = CDF(rmax[d], alpha[d], kw[d]);
-        //minU[d] = rmin[d];
-        //maxU[d] = rmax[d];
     }
 
 
@@ -520,33 +503,68 @@ int main(int argc, char *argv[]){
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
+    //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    //if(Ippl::Comm->rank() == 0) {
+    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    //    Kokkos::parallel_for(nloc,
+    //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+    //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
+
+    //    Kokkos::fence();
+    //    size_type bufSize = Pbegin->packedSize(nloc);
+    //    std::vector<MPI_Request> requests(0);
+    //    int sends = 0;
+    //    for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
+    //        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
+    //        requests.resize(requests.size() + 1);
+    //        Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
+    //        buf->resetWritePos();
+    //        ++sends;
+    //    }
+    //    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+    //}
+    //else {
+    //    size_type bufSize = Pbegin->packedSize(nloc);
+    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+    //    Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+    //    buf->resetReadPos();
+    //}
+    //Ippl::Comm->barrier();
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+    
+
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+
     if(Ippl::Comm->rank() == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
         Kokkos::parallel_for(nloc,
                              generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
                              Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
 
+
         Kokkos::fence();
-        size_type bufSize = Pbegin->packedSize(nloc);
-        std::vector<MPI_Request> requests(0);
-        int sends = 0;
-        for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
-            requests.resize(requests.size() + 1);
-            Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
-            buf->resetWritePos();
-            ++sends;
-        }
-        MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
     }
     else {
         size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-        Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+        Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
         buf->resetReadPos();
     }
-    Ippl::Comm->barrier();
+
+    
+    if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+        MPI_Request request;
+        Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pbegin, *buf, request, nloc);
+        buf->resetWritePos();
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+    }
+
+    //Ippl::Comm->barrier();
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
@@ -561,6 +579,23 @@ int main(int argc, char *argv[]){
     Ippl::Comm->barrier();
 #endif
 
+    msg << "Parareal "
+        << TestName
+        << endl
+        << "Slice dT: " << dtSlice
+        << endl
+        << "No. of fine time steps: " << ntFine 
+        << endl
+        << "No. of coarse time steps: " << ntCoarse
+        << endl
+        << "Tolerance: " << tol
+        << " No. of cycles: " << nCycles
+        << endl
+        << "Np= " << totalP 
+        << " Fourier modes = " << nmPIF
+        << " Grid points = " << nrPIC
+        << endl;
+
 
     Pcoarse->q = Pcoarse->Q_m/nloc;
     IpplTimings::stopTimer(particleCreation);                                                    
@@ -568,54 +603,56 @@ int main(int argc, char *argv[]){
     msg << "particles created and initial conditions assigned " << endl;
 
     //Copy initial conditions as they are needed later
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+
+
+    ////Get initial guess for ranks other than 0 by propagating the coarse solver
+    //IpplTimings::startTimer(coarsePropagator);
+    //if (Ippl::Comm->rank() > 0) {
+    //    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
+    //}
+    //
+    //Ippl::Comm->barrier();
+    //
+    //IpplTimings::stopTimer(coarsePropagator);
 
+    //msg << "First Leap frog PIC done " << endl;
 
-    //Get initial guess for ranks other than 0 by propagating the coarse solver
-    IpplTimings::startTimer(coarsePropagator);
-    if (Ippl::Comm->rank() > 0) {
-        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
-    }
-    
-    Ippl::Comm->barrier();
-    
-    IpplTimings::stopTimer(coarsePropagator);
-
-    msg << "First Leap frog PIC done " << endl;
+    //
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
 
-    
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
 
+    ////Run the coarse integrator to get the values at the end of the time slice 
+    //IpplTimings::startTimer(coarsePropagator);
+    //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+    //IpplTimings::stopTimer(coarsePropagator);
+    //msg << "Second Leap frog PIC done " << endl;
 
-    //Run the coarse integrator to get the values at the end of the time slice 
-    IpplTimings::startTimer(coarsePropagator);
-    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
-    IpplTimings::stopTimer(coarsePropagator);
-    msg << "Second Leap frog PIC done " << endl;
 
+    ////The following might not be needed
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
+    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
 
-    //The following might not be needed
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
-    IpplTimings::stopTimer(deepCopy);
 
+    //msg << "Starting parareal iterations ..." << endl;
+    //bool isConverged = false;
+    //bool isPreviousDomainConverged;
+    //if(Ippl::Comm->rank() == 0) {
+    //    isPreviousDomainConverged = true;
+    //}
+    //else {
+    //    isPreviousDomainConverged = false;
+    //}
 
-    msg << "Starting parareal iterations ..." << endl;
-    bool isConverged = false;
-    bool isPreviousDomainConverged;
-    if(Ippl::Comm->rank() == 0) {
-        isPreviousDomainConverged = true;
-    }
-    else {
-        isPreviousDomainConverged = false;
-    }
+    bool isConverged, isPreviousDomainConverged;
 
     Pcoarse->shapetype_m = argv[13];
     Pcoarse->shapedegree_m = std::atoi(argv[14]); 
@@ -625,113 +662,135 @@ int main(int argc, char *argv[]){
 
     Pcoarse->initNUFFT(FLPIF);
     
-    //Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R0.getView());
-    //Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P0.getView());
-    //Pcoarse->LeapFrogPIF(Pcoarse->RprevIter, Pcoarse->PprevIter, (Ippl::Comm->rank()+1)*ntFine, 
-    //                     dtFine, isConverged, tStartMySlice, 0);
-    //Ippl::Comm->barrier();
+    for (unsigned int nc=0; nc < nCycles; nc++) {
+        double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
+        Pcoarse->time_m = tStartMySlice;
+        Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
+                                    isPreviousDomainConverged, ntCoarse,
+                                    dtCoarse, tStartMySlice);
+        unsigned int it = 0;
+        while (!isConverged) { 
+            //Run fine integrator in parallel
+            IpplTimings::startTimer(finePropagator);
+            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, nc+1, it+1);
+            IpplTimings::stopTimer(finePropagator);
     
-    //unsigned int maxIterRank;
-    for (unsigned int it=0; it<maxIter; it++) {
 
-        //Run fine integrator in parallel
-        IpplTimings::startTimer(finePropagator);
-        Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, it+1);
-        IpplTimings::stopTimer(finePropagator);
-    
+            //Difference = Fine - Coarse
+            Pend->R = Pbegin->R - Pcoarse->R;
+            Pend->P = Pbegin->P - Pcoarse->P;
 
-        //Difference = Fine - Coarse
-        Pend->R = Pbegin->R - Pcoarse->R;
-        Pend->P = Pbegin->P - Pcoarse->P;
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
+            Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            IpplTimings::startTimer(timeCommunication);
+            tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
+                size_type bufSize = Pbegin->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+                buf->resetReadPos();
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
+                        Ippl::getComm(), MPI_STATUS_IGNORE);
+                IpplTimings::startTimer(deepCopy);
+                Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+                Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+                IpplTimings::stopTimer(deepCopy);
+            }
+            IpplTimings::stopTimer(timeCommunication);
 
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
-        Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
-        IpplTimings::stopTimer(deepCopy);
-        
-        IpplTimings::startTimer(timeCommunication);
-        tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-        int tagbool = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-        
-        if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
-            size_type bufSize = Pbegin->packedSize(nloc);
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-            Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
-            buf->resetReadPos();
-            MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
-                    Ippl::getComm(), MPI_STATUS_IGNORE);
             IpplTimings::startTimer(deepCopy);
-            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
-            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
-        }
-        IpplTimings::stopTimer(timeCommunication);
-
-        IpplTimings::startTimer(deepCopy);
-        Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R0.getView());
-        Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P0.getView());
-        Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-        Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-        IpplTimings::stopTimer(deepCopy);
-
-        IpplTimings::startTimer(coarsePropagator);
-        Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
-        IpplTimings::stopTimer(coarsePropagator);
-
-        Pend->R = Pend->R + Pcoarse->R;
-        Pend->P = Pend->P + Pcoarse->P;
-
-        PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
-        IpplTimings::startTimer(computeErrors);
-        double localRerror, localPerror;
-        double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
-        double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+
+            IpplTimings::startTimer(coarsePropagator);
+            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+            IpplTimings::stopTimer(coarsePropagator);
+
+            Pend->R = Pend->R + Pcoarse->R;
+            Pend->P = Pend->P + Pcoarse->P;
+
+            PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
+            IpplTimings::startTimer(computeErrors);
+            double localRerror, localPerror;
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
     
-        //double Rerror = computeRL2Error(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
-        //double Perror = computePL2Error(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-        //double EfieldError = 0;
-        //if(it > 0) {
-        //    EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
-        //}
-        IpplTimings::stopTimer(computeErrors);
-
-        if((Rerror <= tol) && (Perror <= tol)) {
-            isConverged = true;
-        }
-        
-        
-        IpplTimings::startTimer(timeCommunication);
-        if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
-            size_type bufSize = Pend->packedSize(nloc);
-            buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
-            MPI_Request request;
-            Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
-            buf->resetWritePos();
-            MPI_Wait(&request, MPI_STATUS_IGNORE);
-            MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
-        }
-        IpplTimings::stopTimer(timeCommunication);
+            //double Rerror = computeRL2Error(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
+            //double Perror = computePL2Error(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+            //double EfieldError = 0;
+            //if(it > 0) {
+            //    EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
+            //}
+            IpplTimings::stopTimer(computeErrors);
+
+            
+            if((Rerror <= tol) && (Perror <= tol) && isPreviousDomainConverged) {
+                isConverged = true;
+            }
+            
+            IpplTimings::startTimer(timeCommunication);
+            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
+            }
+            IpplTimings::stopTimer(timeCommunication);
+
+            
+            msg << "Finished iteration: " << it+1 
+                << " in cycle: " << nc+1
+                << " Rerror: " << Rerror 
+                << " Perror: " << Perror
+                << endl;
 
+            IpplTimings::startTimer(dumpData);
+            //Pcoarse->writeError(Rerror, Perror, it+1);
+            Pcoarse->writelocalError(localRerror, localPerror, nc+1, it+1);
+            IpplTimings::stopTimer(dumpData);
+
+            it += 1;
+        }
         
-        msg << "Finished iteration: " << it+1 
-            << " Rerror: " << Rerror 
-            << " Perror: " << Perror
-            << endl;
-
-        IpplTimings::startTimer(dumpData);
-        //Pcoarse->writeError(Rerror, Perror, it+1);
-        Pcoarse->writelocalError(localRerror, localPerror, it+1);
-        IpplTimings::stopTimer(dumpData);
-
-        if(isConverged && isPreviousDomainConverged) {
-            //maxIterRank = it+1;
-            break;
+        Ippl::Comm->barrier();
+        if((nCycles > 1) && (nc < (nCycles - 1))) {  
+            IpplTimings::startTimer(timeCommunication);
+            tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+            
+            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+                Ippl::Comm->recv(Ippl::Comm->rank()+1, tag, *Pend, *buf, bufSize, nloc);
+                buf->resetReadPos();
+            }
+            if(Ippl::Comm->rank() > 0) {
+                size_type bufSize = Pend->packedSize(nloc);
+                buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+                MPI_Request request;
+                Ippl::Comm->isend(Ippl::Comm->rank()-1, tag, *Pend, *buf, request, nloc);
+                buf->resetWritePos();
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+            }
+            IpplTimings::stopTimer(timeCommunication);
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
         }
-
     }
 
-    //std::cout << "Rank " << Ippl::Comm->rank() << " is out of the loop in iteration: " << maxIterRank << std::endl;
-    Ippl::Comm->barrier();
     msg << TestName << " Parareal: End." << endl;
     IpplTimings::stopTimer(mainTimer);
     IpplTimings::print();
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 1f3f411c6..730f63d68 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -19,7 +19,7 @@
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
 //     Example:
-//     srun ./PenningTrapPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 100 B-spline 1 --info 5
+//     srun ./PenningTrapPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 4 B-spline 1 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
index a244e7816..a020c4c79 100644
--- a/test/FFT/TestNUFFT1.cpp
+++ b/test/FFT/TestNUFFT1.cpp
@@ -200,8 +200,8 @@ int main(int argc, char *argv[]) {
                }
                const double& val = Qview(idx);
 
-               innerReduce += (Kokkos::Experimental::cos(arg) 
-                           - imag * Kokkos::Experimental::sin(arg)) * val;
+               innerReduce += (Kokkos::cos(arg) 
+                           - imag * Kokkos::sin(arg)) * val;
            }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
 
            if(teamMember.team_rank() == 0) {
@@ -254,8 +254,8 @@ int main(int argc, char *argv[]) {
                                     arg += kVec[d]*Rview(idx)[d];
                                 }
 
-                                valL += (Kokkos::Experimental::cos(arg) 
-                                - imag * Kokkos::Experimental::sin(arg)) * Qview(idx);
+                                valL += (Kokkos::cos(arg) 
+                                - imag * Kokkos::sin(arg)) * Qview(idx);
                             }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
     
     double abs_error_real = std::fabs(reducedValue.real() - field_result(iInd, jInd, kInd).real());
diff --git a/test/FFT/TestNUFFT2.cpp b/test/FFT/TestNUFFT2.cpp
index d48abe9fd..8ffaf6827 100644
--- a/test/FFT/TestNUFFT2.cpp
+++ b/test/FFT/TestNUFFT2.cpp
@@ -204,8 +204,8 @@ int main(int argc, char *argv[]) {
                                     arg += (iVec[d] - (pt[d]/2)) * Rview(idx)[d];
                                 }
 
-                                valL += (Kokkos::Experimental::cos(arg) 
-                                + imag * Kokkos::Experimental::sin(arg)) * fview(i + nghost, j + nghost, k + nghost);
+                                valL += (Kokkos::cos(arg) 
+                                + imag * Kokkos::sin(arg)) * fview(i + nghost, j + nghost, k + nghost);
                             }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
     
     double abs_error_real = std::fabs(reducedValue.real() - Q_result(idx));

From 62dba0f14e654edb0a6d4b9194c382f9b334157c Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 17 Apr 2023 09:23:39 +0200
Subject: [PATCH 077/117] Added missing critical Kokkos::fence() after NUFFT as
 otherwise we have issues with Kokkos 4.0.00

---
 alpine/PinT/ChargedParticlesPinT.hpp | 3 +--
 src/FFT/FFT.hpp                      | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index f324d2e26..dedc3050d 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -555,7 +555,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             valL += myVal;
 
         }, Kokkos::Sum<double>(temp));
-        
 
         double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
         potentialEnergy = 0.5 * temp * volume;
@@ -1113,7 +1112,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                                 - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
                 P2view(j)[2] += alpha * E2view(j)[2];
             });
-    
+
             time_m += dt;
             
             IpplTimings::startTimer(dumpData);
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index b28196de7..b1b4bea19 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -838,6 +838,9 @@ namespace ippl {
                                         ParticleAttrib<PT2, Properties... >& Q,
                                         typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
     {
+        
+        //Inform m("FFT ");
+        
         auto fview = f.getView();
         auto Rview = R.getView();
         auto Qview = Q.getView();
@@ -915,6 +918,7 @@ namespace ippl {
                      NULL, NULL, NULL, plan_m);
 
         ier_m = nufft_m.execute(tempQ.data(), tempField.data(), plan_m);
+        Kokkos::fence();
 
 
         if(type_m == 1) { 

From 456f146a271e1571741f8c9ebe5030f2e09c082d Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 17 Apr 2023 09:25:03 +0200
Subject: [PATCH 078/117] Cleanup commented Inform

---
 src/FFT/FFT.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index b1b4bea19..0698e1ceb 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -839,7 +839,6 @@ namespace ippl {
                                         typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
     {
         
-        //Inform m("FFT ");
         
         auto fview = f.getView();
         auto Rview = R.getView();

From f3e550fdac2ecebaec002d8978a4f64f7f9c4a4c Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 17 Apr 2023 12:19:33 +0200
Subject: [PATCH 079/117] Add ifdefs for NUFFT

---
 src/FFT/FFT.h   | 10 +++++++++-
 src/FFT/FFT.hpp |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 2b41a9495..890dfb381 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -30,7 +30,9 @@
 
 #include <heffte_fft3d.h>
 #include <heffte_fft3d_r2c.h>
-#include <cufinufft.h>
+#ifdef ENABLE_NUFFT
+    #include <cufinufft.h>
+#endif
 #include <array>
 #include <memory>
 #include <functional>
@@ -69,11 +71,13 @@ namespace ippl {
        Tag classes for Cosine transforms
     */
     class CosTransform {};
+#ifdef ENABLE_NUFFT
 #ifdef KOKKOS_ENABLE_CUDA
     /**
        Tag classes for Non-uniform type of Fourier transforms
     */
     class NUFFTransform {};
+#endif
 #endif
 
     enum FFTComm {
@@ -123,6 +127,7 @@ namespace ippl {
 #endif
 #endif
 
+#ifdef ENABLE_NUFFT
 #ifdef KOKKOS_ENABLE_CUDA
         template <class T>
         struct CufinufftType {};
@@ -152,6 +157,7 @@ namespace ippl {
             using complexType = cuDoubleComplex;
             using plan_t      = cufinufft_plan;
         };
+#endif
 #endif
     }
 
@@ -338,6 +344,7 @@ namespace ippl {
     };
 
 
+#ifdef ENABLE_NUFFT
 #ifdef KOKKOS_ENABLE_CUDA
     /**
        Non-uniform FFT class
@@ -388,6 +395,7 @@ namespace ippl {
 
 }
 #endif
+#endif
 #include "FFT/FFT.hpp"
 #endif // IPPL_FFT_FFT_H
 
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 0698e1ceb..a933089ef 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -750,6 +750,7 @@ namespace ippl {
     }
 
 
+#ifdef ENABLE_NUFFT
 #ifdef KOKKOS_ENABLE_CUDA
     //=========================================================================
     // FFT NUFFTransform Constructors
@@ -954,6 +955,7 @@ namespace ippl {
 
     }
 #endif
+#endif
 }
 
 // vi: set et ts=4 sw=4 sts=4:

From 846e3da19dab37a85e0a4ab53adfcc48b255c841 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 18 Apr 2023 15:18:40 +0200
Subject: [PATCH 080/117] Revert "Add ifdefs for NUFFT"

This reverts commit f3e550fdac2ecebaec002d8978a4f64f7f9c4a4c.
---
 src/FFT/FFT.h   | 10 +---------
 src/FFT/FFT.hpp |  2 --
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 890dfb381..2b41a9495 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -30,9 +30,7 @@
 
 #include <heffte_fft3d.h>
 #include <heffte_fft3d_r2c.h>
-#ifdef ENABLE_NUFFT
-    #include <cufinufft.h>
-#endif
+#include <cufinufft.h>
 #include <array>
 #include <memory>
 #include <functional>
@@ -71,13 +69,11 @@ namespace ippl {
        Tag classes for Cosine transforms
     */
     class CosTransform {};
-#ifdef ENABLE_NUFFT
 #ifdef KOKKOS_ENABLE_CUDA
     /**
        Tag classes for Non-uniform type of Fourier transforms
     */
     class NUFFTransform {};
-#endif
 #endif
 
     enum FFTComm {
@@ -127,7 +123,6 @@ namespace ippl {
 #endif
 #endif
 
-#ifdef ENABLE_NUFFT
 #ifdef KOKKOS_ENABLE_CUDA
         template <class T>
         struct CufinufftType {};
@@ -157,7 +152,6 @@ namespace ippl {
             using complexType = cuDoubleComplex;
             using plan_t      = cufinufft_plan;
         };
-#endif
 #endif
     }
 
@@ -344,7 +338,6 @@ namespace ippl {
     };
 
 
-#ifdef ENABLE_NUFFT
 #ifdef KOKKOS_ENABLE_CUDA
     /**
        Non-uniform FFT class
@@ -395,7 +388,6 @@ namespace ippl {
 
 }
 #endif
-#endif
 #include "FFT/FFT.hpp"
 #endif // IPPL_FFT_FFT_H
 
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index a933089ef..0698e1ceb 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -750,7 +750,6 @@ namespace ippl {
     }
 
 
-#ifdef ENABLE_NUFFT
 #ifdef KOKKOS_ENABLE_CUDA
     //=========================================================================
     // FFT NUFFTransform Constructors
@@ -955,7 +954,6 @@ namespace ippl {
 
     }
 #endif
-#endif
 }
 
 // vi: set et ts=4 sw=4 sts=4:

From d3f288ae6d393cf29cc9d4ece7c9c7cc70ed4c6f Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 25 Apr 2023 07:41:11 +0200
Subject: [PATCH 081/117] PenningTrap PIF with NUFFT made

---
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index 1c81783b9..2f40467b4 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -261,6 +261,20 @@ int main(int argc, char *argv[]){
     P->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
+    ippl::ParameterList fftParams;
+
+    fftParams.add("gpu_method", 1);
+    fftParams.add("gpu_sort", 1);
+    fftParams.add("gpu_kerevalmeth", 1);
+    fftParams.add("tolerance", 1e-4);
+
+    fftParams.add("use_cufinufft_defaults", false);
+
+
+    P->fft = std::make_shared<FFT_type>(FL, 1, fftParams);
+
+    P->q.initializeNUFFT(FL, 1, fftParams);
+    P->E.initializeNUFFT(FL, 2, fftParams);
 
     P->scatter();
 

From b7d6c2991dea4027bafb73c5d950f1ef2aebcfed Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Thu, 27 Apr 2023 10:34:06 +0200
Subject: [PATCH 082/117] tolerance changed in PenningTrap PIF and NUFFT init
 moved to a separate function

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 19 ++++++++++++++-----
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  | 17 +----------------
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp    | 15 +--------------
 alpine/PinT/PenningTrapPinT.cpp               |  5 ++++-
 4 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index e2688105f..51c308f8f 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -42,7 +42,6 @@ typedef Field<double, Dim>   Field_t;
 typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
 typedef Field<Vector_t, Dim> VField_t;
 
-typedef ippl::FFT<ippl::NUFFTransform, 3, double> FFT_type;
 
 const double pi = std::acos(-1.0);
 
@@ -76,7 +75,6 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     int shapedegree_m;
 
-    std::shared_ptr<FFT_type> fft;
 
 public:
     ParticleAttrib<double>     q; // charge
@@ -126,6 +124,20 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         setBCAllPeriodic();
     }
 
+    void initNUFFT(FieldLayout_t& FL) {
+        ippl::ParameterList fftParams;
+
+        fftParams.add("gpu_method", 1);
+        fftParams.add("gpu_sort", 1);
+        fftParams.add("gpu_kerevalmeth", 1);
+        fftParams.add("tolerance", 1e-6);
+
+        fftParams.add("use_cufinufft_defaults", false);
+
+        q.initializeNUFFT(FL, 1, fftParams);
+        E.initializeNUFFT(FL, 2, fftParams);
+    }
+
     void gather() {
 
         gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, this->q);
@@ -142,15 +154,12 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         Inform m("scatter ");
         rho_m = {0.0, 0.0};
         scatterPIFNUFFT(q, rho_m, Sk_m, this->R);
-        //fft->transform(this->R, q, rho_m);
         //rhoDFT_m = {0.0, 0.0};
         //scatterPIFNUDFT(q, rho_m, Sk_m, this->R);
 
         //dumpFieldData();
 
         rho_m = rho_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
-        //rhoDFT_m = rhoDFT_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
-
     }
 
 
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index d0e9d3b92..0ed48fc0e 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -270,22 +270,7 @@ int main(int argc, char *argv[]){
     P->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
-
-    ippl::ParameterList fftParams;
-
-    fftParams.add("gpu_method", 1);
-    fftParams.add("gpu_sort", 1);
-    fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-4);
-
-    fftParams.add("use_cufinufft_defaults", false);
-
-
-    P->fft = std::make_shared<FFT_type>(FL, 1, fftParams);
-
-    P->q.initializeNUFFT(FL, 1, fftParams);
-    P->E.initializeNUFFT(FL, 2, fftParams);
-
+    P->initNUFFT(FL);
 
     P->scatter();
 
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index 2f40467b4..dfe082298 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -261,20 +261,7 @@ int main(int argc, char *argv[]){
     P->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
-    ippl::ParameterList fftParams;
-
-    fftParams.add("gpu_method", 1);
-    fftParams.add("gpu_sort", 1);
-    fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-4);
-
-    fftParams.add("use_cufinufft_defaults", false);
-
-
-    P->fft = std::make_shared<FFT_type>(FL, 1, fftParams);
-
-    P->q.initializeNUFFT(FL, 1, fftParams);
-    P->E.initializeNUFFT(FL, 2, fftParams);
+    P->initNUFFT(FL);
 
     P->scatter();
 
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 730f63d68..5285d992a 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -402,6 +402,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
     static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
+    static IpplTimings::TimerRef initializeCycles = IpplTimings::getTimer("initializeCycles");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -585,7 +586,7 @@ int main(int argc, char *argv[]){
                          minU, maxU));
 
     Kokkos::fence();
-    Ippl::Comm->barrier();
+    //Ippl::Comm->barrier();
 #endif
 
 
@@ -676,9 +677,11 @@ int main(int argc, char *argv[]){
     for (unsigned int nc=0; nc < nCycles; nc++) {
         double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
         Pcoarse->time_m = tStartMySlice;
+        IpplTimings::startTimer(initializeCycles);
         Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
                                     isPreviousDomainConverged, ntCoarse,
                                     dtCoarse, tStartMySlice, Bext);
+        IpplTimings::stopTimer(initializeCycles);
         unsigned int it = 0;
         while (!isConverged) { 
         //while ((!isPreviousDomainConverged) || (!isConverged)) { 

From ed6d72185faf5575a18c9da406a736af54b6d4a0 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 30 May 2023 08:17:21 +0200
Subject: [PATCH 083/117] Initial space-time distributed parallel code made.
 Need to compile and test.

---
 alpine/PinT/ChargedParticlesPinT.hpp |  36 ++++-----
 alpine/PinT/PenningTrapPinT.cpp      | 110 +++++++++++++++++++--------
 src/Communicate/Communicate.cpp      |   8 +-
 src/Communicate/Communicate.h        |  13 ++--
 src/Particle/ParticleAttrib.hpp      |  62 +++++++++------
 5 files changed, 150 insertions(+), 79 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index dedc3050d..a86b385d5 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -630,27 +630,29 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }
      }
 
-    void writelocalError(double Rerror, double Perror, unsigned int nc, unsigned int iter) {
+    void writelocalError(double Rerror, double Perror, unsigned int nc, unsigned int iter, int rankTime) {
         
-            std::stringstream fname;
-            fname << "data/localError_rank_";
-            fname << Ippl::Comm->rank();
-            fname << "_nc_";
-            fname << nc;
-            fname << ".csv";
-
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
-            csvout.precision(10);
-            csvout.setf(std::ios::scientific, std::ios::floatfield);
+            if(Ippl::Comm->rank() == 0) {
+                std::stringstream fname;
+                fname << "data/localError_rank_";
+                fname << rankTime;
+                fname << "_nc_";
+                fname << nc;
+                fname << ".csv";
+
+                Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, rankTime);
+                csvout.precision(10);
+                csvout.setf(std::ios::scientific, std::ios::floatfield);
+
+                if(iter == 1) {
+                    csvout << "Iter, Rerror, Perror" << endl;
+                }
 
-            if(iter == 1) {
-                csvout << "Iter, Rerror, Perror" << endl;
+                csvout << iter << " "
+                       << Rerror << " "
+                       << Perror << endl;
             }
 
-            csvout << iter << " "
-                   << Rerror << " "
-                   << Perror << endl;
-
     }
 
     
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 5285d992a..09a09d888 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -147,8 +147,7 @@ double CDF(const double& x, const double& mu, const double& sigma) {
 }
 
 double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                         const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
-                         Vector_t& length) {
+                      Vector_t& length) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -174,16 +173,19 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
     Kokkos::fence();
-    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //lError = std::sqrt(localError)/std::sqrt(localNorm);
 
-    double relError = lError;//absError / std::sqrt(globaltemp);
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
     
     return relError;
 
 }
 
-double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -200,9 +202,13 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
     Kokkos::fence();
-    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //lError = std::sqrt(localError)/std::sqrt(localNorm);
 
-    double relError = lError;//absError / std::sqrt(globaltemp);
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
     
     return relError;
 
@@ -376,7 +382,31 @@ double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
 const char* TestName = "PenningTrapPinT";
 
 int main(int argc, char *argv[]){
-    Ippl ippl(argc, argv);
+   
+    int rankWorld, sizeWorld;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld);
+    MPI_Comm_size(MPI_COMM_WORLD, &sizeWorld);
+
+    int spaceColor, timeColor;
+    MPI_Comm spaceComm, timeComm;
+
+    int spaceProcs = std::atoi(argv[13]);
+    int timeProcs = std::atoi(argv[14]);
+    spaceColor = rankWorld / spaceProcs; 
+    timeColor = rankWorld % spaceProcs;
+
+    MPI_Comm_split(MPI_COMM_WORLD, spaceColor, rankWorld, &spaceComm);
+    MPI_Comm_split(MPI_COMM_WORLD, timeColor, rankWorld, &timeComm);
+
+    int rankSpace, sizeSpace, rankTime, sizeTime;
+    MPI_Comm_rank(spaceComm, &rankSpace);
+    MPI_Comm_size(spaceComm, &sizeSpace);
+
+    MPI_Comm_rank(timeComm, &rankTime);
+    MPI_Comm_size(timeComm, &sizeTime);
+
+    Ippl ippl(argc, argv, spaceComm);
     
     Inform msg(TestName, Ippl::Comm->size()-1);
     Inform msg2all(TestName,INFORM_ALL_NODES);
@@ -410,7 +440,7 @@ int main(int argc, char *argv[]){
     const double tEnd = std::atof(argv[8]);
     const unsigned int nCycles = std::atoi(argv[12]);
     double tEndCycle = tEnd / nCycles;
-    const double dtSlice = tEndCycle / Ippl::Comm->size();
+    const double dtSlice = tEndCycle / sizeTime;
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
     const unsigned int ntFine = std::ceil(dtSlice / dtFine);
@@ -474,8 +504,18 @@ int main(int argc, char *argv[]){
     FieldLayout_t FLPIF(domainPIF, decomp, isAllPeriodic);
     PLayout_t PL(FLPIC, meshPIC);
 
-    size_type nloc = totalP;
+    size_type nloc = (size_type)(totalP / sizeSpace);
+
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
 
+    int rest = (int) (totalP - Total_particles);
+
+    if ( Ippl::Comm->rank() < rest ) {
+        ++nloc;
+    }
 
     double Q = -1562.5;
     double Bext = 5.0;
@@ -546,8 +586,9 @@ int main(int argc, char *argv[]){
     //condition is not the same on different GPUs
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
 
-    if(Ippl::Comm->rank() == 0) {
-        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    //if(Ippl::Comm->rank() == 0) {
+    if(rankTime == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
         Kokkos::parallel_for(nloc,
                              generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
                              Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, mu, sd, 
@@ -559,16 +600,16 @@ int main(int argc, char *argv[]){
     else {
         size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-        Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+        Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
         buf->resetReadPos();
     }
 
     
-    if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+    if(rankTime < sizeTime-1) {
         size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
         MPI_Request request;
-        Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pbegin, *buf, request, nloc);
+        Ippl::Comm->isend(rankTime+1, tag, *Pbegin, *buf, request, nloc, timeComm);
         buf->resetWritePos();
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
@@ -675,7 +716,7 @@ int main(int argc, char *argv[]){
     
    
     for (unsigned int nc=0; nc < nCycles; nc++) {
-        double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
+        double tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice); 
         Pcoarse->time_m = tStartMySlice;
         IpplTimings::startTimer(initializeCycles);
         Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
@@ -710,13 +751,13 @@ int main(int argc, char *argv[]){
             tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             
-            if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
+            if((rankTime > 0) && (!isPreviousDomainConverged)) {
                 size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-                Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+                Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
-                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
-                        Ippl::getComm(), MPI_STATUS_IGNORE);
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, rankTime-1, tagbool, 
+                        timeComm, MPI_STATUS_IGNORE);
                 IpplTimings::startTimer(deepCopy);
                 Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
                 Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
@@ -742,9 +783,9 @@ int main(int argc, char *argv[]){
 
             PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
             IpplTimings::startTimer(computeErrors);
-            double localRerror, localPerror;
-            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
-            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+            //double localRerror, localPerror;
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter);
         
             IpplTimings::stopTimer(computeErrors);
 
@@ -754,14 +795,14 @@ int main(int argc, char *argv[]){
 
 
             IpplTimings::startTimer(timeCommunication);
-            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+            if(rankTime < (sizeTime-1)) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
                 MPI_Request request;
-                Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+                Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
                 buf->resetWritePos();
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
-                MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, rankTime+1, tagbool, timeComm);
             }
             IpplTimings::stopTimer(timeCommunication);
 
@@ -774,10 +815,12 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(localRerror, localPerror, nc+1, it+1);
+            Pcoarse->writelocalError(localRerror, localPerror, nc+1, it+1, rankTime);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
             IpplTimings::stopTimer(dumpData);
 
+            MPI_Barrier(spaceComm);
+
             it += 1;
             //if(isConverged && isPreviousDomainConverged) {
             //    break;
@@ -785,24 +828,25 @@ int main(int argc, char *argv[]){
         }
     
         //std::cout << "Before barrier in cycle: " << nc+1 << "for rank: " << Ippl::Comm->rank() << std::endl;
-        Ippl::Comm->barrier();
+        //Ippl::Comm->barrier();
+        MPI_Barrier(MPI_COMM_WORLD);
         //msg << "Communication started in cycle: " << nc+1 << endl;
         //std::cout << "Communication started in cycle: " << nc+1 << "for rank: " << Ippl::Comm->rank() << std::endl;
         if((nCycles > 1) && (nc < (nCycles - 1))) {  
             IpplTimings::startTimer(timeCommunication);
             tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             
-            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+            if(rankTime < (sizeTime-1)) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-                Ippl::Comm->recv(Ippl::Comm->rank()+1, tag, *Pend, *buf, bufSize, nloc);
+                Ippl::Comm->recv(rankTime+1, tag, *Pend, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
             }
-            if(Ippl::Comm->rank() > 0) {
+            if(rankTime > 0) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
                 MPI_Request request;
-                Ippl::Comm->isend(Ippl::Comm->rank()-1, tag, *Pend, *buf, request, nloc);
+                Ippl::Comm->isend(rankTime-1, tag, *Pend, *buf, request, nloc, timeComm);
                 buf->resetWritePos();
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
             }
diff --git a/src/Communicate/Communicate.cpp b/src/Communicate/Communicate.cpp
index 78bf9bb82..63314a1ef 100644
--- a/src/Communicate/Communicate.cpp
+++ b/src/Communicate/Communicate.cpp
@@ -25,7 +25,13 @@ namespace ippl {
     Communicate::Communicate(int& argc, char**& argv, const MPI_Comm& comm)
     : comm_m(comm)
     {
-        MPI_Init(&argc, &argv);
+        int isInitialized;
+        MPI_Initialized(&isInitialized);
+        
+        if (!isInitialized) {
+            MPI_Init(&argc, &argv);
+        }
+        
         MPI_Comm_rank(comm_m, &rank_m);
         MPI_Comm_size(comm_m, &size_m);
     }
diff --git a/src/Communicate/Communicate.h b/src/Communicate/Communicate.h
index 7024982e2..841868a05 100644
--- a/src/Communicate/Communicate.h
+++ b/src/Communicate/Communicate.h
@@ -123,14 +123,14 @@ namespace ippl {
          */
         template <class Buffer>
         void recv(int src, int tag, Buffer& buffer, archive_type& ar,
-                  size_type msize, size_type nrecvs);
+                  size_type msize, size_type nrecvs, const MPI_Comm& comm = comm_m);
 
         /*!
          * \warning Only works with default spaces!
          */
         template <class Buffer>
         void isend(int dest, int tag, Buffer& buffer, archive_type&,
-                   MPI_Request&, size_type nsends);
+                   MPI_Request&, size_type nsends, const MPI_Comm& comm = comm_m);
 
         /*!
          * \warning Only works with default spaces!
@@ -158,7 +158,7 @@ namespace ippl {
 
     template <class Buffer>
     void Communicate::recv(int src, int tag, Buffer& buffer, archive_type& ar,
-                           size_type msize, size_type nrecvs)
+                           size_type msize, size_type nrecvs, const MPI_Comm& comm)
     {
         // Temporary fix. MPI communication seems to have problems when the
         // count argument exceeds the range of int, so large messages should
@@ -169,14 +169,15 @@ namespace ippl {
         }
         MPI_Status status;
         MPI_Recv(ar.getBuffer(), msize,
-                MPI_BYTE, src, tag, comm_m, &status);
+                MPI_BYTE, src, tag, comm, &status);
 
         buffer.deserialize(ar, nrecvs);
     }
 
     template <class Buffer>
     void Communicate::isend(int dest, int tag, Buffer& buffer,
-                            archive_type& ar, MPI_Request& request, size_type nsends)
+                            archive_type& ar, MPI_Request& request, size_type nsends, 
+                            const MPI_Comm& comm)
     {
         if (ar.getSize() > INT_MAX) {
             std::cerr << "Message size exceeds range of int" << std::endl;
@@ -184,7 +185,7 @@ namespace ippl {
         }
         buffer.serialize(ar, nsends);
         MPI_Isend(ar.getBuffer(), ar.getSize(),
-                  MPI_BYTE, dest, tag, comm_m, &request);
+                  MPI_BYTE, dest, tag, comm, &request);
     }
 }
 
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 39ba19e3b..673516ffc 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -160,6 +160,15 @@ namespace ippl {
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
         const int nghost = f.getNghost();
 
+
+        Field<FT,Dim,M,C> tempField;
+
+        tempField.initialize(mesh, layout);
+
+        tempField = 0.0;
+        
+        view_type viewLocal = tempField.getView();
+
         Kokkos::parallel_for(
             "ParticleAttrib::scatter",
             *(this->localNum_mp),
@@ -183,14 +192,14 @@ namespace ippl {
 
                 // scatter
                 const value_type& val = dview_m(idx);
-                Kokkos::atomic_add(&view(i-1, j-1, k-1), wlo[0] * wlo[1] * wlo[2] * val);
-                Kokkos::atomic_add(&view(i-1, j-1, k  ), wlo[0] * wlo[1] * whi[2] * val);
-                Kokkos::atomic_add(&view(i-1, j,   k-1), wlo[0] * whi[1] * wlo[2] * val);
-                Kokkos::atomic_add(&view(i-1, j,   k  ), wlo[0] * whi[1] * whi[2] * val);
-                Kokkos::atomic_add(&view(i,   j-1, k-1), whi[0] * wlo[1] * wlo[2] * val);
-                Kokkos::atomic_add(&view(i,   j-1, k  ), whi[0] * wlo[1] * whi[2] * val);
-                Kokkos::atomic_add(&view(i,   j,   k-1), whi[0] * whi[1] * wlo[2] * val);
-                Kokkos::atomic_add(&view(i,   j,   k  ), whi[0] * whi[1] * whi[2] * val);
+                Kokkos::atomic_add(&viewLocal(i-1, j-1, k-1), wlo[0] * wlo[1] * wlo[2] * val);
+                Kokkos::atomic_add(&viewLocal(i-1, j-1, k  ), wlo[0] * wlo[1] * whi[2] * val);
+                Kokkos::atomic_add(&viewLocal(i-1, j,   k-1), wlo[0] * whi[1] * wlo[2] * val);
+                Kokkos::atomic_add(&viewLocal(i-1, j,   k  ), wlo[0] * whi[1] * whi[2] * val);
+                Kokkos::atomic_add(&viewLocal(i,   j-1, k-1), whi[0] * wlo[1] * wlo[2] * val);
+                Kokkos::atomic_add(&viewLocal(i,   j-1, k  ), whi[0] * wlo[1] * whi[2] * val);
+                Kokkos::atomic_add(&viewLocal(i,   j,   k-1), whi[0] * whi[1] * wlo[2] * val);
+                Kokkos::atomic_add(&viewLocal(i,   j,   k  ), whi[0] * whi[1] * whi[2] * val);
             }
         );
         IpplTimings::stopTimer(scatterPICTimer);
@@ -198,7 +207,14 @@ namespace ippl {
         //static IpplTimings::TimerRef accumulateHaloTimer = IpplTimings::getTimer("AccumulateHalo");           
         //IpplTimings::startTimer(accumulateHaloTimer);                                               
         f.accumulateHalo();
-        //IpplTimings::stopTimer(accumulateHaloTimer);                                               
+        //IpplTimings::stopTimer(accumulateHaloTimer);
+
+        static IpplTimings::TimerRef scatterAllReducePICTimer = IpplTimings::getTimer("scatterAllReducePIC");           
+        IpplTimings::startTimer(scatterAllReducePICTimer);                                               
+        int viewSize = view.extent(0) * view.extent(1) * view.extent(2);
+        MPI_Allreduce(viewLocal.data(), view.data(), viewSize, 
+                      MPI_DOUBLE, MPI_SUM, Ippl::getComm());  
+        IpplTimings::stopTimer(scatterAllReducePICTimer);
     }
 
 
@@ -497,31 +513,33 @@ namespace ippl {
 
         auto q = *this;
         
-        //Field<FT,Dim,M,C> tempField;
+        Field<FT,Dim,M,C> tempField;
+
+        FieldLayout<Dim>& layout = f.getLayout(); 
+        M& mesh = f.get_mesh();
 
-        //FieldLayout<Dim>& layout = f.getLayout(); 
-        //M& mesh = f.get_mesh();
+        tempField.initialize(mesh, layout);
 
-        //tempField.initialize(mesh, layout);
+        tempField = 0.0;
         
-        //fftType_mp->transform(pp, q, tempField);
-        fftType_mp->transform(pp, q, f);
+        fftType_mp->transform(pp, q, tempField);
+        //fftType_mp->transform(pp, q, f);
 
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         view_type fview = f.getView();
-        //view_type viewLocal = tempField.getView();
+        view_type viewLocal = tempField.getView();
         typename Field<ST, Dim, M, C>::view_type Skview = Sk.getView();
         const int nghost = f.getNghost();
         
         IpplTimings::stopTimer(scatterPIFNUFFTTimer);
 
-        //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
-        //IpplTimings::startTimer(scatterAllReduceTimer);                                               
-        //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
-        //MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
-        //              MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
-        //IpplTimings::stopTimer(scatterAllReduceTimer);
+        static IpplTimings::TimerRef scatterAllReducePIFTimer = IpplTimings::getTimer("scatterAllReducePIF");           
+        IpplTimings::startTimer(scatterAllReducePIFTimer);                                               
+        int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
+        MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
+                      MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
+        IpplTimings::stopTimer(scatterAllReducePIFTimer);
 
         //IpplTimings::startTimer(scatterPIFNUFFTTimer);
 

From b708c671b4982de1c70f3bbc0da213a5008a0b0b Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 30 May 2023 08:21:04 +0200
Subject: [PATCH 084/117] MPI_Comm_free added for space and time communicators

---
 alpine/PinT/PenningTrapPinT.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 09a09d888..9b07e1106 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -866,5 +866,8 @@ int main(int argc, char *argv[]){
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
 
+    MPI_Comm_free(spaceComm);
+    MPI_Comm_free(timeComm);
+
     return 0;
 }

From dfe3c75c842c1ecabe3e0339cc04b0651b61e262 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 30 May 2023 08:54:05 +0200
Subject: [PATCH 085/117] Compilation errors removed. Need to run and test

---
 alpine/PinT/PenningTrapPinT.cpp | 6 +++---
 src/Communicate/Communicate.h   | 4 ++--
 src/Particle/ParticleAttrib.hpp | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 9b07e1106..c503d9445 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -815,7 +815,7 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(localRerror, localPerror, nc+1, it+1, rankTime);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
             IpplTimings::stopTimer(dumpData);
 
@@ -866,8 +866,8 @@ int main(int argc, char *argv[]){
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
 
-    MPI_Comm_free(spaceComm);
-    MPI_Comm_free(timeComm);
+    MPI_Comm_free(&spaceComm);
+    MPI_Comm_free(&timeComm);
 
     return 0;
 }
diff --git a/src/Communicate/Communicate.h b/src/Communicate/Communicate.h
index 841868a05..423bd08b1 100644
--- a/src/Communicate/Communicate.h
+++ b/src/Communicate/Communicate.h
@@ -123,14 +123,14 @@ namespace ippl {
          */
         template <class Buffer>
         void recv(int src, int tag, Buffer& buffer, archive_type& ar,
-                  size_type msize, size_type nrecvs, const MPI_Comm& comm = comm_m);
+                  size_type msize, size_type nrecvs, const MPI_Comm& comm = MPI_COMM_WORLD);
 
         /*!
          * \warning Only works with default spaces!
          */
         template <class Buffer>
         void isend(int dest, int tag, Buffer& buffer, archive_type&,
-                   MPI_Request&, size_type nsends, const MPI_Comm& comm = comm_m);
+                   MPI_Request&, size_type nsends, const MPI_Comm& comm = MPI_COMM_WORLD);
 
         /*!
          * \warning Only works with default spaces!
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 673516ffc..458fba563 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -147,7 +147,7 @@ namespace ippl {
         
         typename Field<T, Dim, M, C>::view_type view = f.getView();
 
-        const M& mesh = f.get_mesh();
+        M& mesh = f.get_mesh();
 
         using vector_type = typename M::vector_type;
         using value_type  = typename ParticleAttrib<T, Properties...>::value_type;
@@ -156,18 +156,18 @@ namespace ippl {
         const vector_type& origin = mesh.getOrigin();
         const vector_type invdx = 1.0 / dx;
 
-        const FieldLayout<Dim>& layout = f.getLayout(); 
+        FieldLayout<Dim>& layout = f.getLayout(); 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
         const int nghost = f.getNghost();
 
 
-        Field<FT,Dim,M,C> tempField;
+        Field<T, Dim, M, C> tempField;
 
         tempField.initialize(mesh, layout);
 
         tempField = 0.0;
         
-        view_type viewLocal = tempField.getView();
+        typename Field<T, Dim, M, C>::view_type viewLocal = tempField.getView();
 
         Kokkos::parallel_for(
             "ParticleAttrib::scatter",

From 384256a8cb2405df59efb2e2fc576910225fabba Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 30 May 2023 16:43:37 +0200
Subject: [PATCH 086/117] space-time parallelism seems to work. Need to run
 more tests and confirm.

---
 alpine/PinT/ChargedParticlesPinT.hpp | 86 +++++++++++++++-------------
 alpine/PinT/PenningTrapPinT.cpp      | 67 +++++++++++++---------
 src/Particle/ParticleAttrib.h        |  6 +-
 src/Particle/ParticleAttrib.hpp      | 22 ++++---
 4 files changed, 103 insertions(+), 78 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index a86b385d5..d552442a1 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -182,15 +182,17 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                             const unsigned int& ntCoarse,
                             const double& dtCoarse,
                             const double& tStartMySlice,
-                            const double& Bext) {
+                            const double& Bext,
+                            const int& rankTime,
+                            MPI_Comm& spaceComm) {
 
         //Copy initial conditions as they are needed later
         Kokkos::deep_copy(R0.getView(), this->R.getView());
         Kokkos::deep_copy(P0.getView(), P.getView());
 
         //Get initial guess for ranks other than 0 by propagating the coarse solver
-        if (Ippl::Comm->rank() > 0) {
-            BorisPIC(this->R, P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice, Bext); 
+        if (rankTime > 0) {
+            BorisPIC(this->R, P, rankTime*ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
         }
         
         //Ippl::Comm->barrier();
@@ -200,10 +202,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 
         //Run the coarse integrator to get the values at the end of the time slice 
-        BorisPIC(this->R, P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
+        BorisPIC(this->R, P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
 
         isConverged = false;
-        if(Ippl::Comm->rank() == 0) {
+        if(rankTime == 0) {
             isPreviousDomainConverged = true;
         }
         else {
@@ -487,7 +489,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 
     void dumpEnergy(size_type /*totalP*/, const unsigned int& nc, 
-                    const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp) {
+                    const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp,
+                    int rankTime, int rankSpace, const MPI_Comm& spaceComm = MPI_COMM_WORLD) {
        
 
         double potentialEnergy, kineticEnergy;
@@ -572,32 +575,34 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                                 }, Kokkos::Sum<double>(temp));
 
         temp *= 0.5;
-        //globaltemp = 0.0;
-        double globaltemp = temp;
-        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
+        double globaltemp = 0.0;
+        //double globaltemp = temp;
+        MPI_Allreduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
 
         kineticEnergy = globaltemp;
 
-        std::stringstream fname;
-        fname << "data/Energy_rank_";
-        fname << Ippl::Comm->rank();
-        fname << "_nc_";
-        fname << nc;
-        fname << "_iter_";
-        fname << iter;
-        fname << ".csv";
+        if(rankSpace == 0) {
+            std::stringstream fname;
+            fname << "data/Energy_rank_";
+            fname << rankTime;
+            fname << "_nc_";
+            fname << nc;
+            fname << "_iter_";
+            fname << iter;
+            fname << ".csv";
 
 
-        Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
-        csvout.precision(10);
-        csvout.setf(std::ios::scientific, std::ios::floatfield);
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
 
-        //csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+            //csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
 
-        csvout << time_m << " "
-               << potentialEnergy << " "
-               << kineticEnergy << " "
-               << potentialEnergy + kineticEnergy << endl;
+            csvout << time_m << " "
+                   << potentialEnergy << " "
+                   << kineticEnergy << " "
+                   << potentialEnergy + kineticEnergy << endl;
+        }
 
     }
 
@@ -630,9 +635,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }
      }
 
-    void writelocalError(double Rerror, double Perror, unsigned int nc, unsigned int iter, int rankTime) {
+    void writelocalError(double Rerror, double Perror, unsigned int nc, unsigned int iter, int rankTime, int rankSpace) {
         
-            if(Ippl::Comm->rank() == 0) {
+            //if(Ippl::Comm->rank() == 0) {
+            if(rankSpace == 0) {
                 std::stringstream fname;
                 fname << "data/localError_rank_";
                 fname << rankTime;
@@ -640,7 +646,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 fname << nc;
                 fname << ".csv";
 
-                Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, rankTime);
+                Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
                 csvout.precision(10);
                 csvout.setf(std::ios::scientific, std::ios::floatfield);
 
@@ -840,16 +846,15 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
     }
 
-    void BorisPIC(ParticleAttrib<Vector_t>& Rtemp, 
-                     ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
-                     const double dt, const double& tStartMySlice, const double& Bext) {
+    void BorisPIC(ParticleAttrib<Vector_t>& Rtemp, ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
+                  const double dt, const double& tStartMySlice, const double& Bext, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIC_m = 0.0;
-        scatter(q, rhoPIC_m, Rtemp);
+        scatter(q, rhoPIC_m, Rtemp, spaceComm);
     
         rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
         rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
@@ -906,7 +911,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIC_m = 0.0;
-            scatter(q, rhoPIC_m, Rtemp);
+            scatter(q, rhoPIC_m, Rtemp, spaceComm);
     
     
             rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
@@ -1019,16 +1024,17 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const bool& /*isConverged*/, 
-                     const double& tStartMySlice, const unsigned& nc, 
-                     const unsigned int& iter, const double& Bext) {
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, const double& Bext,
+                     int rankTime, int rankSpace,
+                     MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
-        scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp);
+        scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
@@ -1041,7 +1047,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0)) {
             IpplTimings::startTimer(dumpData);
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp);
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         double alpha = -0.5 * dt;
@@ -1087,7 +1093,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
-            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp);
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
@@ -1118,7 +1124,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             time_m += dt;
             
             IpplTimings::startTimer(dumpData);
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp);         
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
             IpplTimings::stopTimer(dumpData);
     
         }
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index c503d9445..73802f9a0 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -147,7 +147,7 @@ double CDF(const double& x, const double& mu, const double& sigma) {
 }
 
 double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      Vector_t& length) {
+                      Vector_t& length, MPI_Comm& spaceComm) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -174,9 +174,11 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 
     Kokkos::fence();
     double globalError = 0.0;
-    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
     double globalNorm = 0.0;
-    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
     //lError = std::sqrt(localError)/std::sqrt(localNorm);
 
     double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
@@ -185,7 +187,7 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 
 }
 
-double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter) {
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, MPI_Comm& spaceComm) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -203,9 +205,11 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 
     Kokkos::fence();
     double globalError = 0.0;
-    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
     double globalNorm = 0.0;
-    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    //MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
     //lError = std::sqrt(localError)/std::sqrt(localNorm);
 
     double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
@@ -383,21 +387,27 @@ const char* TestName = "PenningTrapPinT";
 
 int main(int argc, char *argv[]){
    
-    int rankWorld, sizeWorld;
-    MPI_Init(&argc, &argv);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld);
-    MPI_Comm_size(MPI_COMM_WORLD, &sizeWorld);
+    Ippl ippl(argc, argv);
+    
+    //int rankWorld, sizeWorld;
+    //MPI_Init(&argc, &argv);
+    //MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld);
+    //MPI_Comm_size(MPI_COMM_WORLD, &sizeWorld);
 
     int spaceColor, timeColor;
     MPI_Comm spaceComm, timeComm;
 
-    int spaceProcs = std::atoi(argv[13]);
-    int timeProcs = std::atoi(argv[14]);
-    spaceColor = rankWorld / spaceProcs; 
-    timeColor = rankWorld % spaceProcs;
+    int spaceProcs = std::atoi(argv[15]);
+    int timeProcs = std::atoi(argv[16]);
+    //spaceColor = rankWorld / spaceProcs; 
+    //timeColor = rankWorld % spaceProcs;
+    spaceColor = Ippl::Comm->rank() / spaceProcs; 
+    timeColor = Ippl::Comm->rank() % spaceProcs;
 
-    MPI_Comm_split(MPI_COMM_WORLD, spaceColor, rankWorld, &spaceComm);
-    MPI_Comm_split(MPI_COMM_WORLD, timeColor, rankWorld, &timeComm);
+    //MPI_Comm_split(MPI_COMM_WORLD, spaceColor, rankWorld, &spaceComm);
+    //MPI_Comm_split(MPI_COMM_WORLD, timeColor, rankWorld, &timeComm);
+    MPI_Comm_split(Ippl::getComm(), spaceColor, Ippl::Comm->rank(), &spaceComm);
+    MPI_Comm_split(Ippl::getComm(), timeColor, Ippl::Comm->rank(), &timeComm);
 
     int rankSpace, sizeSpace, rankTime, sizeTime;
     MPI_Comm_rank(spaceComm, &rankSpace);
@@ -406,8 +416,9 @@ int main(int argc, char *argv[]){
     MPI_Comm_rank(timeComm, &rankTime);
     MPI_Comm_size(timeComm, &sizeTime);
 
-    Ippl ippl(argc, argv, spaceComm);
+    //Ippl ippl(argc, argv, spaceComm);
     
+    //Inform msg(TestName, sizeSpace-1);
     Inform msg(TestName, Ippl::Comm->size()-1);
     Inform msg2all(TestName,INFORM_ALL_NODES);
 
@@ -508,8 +519,10 @@ int main(int argc, char *argv[]){
 
     size_type Total_particles = 0;
 
+    //MPI_Allreduce(&nloc, &Total_particles, 1,
+    //            MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
     MPI_Allreduce(&nloc, &Total_particles, 1,
-                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+                MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
 
     int rest = (int) (totalP - Total_particles);
 
@@ -519,7 +532,7 @@ int main(int argc, char *argv[]){
 
     double Q = -1562.5;
     double Bext = 5.0;
-    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,nloc);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,totalP);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -643,12 +656,12 @@ int main(int argc, char *argv[]){
         //<< " Max. iterations: " << maxIter
         << " No. of cycles: " << nCycles
         << endl
-        << "Np= " << nloc 
+        << "Np= " << totalP 
         << " Fourier modes = " << nmPIF
         << " Grid points = " << nrPIC
         << endl;
     
-    Pcoarse->q = Pcoarse->Q_m/nloc;
+    Pcoarse->q = Pcoarse->Q_m/totalP;
     IpplTimings::stopTimer(particleCreation);                                                    
     
     msg << "particles created and initial conditions assigned " << endl;
@@ -721,7 +734,7 @@ int main(int argc, char *argv[]){
         IpplTimings::startTimer(initializeCycles);
         Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
                                     isPreviousDomainConverged, ntCoarse,
-                                    dtCoarse, tStartMySlice, Bext);
+                                    dtCoarse, tStartMySlice, Bext, rankTime, spaceComm);
         IpplTimings::stopTimer(initializeCycles);
         unsigned int it = 0;
         while (!isConverged) { 
@@ -730,7 +743,7 @@ int main(int argc, char *argv[]){
 
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
-            Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, nc+1, it+1, Bext);
+            Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, Bext, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(finePropagator);
         
 
@@ -773,7 +786,7 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
+            Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -784,8 +797,8 @@ int main(int argc, char *argv[]){
             PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
             IpplTimings::startTimer(computeErrors);
             //double localRerror, localPerror;
-            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length);
-            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter);
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
         
             IpplTimings::stopTimer(computeErrors);
 
@@ -815,7 +828,7 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
             IpplTimings::stopTimer(dumpData);
 
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index a50bb9007..aeb0df5f0 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -157,7 +157,8 @@ namespace ippl {
         template <unsigned Dim, class M, class C, typename P2>
         void
         scatter(Field<T, Dim, M, C>& f,
-                const ParticleAttrib<Vector<P2, Dim>, Properties... >& pp) const;
+                const ParticleAttrib<Vector<P2, Dim>, Properties... >& pp,
+                const MPI_Comm& spaceComm) const;
 
         template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
@@ -181,7 +182,8 @@ namespace ippl {
         template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
         scatterPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
-                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp) const;
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp,
+                const MPI_Comm& spaceComm) const;
         
         template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 458fba563..6a4047742 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -139,7 +139,8 @@ namespace ippl {
     template<typename T, class... Properties>
     template <unsigned Dim, class M, class C, class PT>
     void ParticleAttrib<T, Properties...>::scatter(Field<T,Dim,M,C>& f,
-                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp,
+                                                   const MPI_Comm& spaceComm)
     const
     {
         static IpplTimings::TimerRef scatterPICTimer = IpplTimings::getTimer("ScatterPIC");           
@@ -206,14 +207,14 @@ namespace ippl {
             
         //static IpplTimings::TimerRef accumulateHaloTimer = IpplTimings::getTimer("AccumulateHalo");           
         //IpplTimings::startTimer(accumulateHaloTimer);                                               
-        f.accumulateHalo();
+        tempField.accumulateHalo();
         //IpplTimings::stopTimer(accumulateHaloTimer);
 
         static IpplTimings::TimerRef scatterAllReducePICTimer = IpplTimings::getTimer("scatterAllReducePIC");           
         IpplTimings::startTimer(scatterAllReducePICTimer);                                               
         int viewSize = view.extent(0) * view.extent(1) * view.extent(2);
         MPI_Allreduce(viewLocal.data(), view.data(), viewSize, 
-                      MPI_DOUBLE, MPI_SUM, Ippl::getComm());  
+                      MPI_DOUBLE, MPI_SUM, spaceComm);  
         IpplTimings::stopTimer(scatterAllReducePICTimer);
     }
 
@@ -504,7 +505,8 @@ namespace ippl {
     template<typename T, class... Properties>
     template <unsigned Dim, class M, class C, class FT, class ST, class PT>
     void ParticleAttrib<T, Properties...>::scatterPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
-                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp,
+                                                   const MPI_Comm& spaceComm)
     const
     {
         
@@ -538,7 +540,7 @@ namespace ippl {
         IpplTimings::startTimer(scatterAllReducePIFTimer);                                               
         int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
         MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
-                      MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
+                      MPI_C_DOUBLE_COMPLEX, MPI_SUM, spaceComm);  
         IpplTimings::stopTimer(scatterAllReducePIFTimer);
 
         //IpplTimings::startTimer(scatterPIFNUFFTTimer);
@@ -648,10 +650,11 @@ namespace ippl {
     template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
     void scatterPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp,
+                 const MPI_Comm& spaceComm = MPI_COMM_WORLD)
     {
 #ifdef KOKKOS_ENABLE_CUDA
-        attrib.scatterPIFNUFFT(f, Sk, pp);
+        attrib.scatterPIFNUFFT(f, Sk, pp, spaceComm);
 #else
         //throw IpplException("scatterPIFNUFFT", "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
         //                     be compiled with CUDA. Otherwise use scatterPIFNUDFT.");
@@ -682,9 +685,10 @@ namespace ippl {
     template<typename P1, unsigned Dim, class M, class C, typename P2, class... Properties>
     inline
     void scatter(const ParticleAttrib<P1, Properties...>& attrib, Field<P1, Dim, M, C>& f,
-                 const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp)
+                 const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp,
+                 const MPI_Comm& spaceComm = MPI_COMM_WORLD)
     {
-        attrib.scatter(f, pp);
+        attrib.scatter(f, pp, spaceComm);
     }
 
     template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>

From 18498227d6a7c78b0710b9ff007ad984eb92920c Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Thu, 1 Jun 2023 11:45:22 +0200
Subject: [PATCH 087/117] In the middle of debugging

---
 alpine/PinT/ChargedParticlesPinT.hpp |  37 +++--
 alpine/PinT/PenningTrapPinT.cpp      | 211 ++++++++++++++++++++++-----
 2 files changed, 198 insertions(+), 50 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index d552442a1..7e86b6324 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -177,6 +177,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void initializeParareal(ParticleAttrib<Vector_t>& Rbegin,
                             ParticleAttrib<Vector_t>& Pbegin,
+                            ParticleAttrib<Vector_t>& Rcoarse,
+                            ParticleAttrib<Vector_t>& Pcoarse,
+                            ParticleAttrib<Vector_t>& Rtemp,
+                            ParticleAttrib<Vector_t>& Ptemp,
                             bool& isConverged,
                             bool& isPreviousDomainConverged,
                             const unsigned int& ntCoarse,
@@ -187,22 +191,33 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                             MPI_Comm& spaceComm) {
 
         //Copy initial conditions as they are needed later
-        Kokkos::deep_copy(R0.getView(), this->R.getView());
-        Kokkos::deep_copy(P0.getView(), P.getView());
+        //Kokkos::deep_copy(R0.getView(), this->R.getView());
+        //Kokkos::deep_copy(P0.getView(), P.getView());
+        Kokkos::deep_copy(Rtemp.getView(), Rcoarse.getView());
+        Kokkos::deep_copy(Ptemp.getView(), Pcoarse.getView());
 
         //Get initial guess for ranks other than 0 by propagating the coarse solver
         if (rankTime > 0) {
-            BorisPIC(this->R, P, rankTime*ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            //BorisPIC(this->R, P, rankTime*ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            BorisPIC(Rcoarse, Pcoarse, rankTime*ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
         }
-        
+
+        //Copy initial conditions as they are needed later
+        //Kokkos::deep_copy(R0.getView(), this->R.getView());
+        //Kokkos::deep_copy(P0.getView(), P.getView());
+
+
         //Ippl::Comm->barrier();
         
-        Kokkos::deep_copy(Rbegin.getView(), this->R.getView());
-        Kokkos::deep_copy(Pbegin.getView(), P.getView());
+        //Kokkos::deep_copy(Rbegin.getView(), this->R.getView());
+        //Kokkos::deep_copy(Pbegin.getView(), P.getView());
+        Kokkos::deep_copy(Rbegin.getView(), Rcoarse.getView());
+        Kokkos::deep_copy(Pbegin.getView(), Pcoarse.getView());
 
 
         //Run the coarse integrator to get the values at the end of the time slice 
-        BorisPIC(this->R, P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+        //BorisPIC(this->R, P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+        BorisPIC(Rcoarse, Pcoarse, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
 
         isConverged = false;
         if(rankTime == 0) {
@@ -860,6 +875,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
     
         //Field solve
+        EfieldPIC_m = 0.0;
         solver_mp->solve();
     
         // gather E field
@@ -919,6 +935,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //Field solve
             IpplTimings::startTimer(fieldSolvePIC);
+            EfieldPIC_m = 0.0;
             solver_mp->solve();
             IpplTimings::stopTimer(fieldSolvePIC);
     
@@ -1031,7 +1048,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
-        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
         scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
@@ -1045,7 +1062,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         time_m = tStartMySlice;
 
-        if((time_m == 0.0)) {
+        if((time_m == 1000.0)) {
             IpplTimings::startTimer(dumpData);
             dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
@@ -1124,7 +1141,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             time_m += dt;
             
             IpplTimings::startTimer(dumpData);
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+            //dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
             IpplTimings::stopTimer(dumpData);
     
         }
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 73802f9a0..8bef5a309 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -561,8 +561,19 @@ int main(int argc, char *argv[]){
     Pbegin->create(nloc);
     Pend->create(nloc);
 
+    Pcoarse->q = Pcoarse->Q_m/totalP;
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
+
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+    
+    
+    Pcoarse->initNUFFT(FLPIF);
+
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
@@ -617,7 +628,49 @@ int main(int argc, char *argv[]){
         buf->resetReadPos();
     }
 
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+    Kokkos::fence();
+
+    if(rankTime == 0) {
+        unsigned int stepsToRun = 2*ntCoarse;
+        Pcoarse->BorisPIC(Pend->R, Pend->P, stepsToRun, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+        Pcoarse->BorisPIC(Pbegin->R, Pbegin->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+        Pcoarse->BorisPIC(Pbegin->R, Pbegin->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+        //Pcoarse->BorisPIF(Pend->R, Pend->P, stepsToRun, dtFine, rankTime * dtSlice, 0, 0, Bext, rankTime, rankSpace, spaceComm);
+        //Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, rankTime * dtSlice, 0, 0, Bext, rankTime, rankSpace, spaceComm);
+        //Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, rankTime * dtSlice, 0, 0, Bext, rankTime, rankSpace, spaceComm);
+        Pcoarse->dumpParticleData(0, Pend->R, Pend->P, "cont");
+        Pcoarse->dumpParticleData(0, Pbegin->R, Pbegin->P, "sep");
+    }
+
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+    Kokkos::fence();
     
+    if(rankTime < sizeTime-1) {
+        size_type bufSize = Pend->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+        MPI_Request request;
+        Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
+        buf->resetWritePos();
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+    }
+    Ippl::Comm->barrier();
+
+    if(rankTime > 0) {
+        size_type bufSize = Pbegin->packedSize(nloc);
+        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+        Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+        buf->resetReadPos();
+    }
+
     if(rankTime < sizeTime-1) {
         size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
@@ -627,11 +680,41 @@ int main(int argc, char *argv[]){
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
 
-    //Ippl::Comm->barrier();
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-    Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-    IpplTimings::stopTimer(deepCopy);
+
+    if(rankTime == 1) {
+        unsigned int stepsToRun = (rankTime+1) * ntCoarse;
+    //    std::cout << "Rank: " << Ippl::Comm->rank() << "needs to run " << stepsToRun << " steps" << std::endl;
+        Pcoarse->BorisPIC(Pbegin->R, Pbegin->P, stepsToRun, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+        Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+    }   
+
+    //Pcoarse->dumpParticleData(0, Pcoarse->R, Pcoarse->P, "new");
+    //Pcoarse->dumpParticleData(0, Pbegin->R, Pbegin->P, "old");
+    double Rerror2 = computeRL2Error(Pbegin->R, Pcoarse->R, length, spaceComm);
+    double Perror2 = computePL2Error(Pbegin->P, Pcoarse->P, spaceComm);
+    std::cout << "Rank: " << Ippl::Comm->rank() << " Rerror: " << Rerror2 << " Perror: "  << Perror2 << std::endl;
+    Pbegin->R = Pbegin->R - Pcoarse->R;
+    Pbegin->P = Pbegin->P - Pcoarse->P;
+    //Pcoarse->dumpParticleData(0, Pbegin->R, Pbegin->P, "diff");
+
+
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pbegin->R.getView(), Pend->R.getView());
+    //Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+
+
+    //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
+
+    //IpplTimings::startTimer(deepCopy);
+    //Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
+    //IpplTimings::stopTimer(deepCopy);
 #else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
@@ -661,7 +744,6 @@ int main(int argc, char *argv[]){
         << " Grid points = " << nrPIC
         << endl;
     
-    Pcoarse->q = Pcoarse->Q_m/totalP;
     IpplTimings::stopTimer(particleCreation);                                                    
     
     msg << "particles created and initial conditions assigned " << endl;
@@ -716,26 +798,44 @@ int main(int argc, char *argv[]){
     //    isPreviousDomainConverged = false;
     //}
 
-    bool isConverged, isPreviousDomainConverged;
 
-    Pcoarse->shapetype_m = argv[13];
-    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
-    IpplTimings::startTimer(initializeShapeFunctionPIF);
-    Pcoarse->initializeShapeFunctionPIF();
-    IpplTimings::stopTimer(initializeShapeFunctionPIF);
-    
     
-    Pcoarse->initNUFFT(FLPIF);
-    
-   
+    int sign = 1;
     for (unsigned int nc=0; nc < nCycles; nc++) {
-        double tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice); 
-        Pcoarse->time_m = tStartMySlice;
-        IpplTimings::startTimer(initializeCycles);
-        Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
-                                    isPreviousDomainConverged, ntCoarse,
-                                    dtCoarse, tStartMySlice, Bext, rankTime, spaceComm);
-        IpplTimings::stopTimer(initializeCycles);
+        
+        double tStartMySlice; 
+        bool sendCriteria, recvCriteria;
+        bool isConverged = false;
+        bool isPreviousDomainConverged = false;
+        
+        //IpplTimings::startTimer(initializeCycles);
+        //Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, Pcoarse->R, Pcoarse->P, Pcoarse->R0, 
+        //                            Pcoarse->P0, isConverged,
+        //                            isPreviousDomainConverged, ntCoarse,
+        //                            dtCoarse, tStartMySlice, Bext, rankTime, spaceComm);
+        //IpplTimings::stopTimer(initializeCycles);
+        //even cycles
+        if(nc % 2 == 0) {
+            sendCriteria = (rankTime < (sizeTime-1));
+            recvCriteria = (rankTime > 0);
+            if(rankTime == 0) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            msg.setPrintNode(Ippl::Comm->size()-1);
+        }
+        //odd cycles
+        else {
+            recvCriteria = (rankTime < (sizeTime-1));
+            sendCriteria = (rankTime > 0);
+            if(rankTime == (sizeTime - 1)) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            msg.setPrintNode(0);
+        }
+        //Pcoarse->time_m = tStartMySlice;
+        
         unsigned int it = 0;
         while (!isConverged) { 
         //while ((!isPreviousDomainConverged) || (!isConverged)) { 
@@ -743,7 +843,8 @@ int main(int argc, char *argv[]){
 
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
-            Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, Bext, rankTime, rankSpace, spaceComm);
+            Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, 
+                              Bext, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(finePropagator);
         
 
@@ -764,12 +865,12 @@ int main(int argc, char *argv[]){
             tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             
-            if((rankTime > 0) && (!isPreviousDomainConverged)) {
+            if(recvCriteria && (!isPreviousDomainConverged)) {
                 size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-                Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+                Ippl::Comm->recv(rankTime-sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
-                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, rankTime-1, tagbool, 
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, rankTime-sign, tagbool, 
                         timeComm, MPI_STATUS_IGNORE);
                 IpplTimings::startTimer(deepCopy);
                 Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
@@ -808,14 +909,14 @@ int main(int argc, char *argv[]){
 
 
             IpplTimings::startTimer(timeCommunication);
-            if(rankTime < (sizeTime-1)) {
+            if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
                 MPI_Request request;
-                Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
+                Ippl::Comm->isend(rankTime+sign, tag, *Pend, *buf, request, nloc, timeComm);
                 buf->resetWritePos();
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
-                MPI_Send(&isConverged, 1, MPI_C_BOOL, rankTime+1, tagbool, timeComm);
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, rankTime+sign, tagbool, timeComm);
             }
             IpplTimings::stopTimer(timeCommunication);
 
@@ -828,7 +929,7 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
             IpplTimings::stopTimer(dumpData);
 
@@ -848,18 +949,47 @@ int main(int argc, char *argv[]){
         if((nCycles > 1) && (nc < (nCycles - 1))) {  
             IpplTimings::startTimer(timeCommunication);
             tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+           
+            //send, receive criteria and tStartMySlice are reversed at the end of the cycle
+            if(nc % 2 == 0) {
+                recvCriteria = (rankTime < (sizeTime-1));
+                sendCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            }
+            //odd cycles
+            else {
+                sendCriteria = (rankTime < (sizeTime-1));
+                recvCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            }
             
-            if(rankTime < (sizeTime-1)) {
-                size_type bufSize = Pend->packedSize(nloc);
+            if(recvCriteria) {
+                size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-                Ippl::Comm->recv(rankTime+1, tag, *Pend, *buf, bufSize, nloc, timeComm);
+                Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
             }
-            if(rankTime > 0) {
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+
+            if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
                 MPI_Request request;
-                Ippl::Comm->isend(rankTime-1, tag, *Pend, *buf, request, nloc, timeComm);
+                Ippl::Comm->isend(rankTime-sign, tag, *Pend, *buf, request, nloc, timeComm);
                 buf->resetWritePos();
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
             }
@@ -868,10 +998,11 @@ int main(int argc, char *argv[]){
             //Ippl::Comm->barrier();
 
             //msg << "Communication finished in cycle: " << nc+1 << endl;
-            IpplTimings::startTimer(deepCopy);
-            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
-            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
-            IpplTimings::stopTimer(deepCopy);
+            //IpplTimings::startTimer(deepCopy);
+            //Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            //Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            //IpplTimings::stopTimer(deepCopy);
+            sign *= -1;
         }
     }
     msg << TestName << " Parareal: End." << endl;

From cb970966f0ad67ead3fe7892dddcdfce2ec90982 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Thu, 1 Jun 2023 17:37:48 +0200
Subject: [PATCH 088/117] bug in Penning trap corrected

---
 alpine/ElectrostaticPIC/PenningTrap.cpp    | 31 +++++-----
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp | 29 ++++-----
 alpine/PinT/ChargedParticlesPinT.hpp       | 66 ++++++++++----------
 alpine/PinT/PenningTrapPinT.cpp            | 70 +---------------------
 4 files changed, 68 insertions(+), 128 deletions(-)

diff --git a/alpine/ElectrostaticPIC/PenningTrap.cpp b/alpine/ElectrostaticPIC/PenningTrap.cpp
index e8c641b56..f2b408cb2 100644
--- a/alpine/ElectrostaticPIC/PenningTrap.cpp
+++ b/alpine/ElectrostaticPIC/PenningTrap.cpp
@@ -352,6 +352,7 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(dumpDataTimer);
     P->dumpData();
+    P->dumpEnergy(totalP);
     P->gatherStatistics(totalP);
     //P->dumpLocalDomains(FL, 0);
     IpplTimings::stopTimer(dumpDataTimer);
@@ -381,13 +382,13 @@ int main(int argc, char *argv[]){
             double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
             double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-            Eview(j)[0] += Eext_x;
-            Eview(j)[1] += Eext_y;
-            Eview(j)[2] += Eext_z;
+            Eext_x += Eview(j)[0];
+            Eext_y += Eview(j)[1];
+            Eext_z += Eview(j)[2];
             
-            Pview(j)[0] += alpha * (Eview(j)[0]  + Pview(j)[1] * Bext);
-            Pview(j)[1] += alpha * (Eview(j)[1]  - Pview(j)[0] * Bext);
-            Pview(j)[2] += alpha * Eview(j)[2];
+            Pview(j)[0] += alpha * (Eext_x  + Pview(j)[1] * Bext);
+            Pview(j)[1] += alpha * (Eext_y  - Pview(j)[0] * Bext);
+            Pview(j)[2] += alpha * Eext_z;
         });
         IpplTimings::stopTimer(PTimer);
 
@@ -434,20 +435,22 @@ int main(int argc, char *argv[]){
             double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
             double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-            E2view(j)[0] += Eext_x;
-            E2view(j)[1] += Eext_y;
-            E2view(j)[2] += Eext_z;
-            P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (E2view(j)[0] 
-                            + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]) );
-            P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (E2view(j)[1] 
-                            - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
-            P2view(j)[2] += alpha * E2view(j)[2];
+            Eext_x += E2view(j)[0];
+            Eext_y += E2view(j)[1];
+            Eext_z += E2view(j)[2];
+            
+            P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (Eext_x 
+                            + P2view(j)[1] * Bext + alpha * Bext * Eext_y) );
+            P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (Eext_y 
+                            - P2view(j)[0] * Bext - alpha * Bext * Eext_x) );
+            P2view(j)[2] += alpha * Eext_z;
         });
         IpplTimings::stopTimer(PTimer);
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
         P->dumpData();
+        P->dumpEnergy(totalP);
         P->gatherStatistics(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index dfe082298..1ae3ab415 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -296,13 +296,13 @@ int main(int argc, char *argv[]){
             double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
             double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-            Eview(j)[0] += Eext_x;
-            Eview(j)[1] += Eext_y;
-            Eview(j)[2] += Eext_z;
+            Eext_x += Eview(j)[0];
+            Eext_y += Eview(j)[1];
+            Eext_z += Eview(j)[2];
             
-            Pview(j)[0] += alpha * (Eview(j)[0]  + Pview(j)[1] * Bext);
-            Pview(j)[1] += alpha * (Eview(j)[1]  - Pview(j)[0] * Bext);
-            Pview(j)[2] += alpha * Eview(j)[2];
+            Pview(j)[0] += alpha * (Eext_x  + Pview(j)[1] * Bext);
+            Pview(j)[1] += alpha * (Eext_y  - Pview(j)[0] * Bext);
+            Pview(j)[2] += alpha * Eext_z;
         });
         IpplTimings::stopTimer(PTimer);
 
@@ -333,14 +333,15 @@ int main(int argc, char *argv[]){
             double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
             double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-            E2view(j)[0] += Eext_x;
-            E2view(j)[1] += Eext_y;
-            E2view(j)[2] += Eext_z;
-            P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (E2view(j)[0] 
-                            + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]) );
-            P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (E2view(j)[1] 
-                            - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
-            P2view(j)[2] += alpha * E2view(j)[2];
+            Eext_x += E2view(j)[0];
+            Eext_y += E2view(j)[1];
+            Eext_z += E2view(j)[2];
+            
+            P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (Eext_x 
+                            + P2view(j)[1] * Bext + alpha * Bext * Eext_y) );
+            P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (Eext_y 
+                            - P2view(j)[0] * Bext - alpha * Bext * Eext_x) );
+            P2view(j)[2] += alpha * Eext_z;
         });
         IpplTimings::stopTimer(PTimer);
 
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 7e86b6324..117016ef4 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -909,13 +909,13 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
                 double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-                Eview(j)[0] += Eext_x;
-                Eview(j)[1] += Eext_y;
-                Eview(j)[2] += Eext_z;
+                Eext_x += Eview(j)[0];
+                Eext_y += Eview(j)[1];
+                Eext_z += Eview(j)[2];
                 
-                Pview(j)[0] += alpha * (Eview(j)[0]  + Pview(j)[1] * Bext);
-                Pview(j)[1] += alpha * (Eview(j)[1]  - Pview(j)[0] * Bext);
-                Pview(j)[2] += alpha * Eview(j)[2];
+                Pview(j)[0] += alpha * (Eext_x  + Pview(j)[1] * Bext);
+                Pview(j)[1] += alpha * (Eext_y  - Pview(j)[0] * Bext);
+                Pview(j)[2] += alpha * Eext_z;
             });
     
             //drift
@@ -951,15 +951,16 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 double Eext_x = -(R2view(j)[0] - 0.5*rmax[0]) * (V0/(2*std::pow(rmax[2],2)));
                 double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
                 double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
-
-                E2view(j)[0] += Eext_x;
-                E2view(j)[1] += Eext_y;
-                E2view(j)[2] += Eext_z;
-                P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (E2view(j)[0] 
-                                + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]) );
-                P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (E2view(j)[1] 
-                                - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
-                P2view(j)[2] += alpha * E2view(j)[2];
+         
+                Eext_x += E2view(j)[0];
+                Eext_y += E2view(j)[1];
+                Eext_z += E2view(j)[2];
+                
+                P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (Eext_x 
+                                + P2view(j)[1] * Bext + alpha * Bext * Eext_y) );
+                P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (Eext_y 
+                                - P2view(j)[0] * Bext - alpha * Bext * Eext_x) );
+                P2view(j)[2] += alpha * Eext_z;
             });
             
             time_m += dt;
@@ -1048,7 +1049,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
-        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
+        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
         scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
@@ -1062,7 +1063,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         time_m = tStartMySlice;
 
-        if((time_m == 1000.0)) {
+        if((time_m == 0.0)) {
             IpplTimings::startTimer(dumpData);
             dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
@@ -1092,13 +1093,13 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 double Eext_y = -(Rview(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
                 double Eext_z =  (Rview(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-                Eview(j)[0] += Eext_x;
-                Eview(j)[1] += Eext_y;
-                Eview(j)[2] += Eext_z;
+                Eext_x += Eview(j)[0];
+                Eext_y += Eview(j)[1];
+                Eext_z += Eview(j)[2];
                 
-                Pview(j)[0] += alpha * (Eview(j)[0]  + Pview(j)[1] * Bext);
-                Pview(j)[1] += alpha * (Eview(j)[1]  - Pview(j)[0] * Bext);
-                Pview(j)[2] += alpha * Eview(j)[2];
+                Pview(j)[0] += alpha * (Eext_x  + Pview(j)[1] * Bext);
+                Pview(j)[1] += alpha * (Eext_y  - Pview(j)[0] * Bext);
+                Pview(j)[2] += alpha * Eext_z;
             });
     
             //drift
@@ -1128,20 +1129,21 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 double Eext_y = -(R2view(j)[1] - 0.5*rmax[1]) * (V0/(2*std::pow(rmax[2],2)));
                 double Eext_z =  (R2view(j)[2] - 0.5*rmax[2]) * (V0/(std::pow(rmax[2],2)));
 
-                E2view(j)[0] += Eext_x;
-                E2view(j)[1] += Eext_y;
-                E2view(j)[2] += Eext_z;
-                P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (E2view(j)[0] 
-                                + P2view(j)[1] * Bext + alpha * Bext * E2view(j)[1]) );
-                P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (E2view(j)[1] 
-                                - P2view(j)[0] * Bext - alpha * Bext * E2view(j)[0]) );
-                P2view(j)[2] += alpha * E2view(j)[2];
+                Eext_x += E2view(j)[0];
+                Eext_y += E2view(j)[1];
+                Eext_z += E2view(j)[2];
+                
+                P2view(j)[0]  = DrInv * ( P2view(j)[0] + alpha * (Eext_x 
+                                + P2view(j)[1] * Bext + alpha * Bext * Eext_y) );
+                P2view(j)[1]  = DrInv * ( P2view(j)[1] + alpha * (Eext_y 
+                                - P2view(j)[0] * Bext - alpha * Bext * Eext_x) );
+                P2view(j)[2] += alpha * Eext_z;
             });
 
             time_m += dt;
             
             IpplTimings::startTimer(dumpData);
-            //dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
             IpplTimings::stopTimer(dumpData);
     
         }
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 8bef5a309..e0981e81c 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -610,7 +610,6 @@ int main(int argc, char *argv[]){
     //condition is not the same on different GPUs
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
 
-    //if(Ippl::Comm->rank() == 0) {
     if(rankTime == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
         Kokkos::parallel_for(nloc,
@@ -634,25 +633,13 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
     IpplTimings::stopTimer(deepCopy);
-    Kokkos::fence();
 
-    if(rankTime == 0) {
-        unsigned int stepsToRun = 2*ntCoarse;
-        Pcoarse->BorisPIC(Pend->R, Pend->P, stepsToRun, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-        Pcoarse->BorisPIC(Pbegin->R, Pbegin->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-        Pcoarse->BorisPIC(Pbegin->R, Pbegin->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-        //Pcoarse->BorisPIF(Pend->R, Pend->P, stepsToRun, dtFine, rankTime * dtSlice, 0, 0, Bext, rankTime, rankSpace, spaceComm);
-        //Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, rankTime * dtSlice, 0, 0, Bext, rankTime, rankSpace, spaceComm);
-        //Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, rankTime * dtSlice, 0, 0, Bext, rankTime, rankSpace, spaceComm);
-        Pcoarse->dumpParticleData(0, Pend->R, Pend->P, "cont");
-        Pcoarse->dumpParticleData(0, Pbegin->R, Pbegin->P, "sep");
-    }
+    Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
     IpplTimings::stopTimer(deepCopy);
-    Kokkos::fence();
     
     if(rankTime < sizeTime-1) {
         size_type bufSize = Pend->packedSize(nloc);
@@ -662,59 +649,6 @@ int main(int argc, char *argv[]){
         buf->resetWritePos();
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
-    Ippl::Comm->barrier();
-
-    if(rankTime > 0) {
-        size_type bufSize = Pbegin->packedSize(nloc);
-        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-        Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
-        buf->resetReadPos();
-    }
-
-    if(rankTime < sizeTime-1) {
-        size_type bufSize = Pbegin->packedSize(nloc);
-        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
-        MPI_Request request;
-        Ippl::Comm->isend(rankTime+1, tag, *Pbegin, *buf, request, nloc, timeComm);
-        buf->resetWritePos();
-        MPI_Wait(&request, MPI_STATUS_IGNORE);
-    }
-
-
-    if(rankTime == 1) {
-        unsigned int stepsToRun = (rankTime+1) * ntCoarse;
-    //    std::cout << "Rank: " << Ippl::Comm->rank() << "needs to run " << stepsToRun << " steps" << std::endl;
-        Pcoarse->BorisPIC(Pbegin->R, Pbegin->P, stepsToRun, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-        Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-    }   
-
-    //Pcoarse->dumpParticleData(0, Pcoarse->R, Pcoarse->P, "new");
-    //Pcoarse->dumpParticleData(0, Pbegin->R, Pbegin->P, "old");
-    double Rerror2 = computeRL2Error(Pbegin->R, Pcoarse->R, length, spaceComm);
-    double Perror2 = computePL2Error(Pbegin->P, Pcoarse->P, spaceComm);
-    std::cout << "Rank: " << Ippl::Comm->rank() << " Rerror: " << Rerror2 << " Perror: "  << Perror2 << std::endl;
-    Pbegin->R = Pbegin->R - Pcoarse->R;
-    Pbegin->P = Pbegin->P - Pcoarse->P;
-    //Pcoarse->dumpParticleData(0, Pbegin->R, Pbegin->P, "diff");
-
-
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pbegin->R.getView(), Pend->R.getView());
-    //Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-
-    //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
 #else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
@@ -929,7 +863,7 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
             IpplTimings::stopTimer(dumpData);
 

From 01e1ab949e0e864eb65e3f3f0933485fed621d07 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 12 Jun 2023 17:01:56 +0200
Subject: [PATCH 089/117] TSI and Landau damping modified and a bug in
 multiblock corrected

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 241 ++++++++++++++++------
 alpine/PinT/ChargedParticlesPinT.hpp      |  56 ++---
 alpine/PinT/LandauDampingPinT.cpp         | 241 +++++++++++++++-------
 alpine/PinT/PenningTrapPinT.cpp           |  28 +--
 4 files changed, 385 insertions(+), 181 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index cf1a2c8e3..e965bf997 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -160,8 +160,7 @@ double CDF(const double& x, const double& delta, const double& k,
 }
 
 double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                         const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
-                         Vector_t& length) {
+                      Vector_t& length, MPI_Comm& spaceComm) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -187,16 +186,18 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
     Kokkos::fence();
-    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
 
-    double relError = lError;//absError / std::sqrt(globaltemp);
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
     
     return relError;
 
 }
 
-double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, MPI_Comm& spaceComm) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -213,9 +214,12 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
     Kokkos::fence();
-    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
 
-    double relError = lError;//absError / std::sqrt(globaltemp);
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
     
     return relError;
 
@@ -391,7 +395,26 @@ const char* TestName = "TwoStreamInstability";
 
 int main(int argc, char *argv[]){
     Ippl ippl(argc, argv);
-    
+
+    int spaceColor, timeColor;
+    MPI_Comm spaceComm, timeComm;
+
+    int spaceProcs = std::atoi(argv[15]);
+    int timeProcs = std::atoi(argv[16]);
+    spaceColor = Ippl::Comm->rank() / spaceProcs; 
+    timeColor = Ippl::Comm->rank() % spaceProcs;
+
+    MPI_Comm_split(Ippl::getComm(), spaceColor, Ippl::Comm->rank(), &spaceComm);
+    MPI_Comm_split(Ippl::getComm(), timeColor, Ippl::Comm->rank(), &timeComm);
+
+    int rankSpace, sizeSpace, rankTime, sizeTime;
+    MPI_Comm_rank(spaceComm, &rankSpace);
+    MPI_Comm_size(spaceComm, &sizeSpace);
+
+    MPI_Comm_rank(timeComm, &rankTime);
+    MPI_Comm_size(timeComm, &sizeTime);
+
+
     Inform msg(TestName, Ippl::Comm->size()-1);
     Inform msg2all(TestName,INFORM_ALL_NODES);
 
@@ -423,7 +446,7 @@ int main(int argc, char *argv[]){
     const double tEnd = std::atof(argv[8]);
     const unsigned int nCycles = std::atoi(argv[12]);
     double tEndCycle = tEnd / nCycles;
-    const double dtSlice = tEndCycle / Ippl::Comm->size();
+    const double dtSlice = tEndCycle / sizeTime;
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
     const unsigned int ntFine = std::ceil(dtSlice / dtFine);
@@ -513,14 +536,30 @@ int main(int argc, char *argv[]){
 
     double factorVelBulk = 1.0 - epsilon;
     double factorVelBeam = 1.0 - factorVelBulk;
-    size_type nlocBulk = (size_type)(factorVelBulk * totalP);
-    size_type nlocBeam = (size_type)(factorVelBeam * totalP);
+    double factorConf = 1.0 / sizeSpace;
+    size_type nlocBulk = (size_type)(factorConf * factorVelBulk * totalP);
+    size_type nlocBeam = (size_type)(factorConf * factorVelBeam * totalP);
     size_type nloc = nlocBulk + nlocBeam;
-    
+   
+    size_type Total_particles = 0;
+
+    //MPI_Allreduce(&nloc, &Total_particles, 1,
+    //            MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
+
+    //int rest = (int) (totalP - Total_particles);
+
+    //if ( (rankTime == 0) && (rankSpace < rest) ) {
+    //    ++nloc;
+    //}
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
+
+
     
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
-    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,nloc);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -548,9 +587,20 @@ int main(int argc, char *argv[]){
     Pcoarse->create(nloc);
     Pbegin->create(nloc);
     Pend->create(nloc);
+    
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
 
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
+
+
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+  
+    Pcoarse->initNUFFT(FLPIF);
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
@@ -590,8 +640,8 @@ int main(int argc, char *argv[]){
 
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
 
-    if(Ippl::Comm->rank() == 0) {
-        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    if(rankTime == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
         Kokkos::parallel_for(nloc,
                              generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
                              Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, delta, kw, 
@@ -603,25 +653,33 @@ int main(int argc, char *argv[]){
     else {
         size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-        Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+        Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
         buf->resetReadPos();
     }
 
-    
-    if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
-        size_type bufSize = Pbegin->packedSize(nloc);
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
+
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    if(rankTime < sizeTime-1) {
+        size_type bufSize = Pend->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
         MPI_Request request;
-        Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pbegin, *buf, request, nloc);
+        Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
         buf->resetWritePos();
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
 
-    //Ippl::Comm->barrier();
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-    Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-    IpplTimings::stopTimer(deepCopy);
 #else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
@@ -648,12 +706,11 @@ int main(int argc, char *argv[]){
         << "Tolerance: " << tol
         << " No. of cycles: " << nCycles
         << endl
-        << "Np= " << nloc 
+        << "Np= " << Total_particles 
         << " Fourier modes = " << nmPIF
         << " Grid points = " << nrPIC
         << endl;
     
-    Pcoarse->q = Pcoarse->Q_m/nloc;
     IpplTimings::stopTimer(particleCreation);                                                    
     
     msg << "particles created and initial conditions assigned " << endl;
@@ -708,28 +765,42 @@ int main(int argc, char *argv[]){
     //    isPreviousDomainConverged = false;
     //}
    
-    bool isConverged, isPreviousDomainConverged;
-
-    Pcoarse->shapetype_m = argv[13];
-    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
-    IpplTimings::startTimer(initializeShapeFunctionPIF);
-    Pcoarse->initializeShapeFunctionPIF();
-    IpplTimings::stopTimer(initializeShapeFunctionPIF);
-  
-    Pcoarse->initNUFFT(FLPIF);
-
 
+    int sign = 1;
     for (unsigned int nc=0; nc < nCycles; nc++) {
-        double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
-        Pcoarse->time_m = tStartMySlice;
-        Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
-                                    isPreviousDomainConverged, ntCoarse,
-                                    dtCoarse, tStartMySlice);
+        
+        double tStartMySlice; 
+        bool sendCriteria, recvCriteria;
+        bool isConverged = false;
+        bool isPreviousDomainConverged = false;
+        
+        //even cycles
+        if(nc % 2 == 0) {
+            sendCriteria = (rankTime < (sizeTime-1));
+            recvCriteria = (rankTime > 0);
+            if(rankTime == 0) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            msg.setPrintNode(Ippl::Comm->size()-1);
+        }
+        //odd cycles
+        else {
+            recvCriteria = (rankTime < (sizeTime-1));
+            sendCriteria = (rankTime > 0);
+            if(rankTime == (sizeTime - 1)) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            msg.setPrintNode(0);
+        }
+        
         unsigned int it = 0;
+        
         while (!isConverged) { 
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
-            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, nc+1, it+1);
+            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(finePropagator);
     
 
@@ -749,13 +820,13 @@ int main(int argc, char *argv[]){
             tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             
-            if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
+            if(recvCriteria && (!isPreviousDomainConverged)) {
                 size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-                Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+                Ippl::Comm->recv(rankTime-sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
-                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
-                        Ippl::getComm(), MPI_STATUS_IGNORE);
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, rankTime-sign, tagbool, 
+                        timeComm, MPI_STATUS_IGNORE);
                 IpplTimings::startTimer(deepCopy);
                 Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
                 Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
@@ -771,7 +842,7 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -781,11 +852,10 @@ int main(int argc, char *argv[]){
 
 
             PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
-            double localRerror, localPerror;
             
             IpplTimings::startTimer(computeErrors);
-            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
-            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
     
             IpplTimings::stopTimer(computeErrors);
             //}
@@ -796,14 +866,14 @@ int main(int argc, char *argv[]){
 
 
             IpplTimings::startTimer(timeCommunication);
-            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+            if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
                 MPI_Request request;
-                Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+                Ippl::Comm->isend(rankTime+sign, tag, *Pend, *buf, request, nloc, timeComm);
                 buf->resetWritePos();
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
-                MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, rankTime+sign, tagbool, timeComm);
             }
             IpplTimings::stopTimer(timeCommunication);
 
@@ -816,40 +886,73 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(localRerror, localPerror, nc+1, it+1);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             //if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
             //}
             IpplTimings::stopTimer(dumpData);
 
+            MPI_Barrier(spaceComm);
+            
             it += 1;
         }
 
-        Ippl::Comm->barrier();
+        MPI_Barrier(MPI_COMM_WORLD);
         if((nCycles > 1) && (nc < (nCycles - 1))) {  
             IpplTimings::startTimer(timeCommunication);
             tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+           
+            //send, receive criteria and tStartMySlice are reversed at the end of the cycle
+            if(nc % 2 == 0) {
+                recvCriteria = (rankTime < (sizeTime-1));
+                sendCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            }
+            //odd cycles
+            else {
+                sendCriteria = (rankTime < (sizeTime-1));
+                recvCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            }
             
-            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
-                size_type bufSize = Pend->packedSize(nloc);
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            if(recvCriteria) {
+                size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-                Ippl::Comm->recv(Ippl::Comm->rank()+1, tag, *Pend, *buf, bufSize, nloc);
+                Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
             }
-            if(Ippl::Comm->rank() > 0) {
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+
+            if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
                 MPI_Request request;
-                Ippl::Comm->isend(Ippl::Comm->rank()-1, tag, *Pend, *buf, request, nloc);
+                Ippl::Comm->isend(rankTime-sign, tag, *Pend, *buf, request, nloc, timeComm);
                 buf->resetWritePos();
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
             }
             IpplTimings::stopTimer(timeCommunication);
-            
-            IpplTimings::startTimer(deepCopy);
-            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
-            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
-            IpplTimings::stopTimer(deepCopy);
+            sign *= -1;
         }
     }
     msg << TestName << " Parareal: End." << endl;
@@ -857,5 +960,9 @@ int main(int argc, char *argv[]){
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
 
+    MPI_Comm_free(&spaceComm);
+    MPI_Comm_free(&timeComm);
+
+
     return 0;
 }
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 117016ef4..c951e17fd 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -413,7 +413,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                << ExAmp << endl;
     }
 
-    void dumpBumponTail(const unsigned int& nc, const unsigned int& iter) {
+    void dumpBumponTail(const unsigned int& nc, const unsigned int& iter, int rankTime, int rankSpace) {
        
 
         double fieldEnergy = 0.0; 
@@ -480,24 +480,26 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         fieldEnergy *= volume;
 
 
-        std::stringstream fname;
-        fname << "data/FieldBumponTail_rank_";
-        fname << Ippl::Comm->rank();
-        fname << "_nc_";
-        fname << nc;
-        fname << "_iter_";
-        fname << iter;
-        fname << ".csv";
+        if(rankSpace == 0) {
+            std::stringstream fname;
+            fname << "data/FieldBumponTail_rank_";
+            fname << rankTime;
+            fname << "_nc_";
+            fname << nc;
+            fname << "_iter_";
+            fname << iter;
+            fname << ".csv";
 
 
-        Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
-        csvout.precision(10);
-        csvout.setf(std::ios::scientific, std::ios::floatfield);
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+            csvout.precision(10);
+            csvout.setf(std::ios::scientific, std::ios::floatfield);
 
 
-        csvout << time_m << " "
-               << fieldEnergy << " "
-               << EzAmp << endl;
+            csvout << time_m << " "
+                   << fieldEnergy << " "
+                   << EzAmp << endl;
+        }
     }
 
 
@@ -802,14 +804,14 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIC(ParticleAttrib<Vector_t>& Rtemp, 
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
-                     const double dt, const double& tStartMySlice) {
+                     const double dt, const double& tStartMySlice, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIC_m = 0.0;
-        scatter(q, rhoPIC_m, Rtemp);
+        scatter(q, rhoPIC_m, Rtemp, spaceComm);
     
         rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
         rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
@@ -838,7 +840,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIC_m = 0.0;
-            scatter(q, rhoPIC_m, Rtemp);
+            scatter(q, rhoPIC_m, Rtemp, spaceComm);
     
     
             rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
@@ -973,16 +975,16 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const bool& /*isConverged*/, 
-                     const double& tStartMySlice, const unsigned& nc, 
-                     const unsigned int& iter) {
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, int rankTime, int rankSpace,
+                     MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
-        scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp);
+        scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
@@ -996,8 +998,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         if((time_m == 0.0)) {
             IpplTimings::startTimer(dumpData);
             //dumpLandau(iter);         
-            dumpBumponTail(nc, iter);         
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp);
+            dumpBumponTail(nc, iter, rankTime, rankSpace);         
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         for (unsigned int it=0; it<nt; it++) {
@@ -1016,7 +1018,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
-            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp);
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
@@ -1032,8 +1034,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             IpplTimings::startTimer(dumpData);
             //dumpLandau(iter);         
-            dumpBumponTail(nc, iter);         
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp);         
+            dumpBumponTail(nc, iter, rankTime, rankSpace);         
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
             IpplTimings::stopTimer(dumpData);
     
         }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index da2491d49..ead8c38c7 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -145,8 +145,7 @@ double CDF(const double& x, const double& alpha, const double& k) {
 }
 
 double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                         const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
-                         Vector_t& length) {
+                      Vector_t& length, MPI_Comm& spaceComm) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -172,16 +171,18 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
     Kokkos::fence();
-    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
 
-    double relError = lError;//absError / std::sqrt(globaltemp);
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
     
     return relError;
 
 }
 
-double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, MPI_Comm& spaceComm) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -198,9 +199,12 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             }, Kokkos::Sum<double>(localError), Kokkos::Sum<double>(localNorm));
 
     Kokkos::fence();
-    lError = std::sqrt(localError)/std::sqrt(localNorm);
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
 
-    double relError = lError;//absError / std::sqrt(globaltemp);
+    double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
     
     return relError;
 
@@ -376,6 +380,24 @@ const char* TestName = "LandauDampingPinT";
 int main(int argc, char *argv[]){
     Ippl ippl(argc, argv);
     
+    int spaceColor, timeColor;
+    MPI_Comm spaceComm, timeComm;
+
+    int spaceProcs = std::atoi(argv[15]);
+    int timeProcs = std::atoi(argv[16]);
+    spaceColor = Ippl::Comm->rank() / spaceProcs; 
+    timeColor = Ippl::Comm->rank() % spaceProcs;
+
+    MPI_Comm_split(Ippl::getComm(), spaceColor, Ippl::Comm->rank(), &spaceComm);
+    MPI_Comm_split(Ippl::getComm(), timeColor, Ippl::Comm->rank(), &timeComm);
+
+    int rankSpace, sizeSpace, rankTime, sizeTime;
+    MPI_Comm_rank(spaceComm, &rankSpace);
+    MPI_Comm_size(spaceComm, &sizeSpace);
+
+    MPI_Comm_rank(timeComm, &rankTime);
+    MPI_Comm_size(timeComm, &sizeTime);
+    
     Inform msg(TestName, Ippl::Comm->size()-1);
     Inform msg2all(TestName,INFORM_ALL_NODES);
 
@@ -407,7 +429,7 @@ int main(int argc, char *argv[]){
     const double tEnd = std::atof(argv[8]);
     const unsigned int nCycles = std::atoi(argv[12]);
     double tEndCycle = tEnd / nCycles;
-    const double dtSlice = tEndCycle / Ippl::Comm->size();
+    const double dtSlice = tEndCycle / sizeTime;
     const double dtFine = std::atof(argv[9]);
     const double dtCoarse = std::atof(argv[10]);
     const unsigned int ntFine = std::ceil(dtSlice / dtFine);
@@ -465,11 +487,26 @@ int main(int argc, char *argv[]){
     PLayout_t PL(FLPIC, meshPIC);
 
 
-    size_type nloc = totalP;
+    double factor = 1.0 / sizeSpace;
+    size_type nloc = (size_type)(factor * totalP);
+
+    size_type Total_particles = 0;
+
+    //MPI_Allreduce(&nloc, &Total_particles, 1,
+    //            MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
+
+    //int rest = (int) (totalP - Total_particles);
+
+    //if ( (rankTime == 0) && (rankSpace < rest) ) {
+    //    ++nloc;
+    //}
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
 
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
-    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,nloc);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -498,8 +535,19 @@ int main(int argc, char *argv[]){
     Pbegin->create(nloc);
     Pend->create(nloc);
 
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
+    
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
+
+    Pcoarse->shapetype_m = argv[13];
+    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
+    IpplTimings::startTimer(initializeShapeFunctionPIF);
+    Pcoarse->initializeShapeFunctionPIF();
+    IpplTimings::stopTimer(initializeShapeFunctionPIF);
+  
+    Pcoarse->initNUFFT(FLPIF);
+
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
@@ -538,8 +586,8 @@ int main(int argc, char *argv[]){
 
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
 
-    if(Ippl::Comm->rank() == 0) {
-        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    if(rankTime == 0) {
+        Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
         Kokkos::parallel_for(nloc,
                              generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
                              Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
@@ -550,25 +598,31 @@ int main(int argc, char *argv[]){
     else {
         size_type bufSize = Pbegin->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-        Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+        Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
         buf->resetReadPos();
     }
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+    Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+    Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+    IpplTimings::stopTimer(deepCopy);
 
-    
-    if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
-        size_type bufSize = Pbegin->packedSize(nloc);
+    Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
+
+    IpplTimings::startTimer(deepCopy);
+    Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+    Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+    IpplTimings::stopTimer(deepCopy);
+
+    if(rankTime < sizeTime-1) {
+        size_type bufSize = Pend->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
         MPI_Request request;
-        Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pbegin, *buf, request, nloc);
+        Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
         buf->resetWritePos();
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
-
-    //Ippl::Comm->barrier();
-    IpplTimings::startTimer(deepCopy);
-    Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-    Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-    IpplTimings::stopTimer(deepCopy);
 #else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
@@ -591,13 +645,12 @@ int main(int argc, char *argv[]){
         << "Tolerance: " << tol
         << " No. of cycles: " << nCycles
         << endl
-        << "Np= " << totalP 
+        << "Np= " << Total_particles 
         << " Fourier modes = " << nmPIF
         << " Grid points = " << nrPIC
         << endl;
 
 
-    Pcoarse->q = Pcoarse->Q_m/nloc;
     IpplTimings::stopTimer(particleCreation);                                                    
     
     msg << "particles created and initial conditions assigned " << endl;
@@ -652,27 +705,40 @@ int main(int argc, char *argv[]){
     //    isPreviousDomainConverged = false;
     //}
 
-    bool isConverged, isPreviousDomainConverged;
-
-    Pcoarse->shapetype_m = argv[13];
-    Pcoarse->shapedegree_m = std::atoi(argv[14]); 
-    IpplTimings::startTimer(initializeShapeFunctionPIF);
-    Pcoarse->initializeShapeFunctionPIF();
-    IpplTimings::stopTimer(initializeShapeFunctionPIF);
-
-    Pcoarse->initNUFFT(FLPIF);
     
+    int sign = 1;
     for (unsigned int nc=0; nc < nCycles; nc++) {
-        double tStartMySlice = (nc * tEndCycle) + (Ippl::Comm->rank() * dtSlice); 
-        Pcoarse->time_m = tStartMySlice;
-        Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, isConverged,
-                                    isPreviousDomainConverged, ntCoarse,
-                                    dtCoarse, tStartMySlice);
+        double tStartMySlice; 
+        bool sendCriteria, recvCriteria;
+        bool isConverged = false;
+        bool isPreviousDomainConverged = false;
+        
+        //even cycles
+        if(nc % 2 == 0) {
+            sendCriteria = (rankTime < (sizeTime-1));
+            recvCriteria = (rankTime > 0);
+            if(rankTime == 0) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            msg.setPrintNode(Ippl::Comm->size()-1);
+        }
+        //odd cycles
+        else {
+            recvCriteria = (rankTime < (sizeTime-1));
+            sendCriteria = (rankTime > 0);
+            if(rankTime == (sizeTime - 1)) {
+                isPreviousDomainConverged = true;
+            }
+            tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            msg.setPrintNode(0);
+        }
+        
         unsigned int it = 0;
         while (!isConverged) { 
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
-            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, isConverged, tStartMySlice, nc+1, it+1);
+            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(finePropagator);
     
 
@@ -689,13 +755,13 @@ int main(int argc, char *argv[]){
             tag = 1100;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             int tagbool = 1300;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
             
-            if((Ippl::Comm->rank() > 0) && (!isPreviousDomainConverged)) {
+            if(recvCriteria && (!isPreviousDomainConverged)) {
                 size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-                Ippl::Comm->recv(Ippl::Comm->rank()-1, tag, *Pbegin, *buf, bufSize, nloc);
+                Ippl::Comm->recv(rankTime-sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
-                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()-1, tagbool, 
-                        Ippl::getComm(), MPI_STATUS_IGNORE);
+                MPI_Recv(&isPreviousDomainConverged, 1, MPI_C_BOOL, rankTime-sign, tagbool, 
+                        timeComm, MPI_STATUS_IGNORE);
                 IpplTimings::startTimer(deepCopy);
                 Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
                 Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
@@ -711,7 +777,7 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
+            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -719,16 +785,8 @@ int main(int argc, char *argv[]){
 
             PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
             IpplTimings::startTimer(computeErrors);
-            double localRerror, localPerror;
-            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
-            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-    
-            //double Rerror = computeRL2Error(Pend->R, Pcoarse->RprevIter, it+1, Ippl::Comm->rank(), localRerror, length);
-            //double Perror = computePL2Error(Pend->P, Pcoarse->PprevIter, it+1, Ippl::Comm->rank(), localPerror);
-            //double EfieldError = 0;
-            //if(it > 0) {
-            //    EfieldError = computeFieldError(Pcoarse->rhoPIF_m, Pcoarse->rhoPIFprevIter_m);
-            //}
+            double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
+            double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
             IpplTimings::stopTimer(computeErrors);
 
             
@@ -737,17 +795,17 @@ int main(int argc, char *argv[]){
             }
             
             IpplTimings::startTimer(timeCommunication);
-            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
+            if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
                 MPI_Request request;
-                Ippl::Comm->isend(Ippl::Comm->rank()+1, tag, *Pend, *buf, request, nloc);
+                Ippl::Comm->isend(rankTime+sign, tag, *Pend, *buf, request, nloc, timeComm);
                 buf->resetWritePos();
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
-                MPI_Send(&isConverged, 1, MPI_C_BOOL, Ippl::Comm->rank()+1, tagbool, Ippl::getComm());
+                MPI_Send(&isConverged, 1, MPI_C_BOOL, rankTime+sign, tagbool, timeComm);
             }
             IpplTimings::stopTimer(timeCommunication);
-
+            
             
             msg << "Finished iteration: " << it+1 
                 << " in cycle: " << nc+1
@@ -757,37 +815,70 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(localRerror, localPerror, nc+1, it+1);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
+            MPI_Barrier(spaceComm);
+            
             it += 1;
         }
         
-        Ippl::Comm->barrier();
+        MPI_Barrier(MPI_COMM_WORLD);
         if((nCycles > 1) && (nc < (nCycles - 1))) {  
             IpplTimings::startTimer(timeCommunication);
             tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-            
-            if(Ippl::Comm->rank() < Ippl::Comm->size()-1) {
-                size_type bufSize = Pend->packedSize(nloc);
+           
+            //send, receive criteria and tStartMySlice are reversed at the end of the cycle
+            if(nc % 2 == 0) {
+                recvCriteria = (rankTime < (sizeTime-1));
+                sendCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
+            }
+            //odd cycles
+            else {
+                sendCriteria = (rankTime < (sizeTime-1));
+                recvCriteria = (rankTime > 0);
+                tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
+            }
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+
+            if(recvCriteria) {
+                size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-                Ippl::Comm->recv(Ippl::Comm->rank()+1, tag, *Pend, *buf, bufSize, nloc);
+                Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
             }
-            if(Ippl::Comm->rank() > 0) {
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+            Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+            Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+            
+            Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
+
+            if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
                 MPI_Request request;
-                Ippl::Comm->isend(Ippl::Comm->rank()-1, tag, *Pend, *buf, request, nloc);
+                Ippl::Comm->isend(rankTime-sign, tag, *Pend, *buf, request, nloc, timeComm);
                 buf->resetWritePos();
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
             }
             IpplTimings::stopTimer(timeCommunication);
-            
-            IpplTimings::startTimer(deepCopy);
-            Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
-            Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
-            IpplTimings::stopTimer(deepCopy);
+            sign *= -1;
         }
     }
 
@@ -796,5 +887,9 @@ int main(int argc, char *argv[]){
     IpplTimings::print();
     IpplTimings::print(std::string("timing.dat"));
 
+    MPI_Comm_free(&spaceComm);
+    MPI_Comm_free(&timeComm);
+
+
     return 0;
 }
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index e0981e81c..a19e200eb 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -524,15 +524,15 @@ int main(int argc, char *argv[]){
     MPI_Allreduce(&nloc, &Total_particles, 1,
                 MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
 
-    int rest = (int) (totalP - Total_particles);
+    //int rest = (int) (totalP - Total_particles);
 
-    if ( Ippl::Comm->rank() < rest ) {
-        ++nloc;
-    }
+    //if ( (rankTime == 0) && (rankSpace < rest) ) {
+    //    ++nloc;
+    //}
 
     double Q = -1562.5;
     double Bext = 5.0;
-    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,totalP);
+    Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
     Pbegin = std::make_unique<states_begin_type>(PL);
     Pend = std::make_unique<states_end_type>(PL);
 
@@ -561,7 +561,7 @@ int main(int argc, char *argv[]){
     Pbegin->create(nloc);
     Pend->create(nloc);
 
-    Pcoarse->q = Pcoarse->Q_m/totalP;
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
 
@@ -673,7 +673,7 @@ int main(int argc, char *argv[]){
         //<< " Max. iterations: " << maxIter
         << " No. of cycles: " << nCycles
         << endl
-        << "Np= " << totalP 
+        << "Np= " << Total_particles 
         << " Fourier modes = " << nmPIF
         << " Grid points = " << nrPIC
         << endl;
@@ -742,12 +742,6 @@ int main(int argc, char *argv[]){
         bool isConverged = false;
         bool isPreviousDomainConverged = false;
         
-        //IpplTimings::startTimer(initializeCycles);
-        //Pcoarse->initializeParareal(Pbegin->R, Pbegin->P, Pcoarse->R, Pcoarse->P, Pcoarse->R0, 
-        //                            Pcoarse->P0, isConverged,
-        //                            isPreviousDomainConverged, ntCoarse,
-        //                            dtCoarse, tStartMySlice, Bext, rankTime, spaceComm);
-        //IpplTimings::stopTimer(initializeCycles);
         //even cycles
         if(nc % 2 == 0) {
             sendCriteria = (rankTime < (sizeTime-1));
@@ -896,7 +890,13 @@ int main(int argc, char *argv[]){
                 recvCriteria = (rankTime > 0);
                 tStartMySlice = (nc * tEndCycle) + (rankTime * dtSlice);
             }
-            
+
+
+            IpplTimings::startTimer(deepCopy);
+            Kokkos::deep_copy(Pbegin->R.getView(), Pend->R.getView());
+            Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
+            IpplTimings::stopTimer(deepCopy);
+
             if(recvCriteria) {
                 size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);

From 86522129a3d0f1f1154fea70b8a647e9544117dc Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 19 Jun 2023 10:09:10 +0200
Subject: [PATCH 090/117] Penning Trap ICs modified in all mini-apps

---
 alpine/ElectrostaticPIC/ChargedParticles.hpp |  2 +-
 alpine/ElectrostaticPIC/PenningTrap.cpp      | 12 ++++++++----
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp   | 12 ++++++++----
 alpine/PinT/PenningTrapPinT.cpp              | 10 +++++++---
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index 61730648d..67b8f738f 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -341,7 +341,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
          rhoNorm_m = norm(rho_m);
          IpplTimings::stopTimer(sumTimer);
 
-         //dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
+         dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
 
          //rho = rho_e - rho_i
          rho_m = rho_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
diff --git a/alpine/ElectrostaticPIC/PenningTrap.cpp b/alpine/ElectrostaticPIC/PenningTrap.cpp
index f2b408cb2..bdb0da505 100644
--- a/alpine/ElectrostaticPIC/PenningTrap.cpp
+++ b/alpine/ElectrostaticPIC/PenningTrap.cpp
@@ -206,7 +206,8 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t rmin = {0.0, 0.0, 0.0};
-    Vector_t rmax = {20.0, 20.0, 20.0};
+    //Vector_t rmax = {20.0, 20.0, 20.0};
+    Vector_t rmax = {25.0, 25.0, 25.0};
     double dx = rmax[0] / nr[0];
     double dy = rmax[1] / nr[1];
     double dz = rmax[2] / nr[2];
@@ -236,9 +237,12 @@ int main(int argc, char *argv[]){
     for (unsigned d = 0; d<Dim; d++) {
         mu[d] = 0.5 * length[d];
     }
-    sd[0] = 0.15*length[0];
-    sd[1] = 0.05*length[1];
-    sd[2] = 0.20*length[2];
+    //sd[0] = 0.15*length[0];
+    //sd[1] = 0.05*length[1];
+    //sd[2] = 0.20*length[2];
+    sd[0] = 0.10*20.0;//length[0];
+    sd[1] = 0.05*20.0;//length[1];
+    sd[2] = 0.15*20.0;//length[2];
 
     P->E_m.initialize(mesh, FL);
     P->rho_m.initialize(mesh, FL);
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index 1ae3ab415..e667fed18 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -183,7 +183,8 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t rmin(0.0);
-    Vector_t rmax(20.0);
+    //Vector_t rmax(20.0);
+    Vector_t rmax(25.0);
     double dx = rmax[0] / nr[0];
     double dy = rmax[1] / nr[1];
     double dz = rmax[2] / nr[2];
@@ -195,9 +196,12 @@ int main(int argc, char *argv[]){
     for (unsigned d = 0; d<Dim; d++) {
         mu[d] = 0.5 * length[d];
     }
-    sd[0] = 0.15*length[0];
-    sd[1] = 0.05*length[1];
-    sd[2] = 0.20*length[2];
+    //sd[0] = 0.15*length[0];
+    //sd[1] = 0.05*length[1];
+    //sd[2] = 0.20*length[2];
+    sd[0] = 0.10*20.0;//length[0];
+    sd[1] = 0.05*20.0;//length[1];
+    sd[2] = 0.15*20.0;//length[2];
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index a19e200eb..8a1988e15 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -159,9 +159,9 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                                 Vector_t diff = Qview(i) - QprevIterView(i);
 
                                 for (unsigned d = 0; d < 3; ++d) {
-                                    bool isLeft = (diff[d] <= -22.0);
-                                    bool isRight = (diff[d] >= 22.0);
-                                    bool isInside = ((diff[d] > -22.0) && (diff[d] < 22.0));
+                                    bool isLeft = (diff[d] <= -17.0);
+                                    bool isRight = (diff[d] >= 17.0);
+                                    bool isInside = ((diff[d] > -17.0) && (diff[d] < 17.0));
                                     diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
                                               +(isRight * (diff[d] - length[d]));
                                 }
@@ -485,6 +485,7 @@ int main(int argc, char *argv[]){
     // create mesh and layout objects for this problem domain
     Vector_t rmin(0.0);
     Vector_t rmax(25.0);
+    //Vector_t rmax(20.0);
     Vector_t length = rmax - rmin;
     double dxPIC = length[0] / nrPIC[0];
     double dyPIC = length[1] / nrPIC[1];
@@ -496,6 +497,9 @@ int main(int argc, char *argv[]){
     for (unsigned d = 0; d<Dim; d++) {
         mu[d] = 0.5 * length[d];
     }
+    //sd[0] = 0.15*length[0];
+    //sd[1] = 0.05*length[1];
+    //sd[2] = 0.20*length[2];
     sd[0] = 0.10*20.0;//length[0];
     sd[1] = 0.05*20.0;//length[1];
     sd[2] = 0.15*20.0;//length[2];

From 25b272dd702e8280a3c91fe70291310c4a4eaedb Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 19 Jun 2023 10:12:25 +0200
Subject: [PATCH 091/117] error checking criterion changed

---
 alpine/PinT/PenningTrapPinT.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 8a1988e15..3f7d2b587 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -159,9 +159,9 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                                 Vector_t diff = Qview(i) - QprevIterView(i);
 
                                 for (unsigned d = 0; d < 3; ++d) {
-                                    bool isLeft = (diff[d] <= -17.0);
-                                    bool isRight = (diff[d] >= 17.0);
-                                    bool isInside = ((diff[d] > -17.0) && (diff[d] < 17.0));
+                                    bool isLeft = (diff[d] <= -22.0);
+                                    bool isRight = (diff[d] >= 22.0);
+                                    bool isInside = ((diff[d] > -22.0) && (diff[d] < 22.0));
                                     diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
                                               +(isRight * (diff[d] - length[d]));
                                 }

From cb03845465ac1ad90cf7048f267f50265fa0f654 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 20 Jun 2023 14:06:22 +0200
Subject: [PATCH 092/117] Total particles changed in PIF codes

---
 .../BumponTailInstabilityPIF.cpp              | 43 ++++++++++---------
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  | 26 +++++------
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp    | 26 +++++------
 src/Particle/ParticleAttrib.hpp               |  2 +-
 4 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
index 5ddbd6c46..ca7efa343 100644
--- a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -173,11 +173,6 @@ int main(int argc, char *argv[]){
     const unsigned int nt     = std::atoi(argv[5]);
     const double dt = std::atof(argv[6]);
 
-    msg << TestName 
-        << endl
-        << "nt " << nt << " Np= "
-        << totalP << " Fourier modes = " << nr
-        << endl;
 
     using bunch_type = ChargedParticlesPIF<PLayout_t>;
 
@@ -240,9 +235,27 @@ int main(int argc, char *argv[]){
     FieldLayout_t FL(domain, decomp, isAllPeriodic);
     PLayout_t PL(FL, mesh);
 
+    double factorConf = 1.0/Ippl::Comm->size();
+    double factorVelBulk = 1.0 - epsilon;
+    double factorVelBeam = 1.0 - factorVelBulk;
+    size_type nlocBulk = (size_type)(factorConf * factorVelBulk * totalP);
+    size_type nlocBeam = (size_type)(factorConf * factorVelBeam * totalP);
+    size_type nloc = nlocBulk + nlocBeam;
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+    
+    msg << TestName 
+        << endl
+        << "nt " << nt << " Np= "
+        << Total_particles << " Fourier modes = " << nr
+        << endl;
+    
+    
     //Q = -\int\int f dx dv
     double Q = -rmax[0] * rmax[1] * rmax[2];
-    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,totalP);
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,Total_particles);
 
     P->nr_m = nr;
 
@@ -266,21 +279,11 @@ int main(int argc, char *argv[]){
         maxU[d]   = CDF(rmax[d], delta, kw[d], d);
     }
 
-    double factorConf = 1.0/Ippl::Comm->size();
-    double factorVelBulk = 1.0 - epsilon;
-    double factorVelBeam = 1.0 - factorVelBulk;
-    size_type nlocBulk = (size_type)(factorConf * factorVelBulk * totalP);
-    size_type nlocBeam = (size_type)(factorConf * factorVelBeam * totalP);
-    size_type nloc = nlocBulk + nlocBeam;
-    size_type Total_particles = 0;
-
-    MPI_Allreduce(&nloc, &Total_particles, 1,
-                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
 
-    int rest = (int) (totalP - Total_particles);
+    //int rest = (int) (totalP - Total_particles);
 
-    if ( Ippl::Comm->rank() < rest )
-        ++nloc;
+    //if ( Ippl::Comm->rank() < rest )
+    //    ++nloc;
 
     P->create(nloc);
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
@@ -293,7 +296,7 @@ int main(int argc, char *argv[]){
     Ippl::Comm->barrier();
     IpplTimings::stopTimer(particleCreation);                                                    
     
-    P->q = P->Q_m/totalP;
+    P->q = P->Q_m/Total_particles;
     msg << "particles created and initial conditions assigned " << endl;
 
     IpplTimings::startTimer(initializeShapeFunctionPIF);
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 0ed48fc0e..2373d3f15 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -170,10 +170,18 @@ int main(int argc, char *argv[]){
     const unsigned int nt     = std::atoi(argv[5]);
     const double dt = std::atof(argv[6]);
 
+    double factor = 1.0/Ippl::Comm->size();
+    size_type nloc = (size_type)(factor * totalP);
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+
+
     msg << "Landau damping"
         << endl
         << "nt " << nt << " Np= "
-        << totalP << " Fourier modes = " << nr
+        << Total_particles << " Fourier modes = " << nr
         << endl;
 
     using bunch_type = ChargedParticlesPIF<PLayout_t>;
@@ -214,7 +222,7 @@ int main(int argc, char *argv[]){
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
     //double Q = -64.0 * pi * pi * pi;
-    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,totalP);
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,Total_particles);
 
     P->nr_m = nr;
 
@@ -241,17 +249,11 @@ int main(int argc, char *argv[]){
         //maxU[d] = rmax[d];//CDF(Regions(myRank)[d].max(), alpha, kw[d]);
     }
 
-    double factor = 1.0/Ippl::Comm->size();
-    size_type nloc = (size_type)(factor * totalP);
-    size_type Total_particles = 0;
-
-    MPI_Allreduce(&nloc, &Total_particles, 1,
-                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
 
-    int rest = (int) (totalP - Total_particles);
+    //int rest = (int) (totalP - Total_particles);
 
-    if ( Ippl::Comm->rank() < rest )
-        ++nloc;
+    //if ( Ippl::Comm->rank() < rest )
+    //    ++nloc;
 
     P->create(nloc);
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
@@ -263,7 +265,7 @@ int main(int argc, char *argv[]){
     Ippl::Comm->barrier();
     IpplTimings::stopTimer(particleCreation);                                                    
     
-    P->q = P->Q_m/totalP;
+    P->q = P->Q_m/Total_particles;
     msg << "particles created and initial conditions assigned " << endl;
 
     IpplTimings::startTimer(initializeShapeFunctionPIF);
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index e667fed18..f70ddaa08 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -161,10 +161,18 @@ int main(int argc, char *argv[]){
     const unsigned int nt     = std::atoi(argv[5]);
     const double dt = std::atof(argv[6]);
 
+    double factor = 1.0/Ippl::Comm->size();
+    size_type nloc = (size_type)(factor * totalP);
+    size_type Total_particles = 0;
+
+    MPI_Allreduce(&nloc, &Total_particles, 1,
+                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
+
+
     msg << TestName
         << endl
         << "nt " << nt << " Np= "
-        << totalP << " Fourier modes = " << nr
+        << Total_particles << " Fourier modes = " << nr
         << endl;
 
     using bunch_type = ChargedParticlesPIF<PLayout_t>;
@@ -213,7 +221,7 @@ int main(int argc, char *argv[]){
 
     double Q = -1562.5;
     double Bext = 5.0;
-    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,totalP);
+    P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,Total_particles);
 
     P->nr_m = nr;
 
@@ -236,17 +244,11 @@ int main(int argc, char *argv[]){
         maxU[d] = CDF(rmax[d], mu[d], sd[d]);
     }
 
-    double factor = 1.0/Ippl::Comm->size();
-    size_type nloc = (size_type)(factor * totalP);
-    size_type Total_particles = 0;
-
-    MPI_Allreduce(&nloc, &Total_particles, 1,
-                MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
 
-    int rest = (int) (totalP - Total_particles);
+    //int rest = (int) (totalP - Total_particles);
 
-    if ( Ippl::Comm->rank() < rest )
-        ++nloc;
+    //if ( Ippl::Comm->rank() < rest )
+    //    ++nloc;
 
     P->create(nloc);
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
@@ -258,7 +260,7 @@ int main(int argc, char *argv[]){
     Ippl::Comm->barrier();
     IpplTimings::stopTimer(particleCreation);                                                    
     
-    P->q = P->Q_m/totalP;
+    P->q = P->Q_m/Total_particles;
     msg << "particles created and initial conditions assigned " << endl;
 
     IpplTimings::startTimer(initializeShapeFunctionPIF);
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 6a4047742..7e45c5071 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -543,7 +543,7 @@ namespace ippl {
                       MPI_C_DOUBLE_COMPLEX, MPI_SUM, spaceComm);  
         IpplTimings::stopTimer(scatterAllReducePIFTimer);
 
-        //IpplTimings::startTimer(scatterPIFNUFFTTimer);
+        IpplTimings::startTimer(scatterPIFNUFFTTimer);
 
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
         Kokkos::parallel_for("Multiply with shape functions",

From 824fc55b875c31e073b58861021b9e04361e2f8a Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Sat, 24 Jun 2023 08:42:00 +0200
Subject: [PATCH 093/117] Bug in Two-stram instability fixed

---
 alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp | 1 +
 alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
index ca7efa343..dcd059bdf 100644
--- a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -303,6 +303,7 @@ int main(int argc, char *argv[]){
     P->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
+    P->initNUFFT(FL);
 
     P->scatter();
 
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 51c308f8f..78042f404 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -133,6 +133,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         fftParams.add("tolerance", 1e-6);
 
         fftParams.add("use_cufinufft_defaults", false);
+        //fftParams.add("use_cufinufft_defaults", true);
 
         q.initializeNUFFT(FL, 1, fftParams);
         E.initializeNUFFT(FL, 2, fftParams);

From ffcdaf782b3a76f019de48104b0a39c8aa039cc6 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Fri, 7 Jul 2023 08:24:17 +0200
Subject: [PATCH 094/117] FFT temporaries moved as member variables and sort
 ption removed in NUFFT as it is error prone

---
 .../BumponTailInstability.cpp                 |   4 +-
 alpine/ElectrostaticPIC/ChargedParticles.hpp  |   2 +-
 alpine/ElectrostaticPIC/PenningTrap.cpp       |   2 +-
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  |   2 +-
 alpine/PinT/ChargedParticlesPinT.hpp          |   2 +-
 src/FFT/FFT.h                                 |  26 ++-
 src/FFT/FFT.hpp                               | 114 ++++++++----
 src/Particle/ParticleAttrib.hpp               |   2 +-
 test/FFT/TestNUFFT1.cpp                       | 164 +++++++++---------
 9 files changed, 193 insertions(+), 125 deletions(-)

diff --git a/alpine/ElectrostaticPIC/BumponTailInstability.cpp b/alpine/ElectrostaticPIC/BumponTailInstability.cpp
index c15cf60aa..07e595cbc 100644
--- a/alpine/ElectrostaticPIC/BumponTailInstability.cpp
+++ b/alpine/ElectrostaticPIC/BumponTailInstability.cpp
@@ -252,7 +252,7 @@ int main(int argc, char *argv[]){
 
     Vector_t hr = {dx, dy, dz};
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
-    const double dt = 0.5*dx;//0.05
+    const double dt = std::atof(argv[9]);;//0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
@@ -383,6 +383,7 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(dumpDataTimer);
     P->dumpBumponTail();
+    P->dumpEnergy(totalP);
     P->gatherStatistics(totalP);
     //P->dumpLocalDomains(FL, 0);
     IpplTimings::stopTimer(dumpDataTimer);
@@ -442,6 +443,7 @@ int main(int argc, char *argv[]){
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
         P->dumpBumponTail();
+        P->dumpEnergy(totalP);
         P->gatherStatistics(totalP);
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index 67b8f738f..61730648d 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -341,7 +341,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
          rhoNorm_m = norm(rho_m);
          IpplTimings::stopTimer(sumTimer);
 
-         dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
+         //dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
 
          //rho = rho_e - rho_i
          rho_m = rho_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
diff --git a/alpine/ElectrostaticPIC/PenningTrap.cpp b/alpine/ElectrostaticPIC/PenningTrap.cpp
index bdb0da505..4cb27474c 100644
--- a/alpine/ElectrostaticPIC/PenningTrap.cpp
+++ b/alpine/ElectrostaticPIC/PenningTrap.cpp
@@ -216,7 +216,7 @@ int main(int argc, char *argv[]){
     Vector_t origin = {rmin[0], rmin[1], rmin[2]};
     //unsigned int nrMax = 2048;// Max grid size in our studies
     //double dxFinest = rmax[0] / nrMax;  
-    const double dt = 0.05;//0.5 * dxFinest;//size of timestep
+    const double dt = std::atof(argv[9]);;//0.5*dx;
 
     const bool isAllPeriodic=true;
     Mesh_t mesh(domain, hr, origin);
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 78042f404..b271372bc 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -128,7 +128,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         ippl::ParameterList fftParams;
 
         fftParams.add("gpu_method", 1);
-        fftParams.add("gpu_sort", 1);
+        fftParams.add("gpu_sort", 0);
         fftParams.add("gpu_kerevalmeth", 1);
         fftParams.add("tolerance", 1e-6);
 
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index c951e17fd..9cb817dd5 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -165,7 +165,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         ippl::ParameterList fftParams;
 
         fftParams.add("gpu_method", 1);
-        fftParams.add("gpu_sort", 1);
+        fftParams.add("gpu_sort", 0);
         fftParams.add("gpu_kerevalmeth", 1);
         fftParams.add("tolerance", 1e-6);
 
diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 2b41a9495..1b0fad70e 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -36,6 +36,7 @@
 #include <functional>
 #include <type_traits>
 
+#include "Types/IpplTypes.h"
 #include "FieldLayout/FieldLayout.h"
 #include "Field/Field.h"
 //#include "Particle/ParticleAttrib.h"
@@ -175,6 +176,7 @@ namespace ippl {
 
         using heffteBackend = typename detail::HeffteBackendType::backend;
         using workspace_t = typename heffte::fft3d<heffteBackend>::template buffer_container<Complex_t>;
+        using view_type = typename detail::ViewType<Complex_t, 3, Kokkos::LayoutLeft>::view_type;
 
         /** Create a new FFT object with the layout for the input Field and
          * parameters for heffte.
@@ -202,6 +204,7 @@ namespace ippl {
 
         std::shared_ptr<heffte::fft3d<heffteBackend, long long>> heffte_m;
         workspace_t workspace_m;
+        view_type tempField_m;
 
     };
 
@@ -220,6 +223,8 @@ namespace ippl {
         using heffteBackend = typename detail::HeffteBackendType::backend;
         typedef Kokkos::complex<T> Complex_t;
         using workspace_t = typename heffte::fft3d_r2c<heffteBackend>::template buffer_container<Complex_t>;
+        using view_real_type = typename detail::ViewType<T, 3, Kokkos::LayoutLeft>::view_type;
+        using view_complex_type = typename detail::ViewType<Complex_t, 3, Kokkos::LayoutLeft>::view_type;
 
         typedef Field<Complex_t,Dim> ComplexField_t;
 
@@ -253,6 +258,8 @@ namespace ippl {
 
         std::shared_ptr<heffte::fft3d_r2c<heffteBackend, long long>> heffte_m;
         workspace_t workspace_m;
+        view_real_type tempFieldf_m;
+        view_complex_type tempFieldg_m;
 
     };
 
@@ -269,6 +276,7 @@ namespace ippl {
 
         using heffteBackend = typename detail::HeffteBackendType::backendSine;
         using workspace_t = typename heffte::fft3d<heffteBackend>::template buffer_container<T>;
+        using view_type = typename detail::ViewType<T, 3, Kokkos::LayoutLeft>::view_type;
 
         /** Create a new FFT object with the layout for the input Field and
          * parameters for heffte.
@@ -294,6 +302,7 @@ namespace ippl {
 
         std::shared_ptr<heffte::fft3d<heffteBackend, long long>> heffte_m;
         workspace_t workspace_m;
+        view_type tempField_m;
 
     };
     /**
@@ -309,6 +318,7 @@ namespace ippl {
 
         using heffteBackend = typename detail::HeffteBackendType::backendCos;
         using workspace_t = typename heffte::fft3d<heffteBackend>::template buffer_container<T>;
+        using view_type = typename detail::ViewType<T, 3, Kokkos::LayoutLeft>::view_type;
 
         /** Create a new FFT object with the layout for the input Field and
          * parameters for heffte.
@@ -334,6 +344,7 @@ namespace ippl {
 
         std::shared_ptr<heffte::fft3d<heffteBackend, long long>> heffte_m;
         workspace_t workspace_m;
+        view_type tempField_m;
 
     };
 
@@ -353,20 +364,23 @@ namespace ippl {
 
         using complexType = typename detail::CufinufftType<T>::complexType;
         using plan_t = typename detail::CufinufftType<T>::plan_t;
+        using view_field_type = typename detail::ViewType<complexType, 3, Kokkos::LayoutLeft>::view_type;
+        using view_particle_real_type = typename detail::ViewType<T, 1, Kokkos::LayoutLeft>::view_type;
+        using view_particle_complex_type = typename detail::ViewType<complexType, 1, Kokkos::LayoutLeft>::view_type;
 
         /** Create a new FFT object with the layout for the input Field, type 
          * (1 or 2) for the NUFFT and parameters for cuFINUFFT.
         */
-        FFT(const Layout_t& layout, int type, const ParameterList& params);
+        FFT(const Layout_t& layout, const detail::size_type& localNp, int type, const ParameterList& params);
 
         // Destructor
         ~FFT();
 
         /** Do the NUFFT.
         */
-        template<class PT1, class PT2, class... Properties>
-        void transform(const ParticleAttrib< Vector<PT1, Dim>, Properties... >& R, 
-                       ParticleAttrib<PT2, Properties... >& Q, ComplexField_t& f);
+        template<class... Properties>
+        void transform(const ParticleAttrib< Vector<T, Dim>, Properties... >& R, 
+                       ParticleAttrib<T, Properties... >& Q, ComplexField_t& f);
 
 
     private:
@@ -382,6 +396,10 @@ namespace ippl {
         int ier_m;
         T tol_m;
         int type_m;
+        view_field_type tempField_m;
+        view_particle_real_type tempR_m[3] = {};
+        view_particle_complex_type tempQ_m;
+
 
     };
 
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 0698e1ceb..2c36113fa 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -74,6 +74,9 @@ namespace ippl {
             high[d] = static_cast<long long>(lDom[d].length() + lDom[d].first() - 1);
         }
 
+        if(tempField_m.size() < lDom.size()) {
+            Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length());
+        }
         setup(low, high, params);
     }
 
@@ -148,11 +151,12 @@ namespace ippl {
          *2) heffte accepts data in layout left (by default) eventhough this
          *can be changed during heffte box creation
         */
-        Kokkos::View<Complex_t***,Kokkos::LayoutLeft>
-            tempField("tempField", fview.extent(0) - 2*nghost,
-                                   fview.extent(1) - 2*nghost,
-                                   fview.extent(2) - 2*nghost);
+        //Kokkos::View<Complex_t***,Kokkos::LayoutLeft>
+        //    tempField("tempField", fview.extent(0) - 2*nghost,
+        //                           fview.extent(1) - 2*nghost,
+        //                           fview.extent(2) - 2*nghost);
 
+        auto tempField = tempField_m;
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
 
         Kokkos::parallel_for("copy from Kokkos FFT",
@@ -259,6 +263,14 @@ namespace ippl {
                             lDomOutput[d].first() - 1);
         }
 
+
+        if(tempFieldf_m.size() < lDomInput.size()) {
+            Kokkos::realloc(tempFieldf_m, lDomInput[0].length(), lDomInput[1].length(), lDomInput[2].length());
+        }
+        if(tempFieldg_m.size() < lDomOutput.size()) {
+            Kokkos::realloc(tempFieldg_m, lDomOutput[0].length(), lDomOutput[1].length(), lDomOutput[2].length());
+        }
+
         setup(lowInput, highInput, lowOutput, highOutput, params);
     }
 
@@ -337,16 +349,18 @@ namespace ippl {
          *2) heffte accepts data in layout left (by default) eventhough this
          *can be changed during heffte box creation
         */
-        Kokkos::View<T***, Kokkos::LayoutLeft>
-            tempFieldf("tempFieldf", fview.extent(0) - 2*nghostf,
-                                     fview.extent(1) - 2*nghostf,
-                                     fview.extent(2) - 2*nghostf);
-
-        Kokkos::View<Complex_t***, Kokkos::LayoutLeft>
-            tempFieldg("tempFieldg", gview.extent(0) - 2*nghostg,
-                                     gview.extent(1) - 2*nghostg,
-                                     gview.extent(2) - 2*nghostg);
-
+        //Kokkos::View<T***, Kokkos::LayoutLeft>
+        //    tempFieldf("tempFieldf", fview.extent(0) - 2*nghostf,
+        //                             fview.extent(1) - 2*nghostf,
+        //                             fview.extent(2) - 2*nghostf);
+
+        //Kokkos::View<Complex_t***, Kokkos::LayoutLeft>
+        //    tempFieldg("tempFieldg", gview.extent(0) - 2*nghostg,
+        //                             gview.extent(1) - 2*nghostg,
+        //                             gview.extent(2) - 2*nghostg);
+
+        auto tempFieldf = tempFieldf_m;
+        auto tempFieldg = tempFieldg_m;
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
 
         Kokkos::parallel_for("copy from Kokkos f field in FFT",
@@ -463,6 +477,9 @@ namespace ippl {
             high[d] = static_cast<long long>(lDom[d].length() + lDom[d].first() - 1);
         }
 
+        if(tempField_m.size() < lDom.size()) {
+            Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length());
+        }
         setup(low, high, params);
     }
 
@@ -534,11 +551,12 @@ namespace ippl {
          *2) heffte accepts data in layout left (by default) eventhough this
          *can be changed during heffte box creation
         */
-        Kokkos::View<T***,Kokkos::LayoutLeft>
-            tempField("tempField", fview.extent(0) - 2*nghost,
-                                   fview.extent(1) - 2*nghost,
-                                   fview.extent(2) - 2*nghost);
+        //Kokkos::View<T***,Kokkos::LayoutLeft>
+        //    tempField("tempField", fview.extent(0) - 2*nghost,
+        //                           fview.extent(1) - 2*nghost,
+        //                           fview.extent(2) - 2*nghost);
 
+        auto tempField = tempField_m;
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
 
         Kokkos::parallel_for("copy from Kokkos FFT",
@@ -624,6 +642,9 @@ namespace ippl {
             high[d] = static_cast<long long>(lDom[d].length() + lDom[d].first() - 1);
         }
 
+        if(tempField_m.size() < lDom.size()) {
+            Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length());
+        }
         setup(low, high, params);
     }
 
@@ -696,11 +717,12 @@ namespace ippl {
          *2) heffte accepts data in layout left (by default) eventhough this
          *can be changed during heffte box creation
         */
-        Kokkos::View<T***,Kokkos::LayoutLeft>
-            tempField("tempField", fview.extent(0) - 2*nghost,
-                                   fview.extent(1) - 2*nghost,
-                                   fview.extent(2) - 2*nghost);
+        //Kokkos::View<T***,Kokkos::LayoutLeft>
+        //    tempField("tempField", fview.extent(0) - 2*nghost,
+        //                           fview.extent(1) - 2*nghost,
+        //                           fview.extent(2) - 2*nghost);
 
+        auto tempField = tempField_m;
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
 
         Kokkos::parallel_for("copy from Kokkos FFT",
@@ -762,6 +784,7 @@ namespace ippl {
 
     template <size_t Dim, class T>
     FFT<NUFFTransform,Dim,T>::FFT(const Layout_t& layout,
+                                  const detail::size_type& localNp,
                                   int type,
                                   const ParameterList& params)
     {
@@ -783,6 +806,17 @@ namespace ippl {
         }
 
         type_m = type;
+        if(tempField_m.size() < lDom.size()) {
+            Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length());
+        }
+        for(size_t d = 0; d < Dim; ++d) {
+            if(tempR_m[d].size() < localNp) {
+                Kokkos::realloc(tempR_m[d], localNp);
+            }
+        }
+        if(tempQ_m.size() < localNp) {
+            Kokkos::realloc(tempQ_m, localNp);
+        }
         setup(nmodes, params);
     }
 
@@ -832,10 +866,10 @@ namespace ippl {
 
 
     template <size_t Dim, class T>
-    template<class PT1, class PT2, class... Properties>
+    template<class... Properties>
     void
-    FFT<NUFFTransform,Dim,T>::transform(const ParticleAttrib< Vector<PT1, Dim>, Properties... >& R,
-                                        ParticleAttrib<PT2, Properties... >& Q,
+    FFT<NUFFTransform,Dim,T>::transform(const ParticleAttrib< Vector<T, Dim>, Properties... >& R,
+                                        ParticleAttrib<T, Properties... >& Q,
                                         typename FFT<NUFFTransform,Dim,T>::ComplexField_t& f)
     {
         
@@ -865,24 +899,30 @@ namespace ippl {
          * cuFINUFFT's layout is left, hence we allocate the temporary
          * Kokkos views with the same layout
          */
-        Kokkos::View<complexType***,Kokkos::LayoutLeft>
-            tempField("tempField", fview.extent(0) - 2*nghost,
-                                   fview.extent(1) - 2*nghost,
-                                   fview.extent(2) - 2*nghost);
+        //Kokkos::View<complexType***,Kokkos::LayoutLeft>
+        //    tempField("tempField", fview.extent(0) - 2*nghost,
+        //                           fview.extent(1) - 2*nghost,
+        //                           fview.extent(2) - 2*nghost);
 
 
-        //Initialize the pointers to NULL and fill only relevant dimensions
-        //CUFINUFFT requires the input like this.
-        Kokkos::View<PT1*,Kokkos::LayoutLeft> tempR[3] = {};
+        ////Initialize the pointers to NULL and fill only relevant dimensions
+        ////CUFINUFFT requires the input like this.
+        //Kokkos::View<PT1*,Kokkos::LayoutLeft> tempR[3] = {};
       
 
-        for(size_t d = 0; d < Dim; ++d) {
-            Kokkos::realloc(tempR[d], localNp);
-        }
+        //for(size_t d = 0; d < Dim; ++d) {
+        //    Kokkos::realloc(tempR[d], localNp);
+        //}
 
        
-        Kokkos::View<complexType*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
-       
+        //Kokkos::View<complexType*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
+  
+        auto tempField = tempField_m;
+        auto tempQ = tempQ_m;
+        Kokkos::View<T*,Kokkos::LayoutLeft> tempR[3] = {};
+        for(size_t d = 0; d < Dim; ++d) {
+            tempR[d] = tempR_m[d];
+        }
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
 
         Kokkos::parallel_for("copy from field data NUFFT",
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 7e45c5071..93f8620b3 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -497,7 +497,7 @@ namespace ippl {
     template<unsigned Dim>
     void ParticleAttrib<T, Properties...>::initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams) {
         
-        fftType_mp = std::make_shared<FFT<NUFFTransform, Dim, double>>(layout, type, fftParams);
+        fftType_mp = std::make_shared<FFT<NUFFTransform, Dim, double>>(layout, *(this->localNum_mp), type, fftParams);
     }
     
     
diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
index a020c4c79..0e261b035 100644
--- a/test/FFT/TestNUFFT1.cpp
+++ b/test/FFT/TestNUFFT1.cpp
@@ -73,7 +73,7 @@ int main(int argc, char *argv[]) {
     typedef Bunch<playout_type> bunch_type;
 
     
-    ippl::Vector<int, dim> pt = {32, 32, 32};
+    ippl::Vector<int, dim> pt = {512, 512, 512};
     ippl::Index I(pt[0]);
     ippl::Index J(pt[1]);
     ippl::Index K(pt[2]);
@@ -91,10 +91,16 @@ int main(int argc, char *argv[]) {
         2.0 * pi / double(pt[2]),
     };
 
+    //std::array<double, dim> dx = {
+    //    25.0 / double(pt[0]),
+    //    25.0 / double(pt[1]),
+    //    25.0 / double(pt[2]),
+    //};
     typedef ippl::Vector<double, 3> Vector_t;
 
     Vector_t hx = {dx[0], dx[1], dx[2]};
     Vector_t origin = {-pi, -pi, -pi};
+    //Vector_t origin = {0, 0, 0};
     ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
 
     playout_type pl(layout, mesh);
@@ -105,7 +111,7 @@ int main(int argc, char *argv[]) {
     using size_type = ippl::detail::size_type;
 
 
-    size_type Np = std::pow(32,3) * 20;
+    size_type Np = std::pow(512,3) * 5;
     
     typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
 
@@ -115,9 +121,9 @@ int main(int argc, char *argv[]) {
     ippl::ParameterList fftParams;
 
     fftParams.add("gpu_method", 1);
-    fftParams.add("gpu_sort", 1);
+    fftParams.add("gpu_sort", 0);
     fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-10);
+    fftParams.add("tolerance", 1e-6);
 
     fftParams.add("use_cufinufft_defaults", false);  
     
@@ -127,15 +133,17 @@ int main(int argc, char *argv[]) {
     
     int type = 1;
     
-    fft = std::make_unique<FFT_type>(layout, type, fftParams);
 
     Vector_t minU = {-pi, -pi, -pi};
     Vector_t maxU = {pi, pi, pi};
+    //Vector_t minU = {0.0, 0.0, 0.0};
+    //Vector_t maxU = {25.0, 25.0, 25.0};
 
 
     size_type nloc = Np/Ippl::Comm->size();
 
     bunch.create(nloc);
+    fft = std::make_unique<FFT_type>(layout, nloc, type, fftParams);
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, dim>(
@@ -167,82 +175,82 @@ int main(int argc, char *argv[]) {
     auto Qview = bunch.Q.getView();
 
     Kokkos::complex<double> imag = {0.0, 1.0};
-    size_t flatN = pt[0] * pt[1] * pt[2];
-    auto fview = field_dft.getView();
+    //size_t flatN = pt[0] * pt[1] * pt[2];
+    //auto fview = field_dft.getView();
   
 
 
-    typedef Kokkos::TeamPolicy<> team_policy;
-    typedef Kokkos::TeamPolicy<>::member_type member_type;
-
-    Kokkos::parallel_for("NUDFT type 1",
-           team_policy(flatN, Kokkos::AUTO),
-           KOKKOS_LAMBDA(const member_type& teamMember) {
-           const size_t flatIndex = teamMember.league_rank();
-          
-           const int k = (int)(flatIndex / (pt[0] * pt[1]));
-           const int flatIndex2D = flatIndex - (k * pt[0] * pt[1]);
-           const int i = flatIndex2D % pt[0];
-           const int j = (int)(flatIndex2D / pt[0]);
-           
-           Kokkos::complex<double> reducedValue = 0.0;
-           ippl::Vector<int, 3> iVec = {i, j, k};
-           ippl::Vector<double, 3>kVec;
-           for(size_t d = 0; d < 3; ++d) {
-               kVec[d] = (2.0 * pi / (maxU[d] - minU[d])) * (iVec[d] - (pt[d] / 2));
-           }
-           Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nloc),
-           [=](const size_t idx, Kokkos::complex<double>& innerReduce)
-           {
-               double arg = 0.0;
-               for(size_t d = 0; d < 3; ++d) {
-                   arg += kVec[d]*Rview(idx)[d];
-               }
-               const double& val = Qview(idx);
-
-               innerReduce += (Kokkos::cos(arg) 
-                           - imag * Kokkos::sin(arg)) * val;
-           }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
-
-           if(teamMember.team_rank() == 0) {
-               fview(i+nghost,j+nghost,k+nghost) = reducedValue;
-           }
-
-           });
-    
-    typename field_type::HostMirror rhoNUDFT_host = field_dft.getHostMirror();
-    Kokkos::deep_copy(rhoNUDFT_host, field_dft.getView());
-    std::stringstream pname;
-    pname << "data/FieldFFT_";
-    pname << Ippl::Comm->rank();
-    pname << ".csv";
-    Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
-    pcsvout.precision(10);
-    pcsvout.setf(std::ios::scientific, std::ios::floatfield);
-    pcsvout << "rho" << endl;
-    for (int i = 0; i< pt[0]; i++) {
-         for (int j = 0; j< pt[1]; j++) {
-             for (int k = 0; k< pt[2]; k++) {
-                 pcsvout << field_result(i+nghost,j+nghost, k+nghost) << endl;
-             }
-         }
-    }
-    std::stringstream pname2;
-    pname2 << "data/FieldDFT_";
-    pname2 << Ippl::Comm->rank();
-    pname2 << ".csv";
-    Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
-    pcsvout2.precision(10);
-    pcsvout2.setf(std::ios::scientific, std::ios::floatfield);
-    pcsvout2 << "rho" << endl;
-    for (int i = 0; i< pt[0]; i++) {
-         for (int j = 0; j< pt[1]; j++) {
-             for (int k = 0; k< pt[2]; k++) {
-                 pcsvout2 << rhoNUDFT_host(i+nghost,j+nghost, k+nghost) << endl;
-             }
-         }
-       }
-       Ippl::Comm->barrier();
+    //typedef Kokkos::TeamPolicy<> team_policy;
+    //typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+    //Kokkos::parallel_for("NUDFT type 1",
+    //       team_policy(flatN, Kokkos::AUTO),
+    //       KOKKOS_LAMBDA(const member_type& teamMember) {
+    //       const size_t flatIndex = teamMember.league_rank();
+    //      
+    //       const int k = (int)(flatIndex / (pt[0] * pt[1]));
+    //       const int flatIndex2D = flatIndex - (k * pt[0] * pt[1]);
+    //       const int i = flatIndex2D % pt[0];
+    //       const int j = (int)(flatIndex2D / pt[0]);
+    //       
+    //       Kokkos::complex<double> reducedValue = 0.0;
+    //       ippl::Vector<int, 3> iVec = {i, j, k};
+    //       ippl::Vector<double, 3>kVec;
+    //       for(size_t d = 0; d < 3; ++d) {
+    //           kVec[d] = (2.0 * pi / (maxU[d] - minU[d])) * (iVec[d] - (pt[d] / 2));
+    //       }
+    //       Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nloc),
+    //       [=](const size_t idx, Kokkos::complex<double>& innerReduce)
+    //       {
+    //           double arg = 0.0;
+    //           for(size_t d = 0; d < 3; ++d) {
+    //               arg += kVec[d]*Rview(idx)[d];
+    //           }
+    //           const double& val = Qview(idx);
+
+    //           innerReduce += (Kokkos::cos(arg) 
+    //                       - imag * Kokkos::sin(arg)) * val;
+    //       }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
+
+    //       if(teamMember.team_rank() == 0) {
+    //           fview(i+nghost,j+nghost,k+nghost) = reducedValue;
+    //       }
+
+    //       });
+    //
+    //typename field_type::HostMirror rhoNUDFT_host = field_dft.getHostMirror();
+    //Kokkos::deep_copy(rhoNUDFT_host, field_dft.getView());
+    //std::stringstream pname;
+    //pname << "data/FieldFFT_";
+    //pname << Ippl::Comm->rank();
+    //pname << ".csv";
+    //Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    //pcsvout.precision(10);
+    //pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+    //pcsvout << "rho" << endl;
+    //for (int i = 0; i< pt[0]; i++) {
+    //     for (int j = 0; j< pt[1]; j++) {
+    //         for (int k = 0; k< pt[2]; k++) {
+    //             pcsvout << field_result(i+nghost,j+nghost, k+nghost) << endl;
+    //         }
+    //     }
+    //}
+    //std::stringstream pname2;
+    //pname2 << "data/FieldDFT_";
+    //pname2 << Ippl::Comm->rank();
+    //pname2 << ".csv";
+    //Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    //pcsvout2.precision(10);
+    //pcsvout2.setf(std::ios::scientific, std::ios::floatfield);
+    //pcsvout2 << "rho" << endl;
+    //for (int i = 0; i< pt[0]; i++) {
+    //     for (int j = 0; j< pt[1]; j++) {
+    //         for (int k = 0; k< pt[2]; k++) {
+    //             pcsvout2 << rhoNUDFT_host(i+nghost,j+nghost, k+nghost) << endl;
+    //         }
+    //     }
+    //   }
+    //   Ippl::Comm->barrier();
     
     
     

From 3b744169fd4e650d2f8c7d02c2f01e641412daf5 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Mon, 31 Jul 2023 15:19:05 +0200
Subject: [PATCH 095/117] Uncommited changes committed and pushed

---
 alpine/ElectrostaticPIC/ChargedParticles.hpp | 2 +-
 alpine/ElectrostaticPIC/LandauDamping.cpp    | 2 +-
 alpine/PinT/LandauDampingPinT.cpp            | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index 61730648d..67b8f738f 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -341,7 +341,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
          rhoNorm_m = norm(rho_m);
          IpplTimings::stopTimer(sumTimer);
 
-         //dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
+         dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
 
          //rho = rho_e - rho_i
          rho_m = rho_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
diff --git a/alpine/ElectrostaticPIC/LandauDamping.cpp b/alpine/ElectrostaticPIC/LandauDamping.cpp
index fde09c024..0aed5ebc8 100644
--- a/alpine/ElectrostaticPIC/LandauDamping.cpp
+++ b/alpine/ElectrostaticPIC/LandauDamping.cpp
@@ -202,7 +202,7 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
-    double alpha = 0.05;
+    double alpha = 0.5;
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
     double dx = rmax[0] / nr[0];
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index ead8c38c7..4c2fe9e5f 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -463,7 +463,8 @@ int main(int argc, char *argv[]){
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
     //double alpha = 0.05;
-    Vector_t alpha = {0.05, 0.05, 0.05};
+    //Vector_t alpha = {0.05, 0.05, 0.05};
+    Vector_t alpha = {0.5, 0.5, 0.5};
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
     Vector_t length = rmax - rmin;

From 061854f00a38ec40c3d68b81400b079486dc0b2f Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Tue, 5 Sep 2023 17:43:51 +0200
Subject: [PATCH 096/117] Uncommited changes pushed

---
 alpine/PinT/LandauDampingPinT.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 4c2fe9e5f..93ced88d4 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -463,8 +463,8 @@ int main(int argc, char *argv[]){
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
     //double alpha = 0.05;
-    //Vector_t alpha = {0.05, 0.05, 0.05};
-    Vector_t alpha = {0.5, 0.5, 0.5};
+    Vector_t alpha = {0.05, 0.05, 0.05};
+    //Vector_t alpha = {0.5, 0.5, 0.5};
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
     Vector_t length = rmax - rmin;

From 7497f8eecec808fe979896ad5f0a14cbfa22f8b6 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <sriramkrishnan.muralikrishnan@psi.ch>
Date: Wed, 20 Dec 2023 05:43:00 -0800
Subject: [PATCH 097/117] State corresponding to Perlmutter full system scaling
 study

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 16 +++----
 alpine/PinT/ChargedParticlesPinT.hpp      | 54 +++++++++++------------
 alpine/PinT/LandauDampingPinT.cpp         | 14 +++---
 alpine/PinT/PenningTrapPinT.cpp           |  8 ++--
 src/FFT/FFT.h                             | 22 ++++-----
 src/FFT/FFT.hpp                           | 20 +++++----
 test/FFT/TestNUFFT2.cpp                   |  8 ++--
 7 files changed, 73 insertions(+), 69 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index e965bf997..fe5d4c3fa 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -436,7 +436,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
     static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
-    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
     static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
 
@@ -884,13 +884,13 @@ int main(int argc, char *argv[]){
                 << " Perror: " << Perror
                 << endl;
 
-            IpplTimings::startTimer(dumpData);
-            //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
-            //if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
-            //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-            //}
-            IpplTimings::stopTimer(dumpData);
+            //IpplTimings::startTimer(dumpData);
+            ////Pcoarse->writeError(Rerror, Perror, it+1);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            ////if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
+            ////Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
+            ////}
+            //IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
             
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 9cb817dd5..0e2d5912b 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -975,11 +975,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
-                     const unsigned int& iter, int rankTime, int rankSpace,
+                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
+                     const unsigned int& /*iter*/, int /*rankTime*/, int /*rankSpace*/,
                      MPI_Comm& spaceComm) {
     
-        static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+        //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
@@ -995,13 +995,13 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         time_m = tStartMySlice;
 
-        if((time_m == 0.0)) {
-            IpplTimings::startTimer(dumpData);
-            //dumpLandau(iter);         
-            dumpBumponTail(nc, iter, rankTime, rankSpace);         
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
-            IpplTimings::stopTimer(dumpData);
-        }
+        //if((time_m == 0.0)) {
+        //    IpplTimings::startTimer(dumpData);
+        //    //dumpLandau(iter);         
+        //    dumpBumponTail(nc, iter, rankTime, rankSpace);         
+        //    dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+        //    IpplTimings::stopTimer(dumpData);
+        //}
         for (unsigned int it=0; it<nt; it++) {
     
             // kick
@@ -1032,11 +1032,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             time_m += dt;
             
-            IpplTimings::startTimer(dumpData);
-            //dumpLandau(iter);         
-            dumpBumponTail(nc, iter, rankTime, rankSpace);         
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
-            IpplTimings::stopTimer(dumpData);
+            //IpplTimings::startTimer(dumpData);
+            ////dumpLandau(iter);         
+            //dumpBumponTail(nc, iter, rankTime, rankSpace);         
+            //dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+            //IpplTimings::stopTimer(dumpData);
     
         }
     }
@@ -1044,12 +1044,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
-                     const unsigned int& iter, const double& Bext,
-                     int rankTime, int rankSpace,
+                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
+                     const unsigned int& /*iter*/, const double& Bext,
+                     int /*rankTime*/, int /*rankSpace*/,
                      MPI_Comm& spaceComm) {
     
-        static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+        //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
@@ -1065,11 +1065,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         time_m = tStartMySlice;
 
-        if((time_m == 0.0)) {
-            IpplTimings::startTimer(dumpData);
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
-            IpplTimings::stopTimer(dumpData);
-        }
+        //if((time_m == 0.0)) {
+        //    IpplTimings::startTimer(dumpData);
+        //    dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+        //    IpplTimings::stopTimer(dumpData);
+        //}
         double alpha = -0.5 * dt;
         double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
         Vector_t rmax = rmax_m;
@@ -1144,9 +1144,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
             time_m += dt;
             
-            IpplTimings::startTimer(dumpData);
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
-            IpplTimings::stopTimer(dumpData);
+            //IpplTimings::startTimer(dumpData);
+            //dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+            //IpplTimings::stopTimer(dumpData);
     
         }
     }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 4c2fe9e5f..e2419b146 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -419,7 +419,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
     static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
-    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
     static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
 
@@ -463,8 +463,8 @@ int main(int argc, char *argv[]){
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
     //double alpha = 0.05;
-    //Vector_t alpha = {0.05, 0.05, 0.05};
-    Vector_t alpha = {0.5, 0.5, 0.5};
+    Vector_t alpha = {0.05, 0.05, 0.05};
+    //Vector_t alpha = {0.5, 0.5, 0.5};
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw ;
     Vector_t length = rmax - rmin;
@@ -814,10 +814,10 @@ int main(int argc, char *argv[]){
                 << " Perror: " << Perror
                 << endl;
 
-            IpplTimings::startTimer(dumpData);
-            //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
-            IpplTimings::stopTimer(dumpData);
+            //IpplTimings::startTimer(dumpData);
+            ////Pcoarse->writeError(Rerror, Perror, it+1);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
             
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 3f7d2b587..3733bfca7 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -440,7 +440,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
     static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
-    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
     static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
     static IpplTimings::TimerRef initializeCycles = IpplTimings::getTimer("initializeCycles");
@@ -859,11 +859,11 @@ int main(int argc, char *argv[]){
                 << " Perror: " << Perror
                 << endl;
 
-            IpplTimings::startTimer(dumpData);
+            //IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-            IpplTimings::stopTimer(dumpData);
+            //IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
 
diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 1b0fad70e..434495c91 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -130,11 +130,11 @@ namespace ippl {
 
         template <>
         struct CufinufftType<float> {
-            std::function<int(int, int, int*, int, int, 
-                              float, int, cufinufftf_plan*, cufinufft_opts*)> makeplan = cufinufftf_makeplan; 
-            std::function<int(int, float*, float*, float*, 
-                              int, float*, float*, float*, cufinufftf_plan)> setpts = cufinufftf_setpts; 
-            std::function<int(cuFloatComplex*, cuFloatComplex*, cufinufftf_plan)> execute = cufinufftf_execute; 
+            std::function<int(int, int, int64_t*, int, int, 
+                              float, cufinufftf_plan*, cufinufft_opts*)> makeplan = cufinufftf_makeplan; 
+            std::function<int(cufinufftf_plan, int, float*, float*, float*, 
+                              int, float*, float*, float*)> setpts = cufinufftf_setpts; 
+            std::function<int(cufinufftf_plan, cuFloatComplex*, cuFloatComplex*)> execute = cufinufftf_execute; 
             std::function<int(cufinufftf_plan)> destroy = cufinufftf_destroy;
             
             using complexType = cuFloatComplex;
@@ -143,11 +143,11 @@ namespace ippl {
 
         template <>
         struct CufinufftType<double> {
-            std::function<int(int, int, int*, int, int, 
-                              double, int, cufinufft_plan*, cufinufft_opts*)> makeplan = cufinufft_makeplan; 
-            std::function<int(int, double*, double*, double*, 
-                              int, double*, double*, double*, cufinufft_plan)> setpts = cufinufft_setpts; 
-            std::function<int(cuDoubleComplex*, cuDoubleComplex*, cufinufft_plan)> execute = cufinufft_execute; 
+            std::function<int(int, int, int64_t*, int, int, 
+                              double, cufinufft_plan*, cufinufft_opts*)> makeplan = cufinufft_makeplan; 
+            std::function<int(cufinufft_plan, int, double*, double*, double*, 
+                              int, double*, double*, double*)> setpts = cufinufft_setpts; 
+            std::function<int(cufinufft_plan, cuDoubleComplex*, cuDoubleComplex*)> execute = cufinufft_execute; 
             std::function<int(cufinufft_plan)> destroy = cufinufft_destroy; 
             
             using complexType = cuDoubleComplex;
@@ -388,7 +388,7 @@ namespace ippl {
         /**
            setup performs the initialization necessary.
         */
-        void setup(std::array<int, 3>& nmodes,
+        void setup(std::array<int64_t, 3>& nmodes,
                    const ParameterList& params);
 
         detail::CufinufftType<T> nufft_m;
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index 2c36113fa..b87f9f5fb 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -795,7 +795,7 @@ namespace ippl {
          * where we fill 0.
          */
         
-        std::array<int, 3> nmodes;
+        std::array<int64_t, 3> nmodes;
 
         const NDIndex<Dim>& lDom = layout.getLocalNDIndex();
 
@@ -826,7 +826,7 @@ namespace ippl {
     */
     template <size_t Dim, class T>
     void
-    FFT<NUFFTransform,Dim,T>::setup(std::array<int, 3>& nmodes,
+    FFT<NUFFTransform,Dim,T>::setup(std::array<int64_t, 3>& nmodes,
                                     const ParameterList& params)
     {
 
@@ -841,8 +841,9 @@ namespace ippl {
            opts.gpu_kerevalmeth = params.get<int>("gpu_kerevalmeth");
         }
 
-        int maxbatchsize = 0; //default option. ignored for ntransf = 1 which
-                              // is our case
+	opts.gpu_maxbatchsize = 0; //default option. ignored for ntransf = 1 which
+                                   // is our case
+	opts.gpu_device_id = (int)(Ippl::Comm->rank() % 4);
 
         int iflag;
         
@@ -859,7 +860,7 @@ namespace ippl {
         //dim in cufinufft is int
         int dim = static_cast<int>(Dim);
         ier_m = nufft_m.makeplan(type_m, dim, nmodes.data(), iflag, 1, tol_m,
-                       maxbatchsize, &plan_m, &opts);  
+                       		 &plan_m, &opts);  
 
     }
 
@@ -884,6 +885,7 @@ namespace ippl {
         const Layout_t& layout = f.getLayout(); 
         const UniformCartesian<T, Dim>& mesh = f.get_mesh();
         const Vector<T, Dim>& dx = mesh.getMeshSpacing();
+        const Vector<T, Dim>& origin = mesh.getOrigin();
         const auto& domain = layout.getDomain();
         Vector<T, Dim> Len;
         Vector<int, Dim> N;
@@ -947,16 +949,16 @@ namespace ippl {
                              KOKKOS_LAMBDA(const size_t i)
                              {
                                  for(size_t d = 0; d < Dim; ++d) {
-                                    tempR[d](i) = Rview(i)[d] * (2.0 * pi / Len[d]);
+                                    tempR[d](i) = (Rview(i)[d] - origin[d]) * (2.0 * pi / Len[d]);
                                  }
                                  tempQ(i).x = Qview(i);
                                  tempQ(i).y = 0.0;
                              });
 
-        ier_m = nufft_m.setpts(localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
-                     NULL, NULL, NULL, plan_m);
+        ier_m = nufft_m.setpts(plan_m, localNp, tempR[0].data(), tempR[1].data(), tempR[2].data(), 0, 
+                     NULL, NULL, NULL);
 
-        ier_m = nufft_m.execute(tempQ.data(), tempField.data(), plan_m);
+        ier_m = nufft_m.execute(plan_m, tempQ.data(), tempField.data());
         Kokkos::fence();
 
 
diff --git a/test/FFT/TestNUFFT2.cpp b/test/FFT/TestNUFFT2.cpp
index 8ffaf6827..f55351db7 100644
--- a/test/FFT/TestNUFFT2.cpp
+++ b/test/FFT/TestNUFFT2.cpp
@@ -148,14 +148,16 @@ int main(int argc, char *argv[]) {
     
     int type = 2;
     
-    fft = std::make_unique<FFT_type>(layout, type, fftParams);
-
-
     Vector_t minU = {-pi, -pi, -pi};
     Vector_t maxU = {pi, pi, pi};
 
     size_type nloc = Np/Ippl::Comm->size();
 
+
+    fft = std::make_unique<FFT_type>(layout, nloc, type, fftParams);
+
+
+
     const int nghost = field.getNghost();
     using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
     auto fview = field.getView();

From 1343b2a60f622176d5fa5b4b61f077c6370afd78 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Wed, 3 Jan 2024 06:51:10 +0100
Subject: [PATCH 098/117] Tweaks needed for finufft 2.2.0 and stages 2024

---
 CMakeLists.txt                               | 3 ++-
 alpine/ElectrostaticPIC/ChargedParticles.hpp | 2 +-
 src/FFT/FFT.hpp                              | 8 +++++---
 src/Types/ViewTypes.h                        | 1 +
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f15ec370..f27d10ac4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,8 @@ endif ()
 add_compile_options (-Wall)
 add_compile_options (-Wunused)
 add_compile_options (-Wextra)
-add_compile_options (-Werror)
+#add_compile_options (-Werror)
+
 
 # allow deprecated functions
 add_compile_options (-Wno-deprecated-declarations)
diff --git a/alpine/ElectrostaticPIC/ChargedParticles.hpp b/alpine/ElectrostaticPIC/ChargedParticles.hpp
index 67b8f738f..61730648d 100644
--- a/alpine/ElectrostaticPIC/ChargedParticles.hpp
+++ b/alpine/ElectrostaticPIC/ChargedParticles.hpp
@@ -341,7 +341,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
          rhoNorm_m = norm(rho_m);
          IpplTimings::stopTimer(sumTimer);
 
-         dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
+         //dumpVTK(rho_m,nr_m[0],nr_m[1],nr_m[2],iteration,hrField[0],hrField[1],hrField[2]);
 
          //rho = rho_e - rho_i
          rho_m = rho_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index b87f9f5fb..acb06d7bd 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -831,7 +831,7 @@ namespace ippl {
     {
 
         cufinufft_opts opts;
-	    ier_m = cufinufft_default_opts(type_m, Dim, &opts);
+	    cufinufft_default_opts(&opts);
         tol_m = 1e-6;
 
         if(!params.get<bool>("use_cufinufft_defaults")) {
@@ -841,9 +841,11 @@ namespace ippl {
            opts.gpu_kerevalmeth = params.get<int>("gpu_kerevalmeth");
         }
 
-	opts.gpu_maxbatchsize = 0; //default option. ignored for ntransf = 1 which
+	    opts.gpu_maxbatchsize = 0; //default option. ignored for ntransf = 1 which
                                    // is our case
-	opts.gpu_device_id = (int)(Ippl::Comm->rank() % 4);
+	    //For Perlmutter since the mask to hide the other GPUs in the node is 
+        //somehow not working there
+        //opts.gpu_device_id = (int)(Ippl::Comm->rank() % 4);
 
         int iflag;
         
diff --git a/src/Types/ViewTypes.h b/src/Types/ViewTypes.h
index 7cfc4238d..a8877926d 100644
--- a/src/Types/ViewTypes.h
+++ b/src/Types/ViewTypes.h
@@ -19,6 +19,7 @@
 #define IPPL_VIEW_TYPES_H
 
 #include <Kokkos_Core.hpp>
+#include <iostream>
 
 namespace ippl {
     /**

From 1cf99a32015bee0800eaba9e3fada507077f5afa Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Thu, 4 Jan 2024 08:24:01 +0100
Subject: [PATCH 099/117] Code modified for using NUFFT of higher tolerance
 also as coarse propagator

---
 alpine/PinT/ChargedParticlesPinT.hpp | 121 ++++++++++++++++++++++-----
 alpine/PinT/LandauDampingPinT.cpp    |  20 ++++-
 src/FFT/FFT.h                        |   3 +
 src/Particle/ParticleAttrib.h        |  16 ++--
 src/Particle/ParticleAttrib.hpp      |  26 +++---
 5 files changed, 145 insertions(+), 41 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 0e2d5912b..7c6d6bcad 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -52,6 +52,9 @@ extern const char* TestName;
 template<class PLayout>
 class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 public:
+
+    //using nufft_t = typename ippl::FFT<ippl::NUFFTransform, 3, double>;
+
     CxField_t rhoPIF_m;
     Field_t Sk_m;
     Field_t rhoPIC_m;
@@ -79,6 +82,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     int shapedegree_m;
 
+    //nufft_t  nufftType1Fine_m,nufftType2Fine_m,nufftType1Coarse_m,nufftType2Coarse_m; 
+    std::shared_ptr<ippl::FFT<ippl::NUFFTransform, 3, double>> nufftType1Fine_mp,nufftType2Fine_mp,nufftType1Coarse_mp,nufftType2Coarse_mp;
+
 public:
     ParticleAttrib<double>     q; // charge
     typename ippl::ParticleBase<PLayout>::particle_position_type P;  // G(P^(k)_n)
@@ -161,20 +167,51 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     }
 
 
-    void initNUFFT(FieldLayout_t& FLPIF) {
-        ippl::ParameterList fftParams;
+    //void initNUFFT(FieldLayout_t& FLPIF, double& tol) {
+    //    ippl::ParameterList fftParams;
 
-        fftParams.add("gpu_method", 1);
-        fftParams.add("gpu_sort", 0);
-        fftParams.add("gpu_kerevalmeth", 1);
-        fftParams.add("tolerance", 1e-6);
+    //    fftParams.add("gpu_method", 1);
+    //    fftParams.add("gpu_sort", 0);
+    //    fftParams.add("gpu_kerevalmeth", 1);
+    //    //fftParams.add("tolerance", 1e-6);
+    //    fftParams.add("tolerance", tol);
 
-        fftParams.add("use_cufinufft_defaults", false);
+    //    fftParams.add("use_cufinufft_defaults", false);
 
-        q.initializeNUFFT(FLPIF, 1, fftParams);
-        E.initializeNUFFT(FLPIF, 2, fftParams);
-    }
+    //    q.initializeNUFFT(FLPIF, 1, fftParams);
+    //    E.initializeNUFFT(FLPIF, 2, fftParams);
+    //}
+
+    void initNUFFTs(FieldLayout_t& FLPIF, double& coarseTol,
+                    double& fineTol) {
+        
+        ippl::ParameterList fftCoarseParams,fftFineParams;
+
+        fftFineParams.add("gpu_method", 1);
+        fftFineParams.add("gpu_sort", 0);
+        fftFineParams.add("gpu_kerevalmeth", 1);
+        fftFineParams.add("tolerance", fineTol);
+
+        fftCoarseParams.add("gpu_method", 1);
+        fftCoarseParams.add("gpu_sort", 0);
+        fftCoarseParams.add("gpu_kerevalmeth", 1);
+        fftCoarseParams.add("tolerance", coarseTol);
 
+        fftFineParams.add("use_cufinufft_defaults", false);
+        fftCoarseParams.add("use_cufinufft_defaults", false);
+        
+        //nufftType1Fine_m = nufft_t(FLPIF, this->getLocalNum(), 1, fftFineParams);
+        //nufftType2Fine_m = nufft_t(FLPIF, this->getLocalNum(), 2, fftFineParams);
+
+        //nufftType1Coarse_m = nufft_t(FLPIF, this->getLocalNum(), 1, fftCoarseParams);
+        //nufftType2Coarse_m = nufft_t(FLPIF, this->getLocalNum(), 2, fftCoarseParams);
+        nufftType1Fine_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 1, fftFineParams);
+        nufftType2Fine_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 2, fftFineParams);
+
+        nufftType1Coarse_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 1, fftCoarseParams);
+        nufftType2Coarse_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 2, fftCoarseParams);
+    }
+    
     void initializeParareal(ParticleAttrib<Vector_t>& Rbegin,
                             ParticleAttrib<Vector_t>& Pbegin,
                             ParticleAttrib<Vector_t>& Rcoarse,
@@ -977,19 +1014,31 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
                      const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
                      const unsigned int& /*iter*/, int /*rankTime*/, int /*rankSpace*/,
-                     MPI_Comm& spaceComm) {
+                     const std::string& propagator, MPI_Comm& spaceComm) {
     
         //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
-        scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
+        if(propagator == "Coarse") {
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
+        }
+        else if(propagator == "Fine") {
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
+        }
+
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
         // Solve for and gather E field
-        gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
+        if(propagator == "Coarse") {
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Coarse_mp.get(), q);
+        }
+        else if(propagator == "Fine") {
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
+        }
+        //gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
 
         q = Q_m / Np_m;
     
@@ -1018,12 +1067,24 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
-            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
+            if(propagator == "Coarse") {
+                scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
+            }
+            else if(propagator == "Fine") {
+                scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
+            }
+            //scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
             // Solve for and gather E field
-            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
+            if(propagator == "Coarse") {
+                gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Coarse_mp.get(), q);
+            }
+            else if(propagator == "Fine") {
+                gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
+            }
+            //gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
 
             q = Q_m / Np_m;
 
@@ -1047,19 +1108,29 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                      const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
                      const unsigned int& /*iter*/, const double& Bext,
                      int /*rankTime*/, int /*rankSpace*/,
-                     MPI_Comm& spaceComm) {
+                     const std::string& propagator, MPI_Comm& spaceComm) {
     
         //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
-        scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
+        if(propagator == "Coarse") {
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
+        }
+        else if(propagator == "Fine") {
+            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
+        }
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
         // Solve for and gather E field
-        gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
+        if(propagator == "Coarse") {
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Coarse_mp.get(), q);
+        }
+        else if(propagator == "Fine") {
+            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
+        }
 
         q = Q_m / Np_m;
 
@@ -1113,12 +1184,22 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
-            scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
+            if(propagator == "Coarse") {
+                scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
+            }
+            else if(propagator == "Fine") {
+                scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
+            }
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
             // Solve for and gather E field
-            gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
+            if(propagator == "Coarse") {
+                gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Coarse_mp.get(), q);
+            }
+            else if(propagator == "Fine") {
+                gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
+            }
     
             q = Q_m / Np_m;
             //kick
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index e2419b146..3a17b804a 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -547,7 +547,12 @@ int main(int argc, char *argv[]){
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
   
-    Pcoarse->initNUFFT(FLPIF);
+    //Pcoarse->initNUFFT(FLPIF);
+    double coarseTol = 1e-2;
+    double fineTol   = 1e-6;
+    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    std::string coarse = "Coarse";
+    std::string fine = "Fine";
 
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
@@ -609,7 +614,9 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
+    //Pcoarse->initNUFFT(FLPIF, coarseTol);
     Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
+    //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
@@ -739,7 +746,8 @@ int main(int argc, char *argv[]){
         while (!isConverged) { 
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
-            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, spaceComm);
+            //Pcoarse->initNUFFT(FLPIF, fineTol);
+            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, fine, spaceComm);
             IpplTimings::stopTimer(finePropagator);
     
 
@@ -778,7 +786,9 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            //Pcoarse->initNUFFT(FLPIF, coarseTol);
+            //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -862,7 +872,9 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            //Pcoarse->initNUFFT(FLPIF, coarseTol);
+            Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 434495c91..816ae8e4b 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -368,6 +368,9 @@ namespace ippl {
         using view_particle_real_type = typename detail::ViewType<T, 1, Kokkos::LayoutLeft>::view_type;
         using view_particle_complex_type = typename detail::ViewType<complexType, 1, Kokkos::LayoutLeft>::view_type;
 
+
+        FFT() = default;
+
         /** Create a new FFT object with the layout for the input Field, type 
          * (1 or 2) for the NUFFT and parameters for cuFINUFFT.
         */
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index aeb0df5f0..10b391d69 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -176,19 +176,21 @@ namespace ippl {
                 const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp);
 
 #ifdef KOKKOS_ENABLE_CUDA
-        template<unsigned Dim>
-        void initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams); 
+        //template<unsigned Dim>
+        //void initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams); 
 
         template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
         scatterPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
                 const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp,
+                FFT<NUFFTransform, 3, P3>* nufft,
                 const MPI_Comm& spaceComm) const;
         
         template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
         gatherPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
                 const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp,
+                FFT<NUFFTransform, 3, P3>* nufft,
                 ParticleAttrib<P4, Properties... >& q);
 #endif
 
@@ -199,11 +201,11 @@ namespace ippl {
 
     private:
         view_type dview_m;
-#ifdef KOKKOS_ENABLE_CUDA
-        //TODO: Remove hard-coded dimension by having Dim as template 
-        //parameter. Does this need to be in CUDA ifdefs?
-        std::shared_ptr<FFT<NUFFTransform, 3, double>> fftType_mp;
-#endif
+//#ifdef KOKKOS_ENABLE_CUDA
+//        //TODO: Remove hard-coded dimension by having Dim as template 
+//        //parameter. Does this need to be in CUDA ifdefs?
+//        std::shared_ptr<FFT<NUFFTransform, 3, double>> fftType_mp;
+//#endif
     };
 }
 
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 93f8620b3..41b11f220 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -493,12 +493,12 @@ namespace ippl {
 
 #ifdef KOKKOS_ENABLE_CUDA
 
-    template<typename T, class... Properties>
-    template<unsigned Dim>
-    void ParticleAttrib<T, Properties...>::initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams) {
-        
-        fftType_mp = std::make_shared<FFT<NUFFTransform, Dim, double>>(layout, *(this->localNum_mp), type, fftParams);
-    }
+    //template<typename T, class... Properties>
+    //template<unsigned Dim>
+    //void ParticleAttrib<T, Properties...>::initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams) {
+    //    
+    //    fftType_mp = std::make_shared<FFT<NUFFTransform, Dim, double>>(layout, *(this->localNum_mp), type, fftParams);
+    //}
     
     
     
@@ -506,6 +506,7 @@ namespace ippl {
     template <unsigned Dim, class M, class C, class FT, class ST, class PT>
     void ParticleAttrib<T, Properties...>::scatterPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp,
+                                                   FFT<NUFFTransform, 3, ST>* nufft,
                                                    const MPI_Comm& spaceComm)
     const
     {
@@ -524,7 +525,8 @@ namespace ippl {
 
         tempField = 0.0;
         
-        fftType_mp->transform(pp, q, tempField);
+        //fftType_mp->transform(pp, q, tempField);
+        nufft->transform(pp, q, tempField);
         //fftType_mp->transform(pp, q, f);
 
         
@@ -566,6 +568,7 @@ namespace ippl {
     template <unsigned Dim, class M, class C, class FT, class ST, class PT>
     void ParticleAttrib<T, Properties...>::gatherPIFNUFFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
                                                    const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp, 
+                                                   FFT<NUFFTransform, 3, ST>* nufft,
                                                    ParticleAttrib<PT, Properties... >& q)
     {
         static IpplTimings::TimerRef gatherPIFNUFFTTimer = IpplTimings::getTimer("GatherPIFNUFFT");           
@@ -631,7 +634,8 @@ namespace ippl {
                 tempview(i, j, k) *= -Skview(i, j, k) * (imag * kVec[gd] * factor);
             });
 
-            fftType_mp->transform(pp, q, tempField);
+            //fftType_mp->transform(pp, q, tempField);
+            nufft->transform(pp, q, tempField);
 
             Kokkos::parallel_for("Assign E gather NUFFT",
                                 Np,
@@ -651,10 +655,11 @@ namespace ippl {
     inline
     void scatterPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
                  Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp,
+                 FFT<NUFFTransform, 3, P3>* nufft,
                  const MPI_Comm& spaceComm = MPI_COMM_WORLD)
     {
 #ifdef KOKKOS_ENABLE_CUDA
-        attrib.scatterPIFNUFFT(f, Sk, pp, spaceComm);
+        attrib.scatterPIFNUFFT(f, Sk, pp, nufft, spaceComm);
 #else
         //throw IpplException("scatterPIFNUFFT", "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 
         //                     be compiled with CUDA. Otherwise use scatterPIFNUDFT.");
@@ -665,10 +670,11 @@ namespace ippl {
     inline
     void gatherPIFNUFFT(ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
                  Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp, 
+                 FFT<NUFFTransform, 3, P3>* nufft,
                  ParticleAttrib<P4, Properties... >& q)
     {
 #ifdef KOKKOS_ENABLE_CUDA
-        attrib.gatherPIFNUFFT(f, Sk, pp, q);
+        attrib.gatherPIFNUFFT(f, Sk, pp, nufft, q);
 #else
         //throw IpplException("gatherPIFNUFFT",
         //                    "The NUFFT library cuFINUFFT currently only works with CUDA and hence Kokkos needs to 

From f67d5ae3313ab99a2e0e670db2d0b18a776e737a Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Tue, 9 Jan 2024 09:32:38 +0100
Subject: [PATCH 100/117] In the middle of trying to understand why particles
 creation take so much time

---
 alpine/PinT/ChargedParticlesPinT.hpp |  6 ++-
 alpine/PinT/LandauDampingPinT.cpp    |  7 ++-
 alpine/PinT/PenningTrapPinT.cpp      | 80 ++++++++++++++++++++++++----
 3 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 7c6d6bcad..235341cfb 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -1136,7 +1136,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         time_m = tStartMySlice;
 
-        //if((time_m == 0.0)) {
+        //if((time_m == 0.0) && (propagator == "Fine")) {
         //    IpplTimings::startTimer(dumpData);
         //    dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
         //    IpplTimings::stopTimer(dumpData);
@@ -1226,7 +1226,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             time_m += dt;
             
             //IpplTimings::startTimer(dumpData);
-            //dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+            //if(propagator == "Fine") {
+            //    dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            //}
             //IpplTimings::stopTimer(dumpData);
     
         }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 3a17b804a..e42c13a39 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -615,8 +615,8 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(deepCopy);
 
     //Pcoarse->initNUFFT(FLPIF, coarseTol);
-    Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
-    //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
+    //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
+    Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
@@ -787,6 +787,9 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(coarsePropagator);
             //Pcoarse->initNUFFT(FLPIF, coarseTol);
+            //double coarseTol = (double)(std::pow(0.1,std::min((int)(it+2),3)));
+            //double fineTol = 1e-6;
+            //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
             //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
             Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 3733bfca7..759027652 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -440,10 +440,13 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
     static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
-    //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
     static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
     static IpplTimings::TimerRef initializeCycles = IpplTimings::getTimer("initializeCycles");
+    static IpplTimings::TimerRef initialComm = IpplTimings::getTimer("initialComm");
+    static IpplTimings::TimerRef initialCoarse = IpplTimings::getTimer("initialCoarse");
+    static IpplTimings::TimerRef warmupStep = IpplTimings::getTimer("warmupStep");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -552,7 +555,6 @@ int main(int argc, char *argv[]){
 
     Pcoarse->initFFTSolver();
 
-    IpplTimings::startTimer(particleCreation);
 
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
@@ -576,7 +578,56 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
     
     
-    Pcoarse->initNUFFT(FLPIF);
+    double coarseTol = 1e-3;
+    double fineTol   = 1e-6;
+    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    std::string coarse = "Coarse";
+    std::string fine = "Fine";
+
+    //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+
+    //IpplTimings::startTimer(warmupStep);
+    //if(rankTime == 0) {
+    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
+    //    Kokkos::parallel_for(nloc,
+    //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
+    //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, mu, sd, 
+    //                         minU, maxU));
+
+
+    //    Kokkos::fence();
+    //}
+    //else {
+    //    size_type bufSize = Pbegin->packedSize(nloc);
+    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
+    //    Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
+    //    buf->resetReadPos();
+    //}
+
+    //Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
+    //Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
+    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
+    ////Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+    //Pcoarse->BorisPIF(Pend->R, Pend->P, 45, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+    //
+    //Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
+    //Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
+    //if(rankTime < sizeTime-1) {
+    //    size_type bufSize = Pend->packedSize(nloc);
+    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
+    //    MPI_Request request;
+    //    Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
+    //    buf->resetWritePos();
+    //    MPI_Wait(&request, MPI_STATUS_IGNORE);
+    //}
+    //IpplTimings::stopTimer(warmupStep);
+    
+
+    
+    IpplTimings::startTimer(particleCreation);
+    
+    //Pcoarse->initNUFFT(FLPIF);
 
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
@@ -614,6 +665,7 @@ int main(int argc, char *argv[]){
     //condition is not the same on different GPUs
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
 
+    IpplTimings::startTimer(initialComm);
     if(rankTime == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
         Kokkos::parallel_for(nloc,
@@ -630,6 +682,7 @@ int main(int argc, char *argv[]){
         Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
         buf->resetReadPos();
     }
+    IpplTimings::stopTimer(initialComm);
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
@@ -638,13 +691,17 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
-    Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+    IpplTimings::startTimer(initialCoarse);
+    //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+    Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+    IpplTimings::stopTimer(initialCoarse);
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
     IpplTimings::stopTimer(deepCopy);
     
+    IpplTimings::startTimer(initialComm);
     if(rankTime < sizeTime-1) {
         size_type bufSize = Pend->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
@@ -653,6 +710,7 @@ int main(int argc, char *argv[]){
         buf->resetWritePos();
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
+    IpplTimings::stopTimer(initialComm);
 #else
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
@@ -776,7 +834,7 @@ int main(int argc, char *argv[]){
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
             Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, 
-                              Bext, rankTime, rankSpace, spaceComm);
+                              Bext, rankTime, rankSpace, fine, spaceComm);
             IpplTimings::stopTimer(finePropagator);
         
 
@@ -819,7 +877,8 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            Pcoarse->BorisPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+            //Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -859,11 +918,11 @@ int main(int argc, char *argv[]){
                 << " Perror: " << Perror
                 << endl;
 
-            //IpplTimings::startTimer(dumpData);
+            IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-            //IpplTimings::stopTimer(dumpData);
+            IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
 
@@ -915,7 +974,8 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+            //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());

From b417e289b9892177cfcf2204271a1b097f8819b8 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Wed, 10 Jan 2024 15:42:15 +0100
Subject: [PATCH 101/117] Reason for initial communication to be expensive
 identified.

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 33 +++++++----
 alpine/PinT/LandauDampingPinT.cpp         | 23 ++++----
 alpine/PinT/PenningTrapPinT.cpp           | 72 ++++++-----------------
 3 files changed, 52 insertions(+), 76 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index fe5d4c3fa..0a72b851d 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -574,22 +574,13 @@ int main(int argc, char *argv[]){
     //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
-
-    IpplTimings::startTimer(particleCreation);
-
+    
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], delta, kw[d], d);
         maxU[d]   = CDF(rmax[d], delta, kw[d], d);
     }
 
-
-    Pcoarse->create(nloc);
-    Pbegin->create(nloc);
-    Pend->create(nloc);
-    
-    Pcoarse->q = Pcoarse->Q_m/Total_particles;
-
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
 
@@ -599,8 +590,23 @@ int main(int argc, char *argv[]){
     IpplTimings::startTimer(initializeShapeFunctionPIF);
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
+    
+    double coarseTol = 1e-3;
+    double fineTol   = 1e-6;
+    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    std::string coarse = "Coarse";
+    std::string fine = "Fine";
+    
+    IpplTimings::startTimer(particleCreation);
+
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
+    
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
+
   
-    Pcoarse->initNUFFT(FLPIF);
+    //Pcoarse->initNUFFT(FLPIF);
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
@@ -665,6 +671,7 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(deepCopy);
 
     Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
+    //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
@@ -800,7 +807,7 @@ int main(int argc, char *argv[]){
         while (!isConverged) { 
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
-            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, spaceComm);
+            Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, fine, spaceComm);
             IpplTimings::stopTimer(finePropagator);
     
 
@@ -843,6 +850,7 @@ int main(int argc, char *argv[]){
 
             IpplTimings::startTimer(coarsePropagator);
             Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            //Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -936,6 +944,7 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
             
             Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index e42c13a39..bb0b12240 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -522,22 +522,13 @@ int main(int argc, char *argv[]){
     //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
     Pcoarse->initFFTSolver();
-
-    IpplTimings::startTimer(particleCreation);
-
+    
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], alpha[d], kw[d]);
         maxU[d]   = CDF(rmax[d], alpha[d], kw[d]);
     }
 
-
-    Pcoarse->create(nloc);
-    Pbegin->create(nloc);
-    Pend->create(nloc);
-
-    Pcoarse->q = Pcoarse->Q_m/Total_particles;
-    
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
 
@@ -548,12 +539,22 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
   
     //Pcoarse->initNUFFT(FLPIF);
-    double coarseTol = 1e-2;
+    double coarseTol = 1e-3;
     double fineTol   = 1e-6;
     Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
     std::string coarse = "Coarse";
     std::string fine = "Fine";
 
+
+    IpplTimings::startTimer(particleCreation);
+
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
+
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
+    
+
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 759027652..967a284ec 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -563,11 +563,6 @@ int main(int argc, char *argv[]){
     }
 
 
-    Pcoarse->create(nloc);
-    Pbegin->create(nloc);
-    Pend->create(nloc);
-
-    Pcoarse->q = Pcoarse->Q_m/Total_particles;
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
 
@@ -584,48 +579,15 @@ int main(int argc, char *argv[]){
     std::string coarse = "Coarse";
     std::string fine = "Fine";
 
-    //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-
-    //IpplTimings::startTimer(warmupStep);
-    //if(rankTime == 0) {
-    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
-    //    Kokkos::parallel_for(nloc,
-    //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
-    //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, mu, sd, 
-    //                         minU, maxU));
-
-
-    //    Kokkos::fence();
-    //}
-    //else {
-    //    size_type bufSize = Pbegin->packedSize(nloc);
-    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-    //    Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
-    //    buf->resetReadPos();
-    //}
-
-    //Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
-    //Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
-    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pbegin->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
-    ////Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-    //Pcoarse->BorisPIF(Pend->R, Pend->P, 45, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
-    //
-    //Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
-    //if(rankTime < sizeTime-1) {
-    //    size_type bufSize = Pend->packedSize(nloc);
-    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
-    //    MPI_Request request;
-    //    Ippl::Comm->isend(rankTime+1, tag, *Pend, *buf, request, nloc, timeComm);
-    //    buf->resetWritePos();
-    //    MPI_Wait(&request, MPI_STATUS_IGNORE);
-    //}
-    //IpplTimings::stopTimer(warmupStep);
     
+    IpplTimings::startTimer(particleCreation);
+    
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
 
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
     
-    IpplTimings::startTimer(particleCreation);
     
     //Pcoarse->initNUFFT(FLPIF);
 
@@ -692,8 +654,8 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(deepCopy);
 
     IpplTimings::startTimer(initialCoarse);
-    //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-    Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+    Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+    //Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
     IpplTimings::stopTimer(initialCoarse);
 
     IpplTimings::startTimer(deepCopy);
@@ -797,6 +759,8 @@ int main(int argc, char *argv[]){
 
     
     int sign = 1;
+    //coarseTol = 1e-3;
+    //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
     for (unsigned int nc=0; nc < nCycles; nc++) {
         
         double tStartMySlice; 
@@ -877,8 +841,10 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            Pcoarse->BorisPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
-            //Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            //coarseTol = 1e-4;//(double)(std::pow(0.1,std::min((int)(it+2),4)));
+            //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+            //Pcoarse->BorisPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+            Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -918,11 +884,11 @@ int main(int argc, char *argv[]){
                 << " Perror: " << Perror
                 << endl;
 
-            IpplTimings::startTimer(dumpData);
+            //IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-            IpplTimings::stopTimer(dumpData);
+            //IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
 
@@ -974,8 +940,8 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
-            //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            //Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+            Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());

From ccc064f859cbe4e9e12679a36db8ee3b234633c2 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Mon, 29 Jan 2024 15:32:41 +0100
Subject: [PATCH 102/117] Uncommitted changes committed

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 36 ++++++++------
 alpine/PinT/ChargedParticlesPinT.hpp      | 60 ++++++++++++-----------
 alpine/PinT/LandauDampingPinT.cpp         | 43 +++++++++-------
 alpine/PinT/PenningTrapPinT.cpp           | 24 +++++----
 4 files changed, 90 insertions(+), 73 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 0a72b851d..1d3b81545 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -436,7 +436,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
     static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
-    //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
     static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
 
@@ -591,19 +591,25 @@ int main(int argc, char *argv[]){
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
     
-    double coarseTol = 1e-3;
-    double fineTol   = 1e-6;
-    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
-    std::string coarse = "Coarse";
-    std::string fine = "Fine";
-    
     IpplTimings::startTimer(particleCreation);
 
     Pcoarse->create(nloc);
     Pbegin->create(nloc);
     Pend->create(nloc);
-    
+
     Pcoarse->q = Pcoarse->Q_m/Total_particles;
+   
+    IpplTimings::stopTimer(particleCreation);
+
+    
+    double coarseTol = std::atof(argv[17]);
+    double fineTol   = 1e-12;
+    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    std::string coarse = "Coarse";
+    std::string fine = "Fine";
+    
+    IpplTimings::startTimer(particleCreation);
+
 
   
     //Pcoarse->initNUFFT(FLPIF);
@@ -892,13 +898,13 @@ int main(int argc, char *argv[]){
                 << " Perror: " << Perror
                 << endl;
 
-            //IpplTimings::startTimer(dumpData);
-            ////Pcoarse->writeError(Rerror, Perror, it+1);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
-            ////if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
-            ////Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-            ////}
-            //IpplTimings::stopTimer(dumpData);
+            IpplTimings::startTimer(dumpData);
+            //Pcoarse->writeError(Rerror, Perror, it+1);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
+            //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
+            //}
+            IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
             
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 235341cfb..862b097e3 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -1012,11 +1012,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
-                     const unsigned int& /*iter*/, int /*rankTime*/, int /*rankSpace*/,
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, int rankTime, int rankSpace,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
-        //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+        static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
@@ -1044,13 +1044,13 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         time_m = tStartMySlice;
 
-        //if((time_m == 0.0)) {
-        //    IpplTimings::startTimer(dumpData);
-        //    //dumpLandau(iter);         
-        //    dumpBumponTail(nc, iter, rankTime, rankSpace);         
-        //    dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
-        //    IpplTimings::stopTimer(dumpData);
-        //}
+        if((time_m == 0.0) && (propagator == "Fine")) {
+            IpplTimings::startTimer(dumpData);
+            //dumpLandau(iter);         
+            dumpBumponTail(nc, iter, rankTime, rankSpace);         
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            IpplTimings::stopTimer(dumpData);
+        }
         for (unsigned int it=0; it<nt; it++) {
     
             // kick
@@ -1093,11 +1093,13 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             time_m += dt;
             
-            //IpplTimings::startTimer(dumpData);
-            ////dumpLandau(iter);         
-            //dumpBumponTail(nc, iter, rankTime, rankSpace);         
-            //dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
-            //IpplTimings::stopTimer(dumpData);
+            if(propagator == "Fine") {
+                IpplTimings::startTimer(dumpData);
+                //dumpLandau(iter);         
+                dumpBumponTail(nc, iter, rankTime, rankSpace);         
+                dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+                IpplTimings::stopTimer(dumpData);
+            }
     
         }
     }
@@ -1105,12 +1107,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
-                     const unsigned int& /*iter*/, const double& Bext,
-                     int /*rankTime*/, int /*rankSpace*/,
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, const double& Bext,
+                     int rankTime, int rankSpace,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
-        //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+        static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
         //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
         //checkBounds(Rtemp);
@@ -1136,11 +1138,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         time_m = tStartMySlice;
 
-        //if((time_m == 0.0) && (propagator == "Fine")) {
-        //    IpplTimings::startTimer(dumpData);
-        //    dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
-        //    IpplTimings::stopTimer(dumpData);
-        //}
+        if((time_m == 0.0) && (propagator == "Fine")) {
+            IpplTimings::startTimer(dumpData);
+            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            IpplTimings::stopTimer(dumpData);
+        }
         double alpha = -0.5 * dt;
         double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
         Vector_t rmax = rmax_m;
@@ -1225,11 +1227,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
             time_m += dt;
             
-            //IpplTimings::startTimer(dumpData);
-            //if(propagator == "Fine") {
-            //    dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
-            //}
-            //IpplTimings::stopTimer(dumpData);
+            if(propagator == "Fine") {
+                IpplTimings::startTimer(dumpData);
+                dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+                IpplTimings::stopTimer(dumpData);
+            }
     
         }
     }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index bb0b12240..994519005 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -419,7 +419,7 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef deepCopy = IpplTimings::getTimer("deepCopy");
     static IpplTimings::TimerRef finePropagator = IpplTimings::getTimer("finePropagator");
     static IpplTimings::TimerRef coarsePropagator = IpplTimings::getTimer("coarsePropagator");
-    //static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
+    static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
     static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
 
@@ -537,14 +537,6 @@ int main(int argc, char *argv[]){
     IpplTimings::startTimer(initializeShapeFunctionPIF);
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
-  
-    //Pcoarse->initNUFFT(FLPIF);
-    double coarseTol = 1e-3;
-    double fineTol   = 1e-6;
-    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
-    std::string coarse = "Coarse";
-    std::string fine = "Fine";
-
 
     IpplTimings::startTimer(particleCreation);
 
@@ -553,6 +545,19 @@ int main(int argc, char *argv[]){
     Pend->create(nloc);
 
     Pcoarse->q = Pcoarse->Q_m/Total_particles;
+   
+    IpplTimings::stopTimer(particleCreation);
+    
+    //Pcoarse->initNUFFT(FLPIF);
+    double coarseTol = std::atof(argv[17]);
+    double fineTol   = 1e-12;
+    Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    std::string coarse = "Coarse";
+    std::string fine = "Fine";
+
+
+    IpplTimings::startTimer(particleCreation);
+
     
 
 #ifdef KOKKOS_ENABLE_CUDA
@@ -616,8 +621,8 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(deepCopy);
 
     //Pcoarse->initNUFFT(FLPIF, coarseTol);
-    //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
-    Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
+    Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
+    //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
@@ -791,8 +796,8 @@ int main(int argc, char *argv[]){
             //double coarseTol = (double)(std::pow(0.1,std::min((int)(it+2),3)));
             //double fineTol = 1e-6;
             //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
-            //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
-            Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            //Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -828,10 +833,10 @@ int main(int argc, char *argv[]){
                 << " Perror: " << Perror
                 << endl;
 
-            //IpplTimings::startTimer(dumpData);
-            ////Pcoarse->writeError(Rerror, Perror, it+1);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
-            //IpplTimings::stopTimer(dumpData);
+            IpplTimings::startTimer(dumpData);
+            //Pcoarse->writeError(Rerror, Perror, it+1);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
             
@@ -876,9 +881,9 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
             //Pcoarse->initNUFFT(FLPIF, coarseTol);
-            Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 967a284ec..7cc1af833 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -572,9 +572,18 @@ int main(int argc, char *argv[]){
     Pcoarse->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
     
+    IpplTimings::startTimer(particleCreation);
+
+    Pcoarse->create(nloc);
+    Pbegin->create(nloc);
+    Pend->create(nloc);
+
+    Pcoarse->q = Pcoarse->Q_m/Total_particles;
+   
+    IpplTimings::stopTimer(particleCreation);
     
-    double coarseTol = 1e-3;
-    double fineTol   = 1e-6;
+    double coarseTol = std::atof(argv[17]);
+    double fineTol   = 1e-12;
     Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
     std::string coarse = "Coarse";
     std::string fine = "Fine";
@@ -582,11 +591,6 @@ int main(int argc, char *argv[]){
     
     IpplTimings::startTimer(particleCreation);
     
-    Pcoarse->create(nloc);
-    Pbegin->create(nloc);
-    Pend->create(nloc);
-
-    Pcoarse->q = Pcoarse->Q_m/Total_particles;
     
     
     //Pcoarse->initNUFFT(FLPIF);
@@ -884,11 +888,11 @@ int main(int argc, char *argv[]){
                 << " Perror: " << Perror
                 << endl;
 
-            //IpplTimings::startTimer(dumpData);
+            IpplTimings::startTimer(dumpData);
             //Pcoarse->writeError(Rerror, Perror, it+1);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-            //IpplTimings::stopTimer(dumpData);
+            IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
 

From 4688555a546b20ada62a3dde3a1186b3f1538973 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Fri, 1 Mar 2024 17:03:59 +0100
Subject: [PATCH 103/117] In the middle of cleanup before running speedup
 studies

---
 CMakeLists.txt                                |   5 +-
 .../BumponTailInstabilityPIF.cpp              |  49 +++-
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 260 +++++++++---------
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  |  54 +++-
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp    |  52 +++-
 alpine/PinT/BumponTailInstabilityPinT.cpp     |  14 +-
 alpine/PinT/ChargedParticlesPinT.hpp          |  91 +++++-
 alpine/PinT/LandauDampingPinT.cpp             | 191 +++++--------
 alpine/PinT/PenningTrapPinT.cpp               |  32 ++-
 src/FFT/FFT.hpp                               |   4 +-
 10 files changed, 449 insertions(+), 303 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f27d10ac4..db77c210f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,11 +21,14 @@ endif ()
 add_compile_options (-Wall)
 add_compile_options (-Wunused)
 add_compile_options (-Wextra)
-#add_compile_options (-Werror)
+add_compile_options (-Werror)
 
 
 # allow deprecated functions
 add_compile_options (-Wno-deprecated-declarations)
+add_compile_options (-Wno-stringop-overflow)
+add_compile_options (-Wno-array-bounds)
+add_compile_options (-Wno-restrict)
 
 option (USE_STATIC_LIBRARIES "Link with static libraries if available" ON)
 
diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
index dcd059bdf..3ef320c57 100644
--- a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -1,6 +1,6 @@
 // Electrostatic Two-stream/Bump-on-tail instability test with Particle-in-Fourier schemes
 //   Usage:
-//     srun ./BumponTailInstabilityPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> --info 5
+//     srun ./BumponTailInstabilityPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> <tol> --info 5
 //     nx       = No. of Fourier modes in the x-direction
 //     ny       = No. of Fourier modes in the y-direction
 //     nz       = No. of Fourier modes in the z-direction
@@ -9,8 +9,9 @@
 //     dt       = Time stepsize
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
+//     tol = tolerance of NUFFT
 //     Example:
-//     srun ./BumponTailInstabilityPIF 32 32 32 655360 20 0.05 B-spline 1 --info 5
+//     srun ./BumponTailInstabilityPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5
 //
 // Copyright (c) 2023, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -262,6 +263,42 @@ int main(int argc, char *argv[]){
     P->rho_m.initialize(mesh, FL);
     P->Sk_m.initialize(mesh, FL);
 
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domain[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domain[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    P->rhoPIFreal_m.initialize(mesh, FL);
+    P->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    P->fft_mp = std::make_shared<FFT_t>(FL, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
+
     P->time_m = 0.0;
 
     P->shapetype_m = argv[7]; 
@@ -269,11 +306,7 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(particleCreation);
 
-    //typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
-    //const RegionLayout_t& RLayout = PL.getRegionLayout();
-    //const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
     Vector_t minU, maxU;
-    //int myRank = Ippl::Comm->rank();
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], delta, kw[d], d);
         maxU[d]   = CDF(rmax[d], delta, kw[d], d);
@@ -303,7 +336,9 @@ int main(int argc, char *argv[]){
     P->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
-    P->initNUFFT(FL);
+    
+    double tol   = std::atof(argv[9]);
+    P->initNUFFT(FL,tol);
 
     P->scatter();
 
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index b271372bc..bb762d408 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -42,16 +42,58 @@ typedef Field<double, Dim>   Field_t;
 typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
 typedef Field<Vector_t, Dim> VField_t;
 
+typedef ippl::FFT<ippl::RCTransform, Dim, double> FFT_t;
 
 const double pi = std::acos(-1.0);
 
 // Test programs have to define this variable for VTK dump purposes
 extern const char* TestName;
 
+void dumpVTK(Field_t& rho, int nx, int ny, int nz, int iteration,
+             double dx, double dy, double dz) {
+
+    typename Field_t::view_type::host_mirror_type host_view = rho.getHostMirror();
+
+    std::stringstream fname;
+    fname << "data/scalar_";
+    fname << std::setw(4) << std::setfill('0') << iteration;
+    fname << ".vtk";
+
+    Kokkos::deep_copy(host_view, rho.getView());
+
+    Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE);
+    vtkout.precision(10);
+    vtkout.setf(std::ios::scientific, std::ios::floatfield);
+
+    // start with header
+    vtkout << "# vtk DataFile Version 2.0" << endl;
+    vtkout << TestName << endl;
+    vtkout << "ASCII" << endl;
+    vtkout << "DATASET STRUCTURED_POINTS" << endl;
+    vtkout << "DIMENSIONS " << nx+3 << " " << ny+3 << " " << nz+3 << endl;
+    vtkout << "ORIGIN " << -dx << " " << -dy << " " << -dz << endl;
+    vtkout << "SPACING " << dx << " " << dy << " " << dz << endl;
+    vtkout << "CELL_DATA " << (nx+2)*(ny+2)*(nz+2) << endl;
+
+    vtkout << "SCALARS Rho float" << endl;
+    vtkout << "LOOKUP_TABLE default" << endl;
+    for (int z=0; z<nz+2; z++) {
+        for (int y=0; y<ny+2; y++) {
+            for (int x=0; x<nx+2; x++) {
+
+                vtkout << host_view(x,y,z) << endl;
+            }
+        }
+    }
+}
+
+
 template<class PLayout>
 class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 public:
     CxField_t rho_m;
+    CxField_t rhoPIFhalf_m;
+    Field_t rhoPIFreal_m;
     CxField_t rhoDFT_m;
     Field_t Sk_m;
 
@@ -74,7 +116,9 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
     std::string shapetype_m;
 
     int shapedegree_m;
+    std::shared_ptr<FFT_t> fft_mp;
 
+    std::shared_ptr<ippl::FFT<ippl::NUFFTransform, 3, double>> nufftType1_mp,nufftType2_mp;
 
 public:
     ParticleAttrib<double>     q; // charge
@@ -124,29 +168,29 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         setBCAllPeriodic();
     }
 
-    void initNUFFT(FieldLayout_t& FL) {
+    void initNUFFT(FieldLayout_t& FL, double& tol) {
         ippl::ParameterList fftParams;
 
         fftParams.add("gpu_method", 1);
         fftParams.add("gpu_sort", 0);
         fftParams.add("gpu_kerevalmeth", 1);
-        fftParams.add("tolerance", 1e-6);
+        fftParams.add("tolerance", tol);
 
         fftParams.add("use_cufinufft_defaults", false);
         //fftParams.add("use_cufinufft_defaults", true);
 
-        q.initializeNUFFT(FL, 1, fftParams);
-        E.initializeNUFFT(FL, 2, fftParams);
+        nufftType1_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FL, this->getLocalNum(), 1, fftParams);
+        nufftType2_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FL, this->getLocalNum(), 2, fftParams);
     }
 
     void gather() {
 
-        gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, this->q);
+        gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, nufftType2_mp.get(), q);
         //gatherPIFNUDFT(this->E, rho_m, Sk_m, this->R);
 
         //Set the charge back to original as we used this view as a 
         //temporary buffer during gather
-        this->q = Q_m / Np_m; 
+        q = Q_m / Np_m; 
 
     }
 
@@ -154,7 +198,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
         
         Inform m("scatter ");
         rho_m = {0.0, 0.0};
-        scatterPIFNUFFT(q, rho_m, Sk_m, this->R);
+        scatterPIFNUFFT(q, rho_m, Sk_m, this->R, nufftType1_mp.get());
         //rhoDFT_m = {0.0, 0.0};
         //scatterPIFNUDFT(q, rho_m, Sk_m, this->R);
 
@@ -205,8 +249,6 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            Vector<double, 3> kVec;
            double Dr = 0.0;
            for(size_t d = 0; d < Dim; ++d) {
-               //bool shift = (iVec[d] > (N[d]/2));
-               //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                Dr += kVec[d] * kVec[d];
            }
@@ -231,32 +273,6 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
        fieldEnergy *= volume;
         
-       //auto Eview = E.getView();
-
-       //double fieldEnergy, ExAmp;
-       //double temp = 0.0;
-
-       //Kokkos::parallel_reduce("Ex energy", this->getLocalNum(),
-       //                        KOKKOS_LAMBDA(const int i, double& valL){
-       //                            double myVal = Eview(i)[0] * Eview(i)[0];
-       //                            valL += myVal;
-       //                        }, Kokkos::Sum<double>(temp));
-
-       //double globaltemp = 0.0;
-       //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-       //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-       //fieldEnergy = globaltemp * volume / totalP ;
-
-       //double tempMax = 0.0;
-       //Kokkos::parallel_reduce("Ex max norm", this->getLocalNum(),
-       //                        KOKKOS_LAMBDA(const size_t i, double& valL)
-       //                        {
-       //                            double myVal = std::fabs(Eview(i)[0]);
-       //                            if(myVal > valL) valL = myVal;
-       //                        }, Kokkos::Max<double>(tempMax));
-       //ExAmp = 0.0;
-       //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
-
 
        if (Ippl::Comm->rank() == 0) {
            std::stringstream fname;
@@ -378,16 +394,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
        double potentialEnergy, kineticEnergy;
        double temp = 0.0;
-
-       //auto Eview = E.getView();
-       //Kokkos::parallel_reduce("Potential energy", this->getLocalNum(),
-       //                        KOKKOS_LAMBDA(const int i, double& valL){
-       //                            double myVal = dot(Eview(i), Eview(i)).apply();
-       //                            valL += myVal;
-       //                        }, Kokkos::Sum<double>(temp));
-
-
-
+       
        auto rhoview = rho_m.getView();
        const int nghost = rho_m.getNghost();
        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
@@ -422,8 +429,6 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            Vector<double, 3> kVec;
            double Dr = 0.0;
            for(size_t d = 0; d < Dim; ++d) {
-               //bool shift = (iVec[d] > (N[d]/2));
-               //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                Dr += kVec[d] * kVec[d];
            }
@@ -438,14 +443,6 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
            }
 
-           //double myVal = rhoview(i,j,k).real() * rhoview(i,j,k).real() + 
-           //               rhoview(i,j,k).imag() * rhoview(i,j,k).imag();
-           //if(Dr != 0.0) {
-           //    myVal /= Dr;
-           //}
-           //else {
-           //    myVal = 0.0;
-           //}
            valL += myVal;
 
        }, Kokkos::Sum<double>(temp));
@@ -472,6 +469,77 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
        kineticEnergy = globaltemp;
 
+       auto rhoPIFhalfview = rhoPIFhalf_m.getView();
+       const int nghostHalf = rhoPIFhalf_m.getNghost();
+      
+       const FieldLayout_t& layoutHalf = rhoPIFhalf_m.getLayout(); 
+       const auto& domainHalf = layoutHalf.getDomain();
+
+       Vector<int, Dim> Nhalf;
+       for (unsigned d=0; d < Dim; ++d) {
+           Nhalf[d] = domainHalf[d].length();
+       }
+
+       Kokkos::parallel_for("Transfer complex rho to half domain",
+                             mdrange_type({0, 0, 0},
+                                          {Nhalf[0],
+                                           Nhalf[1],
+                                           Nhalf[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k)
+       {
+           Vector<int, 3> iVec = {i, j, k};
+           int shift;
+           for(size_t d = 0; d < Dim; ++d) {
+               bool isLessThanHalf = (iVec[d] < (Nhalf[d]/2));
+               shift = ((int)isLessThanHalf * 2) - 1;
+               iVec[d] = (iVec[d] + shift * (Nhalf[d]/2)) + nghostHalf;
+           }
+           rhoPIFhalfview(Nhalf[0]-1-i+nghostHalf, iVec[1], iVec[2]) = rhoview(i+nghostHalf,j+nghostHalf,k+nghostHalf);
+       });
+
+
+       rhoPIFreal_m = 0.0;
+       fft_mp->transform(-1, rhoPIFreal_m, rhoPIFhalf_m);
+
+
+       rhoPIFreal_m = (1.0/(nr_m[0]*nr_m[1]*nr_m[2])) * volume * rhoPIFreal_m;
+       auto rhoPIFrealview = rhoPIFreal_m.getView();
+       temp = 0.0;
+       Kokkos::parallel_reduce("Rho real sum",
+                             mdrange_type({0, 0, 0},
+                                          {N[0],
+                                           N[1],
+                                           N[2]}),
+                             KOKKOS_LAMBDA(const int i,
+                                           const int j,
+                                           const int k,
+                                           double& valL)
+       {
+         
+           valL += rhoPIFrealview(i+nghost, j+nghost, k+nghost);
+       }, Kokkos::Sum<double>(temp));
+
+       double charge = temp;
+
+        Vector_t totalMomentum = 0.0;
+        
+        Kokkos::parallel_reduce("Total Momentum", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, Vector_t& valL){
+                                    valL  += (-qView(i)) * Pview(i);
+                                }, Kokkos::Sum<ippl::Vector<double,3>>(totalMomentum));
+        
+        Vector_t globalMom;
+
+        double magMomentum = 0.0;
+        for(size_t d = 0; d < Dim; ++d) {
+            MPI_Allreduce(&totalMomentum[d], &globalMom[d], 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+            magMomentum += globalMom[d] * globalMom[d];
+        }
+
+        magMomentum  = std::sqrt(magMomentum);
+
        if (Ippl::Comm->rank() == 0) {
            std::stringstream fname;
            fname << "data/Energy_";
@@ -480,17 +548,19 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
 
            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-           csvout.precision(10);
+           csvout.precision(17);
            csvout.setf(std::ios::scientific, std::ios::floatfield);
 
            if(time_m == 0.0) {
-               csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
+               csvout << "time, Potential energy, Kinetic energy, Total energy Total charge Total Momentum" << endl;
            }
 
            csvout << time_m << " "
                   << potentialEnergy << " "
                   << kineticEnergy << " "
-                  << potentialEnergy + kineticEnergy << endl;
+                  << potentialEnergy + kineticEnergy << " " 
+                  << charge << " "
+                  << magMomentum << endl;
 
        }
        
@@ -529,8 +599,6 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                 Vector<double, 3> kVec;
                 double Sk = 1.0;
                 for(size_t d = 0; d < Dim; ++d) {
-                    //bool shift = (iVec[d] > (N[d]/2));
-                    //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                     double kh = kVec[d] * dx[d];
                     bool isNotZero = (kh != 0.0);
@@ -551,75 +619,15 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
     }
     
-    //void dumpBumponTail() {
-
-    //   const int nghostE = E_m.getNghost();
-    //   auto Eview = E_m.getView();
-    //   double fieldEnergy, EzAmp;
-    //   using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-    //   double temp = 0.0;
-    //   Kokkos::parallel_reduce("Ex inner product",
-    //                           mdrange_type({nghostE, nghostE, nghostE},
-    //                                        {Eview.extent(0) - nghostE,
-    //                                         Eview.extent(1) - nghostE,
-    //                                         Eview.extent(2) - nghostE}),
-    //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
-    //                                         const size_t k, double& valL)
-    //                           {
-    //                               double myVal = std::pow(Eview(i, j, k)[2], 2);
-    //                               valL += myVal;
-    //                           }, Kokkos::Sum<double>(temp));
-    //   double globaltemp = 0.0;
-    //   MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-    //   fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
-
-    //   double tempMax = 0.0;
-    //   Kokkos::parallel_reduce("Ex max norm",
-    //                           mdrange_type({nghostE, nghostE, nghostE},
-    //                                        {Eview.extent(0) - nghostE,
-    //                                         Eview.extent(1) - nghostE,
-    //                                         Eview.extent(2) - nghostE}),
-    //                           KOKKOS_LAMBDA(const size_t i, const size_t j,
-    //                                         const size_t k, double& valL)
-    //                           {
-    //                               double myVal = std::fabs(Eview(i, j, k)[2]);
-    //                               if(myVal > valL) valL = myVal;
-    //                           }, Kokkos::Max<double>(tempMax));
-    //   EzAmp = 0.0;
-    //   MPI_Reduce(&tempMax, &EzAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
-
-
-    //   if (Ippl::Comm->rank() == 0) {
-    //       std::stringstream fname;
-    //       fname << "data/FieldBumponTail_";
-    //       fname << Ippl::Comm->size();
-    //       fname << ".csv";
-
-
-    //       Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-    //       csvout.precision(10);
-    //       csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-    //       if(time_m == 0.0) {
-    //           csvout << "time, Ez_field_energy, Ez_max_norm" << endl;
-    //       }
-
-    //       csvout << time_m << " "
-    //              << fieldEnergy << " "
-    //              << EzAmp << endl;
-
-    //   }
-    //   
-    //   Ippl::Comm->barrier();
-    //}
 
     void dumpFieldData() {
 
        typename CxField_t::HostMirror rhoNUFFT_host = rho_m.getHostMirror();
-       typename CxField_t::HostMirror rhoNUDFT_host = rhoDFT_m.getHostMirror();
+       typename Field_t::HostMirror rhoNUFFT_real = rhoPIFreal_m.getHostMirror();
+       //typename CxField_t::HostMirror rhoNUDFT_host = rhoDFT_m.getHostMirror();
        Kokkos::deep_copy(rhoNUFFT_host, rho_m.getView());
-       Kokkos::deep_copy(rhoNUDFT_host, rhoDFT_m.getView());
+       Kokkos::deep_copy(rhoNUFFT_real, rhoPIFreal_m.getView());
+       //Kokkos::deep_copy(rhoNUDFT_host, rhoDFT_m.getView());
        const int nghost = rho_m.getNghost();
        std::stringstream pname;
        pname << "data/FieldFFT_";
@@ -637,7 +645,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
             }
        }
        std::stringstream pname2;
-       pname2 << "data/FieldDFT_";
+       pname2 << "data/Fieldreal_";
        pname2 << Ippl::Comm->rank();
        pname2 << ".csv";
        Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
@@ -647,7 +655,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
        for (int i = 0; i< nr_m[0]; i++) {
             for (int j = 0; j< nr_m[1]; j++) {
                 for (int k = 0; k< nr_m[2]; k++) {
-                    pcsvout2 << rhoNUDFT_host(i+nghost,j+nghost, k+nghost) << endl;
+                    pcsvout2 << rhoNUFFT_real(i+nghost,j+nghost, k+nghost) << endl;
                 }
             }
        }
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index 2373d3f15..fe5e8b68c 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -1,6 +1,6 @@
 // Electrostatic Landau damping test with Particle-in-Fourier schemes
 //   Usage:
-//     srun ./LandauDampingPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> --info 5
+//     srun ./LandauDampingPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> <tol> --info 5
 //     nx       = No. of Fourier modes in the x-direction
 //     ny       = No. of Fourier modes in the y-direction
 //     nz       = No. of Fourier modes in the z-direction
@@ -9,8 +9,9 @@
 //     dt       = Time stepsize
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
+//     tol = tolerance of NUFFT
 //     Example:
-//     srun ./LandauDampingPIF 32 32 32 655360 20 0.05 B-spline 1 --info 5
+//     srun ./LandauDampingPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -200,10 +201,7 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
-    //Vector_t kw = {1.0, 1.0, 1.0};
     double alpha = 0.05;
-    //Vector_t rmin(-pi);
-    //Vector_t rmax(pi);
     Vector_t rmin(0.0);
     Vector_t rmax = 2 * pi / kw;
     Vector_t length = rmax - rmin;
@@ -221,7 +219,6 @@ int main(int argc, char *argv[]){
 
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
-    //double Q = -64.0 * pi * pi * pi;
     P = std::make_unique<bunch_type>(PL,hr,rmin,rmax,decomp,Q,Total_particles);
 
     P->nr_m = nr;
@@ -230,6 +227,42 @@ int main(int argc, char *argv[]){
     P->rhoDFT_m.initialize(mesh, FL);
     P->Sk_m.initialize(mesh, FL);
 
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domain[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domain[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    P->rhoPIFreal_m.initialize(mesh, FL);
+    P->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    P->fft_mp = std::make_shared<FFT_t>(FL, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
+
     P->time_m = 0.0;
 
     P->shapetype_m = argv[7]; 
@@ -237,16 +270,10 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(particleCreation);
 
-    //typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
-    //const RegionLayout_t& RLayout = PL.getRegionLayout();
-    //const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
     Vector_t minU, maxU;
-    //int myRank = Ippl::Comm->rank();
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], alpha, kw[d]);
         maxU[d]   = CDF(rmax[d], alpha, kw[d]);
-        //minU[d] = rmin[d];//CDF(Regions(myRank)[d].min(), alpha, kw[d]);
-        //maxU[d] = rmax[d];//CDF(Regions(myRank)[d].max(), alpha, kw[d]);
     }
 
 
@@ -272,7 +299,8 @@ int main(int argc, char *argv[]){
     P->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
-    P->initNUFFT(FL);
+    double tol   = std::atof(argv[9]);
+    P->initNUFFT(FL,tol);
 
     P->scatter();
 
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index f70ddaa08..54984352e 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -1,6 +1,6 @@
 // Electrostatic Penning trap test with Particle-in-Fourier schemes
 //   Usage:
-//     srun ./PenningTrapPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> --info 5
+//     srun ./PenningTrapPIF <nx> <ny> <nz> <Np> <Nt> <dt> <ShapeType> <degree> <tol> --info 5
 //     nx       = No. of Fourier modes in the x-direction
 //     ny       = No. of Fourier modes in the y-direction
 //     nz       = No. of Fourier modes in the z-direction
@@ -9,8 +9,9 @@
 //     dt       = Time stepsize
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
+//     tol = tolerance of NUFFT
 //     Example:
-//     srun ./PenningTrapPIF 32 32 32 655360 20 0.05 B-spline 1 --info 5
+//     srun ./PenningTrapPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5
 //
 // Copyright (c) 2023, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -191,7 +192,6 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t rmin(0.0);
-    //Vector_t rmax(20.0);
     Vector_t rmax(25.0);
     double dx = rmax[0] / nr[0];
     double dy = rmax[1] / nr[1];
@@ -228,6 +228,42 @@ int main(int argc, char *argv[]){
     P->rho_m.initialize(mesh, FL);
     P->Sk_m.initialize(mesh, FL);
 
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domain[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domain[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    P->rhoPIFreal_m.initialize(mesh, FL);
+    P->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    P->fft_mp = std::make_shared<FFT_t>(FL, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
+
     P->time_m = 0.0;
 
     P->shapetype_m = argv[7]; 
@@ -235,9 +271,6 @@ int main(int argc, char *argv[]){
 
     IpplTimings::startTimer(particleCreation);
 
-    //typedef ippl::detail::RegionLayout<double, Dim, Mesh_t> RegionLayout_t;
-    //const RegionLayout_t& RLayout = PL.getRegionLayout();
-    //const typename RegionLayout_t::host_mirror_type Regions = RLayout.gethLocalRegions();
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], mu[d], sd[d]);
@@ -267,14 +300,15 @@ int main(int argc, char *argv[]){
     P->initializeShapeFunctionPIF();
     IpplTimings::stopTimer(initializeShapeFunctionPIF);
 
-    P->initNUFFT(FL);
+    double tol   = std::atof(argv[9]);
+    P->initNUFFT(FL,tol);
 
     P->scatter();
 
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    P->dumpEnergy();
+    //P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     double alpha = -0.5 * dt;
@@ -353,7 +387,7 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        P->dumpEnergy();
+        //P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 1d3b81545..8bf8547f9 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -603,7 +603,7 @@ int main(int argc, char *argv[]){
 
     
     double coarseTol = std::atof(argv[17]);
-    double fineTol   = 1e-12;
+    double fineTol   = 1e-3;//1e-12;
     Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
     std::string coarse = "Coarse";
     std::string fine = "Fine";
@@ -676,8 +676,8 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
-    Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
-    //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
+    //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
+    Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
@@ -855,8 +855,8 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
-            //Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -949,8 +949,8 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
-            //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 862b097e3..9e6abddb5 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -44,6 +44,8 @@ typedef Field<Kokkos::complex<double>, Dim>   CxField_t;
 typedef Field<Vector_t, Dim> VField_t;
 typedef ippl::FFTPeriodicPoissonSolver<Vector_t, double, Dim> Solver_t;
 
+typedef ippl::FFT<ippl::RCTransform, Dim, double> FFT_t;
+
 const double pi = std::acos(-1.0);
 
 // Test programs have to define this variable for VTK dump purposes
@@ -56,6 +58,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     //using nufft_t = typename ippl::FFT<ippl::NUFFTransform, 3, double>;
 
     CxField_t rhoPIF_m;
+    CxField_t rhoPIFhalf_m;
+    Field_t rhoPIFreal_m;
     Field_t Sk_m;
     Field_t rhoPIC_m;
     VField_t EfieldPIC_m;
@@ -75,6 +79,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     size_type Np_m;
     
     std::shared_ptr<Solver_t> solver_mp;
+    std::shared_ptr<FFT_t> fft_mp;
     
     double time_m;
 
@@ -96,6 +101,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     typename ippl::ParticleBase<PLayout>::particle_position_type RprevIter;  // G(R^(k-1)_n)
     typename ippl::ParticleBase<PLayout>::particle_position_type PprevIter;  // G(P^(k-1)_n)
 
+    //typename ippl::ParticleBase<PLayout>::particle_position_type Rfine;  
+    //typename ippl::ParticleBase<PLayout>::particle_position_type Pfine; 
+
     /*
       This constructor is mandatory for all derived classes from
       ParticleBase as the bunch buffer uses this
@@ -111,6 +119,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         this->addAttribute(P0);
         this->addAttribute(RprevIter);
         this->addAttribute(PprevIter);
+        //this->addAttribute(Rfine);
+        //this->addAttribute(Pfine);
     }
     
     ChargedParticlesPinT(PLayout& pl,
@@ -135,6 +145,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         this->addAttribute(P0);
         this->addAttribute(RprevIter);
         this->addAttribute(PprevIter);
+        //this->addAttribute(Rfine);
+        //this->addAttribute(Pfine);
         setupBCs();
         for (unsigned int i = 0; i < Dim; i++)
             decomp_m[i]=decomp[i];
@@ -624,7 +636,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
                                 KOKKOS_LAMBDA(const int i, double& valL){
                                     double myVal = dot(Pview(i), Pview(i)).apply();
-                                    myVal *= -qView(i);
+                                    myVal *= -qView(i); //q/(q/m) where q/m=-1 for us
                                     valL += myVal;
                                 }, Kokkos::Sum<double>(temp));
 
@@ -635,6 +647,77 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         kineticEnergy = globaltemp;
 
+        auto rhoPIFhalfview = rhoPIFhalf_m.getView();
+        const int nghostHalf = rhoPIFhalf_m.getNghost();
+      
+        const FieldLayout_t& layoutHalf = rhoPIFhalf_m.getLayout(); 
+        const auto& domainHalf = layoutHalf.getDomain();
+
+        Vector<int, Dim> Nhalf;
+        for (unsigned d=0; d < Dim; ++d) {
+            Nhalf[d] = domainHalf[d].length();
+        }
+
+        Kokkos::parallel_for("Transfer complex rho to half domain",
+                              mdrange_type({0, 0, 0},
+                                           {Nhalf[0],
+                                            Nhalf[1],
+                                            Nhalf[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k)
+        {
+            Vector<int, 3> iVec = {i, j, k};
+            int shift;
+            for(size_t d = 0; d < Dim; ++d) {
+                bool isLessThanHalf = (iVec[d] < (Nhalf[d]/2));
+                shift = ((int)isLessThanHalf * 2) - 1;
+                iVec[d] = (iVec[d] + shift * (Nhalf[d]/2)) + nghostHalf;
+            }
+            rhoPIFhalfview(Nhalf[0]-1-i+nghostHalf, iVec[1], iVec[2]) = 
+            rhoview(i+nghostHalf,j+nghostHalf,k+nghostHalf);
+        });
+
+
+        rhoPIFreal_m = 0.0;
+        fft_mp->transform(-1, rhoPIFreal_m, rhoPIFhalf_m);
+
+        rhoPIFreal_m = (1.0/(N[0]*N[1]*N[2])) * volume * rhoPIFreal_m;
+        auto rhoPIFrealview = rhoPIFreal_m.getView();
+        temp = 0.0;
+        Kokkos::parallel_reduce("Rho real sum",
+                              mdrange_type({0, 0, 0},
+                                           {N[0],
+                                            N[1],
+                                            N[2]}),
+                              KOKKOS_LAMBDA(const int i,
+                                            const int j,
+                                            const int k,
+                                            double& valL)
+        {
+            valL += rhoPIFrealview(i+nghost, j+nghost, k+nghost);
+        }, Kokkos::Sum<double>(temp));
+
+
+        double chargeTotal = temp;
+
+        Vector_t totalMomentum = 0.0;
+        
+        Kokkos::parallel_reduce("Total Momentum", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, Vector_t& valL){
+                                    valL  += (-qView(i)) * Pview(i);
+                                }, Kokkos::Sum<ippl::Vector<double,3>>(totalMomentum));
+        
+        Vector_t globalMom;
+
+        double magMomentum = 0.0;
+        for(size_t d = 0; d < Dim; ++d) {
+            MPI_Allreduce(&totalMomentum[d], &globalMom[d], 1, MPI_DOUBLE, MPI_SUM, spaceComm);
+            magMomentum += globalMom[d] * globalMom[d];
+        }
+
+        magMomentum  = std::sqrt(magMomentum);
+
         if(rankSpace == 0) {
             std::stringstream fname;
             fname << "data/Energy_rank_";
@@ -647,7 +730,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
 
             Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
-            csvout.precision(10);
+            csvout.precision(17);
             csvout.setf(std::ios::scientific, std::ios::floatfield);
 
             //csvout << "time, Potential energy, Kinetic energy, Total energy" << endl;
@@ -655,7 +738,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             csvout << time_m << " "
                    << potentialEnergy << " "
                    << kineticEnergy << " "
-                   << potentialEnergy + kineticEnergy << endl;
+                   << potentialEnergy + kineticEnergy << " " 
+                   << chargeTotal << " " 
+                   << magMomentum << endl;
         }
 
     }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 994519005..a522f7824 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -39,8 +39,6 @@
 #include "ChargedParticlesPinT.hpp"
 #include "StatesBeginSlice.hpp"
 #include "StatesEndSlice.hpp"
-//#include "LeapFrogPIC.cpp"
-//#include "LeapFrogPIF.cpp"
 #include <string>
 #include <vector>
 #include <iostream>
@@ -156,6 +154,11 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
 
+                                //This is just to undo the effect of periodic BCs during the 
+                                //error calculation. Otherwise even though the actual error is 
+                                //small the computed error might be very large. 
+                                //The values (e.g. 10) mentioned here are just an adhoc
+                                //value depending on the domain length. 
                                 for (unsigned d = 0; d < 3; ++d) {
                                     bool isLeft = (diff[d] <= -10.0);
                                     bool isRight = (diff[d] >= 10.0);
@@ -211,8 +214,7 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 }
 
 double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
-                      Vector_t& length) {
+                         Vector_t& length, MPI_Comm& spaceComm) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -223,6 +225,11 @@ double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>&
                             KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
                                 
+                                //This is just to undo the effect of periodic BCs during the 
+                                //error calculation. Otherwise even though the actual error is 
+                                //small the computed error might be very large. 
+                                //The values (e.g. 10) mentioned here are just an adhoc
+                                //value depending on the domain length. 
                                 for (unsigned d = 0; d < 3; ++d) {
                                     bool isLeft = (diff[d] <= -10.0);
                                     bool isRight = (diff[d] >= 10.0);
@@ -235,33 +242,30 @@ double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>&
 
                                 myValError = std::sqrt(myValError);
 
-                                //bool isIncluded = (myValError < 10.0);
-
-                                //myValError *= isIncluded;
-                                
                                 if(myValError > valLError) valLError = myValError;
                                 
                                 double myValnorm = dot(Qview(i), Qview(i)).apply();
                                 myValnorm = std::sqrt(myValnorm);
 
-                                //myValnorm *= isIncluded;
-                                
                                 if(myValnorm > valLnorm) valLnorm = myValnorm;
                                 
-                                //excluded += (!isIncluded);
                             }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
 
     Kokkos::fence();
-    lError = localError/localNorm;
     
-    double relError = lError;
+    double globalError = 0.0;
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_MAX, spaceComm);
+    double globalNorm = 0.0;
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_MAX, spaceComm);
+    
+    double relError = globalError/globalNorm;
     
     return relError;
 
 }
 
 double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
+                         MPI_Comm& spaceComm) {
     
     auto Qview = Q.getView();
     auto QprevIterView = QprevIter.getView();
@@ -283,97 +287,17 @@ double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>&
                             }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
 
     Kokkos::fence();
-    lError = localError/localNorm;
-    
-    double relError = lError;
-    
-    return relError;
-
-}
-
-
-double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
-
-    auto rhoview = rhoPIF.getView();
-    auto rhoprevview = rhoPIFprevIter.getView();
-    const int nghost = rhoPIF.getNghost();
-    using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
-    
-    const FieldLayout_t& layout = rhoPIF.getLayout(); 
-    const Mesh_t& mesh = rhoPIF.get_mesh();
-    const Vector<double, Dim>& dx = mesh.getMeshSpacing();
-    const auto& domain = layout.getDomain();
-    Vector<double, Dim> Len;
-    Vector<int, Dim> N;
-
-    for (unsigned d=0; d < Dim; ++d) {
-        N[d] = domain[d].length();
-        Len[d] = dx[d] * N[d];
-    }
-
-    double AbsError = 0.0;
-    double Enorm = 0.0;
-    Kokkos::complex<double> imag = {0.0, 1.0};
-    double pi = std::acos(-1.0);
-    Kokkos::parallel_reduce("Ex field error",
-                          mdrange_type({0, 0, 0},
-                                       {N[0],
-                                        N[1],
-                                        N[2]}),
-                          KOKKOS_LAMBDA(const int i,
-                                        const int j,
-                                        const int k,
-                                        double& errorSum,
-                                        double& fieldSum)
-    {
-    
-        Vector<int, 3> iVec = {i, j, k};
-        Vector<double, 3> kVec;
-        double Dr = 0.0;
-        for(size_t d = 0; d < Dim; ++d) {
-            bool shift = (iVec[d] > (N[d]/2));
-            kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-            Dr += kVec[d] * kVec[d];
-        }
-
-        double myError = 0.0;
-        double myField = 0.0;
-        Kokkos::complex<double> Ek = {0.0, 0.0};
-        Kokkos::complex<double> Ekprev = {0.0, 0.0};
-        for(size_t d = 0; d < Dim; ++d) {
-            if(Dr != 0.0) {
-                Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-                Ekprev = -(imag * kVec[d] * rhoprevview(i+nghost,j+nghost,k+nghost) / Dr);
-            }
-            Ekprev = Ekprev - Ek;
-            myError += Ekprev.real() * Ekprev.real() + Ekprev.imag() * Ekprev.imag();
-            myField += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
-        }
-        errorSum += myError;
-        fieldSum += myField;
-        //Kokkos::complex<double> rhok = rhoview(i+nghost,j+nghost,k+nghost);
-        //Kokkos::complex<double> rhokprev = rhoprevview(i+nghost,j+nghost,k+nghost);
-        //rhokprev = rhokprev - rhok;
-        //myError = rhokprev.real() * rhokprev.real() + rhokprev.imag() * rhokprev.imag();
-        //errorSum += myError;
-        //myField = rhok.real() * rhok.real() + rhok.imag() * rhok.imag();
-        //fieldSum += myField;
-
-    }, Kokkos::Sum<double>(AbsError), Kokkos::Sum<double>(Enorm));
     
-    Kokkos::fence();
     double globalError = 0.0;
-    MPI_Allreduce(&AbsError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_MAX, spaceComm);
     double globalNorm = 0.0;
-    MPI_Allreduce(&Enorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-    //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-    //fieldEnergy *= volume;
-
-    double relError = std::sqrt(globalError)/std::sqrt(globalNorm);
-
+    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_MAX, spaceComm);
+    
+    double relError = globalError/globalNorm;
+    
     return relError;
-}
 
+}
 
 const char* TestName = "LandauDampingPinT";
 
@@ -436,9 +360,6 @@ int main(int argc, char *argv[]){
     const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
 
-    //const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
-    //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
-
 
     using bunch_type = ChargedParticlesPinT<PLayout_t>;
     using states_begin_type = StatesBeginSlice<PLayout_t>;
@@ -462,7 +383,6 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
-    //double alpha = 0.05;
     Vector_t alpha = {0.05, 0.05, 0.05};
     //Vector_t alpha = {0.5, 0.5, 0.5};
     Vector_t rmin(0.0);
@@ -493,15 +413,6 @@ int main(int argc, char *argv[]){
 
     size_type Total_particles = 0;
 
-    //MPI_Allreduce(&nloc, &Total_particles, 1,
-    //            MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
-
-    //int rest = (int) (totalP - Total_particles);
-
-    //if ( (rankTime == 0) && (rankSpace < rest) ) {
-    //    ++nloc;
-    //}
-
     MPI_Allreduce(&nloc, &Total_particles, 1,
                 MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
 
@@ -516,10 +427,44 @@ int main(int argc, char *argv[]){
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
     Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
-    //Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
     Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
     Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
-    //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
+
+
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    Pcoarse->rhoPIFreal_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    Pcoarse->fft_mp = std::make_shared<FFT_t>(FLPIF, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
 
     Pcoarse->initFFTSolver();
     
@@ -550,7 +495,7 @@ int main(int argc, char *argv[]){
     
     //Pcoarse->initNUFFT(FLPIF);
     double coarseTol = std::atof(argv[17]);
-    double fineTol   = 1e-12;
+    double fineTol   = std::atof(argv[18]);
     Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
     std::string coarse = "Coarse";
     std::string fine = "Fine";
@@ -596,7 +541,7 @@ int main(int argc, char *argv[]){
     //IpplTimings::stopTimer(deepCopy);
     
 
-    tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    tag = 500;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
 
     if(rankTime == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
@@ -621,8 +566,8 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(deepCopy);
 
     //Pcoarse->initNUFFT(FLPIF, coarseTol);
-    Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
-    //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
+    //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
+    Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
@@ -796,8 +741,8 @@ int main(int argc, char *argv[]){
             //double coarseTol = (double)(std::pow(0.1,std::min((int)(it+2),3)));
             //double fineTol = 1e-6;
             //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
-            Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
-            //Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -881,9 +826,9 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
+            //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
             //Pcoarse->initNUFFT(FLPIF, coarseTol);
-            //Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 7cc1af833..95ae8387a 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -599,22 +599,20 @@ int main(int argc, char *argv[]){
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
     //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-    //if(Ippl::Comm->rank() == 0) {
-    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
+    //if(rankTime == 0) {
+    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
     //    Kokkos::parallel_for(nloc,
     //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
     //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, mu, sd, 
     //                         minU, maxU));
-
-
     //    Kokkos::fence();
     //    size_type bufSize = Pbegin->packedSize(nloc);
     //    std::vector<MPI_Request> requests(0);
     //    int sends = 0;
-    //    for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
+    //    for(int rank = 1; rank < sizeTime; ++rank) {
     //        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
     //        requests.resize(requests.size() + 1);
-    //        Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
+    //        Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc, timeComm);
     //        buf->resetWritePos();
     //        ++sends;
     //    }
@@ -623,10 +621,14 @@ int main(int argc, char *argv[]){
     //else {
     //    size_type bufSize = Pbegin->packedSize(nloc);
     //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-    //    Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
+    //    Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
     //    buf->resetReadPos();
     //}
 
+    //Kokkos::deep_copy(Pcoarse->Rfine.getView(), Pbegin->R.getView());
+    //Kokkos::deep_copy(Pcoarse->Pfine.getView(), Pbegin->P.getView());
+
+
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
     tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
@@ -658,8 +660,8 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(deepCopy);
 
     IpplTimings::startTimer(initialCoarse);
-    Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-    //Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+    //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
+    Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
     IpplTimings::stopTimer(initialCoarse);
 
     IpplTimings::startTimer(deepCopy);
@@ -765,6 +767,8 @@ int main(int argc, char *argv[]){
     int sign = 1;
     //coarseTol = 1e-3;
     //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
+    //Pcoarse->BorisPIF(Pcoarse->Rfine, Pcoarse->Pfine, (rankTime+1)*ntFine, dtFine, 0, 0, 0, 
+    //                          Bext, rankTime, rankSpace, fine, spaceComm);
     for (unsigned int nc=0; nc < nCycles; nc++) {
         
         double tStartMySlice; 
@@ -847,8 +851,8 @@ int main(int argc, char *argv[]){
             IpplTimings::startTimer(coarsePropagator);
             //coarseTol = 1e-4;//(double)(std::pow(0.1,std::min((int)(it+2),4)));
             //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
-            //Pcoarse->BorisPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
-            Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            Pcoarse->BorisPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+            //Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
@@ -861,6 +865,8 @@ int main(int argc, char *argv[]){
             //double localRerror, localPerror;
             double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
             double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
+            //double Rerror = computeRL2Error(Pcoarse->Rfine, Pend->R, length, spaceComm);
+            //double Perror = computePL2Error(Pcoarse->Pfine, Pend->P, spaceComm);
         
             IpplTimings::stopTimer(computeErrors);
 
@@ -944,8 +950,8 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            //Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
-            Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+            //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index acb06d7bd..c8f28c8bf 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -124,8 +124,10 @@ namespace ippl {
            }
         }
 
+        //heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
+        //           (inbox, outbox, Ippl::getComm(), heffteOptions);
         heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-                   (inbox, outbox, Ippl::getComm(), heffteOptions);
+                   (inbox, outbox, MPI_COMM_SELF, heffteOptions);
 
         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
         if(workspace_m.size() < heffte_m->size_workspace())

From 380e3e4210101e5c95ad999d3195002fb4d37d5d Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Fri, 1 Mar 2024 17:06:08 +0100
Subject: [PATCH 104/117] Unwanted files removed

---
 alpine/PinT/LeapFrogPIC.cpp | 60 ----------------------------------
 alpine/PinT/LeapFrogPIF.cpp | 65 -------------------------------------
 2 files changed, 125 deletions(-)
 delete mode 100644 alpine/PinT/LeapFrogPIC.cpp
 delete mode 100644 alpine/PinT/LeapFrogPIF.cpp

diff --git a/alpine/PinT/LeapFrogPIC.cpp b/alpine/PinT/LeapFrogPIC.cpp
deleted file mode 100644
index d719a423e..000000000
--- a/alpine/PinT/LeapFrogPIC.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
-// Paul Scherrer Institut, Villigen PSI, Switzerland
-// All rights reserved
-//
-// This file is part of IPPL.
-//
-// IPPL is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// You should have received a copy of the GNU General Public License
-// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
-//
-
-//#include "ChargedParticlesPinT.hpp"
-
-void LeapFrogPIC(ChargedParticlesPinT<PLayout_t>& P, ParticleAttrib<Vector_t>& Rtemp, 
-                 ParticleAttrib<Vector_t>& Ptemp, const unsigned int nt, 
-                 const double dt) {
-
-    PLayout_t& PL = P.getLayout();
-
-    const auto& hr = P.hr_m;
-    const auto& rmax = P.rmax_m;
-    const auto& rmin = P.rmin_m;
-    for (unsigned int it=0; it<nt; it++) {
-        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
-        // Here, we assume a constant charge-to-mass ratio of -1 for
-        // all the particles hence eliminating the need to store mass as
-        // an attribute
-        // kick
-
-        Ptemp = Ptemp - 0.5 * dt * P.E;
-
-        //drift
-        Rtemp = Rtemp + dt * Ptemp;
-
-        //Apply particle BC
-        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-
-        //scatter the charge onto the underlying grid
-        P.rhoPIC_m = 0.0;
-        scatter(P.q, P.rhoPIC_m, Rtemp);
-
-
-        P.rhoPIC_m = P.rhoPIC_m / (hr[0] * hr[1] * hr[2]);
-        P.rhoPIC_m = P.rhoPIC_m - (P.Q_m/((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2])));
-
-        //Field solve
-        P.solver_mp->solve();
-
-        // gather E field
-        gather(P.E, P.EfieldPIC_m, Rtemp);
-
-        //kick
-        Ptemp = Ptemp - 0.5 * dt * P.E;
-    }
-
-}
diff --git a/alpine/PinT/LeapFrogPIF.cpp b/alpine/PinT/LeapFrogPIF.cpp
deleted file mode 100644
index b7473237f..000000000
--- a/alpine/PinT/LeapFrogPIF.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//
-// Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
-// Paul Scherrer Institut, Villigen PSI, Switzerland
-// All rights reserved
-//
-// This file is part of IPPL.
-//
-// IPPL is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// You should have received a copy of the GNU General Public License
-// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
-//
-
-//#include "ChargedParticlesPinT.hpp"
-
-void LeapFrogPIF(ChargedParticlesPinT<PLayout_t>& P, ParticleAttrib<Vector_t>& Rtemp,
-                 ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                 const double& dt, const bool& isConverged, 
-                 const double& tStartMySlice) {
-
-    auto& PL = P.getLayout();
-    const auto& rmax = P.rmax_m;
-    const auto& rmin = P.rmin_m;
-
-    P.time_m = tStartMySlice;
-
-    for (unsigned int it=0; it<nt; it++) {
-
-        // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
-        // Here, we assume a constant charge-to-mass ratio of -1 for
-        // all the particles hence eliminating the need to store mass as
-        // an attribute
-        // kick
-
-        Ptemp = Ptemp - 0.5 * dt * P.E;
-
-        //drift
-        Rtemp = Rtemp + dt * Ptemp;
-
-        //Apply particle BC
-        PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-
-        //scatter the charge onto the underlying grid
-        P.rhoPIF_m = {0.0, 0.0};
-        scatterPIF(P.q, P.rhoPIF_m, Rtemp);
-
-        P.rhoPIF_m = P.rhoPIF_m / ((rmax[0] - rmin[0]) * (rmax[1] - rmin[1]) * (rmax[2] - rmin[2]));
-
-        // Solve for and gather E field
-        gatherPIF(P.E, P.rhoPIF_m, Rtemp);
-
-        //kick
-        Ptemp = Ptemp - 0.5 * dt * P.E;
-
-        P.time_m += dt;
-        if(isConverged) {
-            P.dumpLandau(P.getLocalNum());         
-            P.dumpEnergy(P.getLocalNum());         
-        }
-
-    }
-}

From 3202f99e0fd5a48ca4044944419372739fe91144 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Fri, 1 Mar 2024 17:11:52 +0100
Subject: [PATCH 105/117] Still in the middle of cleanup

---
 alpine/PinT/LandauDampingPinT.cpp             | 14 ++++-----
 alpine/PinT/StatesBeginSlice.hpp              | 31 -------------------
 .../{StatesEndSlice.hpp => StatesSlice.hpp}   |  6 ++--
 3 files changed, 9 insertions(+), 42 deletions(-)
 delete mode 100644 alpine/PinT/StatesBeginSlice.hpp
 rename alpine/PinT/{StatesEndSlice.hpp => StatesSlice.hpp} (86%)

diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index a522f7824..cb1a7a76e 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -37,8 +37,7 @@
 //
 
 #include "ChargedParticlesPinT.hpp"
-#include "StatesBeginSlice.hpp"
-#include "StatesEndSlice.hpp"
+#include "StatesSlice.hpp"
 #include <string>
 #include <vector>
 #include <iostream>
@@ -362,12 +361,11 @@ int main(int argc, char *argv[]){
 
 
     using bunch_type = ChargedParticlesPinT<PLayout_t>;
-    using states_begin_type = StatesBeginSlice<PLayout_t>;
-    using states_end_type = StatesEndSlice<PLayout_t>;
+    using states_type = StatesSlice<PLayout_t>;
 
     std::unique_ptr<bunch_type>  Pcoarse;
-    std::unique_ptr<states_begin_type>  Pbegin;
-    std::unique_ptr<states_end_type>  Pend;
+    std::unique_ptr<states_type>  Pbegin;
+    std::unique_ptr<states_type>  Pend;
 
     ippl::NDIndex<Dim> domainPIC;
     ippl::NDIndex<Dim> domainPIF;
@@ -419,8 +417,8 @@ int main(int argc, char *argv[]){
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
     Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
-    Pbegin = std::make_unique<states_begin_type>(PL);
-    Pend = std::make_unique<states_end_type>(PL);
+    Pbegin = std::make_unique<states_type>(PL);
+    Pend = std::make_unique<states_type>(PL);
 
     Pcoarse->nr_m = nrPIC;
     Pcoarse->nm_m = nmPIF;
diff --git a/alpine/PinT/StatesBeginSlice.hpp b/alpine/PinT/StatesBeginSlice.hpp
deleted file mode 100644
index 621e88038..000000000
--- a/alpine/PinT/StatesBeginSlice.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland
-// All rights reserved
-//
-// This file is part of IPPL.
-//
-// IPPL is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// You should have received a copy of the GNU General Public License
-// along with IPPL. If not, see <https://www.gnu.org/licenses/>.
-//
-
-
-template<class PLayout>
-class StatesBeginSlice : public ippl::ParticleBase<PLayout> {
-
-public:
-    typename ippl::ParticleBase<PLayout>::particle_position_type P;
-
-    StatesBeginSlice(PLayout& pl)
-    : ippl::ParticleBase<PLayout>(pl)
-    {
-        // register the particle attributes
-        this->addAttribute(P);
-    }
-
-    ~StatesBeginSlice(){ }
-
-};
diff --git a/alpine/PinT/StatesEndSlice.hpp b/alpine/PinT/StatesSlice.hpp
similarity index 86%
rename from alpine/PinT/StatesEndSlice.hpp
rename to alpine/PinT/StatesSlice.hpp
index 6b69996a1..206f8746c 100644
--- a/alpine/PinT/StatesEndSlice.hpp
+++ b/alpine/PinT/StatesSlice.hpp
@@ -14,18 +14,18 @@
 
 
 template<class PLayout>
-class StatesEndSlice : public ippl::ParticleBase<PLayout> {
+class StatesSlice : public ippl::ParticleBase<PLayout> {
 
 public:
     typename ippl::ParticleBase<PLayout>::particle_position_type P;
 
-    StatesEndSlice(PLayout& pl)
+    StatesSlice(PLayout& pl)
     : ippl::ParticleBase<PLayout>(pl)
     {
         // register the particle attributes
         this->addAttribute(P);
     }
 
-    ~StatesEndSlice(){ }
+    ~StatesSlice(){ }
 
 };

From eb6fccd9224c0cfd258318bd831251fae4f76724 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Fri, 1 Mar 2024 17:40:43 +0100
Subject: [PATCH 106/117] Landaudamping cleaned, chargedparticles as well as
 others need to be cleaned as well

---
 alpine/PinT/ChargedParticlesPinT.hpp |   2 +
 alpine/PinT/LandauDampingPinT.cpp    | 242 +++++----------------------
 2 files changed, 47 insertions(+), 197 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 9e6abddb5..fe46f837f 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -85,6 +85,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     std::string shapetype_m;
 
+    std::string coarsetype_m;
+
     int shapedegree_m;
 
     //nufft_t  nufftType1Fine_m,nufftType2Fine_m,nufftType1Coarse_m,nufftType2Coarse_m; 
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index cb1a7a76e..c6bbd3536 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -6,20 +6,27 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./LandauDampingPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> 
-//          <nCycles> <ShapeType> <degree> --info 5
+//     srun ./LandauDampingPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tolParareal> 
+//          <nCycles> <ShapeType> <degree> <No. of space procs> <No. of time procs> 
+//          <coarseTol> <fineTol> <coarseType> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
 //     nmz       = No. of Fourier modes in the z-direction
-//     nx       = No. of grid points in the x-direction
-//     ny       = No. of grid points in the y-direction
-//     nz       = No. of grid points in the z-direction
+//     nx       = No. of grid points in the x-direction (not used if PIF is also used as coarse propagator)
+//     ny       = No. of grid points in the y-direction (not used if PIF is also used as coarse propagator)
+//     nz       = No. of grid points in the z-direction (not used if PIF is also used as coarse propagator)
 //     Np       = Total no. of macro-particles in the simulation
+//     tolParareal      = Parareal tolerance
 //     nCycles = No. of Parareal blocks/cycles
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
+//     No. of space procs = Number of MPI ranks to be used in the spatial parallelization
+//     No. of time procs = Number of MPI ranks to be used in the time parallelization
+//     coarseTol = Coarse tolerance for PIF if we use PIF as a coarse propagator (will not be used when PIC is used)
+//     fineTol = fine tolerance for PIF
+//     coarseType = Type of coarse propagator (PIF or PIC)
 //     Example:
-//     srun ./LandauDampingPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 4 B-spline 1 --info 5
+//     srun ./LandauDampingPinT 32 32 32 16 16 16 655360 19.2 0.05 0.05 1e-5 1 B-spline 1 4 16 1e-2 1e-4 PIC --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -212,92 +219,6 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 
 }
 
-double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                         Vector_t& length, MPI_Comm& spaceComm) {
-    
-    auto Qview = Q.getView();
-    auto QprevIterView = QprevIter.getView();
-    double localError = 0.0;
-    double localNorm = 0.0;
-
-    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
-                                Vector_t diff = Qview(i) - QprevIterView(i);
-                                
-                                //This is just to undo the effect of periodic BCs during the 
-                                //error calculation. Otherwise even though the actual error is 
-                                //small the computed error might be very large. 
-                                //The values (e.g. 10) mentioned here are just an adhoc
-                                //value depending on the domain length. 
-                                for (unsigned d = 0; d < 3; ++d) {
-                                    bool isLeft = (diff[d] <= -10.0);
-                                    bool isRight = (diff[d] >= 10.0);
-                                    bool isInside = ((diff[d] > -10.0) && (diff[d] < 10.0));
-                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
-                                              +(isRight * (diff[d] - length[d]));
-                                }
-                                
-                                double myValError = dot(diff, diff).apply();
-
-                                myValError = std::sqrt(myValError);
-
-                                if(myValError > valLError) valLError = myValError;
-                                
-                                double myValnorm = dot(Qview(i), Qview(i)).apply();
-                                myValnorm = std::sqrt(myValnorm);
-
-                                if(myValnorm > valLnorm) valLnorm = myValnorm;
-                                
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
-
-    Kokkos::fence();
-    
-    double globalError = 0.0;
-    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_MAX, spaceComm);
-    double globalNorm = 0.0;
-    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_MAX, spaceComm);
-    
-    double relError = globalError/globalNorm;
-    
-    return relError;
-
-}
-
-double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                         MPI_Comm& spaceComm) {
-    
-    auto Qview = Q.getView();
-    auto QprevIterView = QprevIter.getView();
-    double localError = 0.0;
-    double localNorm = 0.0;
-
-    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
-                                Vector_t diff = Qview(i) - QprevIterView(i);
-                                double myValError = dot(diff, diff).apply();
-                                myValError = std::sqrt(myValError);
-                                
-                                if(myValError > valLError) valLError = myValError;
-                                
-                                double myValnorm = dot(Qview(i), Qview(i)).apply();
-                                myValnorm = std::sqrt(myValnorm);
-                                
-                                if(myValnorm > valLnorm) valLnorm = myValnorm;
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
-
-    Kokkos::fence();
-    
-    double globalError = 0.0;
-    MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_MAX, spaceComm);
-    double globalNorm = 0.0;
-    MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_MAX, spaceComm);
-    
-    double relError = globalError/globalNorm;
-    
-    return relError;
-
-}
-
 const char* TestName = "LandauDampingPinT";
 
 int main(int argc, char *argv[]){
@@ -425,9 +346,14 @@ int main(int argc, char *argv[]){
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
     Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
-    Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
-    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
-
+    
+    Pcoarse->coarsetype_m = argv[19];
+   
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->initFFTSolver();
+    }
 
     ////////////////////////////////////////////////////////////
     //Initialize an FFT object for getting rho in real space and 
@@ -464,7 +390,6 @@ int main(int argc, char *argv[]){
    
     ////////////////////////////////////////////////////////////
 
-    Pcoarse->initFFTSolver();
     
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
@@ -491,54 +416,20 @@ int main(int argc, char *argv[]){
    
     IpplTimings::stopTimer(particleCreation);
     
-    //Pcoarse->initNUFFT(FLPIF);
     double coarseTol = std::atof(argv[17]);
     double fineTol   = std::atof(argv[18]);
     Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
     std::string coarse = "Coarse";
     std::string fine = "Fine";
 
-
     IpplTimings::startTimer(particleCreation);
 
-    
-
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
-    //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-    //if(Ippl::Comm->rank() == 0) {
-    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
-    //    Kokkos::parallel_for(nloc,
-    //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
-    //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
-
-    //    Kokkos::fence();
-    //    size_type bufSize = Pbegin->packedSize(nloc);
-    //    std::vector<MPI_Request> requests(0);
-    //    int sends = 0;
-    //    for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
-    //        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
-    //        requests.resize(requests.size() + 1);
-    //        Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
-    //        buf->resetWritePos();
-    //        ++sends;
-    //    }
-    //    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
-    //}
-    //else {
-    //    size_type bufSize = Pbegin->packedSize(nloc);
-    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-    //    Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
-    //    buf->resetReadPos();
-    //}
-    //Ippl::Comm->barrier();
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-    
 
+    //For some reason using the next_tag with multiple cycles is not 
+    //working so we use static tags here
     tag = 500;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
 
     if(rankTime == 0) {
@@ -563,9 +454,14 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
-    //Pcoarse->initNUFFT(FLPIF, coarseTol);
-    //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
-    Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
+    
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm);
+    }
+    else {
+        //PIF with coarse tolerance as coarse propagator
+        Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm);
+    }
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
@@ -581,6 +477,7 @@ int main(int argc, char *argv[]){
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
 #else
+    //Note the CPU version has not been tested.
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
@@ -612,57 +509,6 @@ int main(int argc, char *argv[]){
     
     msg << "particles created and initial conditions assigned " << endl;
 
-    //Copy initial conditions as they are needed later
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-
-    ////Get initial guess for ranks other than 0 by propagating the coarse solver
-    //IpplTimings::startTimer(coarsePropagator);
-    //if (Ippl::Comm->rank() > 0) {
-    //    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
-    //}
-    //
-    //Ippl::Comm->barrier();
-    //
-    //IpplTimings::stopTimer(coarsePropagator);
-
-    //msg << "First Leap frog PIC done " << endl;
-
-    //
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-
-    ////Run the coarse integrator to get the values at the end of the time slice 
-    //IpplTimings::startTimer(coarsePropagator);
-    //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
-    //IpplTimings::stopTimer(coarsePropagator);
-    //msg << "Second Leap frog PIC done " << endl;
-
-
-    ////The following might not be needed
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-
-    //msg << "Starting parareal iterations ..." << endl;
-    //bool isConverged = false;
-    //bool isPreviousDomainConverged;
-    //if(Ippl::Comm->rank() == 0) {
-    //    isPreviousDomainConverged = true;
-    //}
-    //else {
-    //    isPreviousDomainConverged = false;
-    //}
-
-    
     int sign = 1;
     for (unsigned int nc=0; nc < nCycles; nc++) {
         double tStartMySlice; 
@@ -695,7 +541,6 @@ int main(int argc, char *argv[]){
         while (!isConverged) { 
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
-            //Pcoarse->initNUFFT(FLPIF, fineTol);
             Pcoarse->LeapFrogPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, rankTime, rankSpace, fine, spaceComm);
             IpplTimings::stopTimer(finePropagator);
     
@@ -735,18 +580,19 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            //Pcoarse->initNUFFT(FLPIF, coarseTol);
-            //double coarseTol = (double)(std::pow(0.1,std::min((int)(it+2),3)));
-            //double fineTol = 1e-6;
-            //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
-            //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
-            Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
+            }
+            else {
+                Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm);
+            }
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
             Pend->P = Pend->P + Pcoarse->P;
 
             PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
+            
             IpplTimings::startTimer(computeErrors);
             double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
             double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
@@ -777,7 +623,6 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writeError(Rerror, Perror, it+1);
             Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
@@ -824,9 +669,12 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
-            //Pcoarse->initNUFFT(FLPIF, coarseTol);
-            Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
+            }
+            else {
+                Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            }
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());

From 67d2fd59fd63d933ade7157f42217811148f9732 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Sat, 2 Mar 2024 10:07:21 +0100
Subject: [PATCH 107/117] ChargedParticles also cleaned

---
 alpine/PinT/ChargedParticlesPinT.hpp | 507 +++------------------------
 1 file changed, 42 insertions(+), 465 deletions(-)

diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index fe46f837f..31835f9e4 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -55,15 +55,12 @@ template<class PLayout>
 class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 public:
 
-    //using nufft_t = typename ippl::FFT<ippl::NUFFTransform, 3, double>;
-
     CxField_t rhoPIF_m;
     CxField_t rhoPIFhalf_m;
     Field_t rhoPIFreal_m;
     Field_t Sk_m;
     Field_t rhoPIC_m;
     VField_t EfieldPIC_m;
-    //VField_t EfieldPICprevIter_m;
 
     Vector<int, Dim> nr_m;
     Vector<int, Dim> nm_m;
@@ -89,8 +86,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     int shapedegree_m;
 
-    //nufft_t  nufftType1Fine_m,nufftType2Fine_m,nufftType1Coarse_m,nufftType2Coarse_m; 
-    std::shared_ptr<ippl::FFT<ippl::NUFFTransform, 3, double>> nufftType1Fine_mp,nufftType2Fine_mp,nufftType1Coarse_mp,nufftType2Coarse_mp;
+    std::shared_ptr<ippl::FFT<ippl::NUFFTransform, 3, double>> nufftType1Fine_mp,nufftType2Fine_mp,
+                                                               nufftType1Coarse_mp,nufftType2Coarse_mp;
 
 public:
     ParticleAttrib<double>     q; // charge
@@ -103,27 +100,24 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     typename ippl::ParticleBase<PLayout>::particle_position_type RprevIter;  // G(R^(k-1)_n)
     typename ippl::ParticleBase<PLayout>::particle_position_type PprevIter;  // G(P^(k-1)_n)
 
-    //typename ippl::ParticleBase<PLayout>::particle_position_type Rfine;  
-    //typename ippl::ParticleBase<PLayout>::particle_position_type Pfine; 
-
-    /*
-      This constructor is mandatory for all derived classes from
-      ParticleBase as the bunch buffer uses this
-    */
-    ChargedParticlesPinT(PLayout& pl)
-    : ippl::ParticleBase<PLayout>(pl)
-    {
-        // register the particle attributes
-        this->addAttribute(q);
-        this->addAttribute(P);
-        this->addAttribute(E);
-        this->addAttribute(R0);
-        this->addAttribute(P0);
-        this->addAttribute(RprevIter);
-        this->addAttribute(PprevIter);
-        //this->addAttribute(Rfine);
-        //this->addAttribute(Pfine);
-    }
+    ///*
+    //  This constructor is mandatory for all derived classes from
+    //  ParticleBase as the bunch buffer uses this
+    //*/
+    //ChargedParticlesPinT(PLayout& pl)
+    //: ippl::ParticleBase<PLayout>(pl)
+    //{
+    //    // register the particle attributes
+    //    this->addAttribute(q);
+    //    this->addAttribute(P);
+    //    this->addAttribute(E);
+    //    this->addAttribute(R0);
+    //    this->addAttribute(P0);
+    //    this->addAttribute(RprevIter);
+    //    this->addAttribute(PprevIter);
+    //    //this->addAttribute(Rfine);
+    //    //this->addAttribute(Pfine);
+    //}
     
     ChargedParticlesPinT(PLayout& pl,
                      Vector_t hr,
@@ -147,8 +141,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         this->addAttribute(P0);
         this->addAttribute(RprevIter);
         this->addAttribute(PprevIter);
-        //this->addAttribute(Rfine);
-        //this->addAttribute(Pfine);
         setupBCs();
         for (unsigned int i = 0; i < Dim; i++)
             decomp_m[i]=decomp[i];
@@ -181,21 +173,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     }
 
 
-    //void initNUFFT(FieldLayout_t& FLPIF, double& tol) {
-    //    ippl::ParameterList fftParams;
-
-    //    fftParams.add("gpu_method", 1);
-    //    fftParams.add("gpu_sort", 0);
-    //    fftParams.add("gpu_kerevalmeth", 1);
-    //    //fftParams.add("tolerance", 1e-6);
-    //    fftParams.add("tolerance", tol);
-
-    //    fftParams.add("use_cufinufft_defaults", false);
-
-    //    q.initializeNUFFT(FLPIF, 1, fftParams);
-    //    E.initializeNUFFT(FLPIF, 2, fftParams);
-    //}
-
     void initNUFFTs(FieldLayout_t& FLPIF, double& coarseTol,
                     double& fineTol) {
         
@@ -214,11 +191,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         fftFineParams.add("use_cufinufft_defaults", false);
         fftCoarseParams.add("use_cufinufft_defaults", false);
         
-        //nufftType1Fine_m = nufft_t(FLPIF, this->getLocalNum(), 1, fftFineParams);
-        //nufftType2Fine_m = nufft_t(FLPIF, this->getLocalNum(), 2, fftFineParams);
-
-        //nufftType1Coarse_m = nufft_t(FLPIF, this->getLocalNum(), 1, fftCoarseParams);
-        //nufftType2Coarse_m = nufft_t(FLPIF, this->getLocalNum(), 2, fftCoarseParams);
         nufftType1Fine_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 1, fftFineParams);
         nufftType2Fine_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 2, fftFineParams);
 
@@ -226,247 +198,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         nufftType2Coarse_mp = std::make_shared<ippl::FFT<ippl::NUFFTransform, 3, double>>(FLPIF, this->getLocalNum(), 2, fftCoarseParams);
     }
     
-    void initializeParareal(ParticleAttrib<Vector_t>& Rbegin,
-                            ParticleAttrib<Vector_t>& Pbegin,
-                            ParticleAttrib<Vector_t>& Rcoarse,
-                            ParticleAttrib<Vector_t>& Pcoarse,
-                            ParticleAttrib<Vector_t>& Rtemp,
-                            ParticleAttrib<Vector_t>& Ptemp,
-                            bool& isConverged,
-                            bool& isPreviousDomainConverged,
-                            const unsigned int& ntCoarse,
-                            const double& dtCoarse,
-                            const double& tStartMySlice,
-                            const double& Bext,
-                            const int& rankTime,
-                            MPI_Comm& spaceComm) {
-
-        //Copy initial conditions as they are needed later
-        //Kokkos::deep_copy(R0.getView(), this->R.getView());
-        //Kokkos::deep_copy(P0.getView(), P.getView());
-        Kokkos::deep_copy(Rtemp.getView(), Rcoarse.getView());
-        Kokkos::deep_copy(Ptemp.getView(), Pcoarse.getView());
-
-        //Get initial guess for ranks other than 0 by propagating the coarse solver
-        if (rankTime > 0) {
-            //BorisPIC(this->R, P, rankTime*ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
-            BorisPIC(Rcoarse, Pcoarse, rankTime*ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
-        }
-
-        //Copy initial conditions as they are needed later
-        //Kokkos::deep_copy(R0.getView(), this->R.getView());
-        //Kokkos::deep_copy(P0.getView(), P.getView());
-
-
-        //Ippl::Comm->barrier();
-        
-        //Kokkos::deep_copy(Rbegin.getView(), this->R.getView());
-        //Kokkos::deep_copy(Pbegin.getView(), P.getView());
-        Kokkos::deep_copy(Rbegin.getView(), Rcoarse.getView());
-        Kokkos::deep_copy(Pbegin.getView(), Pcoarse.getView());
-
-
-        //Run the coarse integrator to get the values at the end of the time slice 
-        //BorisPIC(this->R, P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
-        BorisPIC(Rcoarse, Pcoarse, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
-
-        isConverged = false;
-        if(rankTime == 0) {
-            isPreviousDomainConverged = true;
-        }
-        else {
-            isPreviousDomainConverged = false;
-        }
-    }
-
-    void initializeParareal(ParticleAttrib<Vector_t>& Rbegin,
-                            ParticleAttrib<Vector_t>& Pbegin,
-                            bool& isConverged,
-                            bool& isPreviousDomainConverged,
-                            const unsigned int& ntCoarse,
-                            const double& dtCoarse,
-                            const double& tStartMySlice) {
-
-        //Copy initial conditions as they are needed later
-        Kokkos::deep_copy(R0.getView(), this->R.getView());
-        Kokkos::deep_copy(P0.getView(), P.getView());
-
-        //Get initial guess for ranks other than 0 by propagating the coarse solver
-        if (Ippl::Comm->rank() > 0) {
-            LeapFrogPIC(this->R, P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
-        }
-        
-        //Ippl::Comm->barrier();
-        
-        Kokkos::deep_copy(Rbegin.getView(), this->R.getView());
-        Kokkos::deep_copy(Pbegin.getView(), P.getView());
-
-
-        //Run the coarse integrator to get the values at the end of the time slice 
-        LeapFrogPIC(this->R, P, ntCoarse, dtCoarse, tStartMySlice); 
-
-        isConverged = false;
-        if(Ippl::Comm->rank() == 0) {
-            isPreviousDomainConverged = true;
-        }
-        else {
-            isPreviousDomainConverged = false;
-        }
-    }
-
-     void dumpLandauPIC() {
-
-        const int nghostE = EfieldPIC_m.getNghost();
-        auto Eview = EfieldPIC_m.getView();
-        double fieldEnergy, ExAmp;
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<3>>;
-
-        double temp = 0.0;
-        Kokkos::parallel_reduce("Ex inner product",
-                                mdrange_type({nghostE, nghostE, nghostE},
-                                             {Eview.extent(0) - nghostE,
-                                              Eview.extent(1) - nghostE,
-                                              Eview.extent(2) - nghostE}),
-                                KOKKOS_LAMBDA(const size_t i, const size_t j,
-                                              const size_t k, double& valL)
-                                {
-                                    double myVal = std::pow(Eview(i, j, k)[0], 2);
-                                    valL += myVal;
-                                }, Kokkos::Sum<double>(temp));
-        double globaltemp = temp;
-        //MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, Ippl::getComm());
-        fieldEnergy = globaltemp * hr_m[0] * hr_m[1] * hr_m[2];
-
-        double tempMax = 0.0;
-        Kokkos::parallel_reduce("Ex max norm",
-                                mdrange_type({nghostE, nghostE, nghostE},
-                                             {Eview.extent(0) - nghostE,
-                                              Eview.extent(1) - nghostE,
-                                              Eview.extent(2) - nghostE}),
-                                KOKKOS_LAMBDA(const size_t i, const size_t j,
-                                              const size_t k, double& valL)
-                                {
-                                    double myVal = std::fabs(Eview(i, j, k)[0]);
-                                    if(myVal > valL) valL = myVal;
-                                }, Kokkos::Max<double>(tempMax));
-        ExAmp = tempMax;
-        //MPI_Reduce(&tempMax, &ExAmp, 1, MPI_DOUBLE, MPI_MAX, 0, Ippl::getComm());
-
-
-        if (Ippl::Comm->rank() == 0) {
-            std::stringstream fname;
-            fname << "data/FieldLandau_";
-            fname << Ippl::Comm->size();
-            fname << ".csv";
-
-
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-            csvout.precision(10);
-            csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-            if(time_m == 0.0) {
-                csvout << "time, Ex_field_energy, Ex_max_norm" << endl;
-            }
-
-            csvout << time_m << " "
-                   << fieldEnergy << " "
-                   << ExAmp << endl;
-
-        }
-        
-        //Ippl::Comm->barrier();
-     }
-
-
-    
-    void dumpLandau(const unsigned int& iter) {
+    void dumpFieldEnergy(const unsigned int& nc, const unsigned int& iter, int rankTime, int rankSpace) {
        
-
-        double fieldEnergy = 0.0; 
-        double ExAmp = 0.0;
-
-        auto rhoview = rhoPIF_m.getView();
-        const int nghost = rhoPIF_m.getNghost();
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
-      
-        const FieldLayout_t& layout = rhoPIF_m.getLayout(); 
-        const Mesh_t& mesh = rhoPIF_m.get_mesh();
-        const Vector<double, Dim>& dx = mesh.getMeshSpacing();
-        const auto& domain = layout.getDomain();
-        Vector<double, Dim> Len;
-        Vector<int, Dim> N;
-
-        for (unsigned d=0; d < Dim; ++d) {
-            N[d] = domain[d].length();
-            Len[d] = dx[d] * N[d];
-        }
-
-
-        Kokkos::complex<double> imag = {0.0, 1.0};
-        double pi = std::acos(-1.0);
-        Kokkos::parallel_reduce("Ex energy and Max",
-                              mdrange_type({0, 0, 0},
-                                           {N[0],
-                                            N[1],
-                                            N[2]}),
-                              KOKKOS_LAMBDA(const int i,
-                                            const int j,
-                                            const int k,
-                                            double& tlSum,
-                                            double& tlMax)
-        {
-        
-            Vector<int, 3> iVec = {i, j, k};
-            Vector<double, 3> kVec;
-            double Dr = 0.0;
-            for(size_t d = 0; d < Dim; ++d) {
-                //bool shift = (iVec[d] > (N[d]/2));
-                //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-                kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
-                Dr += kVec[d] * kVec[d];
-            }
-
-            Kokkos::complex<double> Ek = {0.0, 0.0}; 
-            bool isNotZero = (Dr != 0.0);
-            double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
-            Ek = -(imag * kVec[0] * rhoview(i+nghost,j+nghost,k+nghost) * factor);
-            double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
-
-            tlSum += myVal;
-
-            double myValMax = std::sqrt(myVal);
-
-            if(myValMax > tlMax) tlMax = myValMax;
-
-        }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(ExAmp));
-        
-
-        Kokkos::fence();
-        double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-        fieldEnergy *= volume;
-
-
-        std::stringstream fname;
-        fname << "data/FieldLandau_";
-        fname << Ippl::Comm->rank();
-        fname << "_iter_";
-        fname << iter;
-        fname << ".csv";
-
-
-        Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
-        csvout.precision(10);
-        csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-
-        csvout << time_m << " "
-               << fieldEnergy << " "
-               << ExAmp << endl;
-    }
-
-    void dumpBumponTail(const unsigned int& nc, const unsigned int& iter, int rankTime, int rankSpace) {
-       
-
         double fieldEnergy = 0.0; 
         double EzAmp = 0.0;
 
@@ -505,8 +238,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Vector<double, 3> kVec;
             double Dr = 0.0;
             for(size_t d = 0; d < Dim; ++d) {
-                //bool shift = (iVec[d] > (N[d]/2));
-                //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                 kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                 Dr += kVec[d] * kVec[d];
             }
@@ -553,13 +284,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }
     }
 
-
-
-
-    void dumpEnergy(size_type /*totalP*/, const unsigned int& nc, 
-                    const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp,
+    void dumpEnergy(const unsigned int& nc, const unsigned int& iter, ParticleAttrib<Vector_t>& Ptemp,
                     int rankTime, int rankSpace, const MPI_Comm& spaceComm = MPI_COMM_WORLD) {
-       
 
         double potentialEnergy, kineticEnergy;
         double temp = 0.0;
@@ -598,10 +324,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Vector<double, 3> kVec;
             double Dr = 0.0;
             for(size_t d = 0; d < Dim; ++d) {
-                //bool shift = (iVec[d] > (N[d]/2));
-                //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                 kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
-                //kVec[d] = 2 * pi / Len[d] * iVec[d];
                 Dr += kVec[d] * kVec[d];
             }
 
@@ -615,14 +338,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
             }
 
-            //double myVal = rhoview(i,j,k).real() * rhoview(i,j,k).real() + 
-            //               rhoview(i,j,k).imag() * rhoview(i,j,k).imag();
-            //if(Dr != 0.0) {
-            //    myVal /= Dr;
-            //}
-            //else {
-            //    myVal = 0.0;
-            //}
             valL += myVal;
 
         }, Kokkos::Sum<double>(temp));
@@ -638,13 +353,12 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         Kokkos::parallel_reduce("Kinetic Energy", this->getLocalNum(),
                                 KOKKOS_LAMBDA(const int i, double& valL){
                                     double myVal = dot(Pview(i), Pview(i)).apply();
-                                    myVal *= -qView(i); //q/(q/m) where q/m=-1 for us
+                                    myVal *= -qView(i); //q/(q/m) where q/m=-1
                                     valL += myVal;
                                 }, Kokkos::Sum<double>(temp));
 
         temp *= 0.5;
         double globaltemp = 0.0;
-        //double globaltemp = temp;
         MPI_Allreduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
 
         kineticEnergy = globaltemp;
@@ -660,6 +374,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Nhalf[d] = domainHalf[d].length();
         }
 
+        //Heffte needs FFTshifted field whereas the field from cuFINUFFT
+        //is not shifted. Hence, here we do the shift. 
         Kokkos::parallel_for("Transfer complex rho to half domain",
                               mdrange_type({0, 0, 0},
                                            {Nhalf[0],
@@ -700,7 +416,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             valL += rhoPIFrealview(i+nghost, j+nghost, k+nghost);
         }, Kokkos::Sum<double>(temp));
 
-
         double chargeTotal = temp;
 
         Vector_t totalMomentum = 0.0;
@@ -746,71 +461,19 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         }
 
     }
-
-
-     void dumpParticleData(const unsigned int& iter, ParticleAttrib<Vector_t>& Rtemp, ParticleAttrib<Vector_t>& Ptemp, const char* fname) {
-
-        typename ParticleAttrib<Vector_t>::HostMirror R_host = Rtemp.getHostMirror();
-        typename ParticleAttrib<Vector_t>::HostMirror P_host = Ptemp.getHostMirror();
-        Kokkos::deep_copy(R_host, Rtemp.getView());
-        Kokkos::deep_copy(P_host, Ptemp.getView());
-        std::stringstream pname;
-        pname << "data/";
-        pname << fname;
-        pname << "_rank_";
-        pname << Ippl::Comm->rank();
-        pname << "_iter_";
-        pname << iter;
-        pname << ".csv";
-        Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
-        pcsvout.precision(10);
-        pcsvout.setf(std::ios::scientific, std::ios::floatfield);
-        pcsvout << "R_x, R_y, R_z, V_x, V_y, V_z" << endl;
-        for (size_type i = 0; i< this->getLocalNum(); i++) {
-            pcsvout << R_host(i)[0] << " "
-                    << R_host(i)[1] << " "
-                    << R_host(i)[2] << " "
-                    << P_host(i)[0] << " "
-                    << P_host(i)[1] << " "
-                    << P_host(i)[2] << endl;
-        }
-     }
-
-    void writelocalError(double Rerror, double Perror, unsigned int nc, unsigned int iter, int rankTime, int rankSpace) {
-        
-            //if(Ippl::Comm->rank() == 0) {
-            if(rankSpace == 0) {
-                std::stringstream fname;
-                fname << "data/localError_rank_";
-                fname << rankTime;
-                fname << "_nc_";
-                fname << nc;
-                fname << ".csv";
-
-                Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
-                csvout.precision(10);
-                csvout.setf(std::ios::scientific, std::ios::floatfield);
-
-                if(iter == 1) {
-                    csvout << "Iter, Rerror, Perror" << endl;
-                }
-
-                csvout << iter << " "
-                       << Rerror << " "
-                       << Perror << endl;
-            }
-
-    }
-
     
-    void writeError(double Rerror, double Perror, unsigned int iter) {
+    void writelocalError(double Rerror, double Perror, unsigned int nc, unsigned int iter, int rankTime, int rankSpace) {
         
-        if(Ippl::Comm->rank() == 0) {
+        if(rankSpace == 0) {
             std::stringstream fname;
-            fname << "data/Error_Vs_Iter.csv";
+            fname << "data/localError_rank_";
+            fname << rankTime;
+            fname << "_nc_";
+            fname << nc;
+            fname << ".csv";
 
-            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND);
-            csvout.precision(10);
+            Inform csvout(NULL, fname.str().c_str(), Inform::APPEND, Ippl::Comm->rank());
+            csvout.precision(17);
             csvout.setf(std::ios::scientific, std::ios::floatfield);
 
             if(iter == 1) {
@@ -820,55 +483,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             csvout << iter << " "
                    << Rerror << " "
                    << Perror << endl;
-
         }
-    
-        Ippl::Comm->barrier();
-
-    }
 
-    void checkBounds(ParticleAttrib<Vector_t>& R) {
-
-        auto Rview = R.getView();
-        double xMin = 0.0;
-        double yMin = 0.0;
-        double zMin = 0.0;
-        double xMax = 0.0;
-        double yMax = 0.0;
-        double zMax = 0.0;
-        Kokkos::parallel_reduce("Bounds calculation", R.size(),
-                                KOKKOS_LAMBDA(const int i, 
-                                              double& xlMin, 
-                                              double& ylMin, 
-                                              double& zlMin, 
-                                              double& xlMax, 
-                                              double& ylMax, 
-                                              double& zlMax){
-
-                                    if(Rview(i)[0] < xlMin) xlMin = Rview(i)[0];
-                                    if(Rview(i)[1] < ylMin) ylMin = Rview(i)[1];
-                                    if(Rview(i)[2] < zlMin) zlMin = Rview(i)[2];
-
-                                    if(Rview(i)[0] > xlMax) xlMax = Rview(i)[0];
-                                    if(Rview(i)[1] > ylMax) ylMax = Rview(i)[1];
-                                    if(Rview(i)[2] > zlMax) zlMax = Rview(i)[2];
-                                
-                                }, Kokkos::Min<double>(xMin), Kokkos::Min<double>(yMin), Kokkos::Min<double>(zMin),
-                                   Kokkos::Max<double>(xMax), Kokkos::Max<double>(yMax), Kokkos::Max<double>(zMax));
-
-        Kokkos::fence();
-
-        Vector_t Rmin = {xMin, yMin, zMin};
-        Vector_t Rmax = {xMax, yMax, zMax};
-
-        for (unsigned d = 0; d < 3; ++d) {
-            if(Rmin[d] < rmin_m[d]) {
-                std::cout << "Invalid particles with min. in rank: " << Ippl::Comm->rank() << " Rmin: " << Rmin << std::endl;
-            }
-            if(Rmax[d] > rmax_m[d]) {
-                std::cout << "Invalid particles with max. in rank: " << Ippl::Comm->rank() << " Rmax: " << Rmax << std::endl;
-            }
-        }
     }
 
     void initializeShapeFunctionPIF() {
@@ -903,8 +519,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 Vector<double, 3> kVec;
                 double Sk = 1.0;
                 for(size_t d = 0; d < Dim; ++d) {
-                    //bool shift = (iVec[d] > (N[d]/2));
-                    //kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                     double kh = kVec[d] * dx[d];
                     bool isNotZero = (kh != 0.0);
@@ -916,8 +530,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 }
                     Skview(i+nghost, j+nghost, k+nghost) = Sk;
             });
-            
-
         }
         else {
             throw IpplException("initializeShapeFunctionPIF",
@@ -932,8 +544,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
         PLayout& PL = this->getLayout();
-        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-        //checkBounds(Rtemp);
         rhoPIC_m = 0.0;
         scatter(q, rhoPIC_m, Rtemp, spaceComm);
     
@@ -948,8 +558,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         time_m = tStartMySlice;
 
-        //dumpLandauPIC();         
-
         for (unsigned int it=0; it<nt; it++) {
             
             // kick
@@ -960,13 +568,11 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //Apply particle BC
             PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-            //checkBounds(Rtemp);
     
             //scatter the charge onto the underlying grid
             rhoPIC_m = 0.0;
             scatter(q, rhoPIC_m, Rtemp, spaceComm);
     
-    
             rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
             rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
     
@@ -982,7 +588,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             Ptemp = Ptemp - 0.5 * dt * E;
             
             time_m += dt;
-            //dumpLandauPIC();         
         }
     
     }
@@ -992,8 +597,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef fieldSolvePIC = IpplTimings::getTimer("fieldSolvePIC");
         PLayout& PL = this->getLayout();
-        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-        //checkBounds(Rtemp);
         rhoPIC_m = 0.0;
         scatter(q, rhoPIC_m, Rtemp, spaceComm);
     
@@ -1001,7 +604,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
     
         //Field solve
-        EfieldPIC_m = 0.0;
         solver_mp->solve();
     
         // gather E field
@@ -1009,12 +611,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         time_m = tStartMySlice;
 
-        //dumpLandauPIC();         
         double alpha = -0.5 * dt;
         double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2)));
         Vector_t rmax = rmax_m;
 
-
         for (unsigned int it=0; it<nt; it++) {
             
             // Staggered Leap frog or Boris algorithm as per 
@@ -1049,19 +649,16 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //Apply particle BC
             PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-            //checkBounds(Rtemp);
     
             //scatter the charge onto the underlying grid
             rhoPIC_m = 0.0;
             scatter(q, rhoPIC_m, Rtemp, spaceComm);
     
-    
             rhoPIC_m = rhoPIC_m / (hr_m[0] * hr_m[1] * hr_m[2]);
             rhoPIC_m = rhoPIC_m - (Q_m/((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])));
     
             //Field solve
             IpplTimings::startTimer(fieldSolvePIC);
-            EfieldPIC_m = 0.0;
             solver_mp->solve();
             IpplTimings::stopTimer(fieldSolvePIC);
     
@@ -1090,13 +687,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             });
             
             time_m += dt;
-            //dumpLandauPIC();         
         }
     
     }
 
-
-
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
                      const double& dt, const double& tStartMySlice, const unsigned& nc, 
@@ -1105,8 +699,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
-        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-        //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
         if(propagator == "Coarse") {
             scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
@@ -1114,7 +706,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         else if(propagator == "Fine") {
             scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
         }
-
     
         rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
@@ -1125,32 +716,29 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         else if(propagator == "Fine") {
             gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
         }
-        //gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
 
+        //Reset the value of q here as we used it as a temporary object in gather to 
+        //save memory
         q = Q_m / Np_m;
     
         time_m = tStartMySlice;
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            //dumpLandau(iter);         
-            dumpBumponTail(nc, iter, rankTime, rankSpace);         
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         for (unsigned int it=0; it<nt; it++) {
     
             // kick
-    
             Ptemp = Ptemp - 0.5 * dt * E;
     
             //drift
-            
             Rtemp = Rtemp + dt * Ptemp;
     
             //Apply particle BC
             PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-            //checkBounds(Rtemp);
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
@@ -1160,7 +748,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             else if(propagator == "Fine") {
                 scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Fine_mp.get(), spaceComm);
             }
-            //scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, spaceComm);
     
             rhoPIF_m = rhoPIF_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]));
     
@@ -1171,7 +758,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             else if(propagator == "Fine") {
                 gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, nufftType2Fine_mp.get(), q);
             }
-            //gatherPIFNUFFT(E, rhoPIF_m, Sk_m, Rtemp, q);
 
             q = Q_m / Np_m;
 
@@ -1182,12 +768,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                //dumpLandau(iter);         
-                dumpBumponTail(nc, iter, rankTime, rankSpace);         
-                dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+                dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
                 IpplTimings::stopTimer(dumpData);
             }
-    
         }
     }
 
@@ -1201,8 +785,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
         PLayout& PL = this->getLayout();
-        //PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-        //checkBounds(Rtemp);
         rhoPIF_m = {0.0, 0.0};
         if(propagator == "Coarse") {
             scatterPIFNUFFT(q, rhoPIF_m, Sk_m, Rtemp, nufftType1Coarse_mp.get(), spaceComm);
@@ -1227,7 +809,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         double alpha = -0.5 * dt;
@@ -1235,8 +817,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
         Vector_t rmax = rmax_m;
         for (unsigned int it=0; it<nt; it++) {
     
-            // kick
-    
             // Staggered Leap frog or Boris algorithm as per 
             // https://www.sciencedirect.com/science/article/pii/S2590055219300526
             // eqns 4(a)-4(c). Note we don't use the Boris trick here and do
@@ -1269,7 +849,6 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
     
             //Apply particle BC
             PL.applyBC(Rtemp, PL.getRegionLayout().getDomain());
-            //checkBounds(Rtemp);
     
             //scatter the charge onto the underlying grid
             rhoPIF_m = {0.0, 0.0};
@@ -1316,16 +895,14 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                dumpEnergy(this->getLocalNum(), nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
                 IpplTimings::stopTimer(dumpData);
             }
-    
         }
     }
 
 private:
     void setBCAllPeriodic() {
-
         this->setParticleBC(ippl::BC::PERIODIC);
     }
 

From c19865559ca902f2d7c3e6ad2748b19082c5a36e Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Sat, 2 Mar 2024 10:19:51 +0100
Subject: [PATCH 108/117] FFT files cleaned a bit

---
 src/FFT/FFT.h   |  1 -
 src/FFT/FFT.hpp | 26 +-------------------------
 2 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h
index 816ae8e4b..6807e8ba3 100644
--- a/src/FFT/FFT.h
+++ b/src/FFT/FFT.h
@@ -39,7 +39,6 @@
 #include "Types/IpplTypes.h"
 #include "FieldLayout/FieldLayout.h"
 #include "Field/Field.h"
-//#include "Particle/ParticleAttrib.h"
 #include "Utility/ParameterList.h"
 #include "Utility/IpplException.h"
 
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index c8f28c8bf..e33552322 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -124,10 +124,8 @@ namespace ippl {
            }
         }
 
-        //heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-        //           (inbox, outbox, Ippl::getComm(), heffteOptions);
         heffte_m = std::make_shared<heffte::fft3d<heffteBackend, long long>>
-                   (inbox, outbox, MPI_COMM_SELF, heffteOptions);
+                  (inbox, outbox, Ippl::getComm(), heffteOptions);
 
         //heffte::gpu::device_set(Ippl::Comm->rank() % heffte::gpu::device_count());
         if(workspace_m.size() < heffte_m->size_workspace())
@@ -901,28 +899,6 @@ namespace ippl {
 
         const double pi = std::acos(-1.0);
 
-        /**
-         * cuFINUFFT's layout is left, hence we allocate the temporary
-         * Kokkos views with the same layout
-         */
-        //Kokkos::View<complexType***,Kokkos::LayoutLeft>
-        //    tempField("tempField", fview.extent(0) - 2*nghost,
-        //                           fview.extent(1) - 2*nghost,
-        //                           fview.extent(2) - 2*nghost);
-
-
-        ////Initialize the pointers to NULL and fill only relevant dimensions
-        ////CUFINUFFT requires the input like this.
-        //Kokkos::View<PT1*,Kokkos::LayoutLeft> tempR[3] = {};
-      
-
-        //for(size_t d = 0; d < Dim; ++d) {
-        //    Kokkos::realloc(tempR[d], localNp);
-        //}
-
-       
-        //Kokkos::View<complexType*,Kokkos::LayoutLeft> tempQ("tempQ", localNp);
-  
         auto tempField = tempField_m;
         auto tempQ = tempQ_m;
         Kokkos::View<T*,Kokkos::LayoutLeft> tempR[3] = {};

From 5e571f7ad1d5f9148c33489dd03a5239e62ad1ad Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Sat, 2 Mar 2024 11:04:58 +0100
Subject: [PATCH 109/117] Particle Attribute also cleaned a bit

---
 src/Particle/ParticleAttrib.h   |  8 ----
 src/Particle/ParticleAttrib.hpp | 72 ++++++++++-----------------------
 2 files changed, 21 insertions(+), 59 deletions(-)

diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 10b391d69..0053dcca7 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -176,9 +176,6 @@ namespace ippl {
                 const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp);
 
 #ifdef KOKKOS_ENABLE_CUDA
-        //template<unsigned Dim>
-        //void initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams); 
-
         template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
         scatterPIFNUFFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
@@ -201,11 +198,6 @@ namespace ippl {
 
     private:
         view_type dview_m;
-//#ifdef KOKKOS_ENABLE_CUDA
-//        //TODO: Remove hard-coded dimension by having Dim as template 
-//        //parameter. Does this need to be in CUDA ifdefs?
-//        std::shared_ptr<FFT<NUFFTransform, 3, double>> fftType_mp;
-//#endif
     };
 }
 
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 41b11f220..a2e33e334 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -185,12 +185,6 @@ namespace ippl {
                 const int j = index[1] - lDom[1].first() + nghost;
                 const int k = index[2] - lDom[2].first() + nghost;
 
-                //if((i < 1) || (i > lDom[0].last() + 2) || (j < 1) || (j > lDom[1].last() + 2)
-                //   || (k < 1) || (k > lDom[0].last() + 2)) {
-                //    std::cout << "i: " << i << ", j: " << j << ", k: " << k << std::endl;
-                //    std::cout << "Invalid particle co-ordinates: " << pp(idx) << std::endl;
-                //}
-
                 // scatter
                 const value_type& val = dview_m(idx);
                 Kokkos::atomic_add(&viewLocal(i-1, j-1, k-1), wlo[0] * wlo[1] * wlo[2] * val);
@@ -203,12 +197,10 @@ namespace ippl {
                 Kokkos::atomic_add(&viewLocal(i,   j,   k  ), whi[0] * whi[1] * whi[2] * val);
             }
         );
-        IpplTimings::stopTimer(scatterPICTimer);
             
-        //static IpplTimings::TimerRef accumulateHaloTimer = IpplTimings::getTimer("AccumulateHalo");           
-        //IpplTimings::startTimer(accumulateHaloTimer);                                               
         tempField.accumulateHalo();
-        //IpplTimings::stopTimer(accumulateHaloTimer);
+
+        IpplTimings::stopTimer(scatterPICTimer);
 
         static IpplTimings::TimerRef scatterAllReducePICTimer = IpplTimings::getTimer("scatterAllReducePIC");           
         IpplTimings::startTimer(scatterAllReducePICTimer);                                               
@@ -222,7 +214,8 @@ namespace ippl {
     template<typename T, class... Properties>
     template <unsigned Dim, class M, class C, class FT, class ST, class PT>
     void ParticleAttrib<T, Properties...>::scatterPIFNUDFT(Field<FT,Dim,M,C>& f, Field<ST,Dim,M,C>& Sk,
-                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp)
+                                                   const ParticleAttrib< Vector<PT,Dim>, Properties... >& pp,
+                                                   const MPI_Comm& spaceComm)
     const
     {
         
@@ -251,11 +244,6 @@ namespace ippl {
         typedef Kokkos::TeamPolicy<> team_policy;
         typedef Kokkos::TeamPolicy<>::member_type member_type;
 
-
-        //using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
-
-        //view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
-
         double pi = std::acos(-1.0);
         Kokkos::complex<double> imag = {0.0, 1.0};
 
@@ -304,8 +292,8 @@ namespace ippl {
                 }, Kokkos::Sum<FT>(reducedValue));
 
                 if(teamMember.team_rank() == 0) {
-                    //viewLocal(i+nghost,j+nghost,k+nghost) = reducedValue;
-                    fview(i+nghost,j+nghost,k+nghost) = reducedValue;
+                    viewLocal(i+nghost,j+nghost,k+nghost) = reducedValue;
+                    //fview(i+nghost,j+nghost,k+nghost) = reducedValue;
                 }
 
                 }
@@ -313,12 +301,12 @@ namespace ippl {
 
         IpplTimings::stopTimer(scatterPIFNUDFTTimer);
 
-        //static IpplTimings::TimerRef scatterAllReduceTimer = IpplTimings::getTimer("scatterAllReduce");           
-        //IpplTimings::startTimer(scatterAllReduceTimer);                                               
-        //int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
-        //MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
-        //              MPI_C_DOUBLE_COMPLEX, MPI_SUM, Ippl::getComm());  
-        //IpplTimings::stopTimer(scatterAllReduceTimer);
+        static IpplTimings::TimerRef scatterAllReducePIFTimer = IpplTimings::getTimer("scatterAllReducePIF");           
+        IpplTimings::startTimer(scatterAllReducePIFTimer);                                               
+        int viewSize = fview.extent(0)*fview.extent(1)*fview.extent(2);
+        MPI_Allreduce(viewLocal.data(), fview.data(), viewSize, 
+                      MPI_C_DOUBLE_COMPLEX, MPI_SUM, spaceComm);  
+        IpplTimings::stopTimer(scatterAllReducePIFTimer);
 
     }
 
@@ -329,13 +317,10 @@ namespace ippl {
                                                   const ParticleAttrib<Vector<P2, Dim>, Properties...>& pp)
     {
 
-        //static IpplTimings::TimerRef fillHaloTimer = IpplTimings::getTimer("FillHalo");           
-        //IpplTimings::startTimer(fillHaloTimer);                                               
-        f.fillHalo();
-        //IpplTimings::stopTimer(fillHaloTimer);                                               
-
         static IpplTimings::TimerRef gatherPICTimer = IpplTimings::getTimer("GatherPIC");           
         IpplTimings::startTimer(gatherPICTimer);                                               
+
+        f.fillHalo();
         
         const typename Field<T, Dim, M, C>::view_type view = f.getView();
 
@@ -408,8 +393,6 @@ namespace ippl {
             Len[d] = dx[d] * N[d];
         }
 
-
-
         typedef Kokkos::TeamPolicy<> team_policy;
         typedef Kokkos::TeamPolicy<>::member_type member_type;
 
@@ -492,15 +475,6 @@ namespace ippl {
     }
 
 #ifdef KOKKOS_ENABLE_CUDA
-
-    //template<typename T, class... Properties>
-    //template<unsigned Dim>
-    //void ParticleAttrib<T, Properties...>::initializeNUFFT(FieldLayout<Dim>& layout, int type, ParameterList& fftParams) {
-    //    
-    //    fftType_mp = std::make_shared<FFT<NUFFTransform, Dim, double>>(layout, *(this->localNum_mp), type, fftParams);
-    //}
-    
-    
     
     template<typename T, class... Properties>
     template <unsigned Dim, class M, class C, class FT, class ST, class PT>
@@ -525,10 +499,7 @@ namespace ippl {
 
         tempField = 0.0;
         
-        //fftType_mp->transform(pp, q, tempField);
         nufft->transform(pp, q, tempField);
-        //fftType_mp->transform(pp, q, f);
-
         
         using view_type = typename Field<FT, Dim, M, C>::view_type;
         view_type fview = f.getView();
@@ -622,7 +593,6 @@ namespace ippl {
                 double Dr = 0.0;
                 for(size_t d = 0; d < Dim; ++d) {
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
-                    //kVec[d] = (iVec[d] - (N[d] / 2));
                     Dr += kVec[d] * kVec[d];
                 }
 
@@ -634,7 +604,6 @@ namespace ippl {
                 tempview(i, j, k) *= -Skview(i, j, k) * (imag * kVec[gd] * factor);
             });
 
-            //fftType_mp->transform(pp, q, tempField);
             nufft->transform(pp, q, tempField);
 
             Kokkos::parallel_for("Assign E gather NUFFT",
@@ -644,13 +613,17 @@ namespace ippl {
                 dview_m(i)[gd] = qview(i);
             });
         }
-
         
         IpplTimings::stopTimer(gatherPIFNUFFTTimer);
 
     }
 #endif
 
+    /*
+     * Non-class functions
+     *
+     */
+
     template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
     void scatterPIFNUFFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
@@ -682,10 +655,6 @@ namespace ippl {
 #endif
     }
 
-    /*
-     * Non-class function
-     *
-     */
 
 
     template<typename P1, unsigned Dim, class M, class C, typename P2, class... Properties>
@@ -700,7 +669,8 @@ namespace ippl {
     template<typename P1, unsigned Dim, class M, class C, typename P2, typename P3, typename P4, class... Properties>
     inline
     void scatterPIFNUDFT(const ParticleAttrib<P1, Properties...>& attrib, Field<P2, Dim, M, C>& f,
-                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp)
+                 Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp,
+                 const MPI_Comm& spaceComm = MPI_COMM_WORLD)
     {
         attrib.scatterPIFNUDFT(f, Sk, pp);
     }

From 8efb99ee9ef0aded347aae8a26c853eb96a070af Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Sat, 2 Mar 2024 12:51:47 +0100
Subject: [PATCH 110/117] Landaudamping results verified after clean up

---
 src/Particle/ParticleAttrib.h   | 3 ++-
 src/Particle/ParticleAttrib.hpp | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 0053dcca7..bfa089d29 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -163,7 +163,8 @@ namespace ippl {
         template <unsigned Dim, class M, class C, typename P2, typename P3, typename P4>
         void
         scatterPIFNUDFT(Field<P2, Dim, M, C>& f, Field<P3, Dim, M, C>& Sk,
-                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp) const;
+                const ParticleAttrib<Vector<P4, Dim>, Properties... >& pp,
+                const MPI_Comm& spaceComm) const;
 
         template <unsigned Dim, class M, class C, typename P2>
         void
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index a2e33e334..724bc2913 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -244,6 +244,10 @@ namespace ippl {
         typedef Kokkos::TeamPolicy<> team_policy;
         typedef Kokkos::TeamPolicy<>::member_type member_type;
 
+        using view_type_temp = typename detail::ViewType<FT, 3>::view_type;
+
+        view_type_temp viewLocal("viewLocal",fview.extent(0),fview.extent(1),fview.extent(2));
+        
         double pi = std::acos(-1.0);
         Kokkos::complex<double> imag = {0.0, 1.0};
 
@@ -672,7 +676,7 @@ namespace ippl {
                  Field<P3, Dim, M, C>& Sk, const ParticleAttrib<Vector<P4, Dim>, Properties...>& pp,
                  const MPI_Comm& spaceComm = MPI_COMM_WORLD)
     {
-        attrib.scatterPIFNUDFT(f, Sk, pp);
+        attrib.scatterPIFNUDFT(f, Sk, pp, spaceComm);
     }
 
 

From fc8252939373ecdf2ec74dd8235d84b049582f93 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Sat, 2 Mar 2024 13:26:50 +0100
Subject: [PATCH 111/117] TSI and Penning trap also cleaned. Need to test and
 see if it works

---
 alpine/PinT/BumponTailInstabilityPinT.cpp | 416 +++++--------------
 alpine/PinT/LandauDampingPinT.cpp         |  15 +-
 alpine/PinT/PenningTrapPinT.cpp           | 483 +++++-----------------
 3 files changed, 218 insertions(+), 696 deletions(-)

diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 8bf8547f9..0e405720a 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -6,20 +6,28 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./BumponTailInstabilityPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> 
-//     <nCycles> <ShapeType> <degree> --info 5
+//  Usage:
+//     srun ./BumponTailInstabilityPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tolParareal> 
+//          <nCycles> <ShapeType> <degree> <No. of space procs> <No. of time procs> 
+//          <coarseTol> <fineTol> <coarseType> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
 //     nmz       = No. of Fourier modes in the z-direction
-//     nx       = No. of grid points in the x-direction
-//     ny       = No. of grid points in the y-direction
-//     nz       = No. of grid points in the z-direction
+//     nx       = No. of grid points in the x-direction (not used if PIF is also used as coarse propagator)
+//     ny       = No. of grid points in the y-direction (not used if PIF is also used as coarse propagator)
+//     nz       = No. of grid points in the z-direction (not used if PIF is also used as coarse propagator)
 //     Np       = Total no. of macro-particles in the simulation
+//     tolParareal      = Parareal tolerance
 //     nCycles = No. of Parareal blocks/cycles
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
+//     No. of space procs = Number of MPI ranks to be used in the spatial parallelization
+//     No. of time procs = Number of MPI ranks to be used in the time parallelization
+//     coarseTol = Coarse tolerance for PIF if we use PIF as a coarse propagator (will not be used when PIC is used)
+//     fineTol = fine tolerance for PIF
+//     coarseType = Type of coarse propagator (PIF or PIC)
 //     Example:
-//     srun ./BumponTailInstabilityPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 4 B-spline 1 --info 5
+//     srun ./BumponTailInstabilityPinT 32 32 32 16 16 16 655360 19.2 0.05 0.05 1e-5 1 B-spline 1 4 16 1e-2 1e-4 PIC --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -39,8 +47,6 @@
 #include "ChargedParticlesPinT.hpp"
 #include "StatesBeginSlice.hpp"
 #include "StatesEndSlice.hpp"
-//#include "LeapFrogPIC.cpp"
-//#include "LeapFrogPIF.cpp"
 #include <string>
 #include <vector>
 #include <iostream>
@@ -171,6 +177,11 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
 
+                                //This is just to undo the effect of periodic BCs during the 
+                                //error calculation. Otherwise even though the actual error is 
+                                //small the computed error might be very large. 
+                                //The values (e.g. 10) mentioned here are just an adhoc
+                                //value depending on the domain length. 
                                 for (unsigned d = 0; d < 3; ++d) {
                                     bool isLeft = (diff[d] <= -10.0);
                                     bool isRight = (diff[d] >= 10.0);
@@ -225,171 +236,6 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 
 }
 
-double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
-                      Vector_t& length) {
-    
-    auto Qview = Q.getView();
-    auto QprevIterView = QprevIter.getView();
-    double localError = 0.0;
-    double localNorm = 0.0;
-
-    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
-                                Vector_t diff = Qview(i) - QprevIterView(i);
-                                
-                                for (unsigned d = 0; d < 3; ++d) {
-                                    bool isLeft = (diff[d] <= -10.0);
-                                    bool isRight = (diff[d] >= 10.0);
-                                    bool isInside = ((diff[d] > -10.0) && (diff[d] < 10.0));
-                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
-                                              +(isRight * (diff[d] - length[d]));
-                                }
-                                
-                                double myValError = dot(diff, diff).apply();
-
-                                myValError = std::sqrt(myValError);
-
-                                //bool isIncluded = (myValError < 10.0);
-
-                                //myValError *= isIncluded;
-                                
-                                if(myValError > valLError) valLError = myValError;
-                                
-                                double myValnorm = dot(Qview(i), Qview(i)).apply();
-                                myValnorm = std::sqrt(myValnorm);
-
-                                //myValnorm *= isIncluded;
-                                
-                                if(myValnorm > valLnorm) valLnorm = myValnorm;
-                                
-                                //excluded += (!isIncluded);
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
-
-    Kokkos::fence();
-    lError = localError/localNorm;
-    
-    double relError = lError;
-    
-    return relError;
-
-}
-
-
-double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
-    
-    auto Qview = Q.getView();
-    auto QprevIterView = QprevIter.getView();
-    double localError = 0.0;
-    double localNorm = 0.0;
-
-    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
-                                Vector_t diff = Qview(i) - QprevIterView(i);
-                                double myValError = dot(diff, diff).apply();
-                                myValError = std::sqrt(myValError);
-                                
-                                if(myValError > valLError) valLError = myValError;
-                                
-                                double myValnorm = dot(Qview(i), Qview(i)).apply();
-                                myValnorm = std::sqrt(myValnorm);
-                                
-                                if(myValnorm > valLnorm) valLnorm = myValnorm;
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
-
-    Kokkos::fence();
-    lError = localError/localNorm;
-    
-    double relError = lError;
-    
-    return relError;
-
-}
-
-double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
-
-    auto rhoview = rhoPIF.getView();
-    auto rhoprevview = rhoPIFprevIter.getView();
-    const int nghost = rhoPIF.getNghost();
-    using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
-    
-    const FieldLayout_t& layout = rhoPIF.getLayout(); 
-    const Mesh_t& mesh = rhoPIF.get_mesh();
-    const Vector<double, Dim>& dx = mesh.getMeshSpacing();
-    const auto& domain = layout.getDomain();
-    Vector<double, Dim> Len;
-    Vector<int, Dim> N;
-
-    for (unsigned d=0; d < Dim; ++d) {
-        N[d] = domain[d].length();
-        Len[d] = dx[d] * N[d];
-    }
-
-    double AbsError = 0.0;
-    double Enorm = 0.0;
-    Kokkos::complex<double> imag = {0.0, 1.0};
-    double pi = std::acos(-1.0);
-    Kokkos::parallel_reduce("Ex field error",
-                          mdrange_type({0, 0, 0},
-                                       {N[0],
-                                        N[1],
-                                        N[2]}),
-                          KOKKOS_LAMBDA(const int i,
-                                        const int j,
-                                        const int k,
-                                        double& errorSum,
-                                        double& fieldSum)
-    {
-    
-        Vector<int, 3> iVec = {i, j, k};
-        Vector<double, 3> kVec;
-        double Dr = 0.0;
-        for(size_t d = 0; d < Dim; ++d) {
-            bool shift = (iVec[d] > (N[d]/2));
-            kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-            Dr += kVec[d] * kVec[d];
-        }
-
-        double myError = 0.0;
-        double myField = 0.0;
-        Kokkos::complex<double> Ek = {0.0, 0.0};
-        Kokkos::complex<double> Ekprev = {0.0, 0.0};
-        for(size_t d = 0; d < Dim; ++d) {
-            if(Dr != 0.0) {
-                Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-                Ekprev = -(imag * kVec[d] * rhoprevview(i+nghost,j+nghost,k+nghost) / Dr);
-            }
-            Ekprev = Ekprev - Ek;
-            myError += Ekprev.real() * Ekprev.real() + Ekprev.imag() * Ekprev.imag();
-            myField += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
-        }
-        errorSum += myError;
-        fieldSum += myField;
-        //Kokkos::complex<double> rhok = rhoview(i+nghost,j+nghost,k+nghost);
-        //Kokkos::complex<double> rhokprev = rhoprevview(i+nghost,j+nghost,k+nghost);
-        //rhokprev = rhokprev - rhok;
-        //myError = rhokprev.real() * rhokprev.real() + rhokprev.imag() * rhokprev.imag();
-        //errorSum += myError;
-        //myField = rhok.real() * rhok.real() + rhok.imag() * rhok.imag();
-        //fieldSum += myField;
-
-    }, Kokkos::Sum<double>(AbsError), Kokkos::Sum<double>(Enorm));
-    
-    Kokkos::fence();
-    double globalError = 0.0;
-    MPI_Allreduce(&AbsError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-    double globalNorm = 0.0;
-    MPI_Allreduce(&Enorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-    //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-    //fieldEnergy *= volume;
-
-    double relError = std::sqrt(globalError)/std::sqrt(globalNorm);
-
-    return relError;
-}
-
-
 const char* TestName = "TwoStreamInstability";
 //const char* TestName = "BumponTailInstability";
 
@@ -452,20 +298,13 @@ int main(int argc, char *argv[]){
     const unsigned int ntFine = std::ceil(dtSlice / dtFine);
     const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
-    //const unsigned int maxIter = std::atoi(argv[12]);
-
-
-    //const double tStartMySlice = Ippl::Comm->rank() * dtSlice; 
-    //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
-
 
     using bunch_type = ChargedParticlesPinT<PLayout_t>;
-    using states_begin_type = StatesBeginSlice<PLayout_t>;
-    using states_end_type = StatesEndSlice<PLayout_t>;
+    using states_type = StatesSlice<PLayout_t>;
 
     std::unique_ptr<bunch_type>  Pcoarse;
-    std::unique_ptr<states_begin_type>  Pbegin;
-    std::unique_ptr<states_end_type>  Pend;
+    std::unique_ptr<states_type>  Pbegin;
+    std::unique_ptr<states_type>  Pend;
 
     ippl::NDIndex<Dim> domainPIC;
     ippl::NDIndex<Dim> domainPIF;
@@ -543,38 +382,62 @@ int main(int argc, char *argv[]){
    
     size_type Total_particles = 0;
 
-    //MPI_Allreduce(&nloc, &Total_particles, 1,
-    //            MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
-
-    //int rest = (int) (totalP - Total_particles);
-
-    //if ( (rankTime == 0) && (rankSpace < rest) ) {
-    //    ++nloc;
-    //}
-
     MPI_Allreduce(&nloc, &Total_particles, 1,
                 MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
-
-
     
     //Q = -\int\int f dx dv
     double Q = -length[0] * length[1] * length[2];
     Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
-    Pbegin = std::make_unique<states_begin_type>(PL);
-    Pend = std::make_unique<states_end_type>(PL);
+    Pbegin = std::make_unique<states_type>(PL);
+    Pend = std::make_unique<states_type>(PL);
 
     Pcoarse->nr_m = nrPIC;
     Pcoarse->nm_m = nmPIF;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
     Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
-    //Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
-    Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
-    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
-    //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
-    Pcoarse->initFFTSolver();
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->initFFTSolver();
+    }
+   
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length());
+    }
     
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
+
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
+
+    Pcoarse->rhoPIFreal_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    Pcoarse->fft_mp = std::make_shared<FFT_t>(FLPIF, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], delta, kw[d], d);
@@ -584,7 +447,6 @@ int main(int argc, char *argv[]){
     using buffer_type = ippl::Communicate::buffer_type;
     int tag;
 
-
     Pcoarse->shapetype_m = argv[13];
     Pcoarse->shapedegree_m = std::atoi(argv[14]); 
     IpplTimings::startTimer(initializeShapeFunctionPIF);
@@ -603,54 +465,22 @@ int main(int argc, char *argv[]){
 
     
     double coarseTol = std::atof(argv[17]);
-    double fineTol   = 1e-3;//1e-12;
+    double fineTol   = std::atof(argv[18]);
     Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
     std::string coarse = "Coarse";
     std::string fine = "Fine";
     
     IpplTimings::startTimer(particleCreation);
 
-
-  
-    //Pcoarse->initNUFFT(FLPIF);
 #ifdef KOKKOS_ENABLE_CUDA
+
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
-    //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-    //if(Ippl::Comm->rank() == 0) {
-    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*Ippl::Comm->rank()));
-    //    Kokkos::parallel_for(nloc,
-    //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
-    //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, delta, kw, 
-    //                         sigma, muBulk, muBeam, nlocBulk, minU, maxU));
-
-
-    //    Kokkos::fence();
-    //    size_type bufSize = Pbegin->packedSize(nloc);
-    //    std::vector<MPI_Request> requests(0);
-    //    int sends = 0;
-    //    for(int rank = 1; rank < Ippl::Comm->size(); ++rank) {
-    //        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
-    //        requests.resize(requests.size() + 1);
-    //        Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc);
-    //        buf->resetWritePos();
-    //        ++sends;
-    //    }
-    //    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
-    //}
-    //else {
-    //    size_type bufSize = Pbegin->packedSize(nloc);
-    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-    //    Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc);
-    //    buf->resetReadPos();
-    //}
-    //Ippl::Comm->barrier();
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pcoarse->R.getView(), Pbegin->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P.getView(), Pbegin->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-    tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+
+    IpplTimings::startTimer(timeCommunication);
+    //For some reason using the next_tag with multiple cycles is not 
+    //working so we use static tags here
+    tag = 500;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
 
     if(rankTime == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
@@ -668,6 +498,7 @@ int main(int argc, char *argv[]){
         Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
         buf->resetReadPos();
     }
+    IpplTimings::stopTimer(timeCommunication);
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
@@ -676,14 +507,22 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
-    //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm); 
-    Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm); 
+    IpplTimings::startTimer(coarsePropagator);
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm);
+    }
+    else {
+        //PIF with coarse tolerance as coarse propagator
+        Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm);
+    }
+    IpplTimings::stopTimer(coarsePropagator);
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
+    IpplTimings::startTimer(timeCommunication);
     if(rankTime < sizeTime-1) {
         size_type bufSize = Pend->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
@@ -692,8 +531,10 @@ int main(int argc, char *argv[]){
         buf->resetWritePos();
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
+    IpplTimings::stopTimer(timeCommunication);
 
 #else
+    //Note the CPU version has not been tested.
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
@@ -705,8 +546,6 @@ int main(int argc, char *argv[]){
     Ippl::Comm->barrier();
 #endif
 
-    //Pcoarse->dumpParticleData(0, Pcoarse->R, Pcoarse->P, "Parareal");
-
     msg << "Parareal "
         << TestName
         << endl
@@ -728,57 +567,6 @@ int main(int argc, char *argv[]){
     
     msg << "particles created and initial conditions assigned " << endl;
 
-    //Copy initial conditions as they are needed later
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-    ////Get initial guess for ranks other than 0 by propagating the coarse solver
-    //IpplTimings::startTimer(coarsePropagator);
-    //if (Ippl::Comm->rank() > 0) {
-    //    Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice); 
-    //}
-    //
-    //Ippl::Comm->barrier();
-    //
-    //IpplTimings::stopTimer(coarsePropagator);
-
-    //msg << "First Leap frog PIC done " << endl;
-
-    //
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-
-    ////Run the coarse integrator to get the values at the end of the time slice 
-    //IpplTimings::startTimer(coarsePropagator);
-    //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice); 
-    //IpplTimings::stopTimer(coarsePropagator);
-    //msg << "Second Leap frog PIC done " << endl;
-
-    ////Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
-
-    ////The following might not be needed
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-
-    //msg << "Starting parareal iterations ..." << endl;
-    //bool isConverged = false;
-    //bool isPreviousDomainConverged;
-    //if(Ippl::Comm->rank() == 0) {
-    //    isPreviousDomainConverged = true;
-    //}
-    //else {
-    //    isPreviousDomainConverged = false;
-    //}
-   
-
     int sign = 1;
     for (unsigned int nc=0; nc < nCycles; nc++) {
         
@@ -821,9 +609,6 @@ int main(int argc, char *argv[]){
             Pend->R = Pbegin->R - Pcoarse->R;
             Pend->P = Pbegin->P - Pcoarse->P;
 
-            //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gk");
-            //Pcoarse->dumpParticleData(it+1, Pbegin->R, Pbegin->P, "Fk");
-
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
             Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
@@ -855,30 +640,28 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            //Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
-            Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->LeapFrogPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
+            }
+            else {
+                Pcoarse->LeapFrogPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm);
+            }
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
             Pend->P = Pend->P + Pcoarse->P;
 
-            //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gkp1");
-
-
             PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
             
             IpplTimings::startTimer(computeErrors);
             double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
             double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
-    
             IpplTimings::stopTimer(computeErrors);
-            //}
 
             if((Rerror <= tol) && (Perror <= tol) && isPreviousDomainConverged) {
                 isConverged = true;
             }
 
-
             IpplTimings::startTimer(timeCommunication);
             if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
@@ -899,11 +682,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writeError(Rerror, Perror, it+1);
             Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
-            //if(Ippl::Comm->rank() == Ippl::Comm->size()-1) {
-            //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
-            //}
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
@@ -913,7 +692,6 @@ int main(int argc, char *argv[]){
 
         MPI_Barrier(MPI_COMM_WORLD);
         if((nCycles > 1) && (nc < (nCycles - 1))) {  
-            IpplTimings::startTimer(timeCommunication);
             tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
            
             //send, receive criteria and tStartMySlice are reversed at the end of the cycle
@@ -935,12 +713,14 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
+            IpplTimings::startTimer(timeCommunication);
             if(recvCriteria) {
                 size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
                 Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
             }
+            IpplTimings::stopTimer(timeCommunication);
 
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
@@ -949,15 +729,21 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            //Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm); 
-            Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            IpplTimings::startTimer(coarsePropagator);
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
+            }
+            else {
+                Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
+            }
+            IpplTimings::stopTimer(coarsePropagator);
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
             Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
             IpplTimings::stopTimer(deepCopy);
 
-
+            IpplTimings::startTimer(timeCommunication);
             if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index c6bbd3536..947a4ed5d 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -428,6 +428,8 @@ int main(int argc, char *argv[]){
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
 
+    
+    IpplTimings::startTimer(timeCommunication);
     //For some reason using the next_tag with multiple cycles is not 
     //working so we use static tags here
     tag = 500;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
@@ -447,6 +449,8 @@ int main(int argc, char *argv[]){
         Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
         buf->resetReadPos();
     }
+    IpplTimings::stopTimer(timeCommunication);
+    
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
     Kokkos::deep_copy(Pend->P.getView(), Pbegin->P.getView());
@@ -454,7 +458,7 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
-    
+    IpplTimings::startTimer(coarsePropagator);
     if(Pcoarse->coarsetype_m == "PIC") {
         Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, spaceComm);
     }
@@ -462,12 +466,14 @@ int main(int argc, char *argv[]){
         //PIF with coarse tolerance as coarse propagator
         Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, 0, 0, coarse, spaceComm);
     }
+    IpplTimings::stopTimer(coarsePropagator);
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
+    IpplTimings::startTimer(timeCommunication);
     if(rankTime < sizeTime-1) {
         size_type bufSize = Pend->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
@@ -476,6 +482,7 @@ int main(int argc, char *argv[]){
         buf->resetWritePos();
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
+    IpplTimings::stopTimer(timeCommunication);
 #else
     //Note the CPU version has not been tested.
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
@@ -633,7 +640,6 @@ int main(int argc, char *argv[]){
         
         MPI_Barrier(MPI_COMM_WORLD);
         if((nCycles > 1) && (nc < (nCycles - 1))) {  
-            IpplTimings::startTimer(timeCommunication);
             tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
            
             //send, receive criteria and tStartMySlice are reversed at the end of the cycle
@@ -655,12 +661,14 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
 
+            IpplTimings::startTimer(timeCommunication);
             if(recvCriteria) {
                 size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
                 Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
             }
+            IpplTimings::stopTimer(timeCommunication);
 
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
@@ -669,12 +677,14 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
+            IpplTimings::startTimer(coarsePropagator);
             if(Pcoarse->coarsetype_m == "PIC") {
                 Pcoarse->LeapFrogPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, spaceComm);
             }
             else {
                 Pcoarse->LeapFrogPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, 0, 0, coarse, spaceComm); 
             }
+            IpplTimings::stopTimer(coarsePropagator);
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
@@ -682,6 +692,7 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
 
+            IpplTimings::startTimer(timeCommunication);
             if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 95ae8387a..f5026949c 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -6,20 +6,27 @@
 // European Conference on Parallel Processing. Springer, Cham, 2017.
 // 
 //  Usage:
-//     srun ./PenningTrapPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tol> <nCycles> 
-//     <ShapeType> <degree> --info 5
+//     srun ./PenningTrapPinT <nmx> <nmy> <nmz> <nx> <ny> <nz> <Np> <Tend> <dtfine> <dtcoarse> <tolParareal> 
+//          <nCycles> <ShapeType> <degree> <No. of space procs> <No. of time procs> 
+//          <coarseTol> <fineTol> <coarseType> --info 5
 //     nmx       = No. of Fourier modes in the x-direction
 //     nmy       = No. of Fourier modes in the y-direction
 //     nmz       = No. of Fourier modes in the z-direction
-//     nx       = No. of grid points in the x-direction
-//     ny       = No. of grid points in the y-direction
-//     nz       = No. of grid points in the z-direction
+//     nx       = No. of grid points in the x-direction (not used if PIF is also used as coarse propagator)
+//     ny       = No. of grid points in the y-direction (not used if PIF is also used as coarse propagator)
+//     nz       = No. of grid points in the z-direction (not used if PIF is also used as coarse propagator)
 //     Np       = Total no. of macro-particles in the simulation
+//     tolParareal      = Parareal tolerance
 //     nCycles = No. of Parareal blocks/cycles
 //     ShapeType = Shape function type B-spline only for the moment
 //     degree = B-spline degree (-1 for delta function)
+//     No. of space procs = Number of MPI ranks to be used in the spatial parallelization
+//     No. of time procs = Number of MPI ranks to be used in the time parallelization
+//     coarseTol = Coarse tolerance for PIF if we use PIF as a coarse propagator (will not be used when PIC is used)
+//     fineTol = fine tolerance for PIF
+//     coarseType = Type of coarse propagator (PIF or PIC)
 //     Example:
-//     srun ./PenningTrapPinT 32 32 32 32 32 32 655360 20.0 0.05 0.05 1e-5 4 B-spline 1 --info 5
+//     srun ./PenningTrapPinT 32 32 32 16 16 16 655360 19.2 0.05 0.05 1e-5 1 B-spline 1 4 16 1e-2 1e-4 PIC --info 5
 //
 // Copyright (c) 2022, Sriramkrishnan Muralikrishnan,
 // Jülich Supercomputing Centre, Jülich, Germany.
@@ -39,8 +46,6 @@
 #include "ChargedParticlesPinT.hpp"
 #include "StatesBeginSlice.hpp"
 #include "StatesEndSlice.hpp"
-//#include "LeapFrogPIC.cpp"
-//#include "LeapFrogPIF.cpp"
 #include <string>
 #include <vector>
 #include <iostream>
@@ -158,6 +163,11 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
                             KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
                                 Vector_t diff = Qview(i) - QprevIterView(i);
 
+                                //This is just to undo the effect of periodic BCs during the 
+                                //error calculation. Otherwise even though the actual error is 
+                                //small the computed error might be very large. 
+                                //The values (e.g. 22) mentioned here are just an adhoc
+                                //value depending on the domain length. 
                                 for (unsigned d = 0; d < 3; ++d) {
                                     bool isLeft = (diff[d] <= -22.0);
                                     bool isRight = (diff[d] >= 22.0);
@@ -174,12 +184,9 @@ double computeRL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 
     Kokkos::fence();
     double globalError = 0.0;
-    //MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
     MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
     double globalNorm = 0.0;
-    //MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
     MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
-    //lError = std::sqrt(localError)/std::sqrt(localNorm);
 
     double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
     
@@ -205,12 +212,9 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 
     Kokkos::fence();
     double globalError = 0.0;
-    //MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
     MPI_Allreduce(&localError, &globalError, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
     double globalNorm = 0.0;
-    //MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
     MPI_Allreduce(&localNorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, spaceComm);
-    //lError = std::sqrt(localError)/std::sqrt(localNorm);
 
     double relError = std::sqrt(globalError) / std::sqrt(globalNorm);
     
@@ -218,194 +222,19 @@ double computePL2Error(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& Qp
 
 }
 
-double computeRLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError, 
-                      Vector_t& length) {
-    
-    auto Qview = Q.getView();
-    auto QprevIterView = QprevIter.getView();
-    double localError = 0.0;
-    double localNorm = 0.0;
-
-    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
-                                Vector_t diff = Qview(i) - QprevIterView(i);
-                                
-                                for (unsigned d = 0; d < 3; ++d) {
-                                    bool isLeft = (diff[d] <= -22.0);
-                                    bool isRight = (diff[d] >= 22.0);
-                                    bool isInside = ((diff[d] > -22.0) && (diff[d] < 22.0));
-                                    diff[d] = (isInside * diff[d]) + (isLeft * (diff[d] + length[d]))
-                                              +(isRight * (diff[d] - length[d]));
-                                }
-                                
-                                double myValError = dot(diff, diff).apply();
-
-                                myValError = std::sqrt(myValError);
-
-                                //bool isIncluded = (myValError < 10.0);
-
-                                //myValError *= isIncluded;
-                                
-                                if(myValError > valLError) valLError = myValError;
-                                
-                                double myValnorm = dot(Qview(i), Qview(i)).apply();
-                                myValnorm = std::sqrt(myValnorm);
-
-                                //myValnorm *= isIncluded;
-                                
-                                if(myValnorm > valLnorm) valLnorm = myValnorm;
-                                
-                                //excluded += (!isIncluded);
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
-
-    Kokkos::fence();
-    lError = localError/localNorm;
-    
-    double relError = lError;
-    
-    return relError;
-
-}
-
-double computePLinfError(ParticleAttrib<Vector_t>& Q, ParticleAttrib<Vector_t>& QprevIter, 
-                      const unsigned int& /*iter*/, const int& /*myrank*/, double& lError) {
-    
-    auto Qview = Q.getView();
-    auto QprevIterView = QprevIter.getView();
-    double localError = 0.0;
-    double localNorm = 0.0;
-
-    Kokkos::parallel_reduce("Abs. max error and norm", Q.size(),
-                            KOKKOS_LAMBDA(const int i, double& valLError, double& valLnorm){
-                                Vector_t diff = Qview(i) - QprevIterView(i);
-                                double myValError = dot(diff, diff).apply();
-                                myValError = std::sqrt(myValError);
-                                
-                                if(myValError > valLError) valLError = myValError;
-                                
-                                double myValnorm = dot(Qview(i), Qview(i)).apply();
-                                myValnorm = std::sqrt(myValnorm);
-                                
-                                if(myValnorm > valLnorm) valLnorm = myValnorm;
-                            }, Kokkos::Max<double>(localError), Kokkos::Max<double>(localNorm));
-
-    Kokkos::fence();
-    lError = localError/localNorm;
-    
-    double relError = lError;
-    
-    return relError;
-
-}
-
-
-double computeFieldError(CxField_t& rhoPIF, CxField_t& rhoPIFprevIter) {
-
-    auto rhoview = rhoPIF.getView();
-    auto rhoprevview = rhoPIFprevIter.getView();
-    const int nghost = rhoPIF.getNghost();
-    using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<Dim>>;
-    
-    const FieldLayout_t& layout = rhoPIF.getLayout(); 
-    const Mesh_t& mesh = rhoPIF.get_mesh();
-    const Vector<double, Dim>& dx = mesh.getMeshSpacing();
-    const auto& domain = layout.getDomain();
-    Vector<double, Dim> Len;
-    Vector<int, Dim> N;
-
-    for (unsigned d=0; d < Dim; ++d) {
-        N[d] = domain[d].length();
-        Len[d] = dx[d] * N[d];
-    }
-
-    double AbsError = 0.0;
-    double Enorm = 0.0;
-    Kokkos::complex<double> imag = {0.0, 1.0};
-    double pi = std::acos(-1.0);
-    Kokkos::parallel_reduce("Ex field error",
-                          mdrange_type({0, 0, 0},
-                                       {N[0],
-                                        N[1],
-                                        N[2]}),
-                          KOKKOS_LAMBDA(const int i,
-                                        const int j,
-                                        const int k,
-                                        double& errorSum,
-                                        double& fieldSum)
-    {
-    
-        Vector<int, 3> iVec = {i, j, k};
-        Vector<double, 3> kVec;
-        double Dr = 0.0;
-        for(size_t d = 0; d < Dim; ++d) {
-            bool shift = (iVec[d] > (N[d]/2));
-            kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
-            Dr += kVec[d] * kVec[d];
-        }
-
-        double myError = 0.0;
-        double myField = 0.0;
-        Kokkos::complex<double> Ek = {0.0, 0.0};
-        Kokkos::complex<double> Ekprev = {0.0, 0.0};
-        for(size_t d = 0; d < Dim; ++d) {
-            if(Dr != 0.0) {
-                Ek = -(imag * kVec[d] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-                Ekprev = -(imag * kVec[d] * rhoprevview(i+nghost,j+nghost,k+nghost) / Dr);
-            }
-            Ekprev = Ekprev - Ek;
-            myError += Ekprev.real() * Ekprev.real() + Ekprev.imag() * Ekprev.imag();
-            myField += Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
-        }
-        errorSum += myError;
-        fieldSum += myField;
-        //Kokkos::complex<double> rhok = rhoview(i+nghost,j+nghost,k+nghost);
-        //Kokkos::complex<double> rhokprev = rhoprevview(i+nghost,j+nghost,k+nghost);
-        //rhokprev = rhokprev - rhok;
-        //myError = rhokprev.real() * rhokprev.real() + rhokprev.imag() * rhokprev.imag();
-        //errorSum += myError;
-        //myField = rhok.real() * rhok.real() + rhok.imag() * rhok.imag();
-        //fieldSum += myField;
-
-    }, Kokkos::Sum<double>(AbsError), Kokkos::Sum<double>(Enorm));
-    
-    Kokkos::fence();
-    double globalError = 0.0;
-    MPI_Allreduce(&AbsError, &globalError, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-    double globalNorm = 0.0;
-    MPI_Allreduce(&Enorm, &globalNorm, 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-    //double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]);
-    //fieldEnergy *= volume;
-
-    double relError = std::sqrt(globalError)/std::sqrt(globalNorm);
-
-    return relError;
-}
-
-
 const char* TestName = "PenningTrapPinT";
 
 int main(int argc, char *argv[]){
-   
     Ippl ippl(argc, argv);
-    
-    //int rankWorld, sizeWorld;
-    //MPI_Init(&argc, &argv);
-    //MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld);
-    //MPI_Comm_size(MPI_COMM_WORLD, &sizeWorld);
 
     int spaceColor, timeColor;
     MPI_Comm spaceComm, timeComm;
 
     int spaceProcs = std::atoi(argv[15]);
     int timeProcs = std::atoi(argv[16]);
-    //spaceColor = rankWorld / spaceProcs; 
-    //timeColor = rankWorld % spaceProcs;
     spaceColor = Ippl::Comm->rank() / spaceProcs; 
     timeColor = Ippl::Comm->rank() % spaceProcs;
 
-    //MPI_Comm_split(MPI_COMM_WORLD, spaceColor, rankWorld, &spaceComm);
-    //MPI_Comm_split(MPI_COMM_WORLD, timeColor, rankWorld, &timeComm);
     MPI_Comm_split(Ippl::getComm(), spaceColor, Ippl::Comm->rank(), &spaceComm);
     MPI_Comm_split(Ippl::getComm(), timeColor, Ippl::Comm->rank(), &timeComm);
 
@@ -416,9 +245,6 @@ int main(int argc, char *argv[]){
     MPI_Comm_rank(timeComm, &rankTime);
     MPI_Comm_size(timeComm, &sizeTime);
 
-    //Ippl ippl(argc, argv, spaceComm);
-    
-    //Inform msg(TestName, sizeSpace-1);
     Inform msg(TestName, Ippl::Comm->size()-1);
     Inform msg2all(TestName,INFORM_ALL_NODES);
 
@@ -443,10 +269,6 @@ int main(int argc, char *argv[]){
     static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
     static IpplTimings::TimerRef computeErrors = IpplTimings::getTimer("computeErrors");
     static IpplTimings::TimerRef initializeShapeFunctionPIF = IpplTimings::getTimer("initializeShapeFunctionPIF");
-    static IpplTimings::TimerRef initializeCycles = IpplTimings::getTimer("initializeCycles");
-    static IpplTimings::TimerRef initialComm = IpplTimings::getTimer("initialComm");
-    static IpplTimings::TimerRef initialCoarse = IpplTimings::getTimer("initialCoarse");
-    static IpplTimings::TimerRef warmupStep = IpplTimings::getTimer("warmupStep");
 
     IpplTimings::startTimer(mainTimer);
 
@@ -460,18 +282,13 @@ int main(int argc, char *argv[]){
     const unsigned int ntFine = std::ceil(dtSlice / dtFine);
     const unsigned int ntCoarse = std::ceil(dtSlice / dtCoarse);
     const double tol = std::atof(argv[11]);
-    //const unsigned int maxIter = std::atoi(argv[12]);
-
-    //const double tEndMySlice = (Ippl::Comm->rank() + 1) * dtSlice; 
-
 
     using bunch_type = ChargedParticlesPinT<PLayout_t>;
-    using states_begin_type = StatesBeginSlice<PLayout_t>;
-    using states_end_type = StatesEndSlice<PLayout_t>;
+    using states_type = StatesSlice<PLayout_t>;
 
     std::unique_ptr<bunch_type>  Pcoarse;
-    std::unique_ptr<states_begin_type>  Pbegin;
-    std::unique_ptr<states_end_type>  Pend;
+    std::unique_ptr<states_type>  Pbegin;
+    std::unique_ptr<states_type>  Pend;
 
     ippl::NDIndex<Dim> domainPIC;
     ippl::NDIndex<Dim> domainPIF;
@@ -488,26 +305,20 @@ int main(int argc, char *argv[]){
     // create mesh and layout objects for this problem domain
     Vector_t rmin(0.0);
     Vector_t rmax(25.0);
-    //Vector_t rmax(20.0);
     Vector_t length = rmax - rmin;
     double dxPIC = length[0] / nrPIC[0];
     double dyPIC = length[1] / nrPIC[1];
     double dzPIC = length[2] / nrPIC[2];
 
-
     Vector_t mu, sd;
 
     for (unsigned d = 0; d<Dim; d++) {
         mu[d] = 0.5 * length[d];
     }
-    //sd[0] = 0.15*length[0];
-    //sd[1] = 0.05*length[1];
-    //sd[2] = 0.20*length[2];
     sd[0] = 0.10*20.0;//length[0];
     sd[1] = 0.05*20.0;//length[1];
     sd[2] = 0.15*20.0;//length[2];
 
-
     double dxPIF = length[0] / nmPIF[0];
     double dyPIF = length[1] / nmPIF[1];
     double dzPIF = length[2] / nmPIF[2];
@@ -526,36 +337,62 @@ int main(int argc, char *argv[]){
 
     size_type Total_particles = 0;
 
-    //MPI_Allreduce(&nloc, &Total_particles, 1,
-    //            MPI_UNSIGNED_LONG, MPI_SUM, Ippl::getComm());
     MPI_Allreduce(&nloc, &Total_particles, 1,
                 MPI_UNSIGNED_LONG, MPI_SUM, spaceComm);
 
-    //int rest = (int) (totalP - Total_particles);
-
-    //if ( (rankTime == 0) && (rankSpace < rest) ) {
-    //    ++nloc;
-    //}
-
     double Q = -1562.5;
     double Bext = 5.0;
     Pcoarse = std::make_unique<bunch_type>(PL,hrPIC,rmin,rmax,decomp,Q,Total_particles);
-    Pbegin = std::make_unique<states_begin_type>(PL);
-    Pend = std::make_unique<states_end_type>(PL);
+    Pbegin = std::make_unique<states_type>(PL);
+    Pend = std::make_unique<states_type>(PL);
 
     Pcoarse->nr_m = nrPIC;
     Pcoarse->nm_m = nmPIF;
 
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
     Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
-    //Pcoarse->rhoPIFprevIter_m.initialize(meshPIF, FLPIF);
-    Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
-    Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
-    //Pcoarse->EfieldPICprevIter_m.initialize(meshPIC, FLPIC);
 
-    Pcoarse->initFFTSolver();
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
+        Pcoarse->initFFTSolver();
+    }
+
+    ////////////////////////////////////////////////////////////
+    //Initialize an FFT object for getting rho in real space and 
+    //doing charge conservation check
+    
+    ippl::ParameterList fftParams;
+    fftParams.add("use_heffte_defaults", false);  
+    fftParams.add("use_pencils", true);  
+    fftParams.add("use_reorder", false);  
+    fftParams.add("use_gpu_aware", true);  
+    fftParams.add("comm", ippl::p2p_pl);  
+    fftParams.add("r2c_direction", 0);  
+
+    ippl::NDIndex<Dim> domainPIFhalf;
+
+    for(unsigned d = 0; d < Dim; ++d) {
+        if(fftParams.template get<int>("r2c_direction") == (int)d)
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length()/2 + 1);
+        else
+            domainPIFhalf[d] = ippl::Index(domainPIF[d].length());
+    }
+    
+
+    FieldLayout_t FLPIFhalf(domainPIFhalf, decomp);
 
+    ippl::Vector<double, 3> hDummy = {1.0, 1.0, 1.0};
+    ippl::Vector<double, 3> originDummy = {0.0, 0.0, 0.0};
+    Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy);
 
+    Pcoarse->rhoPIFreal_m.initialize(meshPIF, FLPIF);
+    Pcoarse->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf);
+
+    Pcoarse->fft_mp = std::make_shared<FFT_t>(FLPIF, FLPIFhalf, fftParams);
+   
+    ////////////////////////////////////////////////////////////
+    
     Vector_t minU, maxU;
     for (unsigned d = 0; d <Dim; ++d) {
         minU[d] = CDF(rmin[d], mu[d], sd[d]);
@@ -583,57 +420,21 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(particleCreation);
     
     double coarseTol = std::atof(argv[17]);
-    double fineTol   = 1e-12;
+    double fineTol   = std::atof(argv[18]);
     Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
     std::string coarse = "Coarse";
     std::string fine = "Fine";
 
-    
     IpplTimings::startTimer(particleCreation);
-    
-    
-    
-    //Pcoarse->initNUFFT(FLPIF);
-
 #ifdef KOKKOS_ENABLE_CUDA
     //If we don't do the following even with the same seed the initial 
     //condition is not the same on different GPUs
-    //tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-    //if(rankTime == 0) {
-    //    Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
-    //    Kokkos::parallel_for(nloc,
-    //                         generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
-    //                         Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, mu, sd, 
-    //                         minU, maxU));
-    //    Kokkos::fence();
-    //    size_type bufSize = Pbegin->packedSize(nloc);
-    //    std::vector<MPI_Request> requests(0);
-    //    int sends = 0;
-    //    for(int rank = 1; rank < sizeTime; ++rank) {
-    //        buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND + sends, bufSize);
-    //        requests.resize(requests.size() + 1);
-    //        Ippl::Comm->isend(rank, tag, *Pbegin, *buf, requests.back(), nloc, timeComm);
-    //        buf->resetWritePos();
-    //        ++sends;
-    //    }
-    //    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
-    //}
-    //else {
-    //    size_type bufSize = Pbegin->packedSize(nloc);
-    //    buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
-    //    Ippl::Comm->recv(0, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
-    //    buf->resetReadPos();
-    //}
-
-    //Kokkos::deep_copy(Pcoarse->Rfine.getView(), Pbegin->R.getView());
-    //Kokkos::deep_copy(Pcoarse->Pfine.getView(), Pbegin->P.getView());
-
-
-    //If we don't do the following even with the same seed the initial 
-    //condition is not the same on different GPUs
-    tag = Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
-
-    IpplTimings::startTimer(initialComm);
+    
+    IpplTimings::startTimer(timeCommunication);
+    //For some reason using the next_tag with multiple cycles is not 
+    //working so we use static tags here
+    tag = 500;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
+    
     if(rankTime == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
         Kokkos::parallel_for(nloc,
@@ -650,7 +451,7 @@ int main(int argc, char *argv[]){
         Ippl::Comm->recv(rankTime-1, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
         buf->resetReadPos();
     }
-    IpplTimings::stopTimer(initialComm);
+    IpplTimings::stopTimer(timeCommunication);
 
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
@@ -659,17 +460,21 @@ int main(int argc, char *argv[]){
     Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
     IpplTimings::stopTimer(deepCopy);
 
-    IpplTimings::startTimer(initialCoarse);
-    //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm); 
-    Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
-    IpplTimings::stopTimer(initialCoarse);
-
+    IpplTimings::startTimer(coarsePropagator);
+    if(Pcoarse->coarsetype_m == "PIC") {
+        Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, Bext, spaceComm);
+    }
+    else {
+        Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, rankTime * dtSlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+    }
+    IpplTimings::stopTimer(coarsePropagator);
+    
     IpplTimings::startTimer(deepCopy);
     Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
     Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
     IpplTimings::stopTimer(deepCopy);
     
-    IpplTimings::startTimer(initialComm);
+    IpplTimings::startTimer(timeCommunication);
     if(rankTime < sizeTime-1) {
         size_type bufSize = Pend->packedSize(nloc);
         buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
@@ -678,8 +483,9 @@ int main(int argc, char *argv[]){
         buf->resetWritePos();
         MPI_Wait(&request, MPI_STATUS_IGNORE);
     }
-    IpplTimings::stopTimer(initialComm);
+    IpplTimings::stopTimer(timeCommunication);
 #else
+    //Note the CPU version has not been tested.
     Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(0));
     Kokkos::parallel_for(nloc,
                          generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
@@ -687,11 +493,11 @@ int main(int argc, char *argv[]){
                          minU, maxU));
 
     Kokkos::fence();
-    //Ippl::Comm->barrier();
+    Ippl::Comm->barrier();
 #endif
 
-
-    msg << "Parareal Penning trap"
+    msg << "Parareal "
+        << TestName
         << endl
         << "Slice dT: " << dtSlice
         << endl
@@ -700,7 +506,6 @@ int main(int argc, char *argv[]){
         << "No. of coarse time steps: " << ntCoarse
         << endl
         << "Tolerance: " << tol
-        //<< " Max. iterations: " << maxIter
         << " No. of cycles: " << nCycles
         << endl
         << "Np= " << Total_particles 
@@ -711,66 +516,9 @@ int main(int argc, char *argv[]){
     IpplTimings::stopTimer(particleCreation);                                                    
     
     msg << "particles created and initial conditions assigned " << endl;
-
-    //Copy initial conditions as they are needed later
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pcoarse->R0.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pcoarse->P0.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-    ////Get initial guess for ranks other than 0 by propagating the coarse solver
-    //IpplTimings::startTimer(coarsePropagator);
-    //if (Ippl::Comm->rank() > 0) {
-    //    Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, Ippl::Comm->rank()*ntCoarse, dtCoarse, tStartMySlice, Bext); 
-    //}
-    //
-    //Ippl::Comm->barrier();
-    //
-    //IpplTimings::stopTimer(coarsePropagator);
-
-    //msg << "First Boris PIC done " << endl;
-
-    //
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pbegin->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pbegin->P.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-
-    ////Run the coarse integrator to get the values at the end of the time slice 
-    //IpplTimings::startTimer(coarsePropagator);
-    //Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext); 
-    //IpplTimings::stopTimer(coarsePropagator);
-    //msg << "Second Boris PIC done " << endl;
-
-    ////Kokkos::deep_copy(Pcoarse->EfieldPICprevIter_m.getView(), Pcoarse->EfieldPIC_m.getView());
-
-    ////The following might not be needed
-    //IpplTimings::startTimer(deepCopy);
-    //Kokkos::deep_copy(Pend->R.getView(), Pcoarse->R.getView());
-    //Kokkos::deep_copy(Pend->P.getView(), Pcoarse->P.getView());
-    //IpplTimings::stopTimer(deepCopy);
-
-
-    //msg << "Starting parareal iterations ..." << endl;
-    //bool isConverged = false;
-    //bool isPreviousDomainConverged;
-    //if(Ippl::Comm->rank() == 0) {
-    //    isPreviousDomainConverged = true;
-    //}
-    //else {
-    //    isPreviousDomainConverged = false;
-    //}
-
-
     
     int sign = 1;
-    //coarseTol = 1e-3;
-    //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
-    //Pcoarse->BorisPIF(Pcoarse->Rfine, Pcoarse->Pfine, (rankTime+1)*ntFine, dtFine, 0, 0, 0, 
-    //                          Bext, rankTime, rankSpace, fine, spaceComm);
     for (unsigned int nc=0; nc < nCycles; nc++) {
-        
         double tStartMySlice; 
         bool sendCriteria, recvCriteria;
         bool isConverged = false;
@@ -796,13 +544,9 @@ int main(int argc, char *argv[]){
             tStartMySlice = (nc * tEndCycle) + (((sizeTime - 1) - rankTime) * dtSlice);
             msg.setPrintNode(0);
         }
-        //Pcoarse->time_m = tStartMySlice;
         
         unsigned int it = 0;
         while (!isConverged) { 
-        //while ((!isPreviousDomainConverged) || (!isConverged)) { 
-        //for (unsigned int it=0; it < maxIter; it++) {
-
             //Run fine integrator in parallel
             IpplTimings::startTimer(finePropagator);
             Pcoarse->BorisPIF(Pbegin->R, Pbegin->P, ntFine, dtFine, tStartMySlice, nc+1, it+1, 
@@ -814,10 +558,6 @@ int main(int argc, char *argv[]){
             Pend->R = Pbegin->R - Pcoarse->R;
             Pend->P = Pbegin->P - Pcoarse->P;
 
-            //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gk");
-            //Pcoarse->dumpParticleData(it+1, Pbegin->R, Pbegin->P, "Fk");
-
-
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->RprevIter.getView(), Pcoarse->R.getView());
             Kokkos::deep_copy(Pcoarse->PprevIter.getView(), Pcoarse->P.getView());
@@ -849,32 +589,27 @@ int main(int argc, char *argv[]){
             IpplTimings::stopTimer(deepCopy);
 
             IpplTimings::startTimer(coarsePropagator);
-            //coarseTol = 1e-4;//(double)(std::pow(0.1,std::min((int)(it+2),4)));
-            //Pcoarse->initNUFFTs(FLPIF, coarseTol, fineTol);
-            Pcoarse->BorisPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
-            //Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->BorisPIC(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm);
+            }
+            else {
+                Pcoarse->BorisPIF(Pcoarse->R, Pcoarse->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
+            }
             IpplTimings::stopTimer(coarsePropagator);
 
             Pend->R = Pend->R + Pcoarse->R;
             Pend->P = Pend->P + Pcoarse->P;
 
-            //Pcoarse->dumpParticleData(it+1, Pcoarse->R, Pcoarse->P, "Gkp1");
-
             PL.applyBC(Pend->R, PL.getRegionLayout().getDomain());
             IpplTimings::startTimer(computeErrors);
-            //double localRerror, localPerror;
             double Rerror = computeRL2Error(Pcoarse->R, Pcoarse->RprevIter, length, spaceComm);
             double Perror = computePL2Error(Pcoarse->P, Pcoarse->PprevIter, spaceComm);
-            //double Rerror = computeRL2Error(Pcoarse->Rfine, Pend->R, length, spaceComm);
-            //double Perror = computePL2Error(Pcoarse->Pfine, Pend->P, spaceComm);
-        
             IpplTimings::stopTimer(computeErrors);
 
             if((Rerror <= tol) && (Perror <= tol) && isPreviousDomainConverged) {
                 isConverged = true;
             }
 
-
             IpplTimings::startTimer(timeCommunication);
             if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
@@ -895,26 +630,16 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writeError(Rerror, Perror, it+1);
             Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
-            //Pcoarse->dumpParticleData(it+1, Pend->R, Pend->P, "Parareal");
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
-
+            
             it += 1;
-            //if(isConverged && isPreviousDomainConverged) {
-            //    break;
-            //}
         }
-    
-        //std::cout << "Before barrier in cycle: " << nc+1 << "for rank: " << Ippl::Comm->rank() << std::endl;
-        //Ippl::Comm->barrier();
+        
         MPI_Barrier(MPI_COMM_WORLD);
-        //msg << "Communication started in cycle: " << nc+1 << endl;
-        //std::cout << "Communication started in cycle: " << nc+1 << "for rank: " << Ippl::Comm->rank() << std::endl;
         if((nCycles > 1) && (nc < (nCycles - 1))) {  
-            IpplTimings::startTimer(timeCommunication);
             tag = 1000;//Ippl::Comm->next_tag(IPPL_PARAREAL_APP, IPPL_APP_CYCLE);
            
             //send, receive criteria and tStartMySlice are reversed at the end of the cycle
@@ -936,12 +661,14 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pbegin->P.getView(), Pend->P.getView());
             IpplTimings::stopTimer(deepCopy);
 
+            IpplTimings::startTimer(timeCommunication);
             if(recvCriteria) {
                 size_type bufSize = Pbegin->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_RECV, bufSize);
                 Ippl::Comm->recv(rankTime+sign, tag, *Pbegin, *buf, bufSize, nloc, timeComm);
                 buf->resetReadPos();
             }
+            IpplTimings::stopTimer(timeCommunication);
 
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pend->R.getView(), Pbegin->R.getView());
@@ -950,15 +677,21 @@ int main(int argc, char *argv[]){
             Kokkos::deep_copy(Pcoarse->P0.getView(), Pbegin->P.getView());
             IpplTimings::stopTimer(deepCopy);
             
-            Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm); 
-            //Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm); 
+            IpplTimings::startTimer(coarsePropagator);
+            if(Pcoarse->coarsetype_m == "PIC") {
+                Pcoarse->BorisPIC(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, Bext, spaceComm);
+            }
+            else {
+                Pcoarse->BorisPIF(Pend->R, Pend->P, ntCoarse, dtCoarse, tStartMySlice, 0, 0, Bext, 0, 0, coarse, spaceComm);
+            }
+            IpplTimings::stopTimer(coarsePropagator);
             
             IpplTimings::startTimer(deepCopy);
             Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
             Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
             IpplTimings::stopTimer(deepCopy);
 
-
+            IpplTimings::startTimer(timeCommunication);
             if(sendCriteria) {
                 size_type bufSize = Pend->packedSize(nloc);
                 buffer_type buf = Ippl::Comm->getBuffer(IPPL_PARAREAL_SEND, bufSize);
@@ -968,14 +701,6 @@ int main(int argc, char *argv[]){
                 MPI_Wait(&request, MPI_STATUS_IGNORE);
             }
             IpplTimings::stopTimer(timeCommunication);
-            //std::cout << "Communication finished in cycle: " << nc+1 << "for rank: " << Ippl::Comm->rank() << std::endl;
-            //Ippl::Comm->barrier();
-
-            //msg << "Communication finished in cycle: " << nc+1 << endl;
-            //IpplTimings::startTimer(deepCopy);
-            //Kokkos::deep_copy(Pcoarse->R.getView(), Pend->R.getView());
-            //Kokkos::deep_copy(Pcoarse->P.getView(), Pend->P.getView());
-            //IpplTimings::stopTimer(deepCopy);
             sign *= -1;
         }
     }

From fff184da8275a176c9fd2276255cff8731938766 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Sat, 2 Mar 2024 20:24:26 +0100
Subject: [PATCH 112/117] Momentum difference seems to stem from different
 initial seed only

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 32 +++++++++++--------
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp    |  4 +--
 alpine/PinT/BumponTailInstabilityPinT.cpp     |  5 +--
 alpine/PinT/ChargedParticlesPinT.hpp          | 12 ++++---
 alpine/PinT/PenningTrapPinT.cpp               |  5 +--
 5 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index bb762d408..7028cf094 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -523,22 +523,26 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
        double charge = temp;
 
-        Vector_t totalMomentum = 0.0;
-        
-        Kokkos::parallel_reduce("Total Momentum", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const int i, Vector_t& valL){
-                                    valL  += (-qView(i)) * Pview(i);
-                                }, Kokkos::Sum<ippl::Vector<double,3>>(totalMomentum));
-        
-        Vector_t globalMom;
+       Vector_t totalMomentum = 0.0;
+       
+       for(size_t d = 0; d < Dim; ++d) {
+            double tempD = 0.0;
+            Kokkos::parallel_reduce("Total Momentum", this->getLocalNum(),
+                               KOKKOS_LAMBDA(const int i, double& valL){
+                                   valL  += (-qView(i)) * Pview(i)[d];
+                               }, Kokkos::Sum<double>(tempD));
+            totalMomentum[d] = tempD;
+       }
+       
+       Vector_t globalMom;
 
-        double magMomentum = 0.0;
-        for(size_t d = 0; d < Dim; ++d) {
-            MPI_Allreduce(&totalMomentum[d], &globalMom[d], 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
-            magMomentum += globalMom[d] * globalMom[d];
-        }
+       double magMomentum = 0.0;
+       for(size_t d = 0; d < Dim; ++d) {
+           MPI_Allreduce(&totalMomentum[d], &globalMom[d], 1, MPI_DOUBLE, MPI_SUM, Ippl::getComm());
+           magMomentum += globalMom[d] * globalMom[d];
+       }
 
-        magMomentum  = std::sqrt(magMomentum);
+       magMomentum  = std::sqrt(magMomentum);
 
        if (Ippl::Comm->rank() == 0) {
            std::stringstream fname;
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index 54984352e..fc01f8228 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -308,7 +308,7 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    //P->dumpEnergy();
+    P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     double alpha = -0.5 * dt;
@@ -387,7 +387,7 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        //P->dumpEnergy();
+        P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 0e405720a..94fd4dd8a 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -45,8 +45,7 @@
 //
 
 #include "ChargedParticlesPinT.hpp"
-#include "StatesBeginSlice.hpp"
-#include "StatesEndSlice.hpp"
+#include "StatesSlice.hpp"
 #include <string>
 #include <vector>
 #include <iostream>
@@ -397,6 +396,8 @@ int main(int argc, char *argv[]){
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
     Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
 
+    Pcoarse->coarsetype_m = argv[19];
+    
     if(Pcoarse->coarsetype_m == "PIC") {
         Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
         Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 31835f9e4..b971bfa33 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -420,10 +420,14 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         Vector_t totalMomentum = 0.0;
         
-        Kokkos::parallel_reduce("Total Momentum", this->getLocalNum(),
-                                KOKKOS_LAMBDA(const int i, Vector_t& valL){
-                                    valL  += (-qView(i)) * Pview(i);
-                                }, Kokkos::Sum<ippl::Vector<double,3>>(totalMomentum));
+        for(size_t d = 0; d < Dim; ++d) {
+             double tempD = 0.0;
+             Kokkos::parallel_reduce("Total Momentum", this->getLocalNum(),
+                                KOKKOS_LAMBDA(const int i, double& valL){
+                                    valL  += (-qView(i)) * Pview(i)[d];
+                                }, Kokkos::Sum<double>(tempD));
+             totalMomentum[d] = tempD;
+        }
         
         Vector_t globalMom;
 
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index f5026949c..270fba8c2 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -44,8 +44,7 @@
 //
 
 #include "ChargedParticlesPinT.hpp"
-#include "StatesBeginSlice.hpp"
-#include "StatesEndSlice.hpp"
+#include "StatesSlice.hpp"
 #include <string>
 #include <vector>
 #include <iostream>
@@ -352,6 +351,8 @@ int main(int argc, char *argv[]){
     Pcoarse->rhoPIF_m.initialize(meshPIF, FLPIF);
     Pcoarse->Sk_m.initialize(meshPIF, FLPIF);
 
+    Pcoarse->coarsetype_m = argv[19];
+
     if(Pcoarse->coarsetype_m == "PIC") {
         Pcoarse->rhoPIC_m.initialize(meshPIC, FLPIC);
         Pcoarse->EfieldPIC_m.initialize(meshPIC, FLPIC);

From c4104ed9a9bf678389ead4bf61f02ff12500ef9a Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Thu, 28 Mar 2024 09:49:42 +0100
Subject: [PATCH 113/117] Uncommitted changes committed

---
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp |  4 ++--
 alpine/PinT/BumponTailInstabilityPinT.cpp  |  2 +-
 alpine/PinT/ChargedParticlesPinT.hpp       | 22 +++++++++++-----------
 alpine/PinT/LandauDampingPinT.cpp          |  2 +-
 alpine/PinT/PenningTrapPinT.cpp            |  2 +-
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index fc01f8228..54984352e 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -308,7 +308,7 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    P->dumpEnergy();
+    //P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     double alpha = -0.5 * dt;
@@ -387,7 +387,7 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        P->dumpEnergy();
+        //P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 94fd4dd8a..6147c543c 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -683,7 +683,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index b971bfa33..31feaeb28 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -697,8 +697,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
-                     const unsigned int& iter, int rankTime, int rankSpace,
+                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
+                     const unsigned int& /*iter*/, int /*rankTime*/, int /*rankSpace*/,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
@@ -729,8 +729,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
-            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            //dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+            //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         for (unsigned int it=0; it<nt; it++) {
@@ -772,8 +772,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
-                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+                //dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+                //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
                 IpplTimings::stopTimer(dumpData);
             }
         }
@@ -782,9 +782,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
-                     const unsigned int& iter, const double& Bext,
-                     int rankTime, int rankSpace,
+                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
+                     const unsigned int& /*iter*/, const double& Bext,
+                     int /*rankTime*/, int /*rankSpace*/,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
@@ -813,7 +813,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         double alpha = -0.5 * dt;
@@ -899,7 +899,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+                //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
                 IpplTimings::stopTimer(dumpData);
             }
         }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 947a4ed5d..f627ccc7f 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -630,7 +630,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 270fba8c2..a5e90b42f 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -631,7 +631,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);

From 7a61baa7edde3a1ba942250a494b8cc79c9e0e38 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Tue, 23 Apr 2024 10:42:16 +0200
Subject: [PATCH 114/117] Bug in shape function corrected

---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  |   2 +-
 alpine/PinT/BumponTailInstabilityPinT.cpp     |   2 +-
 alpine/PinT/ChargedParticlesPinT.hpp          |  24 +--
 alpine/PinT/LandauDampingPinT.cpp             |   4 +-
 alpine/PinT/PenningTrapPinT.cpp               |   2 +-
 src/FFT/FFT.hpp                               |   4 +-
 test/FFT/TestNUFFT1.cpp                       | 183 +++++++++---------
 test/FFT/TestNUFFT2.cpp                       |  20 +-
 8 files changed, 120 insertions(+), 121 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 7028cf094..959fd119c 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -604,7 +604,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                 double Sk = 1.0;
                 for(size_t d = 0; d < Dim; ++d) {
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
-                    double kh = kVec[d] * dx[d];
+                    double kh = kVec[d] * dx[d] / 2;
                     bool isNotZero = (kh != 0.0);
                     double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
                     double arg = isNotZero * (Kokkos::sin(kh) * factor) + 
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 6147c543c..94fd4dd8a 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -683,7 +683,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 31feaeb28..d94309c60 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -524,7 +524,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 double Sk = 1.0;
                 for(size_t d = 0; d < Dim; ++d) {
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
-                    double kh = kVec[d] * dx[d];
+                    double kh = kVec[d] * dx[d] / 2;
                     bool isNotZero = (kh != 0.0);
                     double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
                     double arg = isNotZero * (Kokkos::sin(kh) * factor) + 
@@ -697,8 +697,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
-                     const unsigned int& /*iter*/, int /*rankTime*/, int /*rankSpace*/,
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, int rankTime, int rankSpace,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
@@ -729,8 +729,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            //dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
-            //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         for (unsigned int it=0; it<nt; it++) {
@@ -772,8 +772,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                //dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
-                //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+                dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
                 IpplTimings::stopTimer(dumpData);
             }
         }
@@ -782,9 +782,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
-                     const unsigned int& /*iter*/, const double& Bext,
-                     int /*rankTime*/, int /*rankSpace*/,
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, const double& Bext,
+                     int rankTime, int rankSpace,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
@@ -813,7 +813,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         double alpha = -0.5 * dt;
@@ -899,7 +899,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
                 IpplTimings::stopTimer(dumpData);
             }
         }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index f627ccc7f..99f625eb9 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -302,6 +302,7 @@ int main(int argc, char *argv[]){
 
     // create mesh and layout objects for this problem domain
     Vector_t kw = {0.5, 0.5, 0.5};
+    //Vector_t kw = {1.0, 1.0, 1.0};
     Vector_t alpha = {0.05, 0.05, 0.05};
     //Vector_t alpha = {0.5, 0.5, 0.5};
     Vector_t rmin(0.0);
@@ -436,6 +437,7 @@ int main(int argc, char *argv[]){
 
     if(rankTime == 0) {
         Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100*rankSpace));
+        //Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(79 + 100*rankSpace));
         Kokkos::parallel_for(nloc,
                              generate_random<Vector_t, Kokkos::Random_XorShift64_Pool<>, Dim>(
                              Pbegin->R.getView(), Pbegin->P.getView(), rand_pool64, alpha, kw, minU, maxU));
@@ -630,7 +632,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index a5e90b42f..270fba8c2 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -631,7 +631,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp
index e33552322..07f36cee9 100644
--- a/src/FFT/FFT.hpp
+++ b/src/FFT/FFT.hpp
@@ -929,7 +929,9 @@ namespace ippl {
                              KOKKOS_LAMBDA(const size_t i)
                              {
                                  for(size_t d = 0; d < Dim; ++d) {
-                                    tempR[d](i) = (Rview(i)[d] - origin[d]) * (2.0 * pi / Len[d]);
+                                    //tempR[d](i) = (Rview(i)[d] - (twopiFactor * 2.0 * pi)) * (2.0 * pi / Len[d]);
+                                    tempR[d](i) = Rview(i)[d] * (2.0 * pi / Len[d]);
+                                    //tempR[d](i) = Rview(i)[d];
                                  }
                                  tempQ(i).x = Qview(i);
                                  tempQ(i).y = 0.0;
diff --git a/test/FFT/TestNUFFT1.cpp b/test/FFT/TestNUFFT1.cpp
index 0e261b035..0f6a6ba3e 100644
--- a/test/FFT/TestNUFFT1.cpp
+++ b/test/FFT/TestNUFFT1.cpp
@@ -73,7 +73,7 @@ int main(int argc, char *argv[]) {
     typedef Bunch<playout_type> bunch_type;
 
     
-    ippl::Vector<int, dim> pt = {512, 512, 512};
+    ippl::Vector<int, dim> pt = {16, 16, 16};
     ippl::Index I(pt[0]);
     ippl::Index J(pt[1]);
     ippl::Index K(pt[2]);
@@ -85,22 +85,21 @@ int main(int argc, char *argv[]) {
 
     ippl::FieldLayout<dim> layout(owned, decomp);
 
+    typedef ippl::Vector<double, 3> Vector_t;
+    Vector_t minU = {-pi, -pi, -pi};
+    Vector_t maxU = {pi, pi, pi};
+    //Vector_t minU = {0.0, 0.0, 0.0};
+    //Vector_t maxU = {25.0, 25.0, 25.0};
+    
     std::array<double, dim> dx = {
-        2.0 * pi / double(pt[0]),
-        2.0 * pi / double(pt[1]),
-        2.0 * pi / double(pt[2]),
+    (maxU[0] - minU[0]) / double(pt[0]),
+    (maxU[1] - minU[1]) / double(pt[1]),
+    (maxU[2] - minU[2]) / double(pt[2]),
     };
-
-    //std::array<double, dim> dx = {
-    //    25.0 / double(pt[0]),
-    //    25.0 / double(pt[1]),
-    //    25.0 / double(pt[2]),
-    //};
-    typedef ippl::Vector<double, 3> Vector_t;
+    
 
     Vector_t hx = {dx[0], dx[1], dx[2]};
-    Vector_t origin = {-pi, -pi, -pi};
-    //Vector_t origin = {0, 0, 0};
+    Vector_t origin = {minU[0], minU[1], minU[2]};
     ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
 
     playout_type pl(layout, mesh);
@@ -111,7 +110,7 @@ int main(int argc, char *argv[]) {
     using size_type = ippl::detail::size_type;
 
 
-    size_type Np = std::pow(512,3) * 5;
+    size_type Np = std::pow(16,3);
     
     typedef ippl::Field<Kokkos::complex<double>, dim> field_type;
 
@@ -123,7 +122,7 @@ int main(int argc, char *argv[]) {
     fftParams.add("gpu_method", 1);
     fftParams.add("gpu_sort", 0);
     fftParams.add("gpu_kerevalmeth", 1);
-    fftParams.add("tolerance", 1e-6);
+    fftParams.add("tolerance", 1e-12);
 
     fftParams.add("use_cufinufft_defaults", false);  
     
@@ -134,12 +133,6 @@ int main(int argc, char *argv[]) {
     int type = 1;
     
 
-    Vector_t minU = {-pi, -pi, -pi};
-    Vector_t maxU = {pi, pi, pi};
-    //Vector_t minU = {0.0, 0.0, 0.0};
-    //Vector_t maxU = {25.0, 25.0, 25.0};
-
-
     size_type nloc = Np/Ippl::Comm->size();
 
     bunch.create(nloc);
@@ -175,82 +168,82 @@ int main(int argc, char *argv[]) {
     auto Qview = bunch.Q.getView();
 
     Kokkos::complex<double> imag = {0.0, 1.0};
-    //size_t flatN = pt[0] * pt[1] * pt[2];
-    //auto fview = field_dft.getView();
+    size_t flatN = pt[0] * pt[1] * pt[2];
+    auto fview = field_dft.getView();
   
 
 
-    //typedef Kokkos::TeamPolicy<> team_policy;
-    //typedef Kokkos::TeamPolicy<>::member_type member_type;
-
-    //Kokkos::parallel_for("NUDFT type 1",
-    //       team_policy(flatN, Kokkos::AUTO),
-    //       KOKKOS_LAMBDA(const member_type& teamMember) {
-    //       const size_t flatIndex = teamMember.league_rank();
-    //      
-    //       const int k = (int)(flatIndex / (pt[0] * pt[1]));
-    //       const int flatIndex2D = flatIndex - (k * pt[0] * pt[1]);
-    //       const int i = flatIndex2D % pt[0];
-    //       const int j = (int)(flatIndex2D / pt[0]);
-    //       
-    //       Kokkos::complex<double> reducedValue = 0.0;
-    //       ippl::Vector<int, 3> iVec = {i, j, k};
-    //       ippl::Vector<double, 3>kVec;
-    //       for(size_t d = 0; d < 3; ++d) {
-    //           kVec[d] = (2.0 * pi / (maxU[d] - minU[d])) * (iVec[d] - (pt[d] / 2));
-    //       }
-    //       Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nloc),
-    //       [=](const size_t idx, Kokkos::complex<double>& innerReduce)
-    //       {
-    //           double arg = 0.0;
-    //           for(size_t d = 0; d < 3; ++d) {
-    //               arg += kVec[d]*Rview(idx)[d];
-    //           }
-    //           const double& val = Qview(idx);
-
-    //           innerReduce += (Kokkos::cos(arg) 
-    //                       - imag * Kokkos::sin(arg)) * val;
-    //       }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
-
-    //       if(teamMember.team_rank() == 0) {
-    //           fview(i+nghost,j+nghost,k+nghost) = reducedValue;
-    //       }
-
-    //       });
-    //
-    //typename field_type::HostMirror rhoNUDFT_host = field_dft.getHostMirror();
-    //Kokkos::deep_copy(rhoNUDFT_host, field_dft.getView());
-    //std::stringstream pname;
-    //pname << "data/FieldFFT_";
-    //pname << Ippl::Comm->rank();
-    //pname << ".csv";
-    //Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
-    //pcsvout.precision(10);
-    //pcsvout.setf(std::ios::scientific, std::ios::floatfield);
-    //pcsvout << "rho" << endl;
-    //for (int i = 0; i< pt[0]; i++) {
-    //     for (int j = 0; j< pt[1]; j++) {
-    //         for (int k = 0; k< pt[2]; k++) {
-    //             pcsvout << field_result(i+nghost,j+nghost, k+nghost) << endl;
-    //         }
-    //     }
-    //}
-    //std::stringstream pname2;
-    //pname2 << "data/FieldDFT_";
-    //pname2 << Ippl::Comm->rank();
-    //pname2 << ".csv";
-    //Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
-    //pcsvout2.precision(10);
-    //pcsvout2.setf(std::ios::scientific, std::ios::floatfield);
-    //pcsvout2 << "rho" << endl;
-    //for (int i = 0; i< pt[0]; i++) {
-    //     for (int j = 0; j< pt[1]; j++) {
-    //         for (int k = 0; k< pt[2]; k++) {
-    //             pcsvout2 << rhoNUDFT_host(i+nghost,j+nghost, k+nghost) << endl;
-    //         }
-    //     }
-    //   }
-    //   Ippl::Comm->barrier();
+    typedef Kokkos::TeamPolicy<> team_policy;
+    typedef Kokkos::TeamPolicy<>::member_type member_type;
+
+    Kokkos::parallel_for("NUDFT type 1",
+           team_policy(flatN, Kokkos::AUTO),
+           KOKKOS_LAMBDA(const member_type& teamMember) {
+           const size_t flatIndex = teamMember.league_rank();
+          
+           const int k = (int)(flatIndex / (pt[0] * pt[1]));
+           const int flatIndex2D = flatIndex - (k * pt[0] * pt[1]);
+           const int i = flatIndex2D % pt[0];
+           const int j = (int)(flatIndex2D / pt[0]);
+           
+           Kokkos::complex<double> reducedValue = 0.0;
+           ippl::Vector<int, 3> iVec = {i, j, k};
+           ippl::Vector<double, 3>kVec;
+           for(size_t d = 0; d < 3; ++d) {
+               kVec[d] = (2.0 * pi / (maxU[d] - minU[d])) * (iVec[d] - (pt[d] / 2));
+           }
+           Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, nloc),
+           [=](const size_t idx, Kokkos::complex<double>& innerReduce)
+           {
+               double arg = 0.0;
+               for(size_t d = 0; d < 3; ++d) {
+                   arg += kVec[d]*Rview(idx)[d];
+               }
+               const double& val = Qview(idx);
+
+               innerReduce += (Kokkos::cos(arg) 
+                           - imag * Kokkos::sin(arg)) * val;
+           }, Kokkos::Sum<Kokkos::complex<double>>(reducedValue));
+
+           if(teamMember.team_rank() == 0) {
+               fview(i+nghost,j+nghost,k+nghost) = reducedValue;
+           }
+
+           });
+    
+    typename field_type::HostMirror rhoNUDFT_host = field_dft.getHostMirror();
+    Kokkos::deep_copy(rhoNUDFT_host, field_dft.getView());
+    std::stringstream pname;
+    pname << "data/FieldFFT_";
+    pname << Ippl::Comm->rank();
+    pname << ".csv";
+    Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    pcsvout.precision(10);
+    pcsvout.setf(std::ios::scientific, std::ios::floatfield);
+    pcsvout << "rho" << endl;
+    for (int i = 0; i< pt[0]; i++) {
+         for (int j = 0; j< pt[1]; j++) {
+             for (int k = 0; k< pt[2]; k++) {
+                 pcsvout << field_result(i+nghost,j+nghost, k+nghost) << endl;
+             }
+         }
+    }
+    std::stringstream pname2;
+    pname2 << "data/FieldDFT_";
+    pname2 << Ippl::Comm->rank();
+    pname2 << ".csv";
+    Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, Ippl::Comm->rank());
+    pcsvout2.precision(10);
+    pcsvout2.setf(std::ios::scientific, std::ios::floatfield);
+    pcsvout2 << "rho" << endl;
+    for (int i = 0; i< pt[0]; i++) {
+         for (int j = 0; j< pt[1]; j++) {
+             for (int k = 0; k< pt[2]; k++) {
+                 pcsvout2 << rhoNUDFT_host(i+nghost,j+nghost, k+nghost) << endl;
+             }
+         }
+       }
+       Ippl::Comm->barrier();
     
     
     
@@ -259,7 +252,7 @@ int main(int argc, char *argv[]) {
 
                                 double arg = 0.0;
                                 for(size_t d = 0; d < dim; ++d) {
-                                    arg += kVec[d]*Rview(idx)[d];
+                                    arg += (2 * pi / (hx[d] * pt[d])) * kVec[d] * Rview(idx)[d];
                                 }
 
                                 valL += (Kokkos::cos(arg) 
diff --git a/test/FFT/TestNUFFT2.cpp b/test/FFT/TestNUFFT2.cpp
index f55351db7..9d47a9607 100644
--- a/test/FFT/TestNUFFT2.cpp
+++ b/test/FFT/TestNUFFT2.cpp
@@ -106,17 +106,21 @@ int main(int argc, char *argv[]) {
 
     ippl::FieldLayout<dim> layout(owned, decomp);
 
+    typedef ippl::Vector<double, 3> Vector_t;
+    Vector_t minU = {-pi, -pi, -pi};
+    Vector_t maxU = {pi, pi, pi};
+    //Vector_t minU = {0.0, 0.0, 0.0};
+    //Vector_t maxU = {25.0, 25.0, 25.0};
+
     std::array<double, dim> dx = {
-        2.0 * pi / double(pt[0]),
-        2.0 * pi / double(pt[1]),
-        2.0 * pi / double(pt[2]),
+    (maxU[0] - minU[0]) / double(pt[0]),
+    (maxU[1] - minU[1]) / double(pt[1]),
+    (maxU[2] - minU[2]) / double(pt[2]),
     };
-
-    typedef ippl::Vector<double, 3> Vector_t;
     //typedef ippl::Vector<Kokkos::complex<double>, 3> CxVector_t;
 
     Vector_t hx = {dx[0], dx[1], dx[2]};
-    Vector_t origin = {-pi, -pi, -pi};
+    Vector_t origin = {minU[0], minU[1], minU[2]};
     ippl::UniformCartesian<double, 3> mesh(owned, hx, origin);
 
     playout_type pl(layout, mesh);
@@ -148,8 +152,6 @@ int main(int argc, char *argv[]) {
     
     int type = 2;
     
-    Vector_t minU = {-pi, -pi, -pi};
-    Vector_t maxU = {pi, pi, pi};
 
     size_type nloc = Np/Ippl::Comm->size();
 
@@ -203,7 +205,7 @@ int main(int argc, char *argv[]) {
                                 ippl::Vector<int, 3> iVec = {i, j, k};
                                 double arg = 0.0;
                                 for(size_t d = 0; d < dim; ++d) {
-                                    arg += (iVec[d] - (pt[d]/2)) * Rview(idx)[d];
+                                    arg += (2 * pi / (hx[d] * pt[d])) * (iVec[d] - (pt[d]/2)) * Rview(idx)[d];
                                 }
 
                                 valL += (Kokkos::cos(arg) 

From 3a14e8a6c6d6f1e1f7c0f5c1f6efeec3eef7af06 Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Fri, 26 Apr 2024 11:10:37 +0200
Subject: [PATCH 115/117] Added a comment

---
 alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp | 2 ++
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp      | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 959fd119c..486709163 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -480,6 +480,8 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            Nhalf[d] = domainHalf[d].length();
        }
 
+       //Heffte needs FFTshifted field whereas the field from cuFINUFFT
+       //is not shifted. Hence, here we do the shift. 
        Kokkos::parallel_for("Transfer complex rho to half domain",
                              mdrange_type({0, 0, 0},
                                           {Nhalf[0],
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index 54984352e..fc01f8228 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -308,7 +308,7 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    //P->dumpEnergy();
+    P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     double alpha = -0.5 * dt;
@@ -387,7 +387,7 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        //P->dumpEnergy();
+        P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }

From 2bae52889a3f6982be242721ea282c5bc969b89b Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Tue, 30 Apr 2024 08:14:55 +0200
Subject: [PATCH 116/117] Variable name changed related to the bugfix and
 dumping commented for speedup benchmarks

---
 .../BumponTailInstabilityPIF.cpp              |  8 ++---
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  |  8 ++---
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  |  8 ++---
 alpine/ElectrostaticPIF/PenningTrapPIF.cpp    |  4 +--
 alpine/PinT/BumponTailInstabilityPinT.cpp     |  2 +-
 alpine/PinT/ChargedParticlesPinT.hpp          | 30 +++++++++----------
 alpine/PinT/LandauDampingPinT.cpp             |  2 +-
 alpine/PinT/PenningTrapPinT.cpp               |  2 +-
 8 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
index 3ef320c57..b716a4ab9 100644
--- a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -345,8 +345,8 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    P->dumpBumponTail();
-    P->dumpEnergy();
+    //P->dumpBumponTail();
+    //P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     // begin main timestep loop
@@ -386,8 +386,8 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        P->dumpBumponTail();
-        P->dumpEnergy();
+        //P->dumpBumponTail();
+        //P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 486709163..39129695b 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -606,10 +606,10 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
                 double Sk = 1.0;
                 for(size_t d = 0; d < Dim; ++d) {
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
-                    double kh = kVec[d] * dx[d] / 2;
-                    bool isNotZero = (kh != 0.0);
-                    double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
-                    double arg = isNotZero * (Kokkos::sin(kh) * factor) + 
+                    double khbytwo = kVec[d] * dx[d] / 2;
+                    bool isNotZero = (khbytwo != 0.0);
+                    double factor = (1.0 / (khbytwo + ((!isNotZero) * 1.0)));
+                    double arg = isNotZero * (Kokkos::sin(khbytwo) * factor) + 
                                  (!isNotZero) * 1.0;
                     //Fourier transform of CIC
                     Sk *= std::pow(arg, order);
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index fe5e8b68c..d360eff59 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -307,8 +307,8 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    P->dumpLandau();
-    P->dumpEnergy();
+    //P->dumpLandau();
+    //P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     // begin main timestep loop
@@ -348,8 +348,8 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        P->dumpLandau();
-        P->dumpEnergy();
+        //P->dumpLandau();
+        //P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
index fc01f8228..54984352e 100644
--- a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
+++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp
@@ -308,7 +308,7 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    P->dumpEnergy();
+    //P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     double alpha = -0.5 * dt;
@@ -387,7 +387,7 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        P->dumpEnergy();
+        //P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 94fd4dd8a..6147c543c 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -683,7 +683,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index d94309c60..538e8f56d 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -524,10 +524,10 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
                 double Sk = 1.0;
                 for(size_t d = 0; d < Dim; ++d) {
                     kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
-                    double kh = kVec[d] * dx[d] / 2;
-                    bool isNotZero = (kh != 0.0);
-                    double factor = (1.0 / (kh + ((!isNotZero) * 1.0)));
-                    double arg = isNotZero * (Kokkos::sin(kh) * factor) + 
+                    double khbytwo = kVec[d] * dx[d] / 2;
+                    bool isNotZero = (khbytwo != 0.0);
+                    double factor = (1.0 / (khbytwo + ((!isNotZero) * 1.0)));
+                    double arg = isNotZero * (Kokkos::sin(khbytwo) * factor) + 
                                  (!isNotZero) * 1.0;
                     //Fourier transform of CIC
                     Sk *= std::pow(arg, order);
@@ -697,8 +697,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
-                     const unsigned int& iter, int rankTime, int rankSpace,
+                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
+                     const unsigned int& /*iter*/, int /*rankTime*/, int /*rankSpace*/,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
@@ -729,8 +729,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
-            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            //dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+            //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         for (unsigned int it=0; it<nt; it++) {
@@ -772,8 +772,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
-                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+                //dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+                //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
                 IpplTimings::stopTimer(dumpData);
             }
         }
@@ -782,9 +782,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
-                     const unsigned int& iter, const double& Bext,
-                     int rankTime, int rankSpace,
+                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
+                     const unsigned int& /*iter*/, const double& Bext,
+                     int /*rankTime*/, int /*rankSpace*/,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
@@ -813,7 +813,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         double alpha = -0.5 * dt;
@@ -899,7 +899,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+                //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
                 IpplTimings::stopTimer(dumpData);
             }
         }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 99f625eb9..6f7959ed9 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -632,7 +632,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index 270fba8c2..a5e90b42f 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -631,7 +631,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);

From bf5582b22801aa414c0cb5791e743b5ad898871a Mon Sep 17 00:00:00 2001
From: Sriramkrishnan Muralikrishnan <s.muralikrishnan@fz-juelich.de>
Date: Wed, 26 Jun 2024 07:38:37 +0200
Subject: [PATCH 117/117] State prior to submission with file writings enabled

---
 .../BumponTailInstabilityPIF.cpp              |  8 +++----
 .../ElectrostaticPIF/ChargedParticlesPIF.hpp  | 18 +++++++--------
 alpine/ElectrostaticPIF/LandauDampingPIF.cpp  |  8 +++----
 alpine/PinT/BumponTailInstabilityPinT.cpp     |  2 +-
 alpine/PinT/ChargedParticlesPinT.hpp          | 22 +++++++++----------
 alpine/PinT/LandauDampingPinT.cpp             |  2 +-
 alpine/PinT/PenningTrapPinT.cpp               |  2 +-
 7 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
index b716a4ab9..3ef320c57 100644
--- a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
+++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp
@@ -345,8 +345,8 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    //P->dumpBumponTail();
-    //P->dumpEnergy();
+    P->dumpBumponTail();
+    P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     // begin main timestep loop
@@ -386,8 +386,8 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        //P->dumpBumponTail();
-        //P->dumpEnergy();
+        P->dumpBumponTail();
+        P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
index 39129695b..ecc061798 100644
--- a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
+++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp
@@ -303,7 +303,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
        
        
        double fieldEnergy = 0.0; 
-       double ExAmp = 0.0;
+       double EzAmp = 0.0;
 
        auto rhoview = rho_m.getView();
        const int nghost = rho_m.getNghost();
@@ -324,7 +324,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
        Kokkos::complex<double> imag = {0.0, 1.0};
        double pi = std::acos(-1.0);
-       Kokkos::parallel_reduce("Ex energy and Max",
+       Kokkos::parallel_reduce("Ez energy and Max",
                              mdrange_type({0, 0, 0},
                                           {N[0],
                                            N[1],
@@ -340,15 +340,15 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
            Vector<double, 3> kVec;
            double Dr = 0.0;
            for(size_t d = 0; d < Dim; ++d) {
-               bool shift = (iVec[d] > (N[d]/2));
-               kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]);
+               kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2));
                Dr += kVec[d] * kVec[d];
            }
 
            Kokkos::complex<double> Ek = {0.0, 0.0}; 
-           if(Dr != 0.0) {
-               Ek = -(imag * kVec[2] * rhoview(i+nghost,j+nghost,k+nghost) / Dr);
-           }
+           auto rho = rhoview(i+nghost,j+nghost,k+nghost);
+           bool isNotZero = (Dr != 0.0);
+           double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); 
+           Ek = -(imag * kVec[2] * rho * factor);
            double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag();
 
            tlSum += myVal;
@@ -357,7 +357,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
            if(myValMax > tlMax) tlMax = myValMax;
 
-       }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(ExAmp));
+       }, Kokkos::Sum<double>(fieldEnergy), Kokkos::Max<double>(EzAmp));
        
 
        Kokkos::fence();
@@ -381,7 +381,7 @@ class ChargedParticlesPIF : public ippl::ParticleBase<PLayout> {
 
            csvout << time_m << " "
                   << fieldEnergy << " "
-                  << ExAmp << endl;
+                  << EzAmp << endl;
 
        }
        
diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
index d360eff59..e07e249fc 100644
--- a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
+++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp
@@ -307,8 +307,8 @@ int main(int argc, char *argv[]){
     P->gather();
 
     IpplTimings::startTimer(dumpDataTimer);
-    //P->dumpLandau();
-    //P->dumpEnergy();
+    P->dumpBumponTail();
+    P->dumpEnergy();
     IpplTimings::stopTimer(dumpDataTimer);
 
     // begin main timestep loop
@@ -348,8 +348,8 @@ int main(int argc, char *argv[]){
 
         P->time_m += dt;
         IpplTimings::startTimer(dumpDataTimer);
-        //P->dumpLandau();
-        //P->dumpEnergy();
+        P->dumpBumponTail();
+        P->dumpEnergy();
         IpplTimings::stopTimer(dumpDataTimer);
         msg << "Finished time step: " << it+1 << " time: " << P->time_m << endl;
     }
diff --git a/alpine/PinT/BumponTailInstabilityPinT.cpp b/alpine/PinT/BumponTailInstabilityPinT.cpp
index 6147c543c..94fd4dd8a 100644
--- a/alpine/PinT/BumponTailInstabilityPinT.cpp
+++ b/alpine/PinT/BumponTailInstabilityPinT.cpp
@@ -683,7 +683,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/alpine/PinT/ChargedParticlesPinT.hpp b/alpine/PinT/ChargedParticlesPinT.hpp
index 538e8f56d..7790c8f26 100644
--- a/alpine/PinT/ChargedParticlesPinT.hpp
+++ b/alpine/PinT/ChargedParticlesPinT.hpp
@@ -697,8 +697,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void LeapFrogPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
-                     const unsigned int& /*iter*/, int /*rankTime*/, int /*rankSpace*/,
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, int rankTime, int rankSpace,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
@@ -729,8 +729,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            //dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
-            //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         for (unsigned int it=0; it<nt; it++) {
@@ -772,8 +772,8 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                //dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
-                //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
+                dumpFieldEnergy(nc, iter, rankTime, rankSpace);         
+                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);         
                 IpplTimings::stopTimer(dumpData);
             }
         }
@@ -782,9 +782,9 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
     void BorisPIF(ParticleAttrib<Vector_t>& Rtemp,
                      ParticleAttrib<Vector_t>& Ptemp, const unsigned int& nt, 
-                     const double& dt, const double& tStartMySlice, const unsigned& /*nc*/, 
-                     const unsigned int& /*iter*/, const double& Bext,
-                     int /*rankTime*/, int /*rankSpace*/,
+                     const double& dt, const double& tStartMySlice, const unsigned& nc, 
+                     const unsigned int& iter, const double& Bext,
+                     int rankTime, int rankSpace,
                      const std::string& propagator, MPI_Comm& spaceComm) {
     
         static IpplTimings::TimerRef dumpData = IpplTimings::getTimer("dumpData");
@@ -813,7 +813,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
 
         if((time_m == 0.0) && (propagator == "Fine")) {
             IpplTimings::startTimer(dumpData);
-            //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+            dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
             IpplTimings::stopTimer(dumpData);
         }
         double alpha = -0.5 * dt;
@@ -899,7 +899,7 @@ class ChargedParticlesPinT : public ippl::ParticleBase<PLayout> {
             
             if(propagator == "Fine") {
                 IpplTimings::startTimer(dumpData);
-                //dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
+                dumpEnergy(nc, iter, Ptemp, rankTime, rankSpace, spaceComm);
                 IpplTimings::stopTimer(dumpData);
             }
         }
diff --git a/alpine/PinT/LandauDampingPinT.cpp b/alpine/PinT/LandauDampingPinT.cpp
index 6f7959ed9..99f625eb9 100644
--- a/alpine/PinT/LandauDampingPinT.cpp
+++ b/alpine/PinT/LandauDampingPinT.cpp
@@ -632,7 +632,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);
diff --git a/alpine/PinT/PenningTrapPinT.cpp b/alpine/PinT/PenningTrapPinT.cpp
index a5e90b42f..270fba8c2 100644
--- a/alpine/PinT/PenningTrapPinT.cpp
+++ b/alpine/PinT/PenningTrapPinT.cpp
@@ -631,7 +631,7 @@ int main(int argc, char *argv[]){
                 << endl;
 
             IpplTimings::startTimer(dumpData);
-            //Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
+            Pcoarse->writelocalError(Rerror, Perror, nc+1, it+1, rankTime, rankSpace);
             IpplTimings::stopTimer(dumpData);
 
             MPI_Barrier(spaceComm);