From 6c5c1869ed7c1cdab60f901e72d860c4d53b4d41 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Thu, 22 Apr 2021 15:32:09 -0400
Subject: [PATCH 01/24] Adding set num threads

---
 time_integrators.f90 | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/time_integrators.f90 b/time_integrators.f90
index 3f1b908..8460bad 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -37,8 +37,11 @@ subroutine imex_rk(vtk_print, save_nusselt)
 integer                        :: nthreads, myid
 integer, EXTERNAL              :: OMP_GET_THREAD_NUM, OMP_GET_NUM_THREADS
 real(dp), EXTERNAL             :: OMP_GET_WTIME
+EXTERNAL                       :: OMP_SET_NUM_THREADS
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
+call OMP_SET_NUM_THREADS(8)
+
 if (present(vtk_print)) then
    wvtk = .true.
    nprint = vtk_print
@@ -97,7 +100,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
    finish = OMP_GET_WTIME()
    write(*,*) " - calc_explicit(1) timing: ", finish-start, "(s)"
    start = OMP_GET_WTIME()
-   !$OMP PARALLEL DO num_threads(16) private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+   !$OMP PARALLEL DO  private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
    do it = 1,Nx ! kx loop
       ! Compute phi1 and T1
       call calc_vari_mod(tmp_phi, tmp_T, acoeffs(1,1), 1,&
@@ -141,7 +144,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
    ! STAGE 2 ::
    !:::::::::::
    start = OMP_GET_WTIME()
-   !$OMP PARALLEL DO num_threads(16) private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+   !$OMP PARALLEL DO  private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
    do it = 1,Nx ! kx loop
       ! Compute phi2 and T2
       call calc_vari_mod(tmp_phi, tmp_T, acoeffs(2,2), 2,&
@@ -185,7 +188,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
    ! STAGE 3 ::
    !:::::::::::
    start = OMP_GET_WTIME()
-   !$OMP PARALLEL DO num_threads(16) private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+   !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
    do it = 1,Nx ! kx loop
       ! Compute phi3 and T3
       call calc_vari_mod(tmp_phi, tmp_T, acoeffs(3,3), 3,&
@@ -240,7 +243,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
                   &                   b(3)*(K3_T(2:Ny-1,:) + K4hat_T(2:Ny-1,:)))
 
    ! Get ux and uy
-   !$OMP PARALLEL DO num_threads(16) private(tmp_uy, it) schedule(dynamic)
+   !$OMP PARALLEL DO private(tmp_uy, it) schedule(dynamic)
    do it = 1,Nx
       ! Solve for v
       call calc_vi_mod(tmp_uy, phi(:,it), kx(it))
@@ -607,25 +610,25 @@ subroutine calc_explicit(stage)
 start = OMP_GET_WTIME()
 select case(stage)
    case (1)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          K1hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
       end do
       !$OMP END PARALLEL DO
    case (2)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          K2hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
       end do
       !$OMP END PARALLEL DO
    case (3)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          K3hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
       end do
       !$OMP END PARALLEL DO
    case (4)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          K4hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
       end do
@@ -635,7 +638,7 @@ subroutine calc_explicit(stage)
 write(*,*) " - - l1 timing: ", finish-start, "(s)"
 
 start = OMP_GET_WTIME()
-!$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+!$OMP PARALLEL DO schedule(dynamic)
 do i=1,Nx
    ! Compute dx(T) in Fourier space
    nlT  (:,i) =  kx(i)*Ti(:,i)
@@ -650,7 +653,7 @@ subroutine calc_explicit(stage)
 nlT = CI*nlT
 
 start = OMP_GET_WTIME()
-!$OMP PARALLEL DO num_threads(16) private(tnlT, tnlphi, tT, tux, tuy, tphi) schedule(dynamic)
+!$OMP PARALLEL DO private(tnlT, tnlphi, tT, tux, tuy, tphi) schedule(dynamic)
 do j = 1,Ny
    ! Bring everything to physical space
    tnlT   = nlT(j,:)
@@ -678,7 +681,7 @@ subroutine calc_explicit(stage)
 
 ! Calculate nonlinear term
 start = OMP_GET_WTIME()
-!$OMP PARALLEL DO num_threads(16) private(tmp_T) schedule(dynamic)
+!$OMP PARALLEL DO private(tmp_T) schedule(dynamic)
 do i = 1,Nx
    ! Temperature
    tmp_T = Ti(:,i)
@@ -692,7 +695,7 @@ subroutine calc_explicit(stage)
 
 ! Bring nonlinear terms back to Fourier space
 start = OMP_GET_WTIME()
-!$OMP PARALLEL DO num_threads(16) private(tnlT, tnlphi) schedule(dynamic)
+!$OMP PARALLEL DO private(tnlT, tnlphi) schedule(dynamic)
 do j = 1,Ny
    tnlT   = nlT(j,:)
    tnlphi = nlphi(j,:)
@@ -718,7 +721,7 @@ subroutine calc_explicit(stage)
 start = OMP_GET_WTIME()
 select case (stage)
    case (1)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          !K1hat_phi(:,i) = K1hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
          K1hat_phi(:,i) = K1hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
@@ -726,7 +729,7 @@ subroutine calc_explicit(stage)
       !$OMP END PARALLEL DO
       K1hat_T = -nlT
    case (2)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          !K2hat_phi(:,i) = K2hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
          K2hat_phi(:,i) = K2hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
@@ -734,7 +737,7 @@ subroutine calc_explicit(stage)
       !$OMP END PARALLEL DO
       K2hat_T = -nlT
    case (3)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          !K3hat_phi(:,i) = K3hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
          K3hat_phi(:,i) = K3hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
@@ -742,7 +745,7 @@ subroutine calc_explicit(stage)
       !$OMP END PARALLEL DO
       K3hat_T = -nlT
    case (4)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          !K4hat_phi(:,i) = K4hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
          K4hat_phi(:,i) = K4hat_phi(:,i) - CI*kx(i)*nlphi(:,i)

From dfe7fc791bebd3ce0b916cd69fa3637174dac7c0 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Thu, 22 Apr 2021 17:44:30 -0400
Subject: [PATCH 02/24] Starting MPI implementation

---
 Makefile             |   9 +-
 input.data           |   4 +-
 time_integrators.f90 |   2 +-
 time_loop_MPI.f90    | 197 +++++++++++++++++++++++++++++++++++++++++++
 write_pack.f90       |  33 +++++---
 5 files changed, 225 insertions(+), 20 deletions(-)
 create mode 100644 time_loop_MPI.f90

diff --git a/Makefile b/Makefile
index 684c2f9..d9d79e7 100755
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,13 @@
-FC=gfortran
+FC=mpif90
 FFLAGS= -c -O3
 
 LIBFLAGS2 = -L/usr/local/fftw/lib
 LDFLAGS   = -I/usr/local/fftw/include
 
-#MAIN = Ra_loop
-#MAIN = Ra_loop_no_opt
-MAIN = time_loop
+# MAIN = Ra_loop
+# MAIN = Ra_loop_no_opt
+# MAIN = time_loop
+MAIN = time_loop_MPI
 
 OBJECTS = fftw.o global.o allocate_vars.o precmod.o stringmod.o write_pack.o interpolation_pack.o mesh_pack.o imod.o bc_setup.o statistics.o time_integrators.o jacobians.o gmres_pack.o nonlinear_solvers.o $(MAIN).o
 PROGRAMS = $(MAIN).exe
diff --git a/input.data b/input.data
index ca77d7c..893b815 100755
--- a/input.data
+++ b/input.data
@@ -2,6 +2,6 @@
 5.0e+03, 1, 1.1
 5.0, 0.1
 -1.0, 1.0
-1600, 1275, 1
-0, 0, 0
+256, 204, 1
 0, 0, 0
+1, 0, 0
diff --git a/time_integrators.f90 b/time_integrators.f90
index 8460bad..846696f 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -40,7 +40,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
 EXTERNAL                       :: OMP_SET_NUM_THREADS
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-call OMP_SET_NUM_THREADS(8)
+call OMP_SET_NUM_THREADS(1)
 
 if (present(vtk_print)) then
    wvtk = .true.
diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
new file mode 100644
index 0000000..1c828ca
--- /dev/null
+++ b/time_loop_MPI.f90
@@ -0,0 +1,197 @@
+program time_loop
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+use global
+use strings
+use write_pack
+use interpolation_pack
+use allocate_vars
+use statistics
+use mesh_pack
+use time_integrators
+
+implicit none
+include 'mpif.h'
+
+integer                                                :: ntokens, ios
+integer                                                :: Nxc, Nyc, Nzc
+integer                                                :: nRa, ii, jj
+integer                                                :: refine_flag_x, refine_flag_y, refine_flag_z
+integer                                                :: vtk_flag, rstrt_flag, opt_flag
+logical                                                :: wvtk, save_restart, calc_opt
+logical                                                :: refine_x, refine_y, refine_z, refine_xy
+logical                                                :: fTexist, fuyexist
+logical                                                :: ex_Tptrb
+real(dp)                                               :: temp
+real(dp)                                               :: dxc
+real(dp)                                               :: Ra, dRa, Nu, dalpha
+real(dp),                  allocatable, dimension(:)   :: xpc, ypc, zpc
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:,:) :: uyc, Tc
+character(10)                                          :: citer
+character(9)                                           :: fiter
+character(80)                                          :: line
+character(80)                                          :: tokens(80)
+
+! MPI specific variables
+integer                                                :: mpierror, num_procs
+integer                                                :: proc_id
+character(4)                                           :: proc_id_str
+integer                                                status(MPI_STATUS_SIZE)
+
+
+! Initialize MPI.
+call MPI_Init(mpierror)
+call MPI_Comm_size(MPI_COMM_WORLD, num_procs, mpierror)
+call MPI_Comm_rank(MPI_COMM_WORLD, proc_id, mpierror)
+
+! Read file.
+open(2,file="input.data", status="old")
+do ii = 1,7
+read(2,'(A)') line
+call parse(line,' ,', tokens, ntokens)
+if (ntokens > 0) then
+    select case(ii)
+        case (1)
+        call value(tokens(1), Pr, ios)
+        call value(tokens(2), alpha, ios)
+        call value(tokens(3), dalpha, ios)
+        case (2)
+        call value(tokens(1), Ra,  ios)
+        call value(tokens(2), nRa, ios)
+        call value(tokens(3), dRa, ios)
+        case (3)
+        call value(tokens(1), t_final, ios)
+        call value(tokens(2), dt, ios)
+        dt_init = dt
+        case (4)
+        call value(tokens(1), ybot, ios)
+        call value(tokens(2), ytop, ios)
+        case (5)
+        call value(tokens(1), Nx, ios)
+        call value(tokens(2), Ny, ios)
+        call value(tokens(3), Nz, ios)
+        case (6)
+        call value(tokens(1), refine_flag_x, ios)
+        call value(tokens(2), refine_flag_y, ios)
+        call value(tokens(3), refine_flag_z, ios)
+        if (refine_flag_x == 1) then
+            if (refine_flag_y == 1) then
+                refine_xy = .true.
+                refine_x = .false.
+                refine_y = .false.
+            else
+            refine_x = .true.
+            refine_y = .false.
+            refine_xy = .false.
+            end if
+        else if (refine_flag_y == 1) then
+            refine_x = .false.
+            refine_y = .true.
+            refine_xy = .false.
+        else if (refine_flag_z == 1) then
+            write(*,*) "Refinement in z-direction not available yet!"
+            stop
+        else
+            refine_x = .false.
+            refine_y = .false.
+            refine_z = .false.
+            refine_xy = .false.
+        end if
+        case (7)
+        call value(tokens(1), vtk_flag, ios)
+        call value(tokens(2), rstrt_flag, ios)
+        call value(tokens(2), opt_flag, ios)
+        if (vtk_flag == 1) then
+            wvtk = .true.
+        else
+            wvtk = .false.
+        end if
+        if (rstrt_flag == 1) then
+            save_restart = .true.
+        else
+            save_restart = .false.
+        end if
+        if (opt_flag == 1) then
+            calc_opt = .true.
+        else
+            calc_opt = .false.
+        end if
+    end select
+end if
+end do
+close(unit=2)
+
+! Create FFT plans
+planuy = fftw_plan_dft_1d(Nx,tuy,tuy, FFTW_FORWARD,FFTW_ESTIMATE)
+iplanuy = fftw_plan_dft_1d(Nx,tuy,tuy, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+planux = fftw_plan_dft_1d(Nx,tux,tux, FFTW_FORWARD,FFTW_ESTIMATE)
+iplanux = fftw_plan_dft_1d(Nx,tux,tux, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+planphi = fftw_plan_dft_1d(Nx,tphi,tphi, FFTW_FORWARD,FFTW_ESTIMATE)
+iplanphi = fftw_plan_dft_1d(Nx,tphi,tphi, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+planT = fftw_plan_dft_1d(Nx,tT,tT, FFTW_FORWARD,FFTW_ESTIMATE)
+iplanT = fftw_plan_dft_1d(Nx,tT,tT, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+plannlT = fftw_plan_dft_1d(Nx,tnlT,tnlT, FFTW_FORWARD,FFTW_ESTIMATE)
+iplannlT = fftw_plan_dft_1d(Nx,tnlT,tnlT, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+plannlphi = fftw_plan_dft_1d(Nx,tnlphi,tnlphi, FFTW_FORWARD,FFTW_ESTIMATE)
+iplannlphi = fftw_plan_dft_1d(Nx,tnlphi,tnlphi, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+call global_params
+call global_allocations
+
+xR =  pi / alpha
+xL = -pi / alpha
+Lx = xR - xL
+
+if (Nx == 1) then
+    dx = 1.0_dp
+else
+    dx = Lx / (real(Nx,kind=dp))
+end if
+
+if (proc_id == 0) then
+    write(*,*) "Ny = ", Ny, " divided among ", num_procs, " processors -> ", &
+               Ny / num_procs, " rows per processor."
+end if
+
+Ny = Ny / 4
+
+call cosine_mesh(xp,yp,zp, Nx,Ny,Nz) ! get coordinates
+call dxdydz(dynu, xp,yp,zp) ! get mesh spacing for nonuniform grids
+call y_mesh_params ! get metric coefficients for nonuniform grids
+dymin = minval(dynu)
+dxmin = sqrt(dx**2.0_dp + dymin**2.0_dp)
+
+! Fourier modes
+do ii = 1,Nx/2
+    kx_modes(ii) = real(ii,kind=dp) - 1.0_dp
+end do
+do ii = Nx/2+1, Nx
+    kx_modes(ii) = real(ii-Nx,kind=dp) - 1.0_dp
+end do
+kx = alpha*kx_modes
+
+! Write initial field to vtk
+if (wvtk) then
+    write(proc_id_str, "(I3.3)") proc_id
+    write(*,*) proc_id_str 
+    call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
+ end if
+
+write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
+
+
+
+
+
+call MPI_Finalize(mpierror)
+
+end program time_loop
+
+    
\ No newline at end of file
diff --git a/write_pack.f90 b/write_pack.f90
index 08f53d4..7d7b857 100755
--- a/write_pack.f90
+++ b/write_pack.f90
@@ -88,7 +88,7 @@ end subroutine write_mat_cmplx
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-subroutine write_vtk_structured_grid(iter)
+subroutine write_vtk_structured_grid(iter, process_id)
 
 use global
 
@@ -99,13 +99,17 @@ subroutine write_vtk_structured_grid(iter)
 character(9)  :: fiter
 integer :: n
 integer :: ii, jj, kk
+character(3), intent(in), optional  :: process_id
 
 write(citer, "(I10)") 1000000000 + iter
 fiter = citer(2:10)
 
-n = (Nx+1)*Ny*Nz
-
-open(unit=2, file=vtkloc//vtkname//fiter//gtype, action="write", status="replace")
+n = (Nx+1)*Ny*Nx
+if (PRESENT(process_id)) then
+   open(unit=2, file=vtkloc//vtkname//"_P"//process_id//"_"//fiter//gtype, action="write", status="replace")
+else
+   open(unit=2, file=vtkloc//vtkname//fiter//gtype, action="write", status="replace")
+endif
 write(2, "(a)") "# vtk DataFile Version 3.1"
 write(2, "(a)") "Thermal Convection Data"
 write(2, "(a)") "ASCII"
@@ -180,18 +184,22 @@ subroutine write_vtk_structured_grid(iter)
 end subroutine write_vtk_structured_grid
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine write_to_vtk(step, physical)
+subroutine write_to_vtk(step, physical, process_id)
 
-integer, intent(in) :: step
-logical, intent(in) :: physical
-integer             :: j
+integer, intent(in)            :: step
+logical, intent(in)            :: physical
+character(3), intent(in), optional  :: process_id
+integer                        :: j
 
-if (physical) then
-
-   call write_vtk_structured_grid(step)
 
+if (physical) then
+   if (PRESENT(process_id)) then
+      write(*,*) "writing file with ", process_id
+      call write_vtk_structured_grid(step, process_id)
+   else 
+      call write_vtk_structured_grid(step)
+   endif 
 else
-
    do j = 1,Ny
       ! Bring everything to physical space
       tT  = T(j,:)
@@ -220,7 +228,6 @@ subroutine write_to_vtk(step, physical)
    T  = T  / real(Nx, kind=dp)
    ux = ux / real(Nx, kind=dp)
    uy = uy / real(Nx, kind=dp)
-
 end if
 
 end subroutine write_to_vtk

From 769e8cbc5ab8d71f09f8a2349170e4548ab7e661 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Thu, 22 Apr 2021 21:33:02 -0400
Subject: [PATCH 03/24] Adding multi-node vtk output

---
 time_loop_MPI.f90 | 20 +++++++++++++-------
 write_pack.f90    |  1 -
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
index 1c828ca..ee40286 100644
--- a/time_loop_MPI.f90
+++ b/time_loop_MPI.f90
@@ -38,6 +38,7 @@ program time_loop
 integer                                                :: mpierror, num_procs
 integer                                                :: proc_id
 character(4)                                           :: proc_id_str
+real(dp)                                                :: mpi_spacing_y
 integer                                                status(MPI_STATUS_SIZE)
 
 
@@ -123,6 +124,18 @@ program time_loop
 end do
 close(unit=2)
 
+! Print MPI division.
+if (proc_id == 0) then
+    write(*,*) "Ny = ", Ny, " divided among ", num_procs, " processors -> ", &
+               Ny / num_procs, " rows per processor."
+end if
+
+! Initialize MPI variables.
+Ny = Ny / 4
+mpi_spacing_y = (ytop - ybot) / num_procs
+ybot = ybot + proc_id * mpi_spacing_y
+ytop = ybot + mpi_spacing_y
+
 ! Create FFT plans
 planuy = fftw_plan_dft_1d(Nx,tuy,tuy, FFTW_FORWARD,FFTW_ESTIMATE)
 iplanuy = fftw_plan_dft_1d(Nx,tuy,tuy, FFTW_BACKWARD,FFTW_ESTIMATE)
@@ -155,12 +168,6 @@ program time_loop
     dx = Lx / (real(Nx,kind=dp))
 end if
 
-if (proc_id == 0) then
-    write(*,*) "Ny = ", Ny, " divided among ", num_procs, " processors -> ", &
-               Ny / num_procs, " rows per processor."
-end if
-
-Ny = Ny / 4
 
 call cosine_mesh(xp,yp,zp, Nx,Ny,Nz) ! get coordinates
 call dxdydz(dynu, xp,yp,zp) ! get mesh spacing for nonuniform grids
@@ -180,7 +187,6 @@ program time_loop
 ! Write initial field to vtk
 if (wvtk) then
     write(proc_id_str, "(I3.3)") proc_id
-    write(*,*) proc_id_str 
     call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
  end if
 
diff --git a/write_pack.f90 b/write_pack.f90
index 7d7b857..a450353 100755
--- a/write_pack.f90
+++ b/write_pack.f90
@@ -194,7 +194,6 @@ subroutine write_to_vtk(step, physical, process_id)
 
 if (physical) then
    if (PRESENT(process_id)) then
-      write(*,*) "writing file with ", process_id
       call write_vtk_structured_grid(step, process_id)
    else 
       call write_vtk_structured_grid(step)

From 08952774d27ab8ff876a9599c3f8145fe1a75ed1 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Thu, 22 Apr 2021 22:14:24 -0400
Subject: [PATCH 04/24] Wrap up initalization

---
 time_integrators.f90 |   4 +-
 time_loop_MPI.f90    | 147 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 139 insertions(+), 12 deletions(-)

diff --git a/time_integrators.f90 b/time_integrators.f90
index 846696f..a8365c7 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -100,7 +100,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
    finish = OMP_GET_WTIME()
    write(*,*) " - calc_explicit(1) timing: ", finish-start, "(s)"
    start = OMP_GET_WTIME()
-   !$OMP PARALLEL DO  private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+   !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
    do it = 1,Nx ! kx loop
       ! Compute phi1 and T1
       call calc_vari_mod(tmp_phi, tmp_T, acoeffs(1,1), 1,&
@@ -144,7 +144,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
    ! STAGE 2 ::
    !:::::::::::
    start = OMP_GET_WTIME()
-   !$OMP PARALLEL DO  private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+   !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
    do it = 1,Nx ! kx loop
       ! Compute phi2 and T2
       call calc_vari_mod(tmp_phi, tmp_T, acoeffs(2,2), 2,&
diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
index ee40286..222a632 100644
--- a/time_loop_MPI.f90
+++ b/time_loop_MPI.f90
@@ -124,12 +124,6 @@ program time_loop
 end do
 close(unit=2)
 
-! Print MPI division.
-if (proc_id == 0) then
-    write(*,*) "Ny = ", Ny, " divided among ", num_procs, " processors -> ", &
-               Ny / num_procs, " rows per processor."
-end if
-
 ! Initialize MPI variables.
 Ny = Ny / 4
 mpi_spacing_y = (ytop - ybot) / num_procs
@@ -168,7 +162,6 @@ program time_loop
     dx = Lx / (real(Nx,kind=dp))
 end if
 
-
 call cosine_mesh(xp,yp,zp, Nx,Ny,Nz) ! get coordinates
 call dxdydz(dynu, xp,yp,zp) ! get mesh spacing for nonuniform grids
 call y_mesh_params ! get metric coefficients for nonuniform grids
@@ -188,16 +181,150 @@ program time_loop
 if (wvtk) then
     write(proc_id_str, "(I3.3)") proc_id
     call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
- end if
+end if
 
-write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
+! Initialize fields.
+call init_fields(ex_Tptrb, Ra)
+call init_to_fourier(ex_Tptrb)
 
+! Get nu0 and kappa0
+call global_params_Ra(Ra)
 
+if (proc_id == 0) then
+    write(*,'(A70)')                  ' '
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '                TWO-DIMENSIONAL THERMAL CONVECTION                   |'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,47X,A)')           'COMPUTATION SIZE:',                                                 '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(20X,A23,I5,21X,A)')     'Nx                       = ', Nx,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Ny                       = ', Ny,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Nz                       = ', Nz,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Number of time steps     = ', nt,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Number of Fourier modes  = ', Nf,                                   '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,45X,A)')           'PROBLEM PARAMETERS:',                                               '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Prandtl number  (Pr)                    = ', Pr,                    '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Rayleigh number (Ra)            = ', Ra,                    '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Reynolds number (Re)            = ', sqrt(Ra/(16.0_dp*Pr)), '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,42X,A)')           'PHYSICAL PROBLEM SIZE:',                                            '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'alpha                     = ', alpha,                               '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Left coordinate of box    = ', xL,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Right coordinate of box   = ', xR,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of bottom wall = ', ybot,                                '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of top wall    = ', ytop,                                '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Box width                 = ', Lx,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Box height                = ', Ly,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Aspect Ratio              = ', Lx/Ly,                               '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Integration time (T)      = ', t_final,                              '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Time step size (Delta t)  = ', dt,                                   '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '                                                                      '
+    
+    flush(6)
+    ! Print MPI division.
+    write(*,*) "Ny = ", Ny*num_procs, " divided among ", num_procs, " processors -> ", &
+               Ny, " rows per processor."
+end if
 
+call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
 
+write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
 
 call MPI_Finalize(mpierror)
 
 end program time_loop
 
-    
\ No newline at end of file
+!::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+subroutine init_fields(ex_Tptrb,Ra)
+
+use global
+
+implicit none
+
+logical, intent(out) :: ex_Tptrb
+real(dp), intent(in) :: Ra
+integer              :: ii
+
+! Initialize fields.
+if (Ra < 1710.0_dp) then
+    ex_Tptrb = .true.
+    do ii = 1,Nx
+        Tptrb(:,ii) = 0.5_dp*2.0_dp*cos(alpha*xp(ii))*cos(pi*yp/2.0_dp)
+        T(:,ii) = -yp + Tptrb(:,ii)
+    end do
+else
+    ex_Tptrb = .false.
+    do ii = 1,Nx
+        T(:,ii) = -yp + 0.5_dp*2.0_dp*cos(alpha*xp(ii))*cos(pi*yp/2.0_dp)
+    end do
+end if
+
+end subroutine init_fields
+
+!::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+subroutine init_to_fourier(ex_Tptrb)
+
+use global
+
+implicit none
+
+logical, intent(in) :: ex_Tptrb
+integer             :: ii, jj
+
+! Bring temperature and velocity to Fourier space.
+do jj = 1,Ny
+    tT = T(jj,:)
+    tuy = uy(jj,:)
+    call fftw_execute_dft(planT, tT, tT)
+    call fftw_execute_dft(planuy, tuy, tuy)
+    ! Truncate modes
+    do ii = 1,Nx
+        if (abs(kx(ii))/alpha >= Nf/2) then
+            tT(ii)  = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+            tuy(ii) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+        end if
+    end do
+    T(jj,:) = tT
+    uy(jj,:) = tuy
+    ! If temperature perturbation needed.
+    if (ex_Tptrb) then
+        tT = Tptrb(jj,:)
+        call fftw_execute_dft(planT, tT, tT)
+        ! Truncate modes
+        do ii = 1,Nx
+            if (abs(kx(ii))/alpha >= Nf/2) then
+            tT(ii)  = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+            end if
+        end do
+        Tptrb(jj,:) = tT
+    end if
+end do
+T = T / real(Nx, kind=dp)
+uy = uy / real(Nx, kind=dp)
+
+if (ex_Tptrb) then
+    Tptrb = Tptrb / real(Nx, kind=dp)
+end if
+
+! Calculate phi and ux from uy
+do ii = 1,Nx
+    if (kx(ii) /= 0.0_dp) then
+        tmp_uy = uy(:,ii)
+        !ux(:,ii) = -CI*d1y(tmp_uy)/kx(ii)
+        ux(:,ii) = CI*d1y(tmp_uy)/kx(ii)
+    else if (kx(ii) == 0.0_dp) then
+        ux(:,ii) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+    end if
+    phi(:,ii) = -kx(ii)**2.0_dp*uy(:,ii) + d2y(uy(:,ii))
+end do
+
+end subroutine init_to_fourier
\ No newline at end of file

From 62886a80a653b40a8d61f7954ed6de5ebd700236 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Fri, 23 Apr 2021 11:06:33 -0400
Subject: [PATCH 05/24] Vtk experiments

---
 Makefile             | 4 ++--
 time_integrators.f90 | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index d9d79e7..0861c74 100755
--- a/Makefile
+++ b/Makefile
@@ -6,8 +6,8 @@ LDFLAGS   = -I/usr/local/fftw/include
 
 # MAIN = Ra_loop
 # MAIN = Ra_loop_no_opt
-# MAIN = time_loop
-MAIN = time_loop_MPI
+MAIN = time_loop
+#MAIN = time_loop_MPI
 
 OBJECTS = fftw.o global.o allocate_vars.o precmod.o stringmod.o write_pack.o interpolation_pack.o mesh_pack.o imod.o bc_setup.o statistics.o time_integrators.o jacobians.o gmres_pack.o nonlinear_solvers.o $(MAIN).o
 PROGRAMS = $(MAIN).exe
diff --git a/time_integrators.f90 b/time_integrators.f90
index a8365c7..6c49184 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -266,8 +266,6 @@ subroutine imex_rk(vtk_print, save_nusselt)
    end if
 
    !call update_dt
-   ! Don't write vtk for now.
-   wvtk = .false.
    if (wvtk) then
       if (mod(nti,vtk_print) == 0) then
          call write_to_vtk(nti, .false.) ! false = Fourier space

From 91c30a06ccee937df14bf1b8970cf0a3c6dd43ce Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Fri, 23 Apr 2021 11:13:19 -0400
Subject: [PATCH 06/24] Vtk experiments

---
 input.data | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/input.data b/input.data
index 893b815..894b925 100755
--- a/input.data
+++ b/input.data
@@ -2,6 +2,6 @@
 5.0e+03, 1, 1.1
 5.0, 0.1
 -1.0, 1.0
-256, 204, 1
+64, 51, 1
 0, 0, 0
 1, 0, 0

From 5527a75f088cc8fb2e134c7ee1a9e73723e323b8 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Sat, 24 Apr 2021 10:55:56 -0400
Subject: [PATCH 07/24] Initialization complete

---
 Makefile                 |   9 +-
 input.data               |   2 +-
 mesh_pack.f90            |  45 ++++++++
 time_integrators.f90     | 230 ---------------------------------------
 time_integrators_MPI.f90 |  64 +++++++++++
 time_loop_MPI.f90        |   9 +-
 write_pack.f90           |   8 +-
 7 files changed, 128 insertions(+), 239 deletions(-)
 create mode 100644 time_integrators_MPI.f90

diff --git a/Makefile b/Makefile
index 0861c74..6db4fb2 100755
--- a/Makefile
+++ b/Makefile
@@ -6,10 +6,10 @@ LDFLAGS   = -I/usr/local/fftw/include
 
 # MAIN = Ra_loop
 # MAIN = Ra_loop_no_opt
-MAIN = time_loop
-#MAIN = time_loop_MPI
+# MAIN = time_loop
+MAIN = time_loop_MPI
 
-OBJECTS = fftw.o global.o allocate_vars.o precmod.o stringmod.o write_pack.o interpolation_pack.o mesh_pack.o imod.o bc_setup.o statistics.o time_integrators.o jacobians.o gmres_pack.o nonlinear_solvers.o $(MAIN).o
+OBJECTS = fftw.o global.o allocate_vars.o precmod.o stringmod.o write_pack.o interpolation_pack.o mesh_pack.o imod.o bc_setup.o statistics.o time_integrators.o time_integrators_MPI.o jacobians.o gmres_pack.o nonlinear_solvers.o $(MAIN).o
 PROGRAMS = $(MAIN).exe
 
 all: $(PROGRAMS)
@@ -53,6 +53,9 @@ statistics.o : statistics.f90
 time_integrators.o : time_integrators.f90
 	$(FC) -fopenmp $(FFLAGS) time_integrators.f90
 
+time_integrators_MPI.o : time_integrators_MPI.f90
+	$(FC) -fopenmp $(FFLAGS) time_integrators_MPI.f90
+
 jacobians.o : jacobians.f90
 	$(FC) $(FFLAGS) jacobians.f90
 
diff --git a/input.data b/input.data
index 894b925..cc633a4 100755
--- a/input.data
+++ b/input.data
@@ -2,6 +2,6 @@
 5.0e+03, 1, 1.1
 5.0, 0.1
 -1.0, 1.0
-64, 51, 1
+64, 48, 1
 0, 0, 0
 1, 0, 0
diff --git a/mesh_pack.f90 b/mesh_pack.f90
index b7239b2..034c0cf 100755
--- a/mesh_pack.f90
+++ b/mesh_pack.f90
@@ -42,6 +42,51 @@ subroutine cosine_mesh(xcoor,ycoor,zcoor, numx,numy,numz,dxin)
 
 end subroutine cosine_mesh
 ! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+subroutine cosine_mesh_MPI(xcoor,ycoor,zcoor, numx,numy,numz, proc_id,num_procs,dxin)
+
+implicit none
+
+integer,                             intent(in)  :: numx, numy, numz
+integer,                             intent(in)  :: proc_id, num_procs
+integer                                                :: mpierror
+integer                                          :: ii, jj, kk
+real(dp)                                         :: c
+real(dp), optional,                  intent(in)  :: dxin
+real(dp)                                         :: dxj
+real(dp), allocatable, dimension(:), intent(out) :: xcoor, ycoor, zcoor
+integer                                          :: total_ny, start_ny
+
+if (present(dxin)) then
+   dxj = dxin
+else
+   dxj = dx
+end if
+
+allocate(xcoor(numx), stat=alloc_err)
+allocate(ycoor(numy), stat=alloc_err)
+allocate(zcoor(numz), stat=alloc_err)
+call check_alloc_err(alloc_err)
+
+! Create the grid.
+xcoor(1) = xL
+zcoor(1) = 0.0_dp
+total_ny = numy * num_procs
+start_ny = proc_id * numy
+do jj = 1,numy
+      c = (real(jj+start_ny,kind=dp) - 1.0_dp) / (real(total_ny,kind=dp) - 1.0_dp)
+      ycoor(jj) = -cos(c*pi)
+end do
+do ii = 2,numx
+   xcoor(ii) = xcoor(ii-1) + dxj
+end do
+do kk = 2,numz
+   zcoor(kk) = 0.0_dp
+end do
+
+end subroutine cosine_mesh_MPI
+! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 subroutine dxdydz(dyj, xcoor,ycoor,zcoor)
 
 implicit none
diff --git a/time_integrators.f90 b/time_integrators.f90
index 6c49184..6536e92 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -54,236 +54,6 @@ subroutine imex_rk(vtk_print, save_nusselt)
    call write_to_vtk(0, .false.) ! false = Fourier space
 end if
 
-dt = dt_init
-
-call init_bc(acoeffs(1,1))
-
-time = 0.0_dp
-
-dtmax = 0.5_dp
-dtmin = 1.0e-4_dp
-
-dt_ramp = 1.1_dp
-
-dt_old = dt
-
-nti = 0
-
-! Format for writing out single values.
-1000 format(E25.16E3)
-
-! Time integration
-do ! while (time < t_final)
-   start_overall = OMP_GET_WTIME()
-
-   dt_final = t_final - time
-
-   if (dt_final <= dt) then
-      time = t_final
-   else
-      time = time + dt
-   end if
-
-   write(*,*) "time = ", time, "dt = ", dt
-
-   nti = nti + 1
-
-   !:::::::::::
-   ! STAGE 1 ::
-   !:::::::::::
-   phii = phi
-   Ti   = T
-   uxi  = ux
-   uyi  = uy
-   start = OMP_GET_WTIME()
-   call calc_explicit(1)
-   finish = OMP_GET_WTIME()
-   write(*,*) " - calc_explicit(1) timing: ", finish-start, "(s)"
-   start = OMP_GET_WTIME()
-   !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
-   do it = 1,Nx ! kx loop
-      ! Compute phi1 and T1
-      call calc_vari_mod(tmp_phi, tmp_T, acoeffs(1,1), 1,&
-                           kx(it), phi(2:Ny-1,it),&
-                           K1hat_phi(2:Ny-1,it),K2hat_phi(2:Ny-1,it),K3hat_phi(2:Ny-1,it),&
-                           K1hat_T(2:Ny-1,it),K2hat_T(2:Ny-1,it),K3hat_T(2:Ny-1,it),&
-                           K1_phi(2:Ny-1,it), K2_phi(2:Ny-1,it), K1_T(2:Ny-1,it), K2_T(2:Ny-1,it),&
-                           T(:,it))
-      ! Compute v1 from phi1
-      call calc_vi_mod(tmp_uy, tmp_phi, kx(it))
-      ! BOUNDAY CONDITIONS!
-      call update_bcs_mod(tmp_phi1,tmp_uy1, tmp_phi,tmp_uy,dyv1_T(it),dyv2_T(it),&
-                           dyv1_B(it),dyv2_B(it),&
-                           V1(:,it),V2(:,it),phi1(:,it),phi2(:,it))
-      tmp_phi = tmp_phi1
-      tmp_uy  = tmp_uy1
-      call calc_implicit_mod(tmp_K_phi,tmp_K_T, tmp_phi,tmp_T, kx(it))
-      K1_phi(:,it) = tmp_K_phi
-      K1_T(:,it)   = tmp_K_T
-      ! Compute u1 from v1
-      if (kx(it) /= 0.0_dp) then
-         !uxi(:,it) = -CI*d1y(tmp_uy)/kx(it)
-         uxi(:,it) = CI*d1y(tmp_uy)/kx(it)
-      else if (kx(it) == 0.0_dp) then
-         uxi(:,it) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
-      end if
-      phii(:,it) = tmp_phi
-      Ti  (:,it) = tmp_T
-      uyi (:,it) = tmp_uy
-   end do
-   !$OMP END PARALLEL DO
-   finish = OMP_GET_WTIME()
-   write(*,*) " - stage 1 mid timing: ", finish-start, "(s)"
-   ! Compute K2hat
-   start = OMP_GET_WTIME()
-   call calc_explicit(2)
-   finish = OMP_GET_WTIME()
-   write(*,*) " - calc_explicit(2) timing: ", finish-start, "(s)"
-
-   !:::::::::::
-   ! STAGE 2 ::
-   !:::::::::::
-   start = OMP_GET_WTIME()
-   !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
-   do it = 1,Nx ! kx loop
-      ! Compute phi2 and T2
-      call calc_vari_mod(tmp_phi, tmp_T, acoeffs(2,2), 2,&
-                           kx(it), phi(2:Ny-1,it),&
-                           K1hat_phi(2:Ny-1,it),K2hat_phi(2:Ny-1,it),K3hat_phi(2:Ny-1,it),&
-                           K1hat_T(2:Ny-1,it),K2hat_T(2:Ny-1,it),K3hat_T(2:Ny-1,it),&
-                           K1_phi(2:Ny-1,it), K2_phi(2:Ny-1,it), K1_T(2:Ny-1,it), K2_T(2:Ny-1,it),&
-                           T(:,it))
-      ! Compute v1 from phi1
-      call calc_vi_mod(tmp_uy, tmp_phi, kx(it))
-      ! BOUNDAY CONDITIONS!
-      call update_bcs_mod(tmp_phi1,tmp_uy1, tmp_phi,tmp_uy,dyv1_T(it),dyv2_T(it),&
-                           dyv1_B(it),dyv2_B(it),&
-                           V1(:,it),V2(:,it),phi1(:,it),phi2(:,it))
-      tmp_phi = tmp_phi1
-      tmp_uy  = tmp_uy1
-      ! Compute K2_T and K2_phi
-      call calc_implicit_mod(tmp_K_phi,tmp_K_T, tmp_phi,tmp_T, kx(it))
-      K2_phi(:,it) = tmp_K_phi
-      K2_T(:,it)   = tmp_K_T
-      ! Compute u1 from v1
-      if (kx(it) /= 0.0_dp) then
-         !uxi(:,it) = -CI*d1y(tmp_uy)/kx(it)
-         uxi(:,it) = CI*d1y(tmp_uy)/kx(it)
-      else if (kx(it) == 0.0_dp) then
-         uxi(:,it) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
-      end if
-      phii(:,it) = tmp_phi
-      Ti  (:,it) = tmp_T
-      uyi (:,it) = tmp_uy
-   end do
-   finish = OMP_GET_WTIME()
-   write(*,*) " - stage 2 mid timing: ", finish-start, "(s)"
-   ! Compute K3hat
-   start = OMP_GET_WTIME()
-   call calc_explicit(3)
-   finish = OMP_GET_WTIME()
-   write(*,*) " - calc_explicit(3) timing: ", finish-start, "(s)"
-
-   !:::::::::::
-   ! STAGE 3 ::
-   !:::::::::::
-   start = OMP_GET_WTIME()
-   !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
-   do it = 1,Nx ! kx loop
-      ! Compute phi3 and T3
-      call calc_vari_mod(tmp_phi, tmp_T, acoeffs(3,3), 3,&
-                           kx(it), phi(2:Ny-1,it),&
-                           K1hat_phi(2:Ny-1,it),K2hat_phi(2:Ny-1,it),K3hat_phi(2:Ny-1,it),&
-                           K1hat_T(2:Ny-1,it),K2hat_T(2:Ny-1,it),K3hat_T(2:Ny-1,it),&
-                           K1_phi(2:Ny-1,it), K2_phi(2:Ny-1,it), K1_T(2:Ny-1,it), K2_T(2:Ny-1,it),&
-                           T(:,it))
-      ! Compute v1 from phi1
-      call calc_vi_mod(tmp_uy, tmp_phi, kx(it))
-      ! BOUNDAY CONDITIONS!
-      call update_bcs_mod(tmp_phi1,tmp_uy1, tmp_phi,tmp_uy,dyv1_T(it),dyv2_T(it),&
-                           dyv1_B(it),dyv2_B(it),&
-                           V1(:,it),V2(:,it),phi1(:,it),phi2(:,it))
-      tmp_phi = tmp_phi1
-      tmp_uy  = tmp_uy1
-      ! Compute K3_T and K3_phi
-      call calc_implicit_mod(tmp_K_phi,tmp_K_T, tmp_phi,tmp_T, kx(it))
-      K3_phi(:,it) = tmp_K_phi
-      K3_T(:,it)   = tmp_K_T
-      ! Compute u1 from v1
-      if (kx(it) /= 0.0_dp) then
-         !uxi(:,it) = -CI*d1y(tmp_uy)/kx(it)
-         uxi(:,it) = CI*d1y(tmp_uy)/kx(it)
-      else if (kx(it) == 0.0_dp) then
-         uxi(:,it) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
-      end if
-      phii(:,it) = tmp_phi
-      Ti  (:,it) = tmp_T
-      uyi (:,it) = tmp_uy
-   end do
-   !$OMP END PARALLEL DO
-   finish = OMP_GET_WTIME()
-   write(*,*) " - stage 3 mid timing: ", finish-start, "(s)"
-   ! Compute K4hat
-   start = OMP_GET_WTIME()
-   call calc_explicit(4)
-   finish = OMP_GET_WTIME()
-   write(*,*) " - calc_explicit(4) timing: ", finish-start, "(s)"
-
-   ! UPDATE SOLUTIONS
-
-   start = OMP_GET_WTIME()
-   ! Get phi
-   phi(2:Ny-1,:) = phi(2:Ny-1,:) + dt*(b(1)*(K1_phi(2:Ny-1,:) + K2hat_phi(2:Ny-1,:)) + &
-                  &                    b(2)*(K2_phi(2:Ny-1,:) + K3hat_phi(2:Ny-1,:)) + &
-                  &                    b(3)*(K3_phi(2:Ny-1,:) + K4hat_phi(2:Ny-1,:)))
-
-   ! Get temperature
-   T(2:Ny-1,:)   = T(2:Ny-1,:)  + dt*(b(1)*(K1_T(2:Ny-1,:) + K2hat_T(2:Ny-1,:)) + &
-                  &                   b(2)*(K2_T(2:Ny-1,:) + K3hat_T(2:Ny-1,:)) + &
-                  &                   b(3)*(K3_T(2:Ny-1,:) + K4hat_T(2:Ny-1,:)))
-
-   ! Get ux and uy
-   !$OMP PARALLEL DO private(tmp_uy, it) schedule(dynamic)
-   do it = 1,Nx
-      ! Solve for v
-      call calc_vi_mod(tmp_uy, phi(:,it), kx(it))
-      uy(:,it) = tmp_uy
-      ! Solve for u
-      if (kx(it) /= 0.0_dp) then
-         !ux(:,it) = -CI*d1y(tmp_uy)/kx(it)
-         ux(:,it) = CI*d1y(tmp_uy)/kx(it)
-      else if (kx(it) == 0.0_dp) then
-         ux(:,it) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
-      end if
-   end do
-   !$OMP END PARALLEL DO
-   finish = OMP_GET_WTIME()
-   write(*,*) " - update sols timing: ", finish-start, "(s)"
-
-
-   if (time == t_final) then
-      exit
-   end if
-
-   !call update_dt
-   if (wvtk) then
-      if (mod(nti,vtk_print) == 0) then
-         call write_to_vtk(nti, .false.) ! false = Fourier space
-      end if
-   end if
-
-   ! Calculate nusselt number.
-   if (save_nusselt) then
-      call nusselt(nusselt_num, .true.) ! true = Fourier space
-      write(8000, fmt=1000) nusselt_num
-      flush(8000)
-   end if
-
-   finish_overall = OMP_GET_WTIME()
-   write(*,*) "overall timing: ", finish_overall-start_overall, "(s)"
-
-end do ! time loop
-
 end subroutine imex_rk
 
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
new file mode 100644
index 0000000..8cf7fa8
--- /dev/null
+++ b/time_integrators_MPI.f90
@@ -0,0 +1,64 @@
+module time_integrators_MPI
+
+use fftw
+use global
+use write_pack
+use allocate_vars
+use bc_setup
+use statistics
+use omp_lib
+
+integer  :: it, jt, kkt
+integer  :: info
+real(dp) :: time, dtmax, dtmin, dt_old, dt_ramp, dt_final
+
+contains
+
+subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt)
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!!  This progam solves the equations of thermal convection using a Fourier
+!!  spectral method in the x-direction and a 2nd order finite difference scheme in
+!!  the y-direction.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+implicit none
+
+character(3), intent(in)       :: proc_id_str
+integer, optional, intent(in)  :: vtk_print
+logical, optional, intent(in)  :: save_nusselt
+
+integer                        :: nti
+integer                        :: nprint
+logical                        :: wvtk
+real(dp)                       :: nusselt_num
+real(dp)                       :: start, finish
+real(dp)                       :: start_overall, finish_overall
+integer                        :: nthreads, myid
+integer, EXTERNAL              :: OMP_GET_THREAD_NUM, OMP_GET_NUM_THREADS
+real(dp), EXTERNAL             :: OMP_GET_WTIME
+EXTERNAL                       :: OMP_SET_NUM_THREADS
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+call OMP_SET_NUM_THREADS(1)
+
+if (present(vtk_print)) then
+    wvtk = .true.
+    nprint = vtk_print
+else
+    wvtk = .false.
+    nprint = 100
+end if
+write(*,*) "imex_rk_MPI from proc ", proc_id_str
+
+if (wvtk) then
+    call write_to_vtk(0, .false., proc_id_str) ! false = Fourier space
+end if
+
+
+end subroutine imex_rk_MPI
+
+
+end module time_integrators_MPI
+    
\ No newline at end of file
diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
index 222a632..380d93c 100644
--- a/time_loop_MPI.f90
+++ b/time_loop_MPI.f90
@@ -10,7 +10,7 @@ program time_loop
 use allocate_vars
 use statistics
 use mesh_pack
-use time_integrators
+use time_integrators_MPI
 
 implicit none
 include 'mpif.h'
@@ -38,7 +38,7 @@ program time_loop
 integer                                                :: mpierror, num_procs
 integer                                                :: proc_id
 character(4)                                           :: proc_id_str
-real(dp)                                                :: mpi_spacing_y
+real(dp)                                               :: mpi_spacing_y
 integer                                                status(MPI_STATUS_SIZE)
 
 
@@ -162,7 +162,7 @@ program time_loop
     dx = Lx / (real(Nx,kind=dp))
 end if
 
-call cosine_mesh(xp,yp,zp, Nx,Ny,Nz) ! get coordinates
+call cosine_mesh_MPI(xp,yp,zp, Nx,Ny,Nz, proc_id, num_procs) ! get coordinates
 call dxdydz(dynu, xp,yp,zp) ! get mesh spacing for nonuniform grids
 call y_mesh_params ! get metric coefficients for nonuniform grids
 dymin = minval(dynu)
@@ -239,6 +239,9 @@ program time_loop
 
 write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
 
+! Get solution with time integration
+call imex_rk_MPI(proc_id_str, 1, .true.) ! true causes writing of nusselt number.
+
 call MPI_Finalize(mpierror)
 
 end program time_loop
diff --git a/write_pack.f90 b/write_pack.f90
index a450353..dd1311d 100755
--- a/write_pack.f90
+++ b/write_pack.f90
@@ -104,7 +104,7 @@ subroutine write_vtk_structured_grid(iter, process_id)
 write(citer, "(I10)") 1000000000 + iter
 fiter = citer(2:10)
 
-n = (Nx+1)*Ny*Nx
+n = (Nx+1)*Ny*Nz
 if (PRESENT(process_id)) then
    open(unit=2, file=vtkloc//vtkname//"_P"//process_id//"_"//fiter//gtype, action="write", status="replace")
 else
@@ -211,7 +211,11 @@ subroutine write_to_vtk(step, physical, process_id)
       ux(j,:)  = tux
       uy(j,:)  = tuy
    end do
-   call write_vtk_structured_grid(step)
+   if (PRESENT(process_id)) then
+      call write_vtk_structured_grid(step, process_id)
+   else 
+      call write_vtk_structured_grid(step)
+   endif 
    do j = 1,Ny
       ! Bring everything to Fourier space
       tT  = T(j,:)

From 86844b7262b6b1668ad9560d39d233ee22f8b524 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Sat, 24 Apr 2021 10:58:24 -0400
Subject: [PATCH 08/24] reset time_integrators

---
 time_integrators.f90 | 233 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 233 insertions(+)

diff --git a/time_integrators.f90 b/time_integrators.f90
index 6536e92..7cd60c9 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -54,6 +54,239 @@ subroutine imex_rk(vtk_print, save_nusselt)
    call write_to_vtk(0, .false.) ! false = Fourier space
 end if
 
+dt = dt_init
+
+call init_bc(acoeffs(1,1))
+
+time = 0.0_dp
+
+dtmax = 0.5_dp
+dtmin = 1.0e-4_dp
+
+dt_ramp = 1.1_dp
+
+dt_old = dt
+
+nti = 0
+
+! Format for writing out single values.
+1000 format(E25.16E3)
+
+! Time integration
+do ! while (time < t_final)
+   start_overall = OMP_GET_WTIME()
+
+   dt_final = t_final - time
+
+   if (dt_final <= dt) then
+      time = t_final
+   else
+      time = time + dt
+   end if
+
+   write(*,*) "time = ", time, "dt = ", dt
+
+   nti = nti + 1
+
+   !:::::::::::
+   ! STAGE 1 ::
+   !:::::::::::
+   phii = phi
+   Ti   = T
+   uxi  = ux
+   uyi  = uy
+   start = OMP_GET_WTIME()
+   call calc_explicit(1)
+   finish = OMP_GET_WTIME()
+   write(*,*) " - calc_explicit(1) timing: ", finish-start, "(s)"
+   start = OMP_GET_WTIME()
+   !$OMP PARALLEL DO num_threads(16) private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+   do it = 1,Nx ! kx loop
+      ! Compute phi1 and T1
+      call calc_vari_mod(tmp_phi, tmp_T, acoeffs(1,1), 1,&
+                           kx(it), phi(2:Ny-1,it),&
+                           K1hat_phi(2:Ny-1,it),K2hat_phi(2:Ny-1,it),K3hat_phi(2:Ny-1,it),&
+                           K1hat_T(2:Ny-1,it),K2hat_T(2:Ny-1,it),K3hat_T(2:Ny-1,it),&
+                           K1_phi(2:Ny-1,it), K2_phi(2:Ny-1,it), K1_T(2:Ny-1,it), K2_T(2:Ny-1,it),&
+                           T(:,it))
+      ! Compute v1 from phi1
+      call calc_vi_mod(tmp_uy, tmp_phi, kx(it))
+      ! BOUNDAY CONDITIONS!
+      call update_bcs_mod(tmp_phi1,tmp_uy1, tmp_phi,tmp_uy,dyv1_T(it),dyv2_T(it),&
+                           dyv1_B(it),dyv2_B(it),&
+                           V1(:,it),V2(:,it),phi1(:,it),phi2(:,it))
+      tmp_phi = tmp_phi1
+      tmp_uy  = tmp_uy1
+      call calc_implicit_mod(tmp_K_phi,tmp_K_T, tmp_phi,tmp_T, kx(it))
+      K1_phi(:,it) = tmp_K_phi
+      K1_T(:,it)   = tmp_K_T
+      ! Compute u1 from v1
+      if (kx(it) /= 0.0_dp) then
+         !uxi(:,it) = -CI*d1y(tmp_uy)/kx(it)
+         uxi(:,it) = CI*d1y(tmp_uy)/kx(it)
+      else if (kx(it) == 0.0_dp) then
+         uxi(:,it) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+      end if
+      phii(:,it) = tmp_phi
+      Ti  (:,it) = tmp_T
+      uyi (:,it) = tmp_uy
+   end do
+   !$OMP END PARALLEL DO
+   finish = OMP_GET_WTIME()
+   write(*,*) " - stage 1 mid timing: ", finish-start, "(s)"
+   ! Compute K2hat
+   start = OMP_GET_WTIME()
+   call calc_explicit(2)
+   finish = OMP_GET_WTIME()
+   write(*,*) " - calc_explicit(2) timing: ", finish-start, "(s)"
+
+   !:::::::::::
+   ! STAGE 2 ::
+   !:::::::::::
+   start = OMP_GET_WTIME()
+   !$OMP PARALLEL DO num_threads(16) private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+   do it = 1,Nx ! kx loop
+      ! Compute phi2 and T2
+      call calc_vari_mod(tmp_phi, tmp_T, acoeffs(2,2), 2,&
+                           kx(it), phi(2:Ny-1,it),&
+                           K1hat_phi(2:Ny-1,it),K2hat_phi(2:Ny-1,it),K3hat_phi(2:Ny-1,it),&
+                           K1hat_T(2:Ny-1,it),K2hat_T(2:Ny-1,it),K3hat_T(2:Ny-1,it),&
+                           K1_phi(2:Ny-1,it), K2_phi(2:Ny-1,it), K1_T(2:Ny-1,it), K2_T(2:Ny-1,it),&
+                           T(:,it))
+      ! Compute v1 from phi1
+      call calc_vi_mod(tmp_uy, tmp_phi, kx(it))
+      ! BOUNDAY CONDITIONS!
+      call update_bcs_mod(tmp_phi1,tmp_uy1, tmp_phi,tmp_uy,dyv1_T(it),dyv2_T(it),&
+                           dyv1_B(it),dyv2_B(it),&
+                           V1(:,it),V2(:,it),phi1(:,it),phi2(:,it))
+      tmp_phi = tmp_phi1
+      tmp_uy  = tmp_uy1
+      ! Compute K2_T and K2_phi
+      call calc_implicit_mod(tmp_K_phi,tmp_K_T, tmp_phi,tmp_T, kx(it))
+      K2_phi(:,it) = tmp_K_phi
+      K2_T(:,it)   = tmp_K_T
+      ! Compute u1 from v1
+      if (kx(it) /= 0.0_dp) then
+         !uxi(:,it) = -CI*d1y(tmp_uy)/kx(it)
+         uxi(:,it) = CI*d1y(tmp_uy)/kx(it)
+      else if (kx(it) == 0.0_dp) then
+         uxi(:,it) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+      end if
+      phii(:,it) = tmp_phi
+      Ti  (:,it) = tmp_T
+      uyi (:,it) = tmp_uy
+   end do
+   finish = OMP_GET_WTIME()
+   write(*,*) " - stage 2 mid timing: ", finish-start, "(s)"
+   ! Compute K3hat
+   start = OMP_GET_WTIME()
+   call calc_explicit(3)
+   finish = OMP_GET_WTIME()
+   write(*,*) " - calc_explicit(3) timing: ", finish-start, "(s)"
+
+   !:::::::::::
+   ! STAGE 3 ::
+   !:::::::::::
+   start = OMP_GET_WTIME()
+   !$OMP PARALLEL DO num_threads(16) private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+   do it = 1,Nx ! kx loop
+      ! Compute phi3 and T3
+      call calc_vari_mod(tmp_phi, tmp_T, acoeffs(3,3), 3,&
+                           kx(it), phi(2:Ny-1,it),&
+                           K1hat_phi(2:Ny-1,it),K2hat_phi(2:Ny-1,it),K3hat_phi(2:Ny-1,it),&
+                           K1hat_T(2:Ny-1,it),K2hat_T(2:Ny-1,it),K3hat_T(2:Ny-1,it),&
+                           K1_phi(2:Ny-1,it), K2_phi(2:Ny-1,it), K1_T(2:Ny-1,it), K2_T(2:Ny-1,it),&
+                           T(:,it))
+      ! Compute v1 from phi1
+      call calc_vi_mod(tmp_uy, tmp_phi, kx(it))
+      ! BOUNDAY CONDITIONS!
+      call update_bcs_mod(tmp_phi1,tmp_uy1, tmp_phi,tmp_uy,dyv1_T(it),dyv2_T(it),&
+                           dyv1_B(it),dyv2_B(it),&
+                           V1(:,it),V2(:,it),phi1(:,it),phi2(:,it))
+      tmp_phi = tmp_phi1
+      tmp_uy  = tmp_uy1
+      ! Compute K3_T and K3_phi
+      call calc_implicit_mod(tmp_K_phi,tmp_K_T, tmp_phi,tmp_T, kx(it))
+      K3_phi(:,it) = tmp_K_phi
+      K3_T(:,it)   = tmp_K_T
+      ! Compute u1 from v1
+      if (kx(it) /= 0.0_dp) then
+         !uxi(:,it) = -CI*d1y(tmp_uy)/kx(it)
+         uxi(:,it) = CI*d1y(tmp_uy)/kx(it)
+      else if (kx(it) == 0.0_dp) then
+         uxi(:,it) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+      end if
+      phii(:,it) = tmp_phi
+      Ti  (:,it) = tmp_T
+      uyi (:,it) = tmp_uy
+   end do
+   !$OMP END PARALLEL DO
+   finish = OMP_GET_WTIME()
+   write(*,*) " - stage 3 mid timing: ", finish-start, "(s)"
+   ! Compute K4hat
+   start = OMP_GET_WTIME()
+   call calc_explicit(4)
+   finish = OMP_GET_WTIME()
+   write(*,*) " - calc_explicit(4) timing: ", finish-start, "(s)"
+
+   ! UPDATE SOLUTIONS
+
+   start = OMP_GET_WTIME()
+   ! Get phi
+   phi(2:Ny-1,:) = phi(2:Ny-1,:) + dt*(b(1)*(K1_phi(2:Ny-1,:) + K2hat_phi(2:Ny-1,:)) + &
+                  &                    b(2)*(K2_phi(2:Ny-1,:) + K3hat_phi(2:Ny-1,:)) + &
+                  &                    b(3)*(K3_phi(2:Ny-1,:) + K4hat_phi(2:Ny-1,:)))
+
+   ! Get temperature
+   T(2:Ny-1,:)   = T(2:Ny-1,:)  + dt*(b(1)*(K1_T(2:Ny-1,:) + K2hat_T(2:Ny-1,:)) + &
+                  &                   b(2)*(K2_T(2:Ny-1,:) + K3hat_T(2:Ny-1,:)) + &
+                  &                   b(3)*(K3_T(2:Ny-1,:) + K4hat_T(2:Ny-1,:)))
+
+   ! Get ux and uy
+   !$OMP PARALLEL DO num_threads(16) private(tmp_uy, it) schedule(dynamic)
+   do it = 1,Nx
+      ! Solve for v
+      call calc_vi_mod(tmp_uy, phi(:,it), kx(it))
+      uy(:,it) = tmp_uy
+      ! Solve for u
+      if (kx(it) /= 0.0_dp) then
+         !ux(:,it) = -CI*d1y(tmp_uy)/kx(it)
+         ux(:,it) = CI*d1y(tmp_uy)/kx(it)
+      else if (kx(it) == 0.0_dp) then
+         ux(:,it) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+      end if
+   end do
+   !$OMP END PARALLEL DO
+   finish = OMP_GET_WTIME()
+   write(*,*) " - update sols timing: ", finish-start, "(s)"
+
+
+   if (time == t_final) then
+      exit
+   end if
+
+   !call update_dt
+   ! Don't write vtk for now.
+   wvtk = .false.
+   if (wvtk) then
+      if (mod(nti,vtk_print) == 0) then
+         call write_to_vtk(nti, .false.) ! false = Fourier space
+      end if
+   end if
+
+   ! Calculate nusselt number.
+   if (save_nusselt) then
+      call nusselt(nusselt_num, .true.) ! true = Fourier space
+      write(8000, fmt=1000) nusselt_num
+      flush(8000)
+   end if
+
+   finish_overall = OMP_GET_WTIME()
+   write(*,*) "overall timing: ", finish_overall-start_overall, "(s)"
+
+end do ! time loop
+
+
 end subroutine imex_rk
 
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!

From 9f0e205494576dc19f5f23c7efcd1452fec25254 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Mon, 26 Apr 2021 14:29:35 -0400
Subject: [PATCH 09/24] fixing spacing

---
 mesh_pack.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mesh_pack.f90 b/mesh_pack.f90
index 034c0cf..8d80ef2 100755
--- a/mesh_pack.f90
+++ b/mesh_pack.f90
@@ -49,7 +49,7 @@ subroutine cosine_mesh_MPI(xcoor,ycoor,zcoor, numx,numy,numz, proc_id,num_procs,
 
 integer,                             intent(in)  :: numx, numy, numz
 integer,                             intent(in)  :: proc_id, num_procs
-integer                                                :: mpierror
+integer                                          :: mpierror
 integer                                          :: ii, jj, kk
 real(dp)                                         :: c
 real(dp), optional,                  intent(in)  :: dxin

From 7afdcc56253ef4644c158f8d6be9a4e6e01f3b19 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Mon, 26 Apr 2021 18:57:22 -0400
Subject: [PATCH 10/24] supporting MPI dynu calculations

---
 mesh_pack.f90     | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 time_loop_MPI.f90 |  7 +++----
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/mesh_pack.f90 b/mesh_pack.f90
index 8d80ef2..8d937f5 100755
--- a/mesh_pack.f90
+++ b/mesh_pack.f90
@@ -1,6 +1,7 @@
 module mesh_pack
 
 use global
+include 'mpif.h'
 
 contains
 
@@ -107,6 +108,57 @@ subroutine dxdydz(dyj, xcoor,ycoor,zcoor)
 end do
 
 end subroutine dxdydz
+
+! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+subroutine dxdydz_MPI(dyj, xcoor,ycoor,zcoor, proc_id, num_procs)
+
+implicit none
+
+real(dp),              dimension(:), intent(in)  :: xcoor, ycoor, zcoor
+integer,                             intent(in)  :: proc_id, num_procs
+real(dp), allocatable, dimension(:), intent(out) :: dyj
+integer                                          :: alloc_err, mpierror
+integer                                          :: numy, jj
+real(dp)                                         :: prev_element
+
+numy = size(ycoor)
+
+if (proc_id == 0) then
+   allocate(dyj(numy-1), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   ! Calculate grid spacing
+   do jj = 2,numy
+      dyj(jj-1) = ycoor(jj) - ycoor(jj-1)
+   end do
+   ! Send last element to next process to the right.
+   call MPI_SEND(ycoor(numy), 1, MPI_DOUBLE, proc_id+1, 42, MPI_COMM_WORLD, mpierror)
+else if (proc_id == num_procs - 1) then
+   ! Recieve first element from next process to the left.
+   call MPI_RECV(prev_element, 1, MPI_DOUBLE, proc_id-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   allocate(dyj(numy), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   ! Calculate grid spacing
+   dyj(1) = ycoor(1) - prev_element
+   do jj = 2,numy
+      dyj(jj) = ycoor(jj) - ycoor(jj-1)
+   end do
+else
+   ! Recieve first element from next process to the left.
+   call MPI_RECV(prev_element, 1, MPI_DOUBLE, proc_id-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send last element to next process to the right.
+   call MPI_SEND(ycoor(numy), 1, MPI_DOUBLE, proc_id+1, 42, MPI_COMM_WORLD, mpierror)
+   allocate(dyj(numy), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   ! Calculate grid spacing
+   dyj(1) = ycoor(1) - prev_element
+   do jj = 2,numy
+      dyj(jj) = ycoor(jj) - ycoor(jj-1)
+   end do
+end if
+
+
+end subroutine dxdydz_MPI
 ! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 subroutine y_mesh_params
 
diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
index 380d93c..c2fffd9 100644
--- a/time_loop_MPI.f90
+++ b/time_loop_MPI.f90
@@ -13,9 +13,8 @@ program time_loop
 use time_integrators_MPI
 
 implicit none
-include 'mpif.h'
 
-integer                                                :: ntokens, ios
+integer                                                :: ntokens, ios, i
 integer                                                :: Nxc, Nyc, Nzc
 integer                                                :: nRa, ii, jj
 integer                                                :: refine_flag_x, refine_flag_y, refine_flag_z
@@ -129,6 +128,7 @@ program time_loop
 mpi_spacing_y = (ytop - ybot) / num_procs
 ybot = ybot + proc_id * mpi_spacing_y
 ytop = ybot + mpi_spacing_y
+write(proc_id_str, "(I3.3)") proc_id
 
 ! Create FFT plans
 planuy = fftw_plan_dft_1d(Nx,tuy,tuy, FFTW_FORWARD,FFTW_ESTIMATE)
@@ -163,7 +163,7 @@ program time_loop
 end if
 
 call cosine_mesh_MPI(xp,yp,zp, Nx,Ny,Nz, proc_id, num_procs) ! get coordinates
-call dxdydz(dynu, xp,yp,zp) ! get mesh spacing for nonuniform grids
+call dxdydz_MPI(dynu, xp,yp,zp, proc_id, num_procs) ! get mesh spacing for nonuniform grids
 call y_mesh_params ! get metric coefficients for nonuniform grids
 dymin = minval(dynu)
 dxmin = sqrt(dx**2.0_dp + dymin**2.0_dp)
@@ -179,7 +179,6 @@ program time_loop
 
 ! Write initial field to vtk
 if (wvtk) then
-    write(proc_id_str, "(I3.3)") proc_id
     call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
 end if
 

From fdc01beb3bc6c81573d3b521e131b9c6d1f612ae Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Mon, 26 Apr 2021 22:14:42 -0400
Subject: [PATCH 11/24] supporting MPI construction of g1,g2,g3,h1,h2,h3

---
 mesh_pack.f90     |  89 +++++++++++++++++++++++++
 time_loop_MPI.f90 | 164 +++++++++++++++++++++++++++-------------------
 2 files changed, 184 insertions(+), 69 deletions(-)

diff --git a/mesh_pack.f90 b/mesh_pack.f90
index 8d937f5..cec5b2d 100755
--- a/mesh_pack.f90
+++ b/mesh_pack.f90
@@ -192,6 +192,95 @@ subroutine y_mesh_params
 
 end subroutine y_mesh_params
 ! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+subroutine y_mesh_params_MPI (proc_id, num_procs)
+
+implicit none
+
+integer, intent(in)  :: proc_id, num_procs
+real(dp)             :: recv_val
+integer              :: mpierror, jj
+
+! Calculate metric coefficients
+! g is for second derivatives
+! h is for first derivatives
+
+! First element
+if (proc_id == 0) then
+   call MPI_RECV(recv_val, 1, MPI_DOUBLE, proc_id+1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+   ! First elems
+   g1(1) =  2.0_dp / (dynu(1)*(dynu(1)+dynu(2)))
+   g2(1) = -2.0_dp / (dynu(1)*dynu(2))
+   g3(1) =  2.0_dp / (dynu(2)*(dynu(1)+dynu(2)))
+
+   h1(1) = -(2.0_dp*dynu(1)+dynu(2)) / (dynu(1)*(dynu(1)+dynu(2)))
+   h2(1) = (dynu(1) + dynu(2)) / (dynu(1)*dynu(2))
+   h3(1) = -dynu(1) / (dynu(2)*(dynu(1)+dynu(2)))
+
+   ! Middle Ny-2 elements.
+   do jj = 2,Ny-1
+      g1(jj) =  2.0_dp / (dynu(jj-1)*(dynu(jj)+dynu(jj-1)))
+      g2(jj) = -2.0_dp / (dynu(jj-1)*dynu(jj))
+      g3(jj) =  2.0_dp / (dynu( jj )*(dynu(jj)+dynu(jj-1)))
+
+      h1(jj) = -dynu(jj) / (dynu(jj-1)*(dynu(jj-1) + dynu(jj)))
+      h2(jj) = (dynu(jj) - dynu(jj-1)) / (dynu(jj-1)*dynu(jj))
+      h3(jj) = dynu(jj-1) / (dynu(jj)*(dynu(jj-1) + dynu(jj)))
+   end do
+
+   ! Last elems
+   g1(Ny) =  2.0_dp / (dynu(Ny-1)*(recv_val+dynu(Ny-1)))
+   g2(Ny) = -2.0_dp / (dynu(Ny-1)*recv_val)
+   g3(Ny) =  2.0_dp / (recv_val*(recv_val+dynu(Ny-1)))
+
+   h1(Ny) = -recv_val / (dynu(Ny-1)*(dynu(Ny-1) + recv_val))
+   h2(Ny) = (recv_val - dynu(Ny-1)) / (dynu(Ny-1)*recv_val)
+   h3(Ny) = dynu(Ny-1) / (recv_val*(dynu(Ny-1) + recv_val))
+else if (proc_id == num_procs - 1) then
+   call MPI_SEND(dynu(1), 1, MPI_DOUBLE, proc_id-1, 42, MPI_COMM_WORLD, mpierror)
+   do jj = 1,Ny-1
+      g1(jj) =  2.0_dp / (dynu(jj)*(dynu(jj+1)+dynu(jj)))
+      g2(jj) = -2.0_dp / (dynu(jj)*dynu(jj+1))
+      g3(jj) =  2.0_dp / (dynu(jj+1)*(dynu(jj+1)+dynu(jj)))
+
+      h1(jj) = -dynu(jj+1) / (dynu(jj)*(dynu(jj) + dynu(jj+1)))
+      h2(jj) = (dynu(jj+1) - dynu(jj)) / (dynu(jj)*dynu(jj+1))
+      h3(jj) = dynu(jj) / (dynu(jj+1)*(dynu(jj) + dynu(jj+1)))
+   end do
+   h1(Ny) = dynu(Ny) / (dynu(Ny-1)*(dynu(Ny-1)+dynu(Ny)))
+   h2(Ny) = -(dynu(Ny-1) + dynu(Ny)) / (dynu(Ny-1)*dynu(Ny))
+   h3(Ny) = (2.0_dp*dynu(Ny) + dynu(Ny-1)) / (dynu(Ny)*(dynu(Ny-1) + dynu(Ny)))
+
+   g1(Ny) =  2.0_dp / (dynu(Ny-1)*(dynu(Ny-1) + dynu(Ny)))
+   g2(Ny) = -2.0_dp / (dynu(Ny)*dynu(Ny-1))
+   g3(Ny) =  2.0_dp / (dynu(Ny)*(dynu(Ny) + dynu(Ny-1)))
+
+else
+   call MPI_SEND(dynu(1), 1, MPI_DOUBLE, proc_id-1, 42, MPI_COMM_WORLD, mpierror)
+   call MPI_RECV(recv_val, 1, MPI_DOUBLE, proc_id+1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! All but last elems.
+   do jj = 1,Ny-1
+      g1(jj) =  2.0_dp / (dynu(jj)*(dynu(jj+1)+dynu(jj)))
+      g2(jj) = -2.0_dp / (dynu(jj)*dynu(jj+1))
+      g3(jj) =  2.0_dp / (dynu(jj+1)*(dynu(jj+1)+dynu(jj)))
+
+      h1(jj) = -dynu(jj+1) / (dynu(jj)*(dynu(jj) + dynu(jj+1)))
+      h2(jj) = (dynu(jj+1) - dynu(jj)) / (dynu(jj)*dynu(jj+1))
+      h3(jj) = dynu(jj) / (dynu(jj+1)*(dynu(jj) + dynu(jj+1)))
+   end do
+   ! Last elems
+   g1(Ny) =  2.0_dp / (dynu(Ny)*(recv_val+dynu(Ny)))
+   g2(Ny) = -2.0_dp / (dynu(Ny)*recv_val)
+   g3(Ny) =  2.0_dp / (recv_val*(recv_val+dynu(Ny)))
+
+   h1(Ny) = -recv_val / (dynu(Ny)*(dynu(Ny) + recv_val))
+   h2(Ny) = (recv_val - dynu(Ny)) / (dynu(Ny)*recv_val)
+   h3(Ny) = dynu(Ny) / (recv_val*(dynu(Ny) + recv_val))
+end if 
+
+end subroutine y_mesh_params_MPI
+! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 subroutine uniform_mesh(xcoor,ycoor,zcoor, numx,numy,numz)
 
 implicit none
diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
index c2fffd9..e01673b 100644
--- a/time_loop_MPI.f90
+++ b/time_loop_MPI.f90
@@ -164,82 +164,108 @@ program time_loop
 
 call cosine_mesh_MPI(xp,yp,zp, Nx,Ny,Nz, proc_id, num_procs) ! get coordinates
 call dxdydz_MPI(dynu, xp,yp,zp, proc_id, num_procs) ! get mesh spacing for nonuniform grids
-call y_mesh_params ! get metric coefficients for nonuniform grids
-dymin = minval(dynu)
-dxmin = sqrt(dx**2.0_dp + dymin**2.0_dp)
+call y_mesh_params_MPI(proc_id, num_procs) ! get metric coefficients for nonuniform grids
 
-! Fourier modes
-do ii = 1,Nx/2
-    kx_modes(ii) = real(ii,kind=dp) - 1.0_dp
-end do
-do ii = Nx/2+1, Nx
-    kx_modes(ii) = real(ii-Nx,kind=dp) - 1.0_dp
-end do
-kx = alpha*kx_modes
+open(unit=9000, file=proc_id_str//"g1.txt", action="write", status="unknown")
+open(unit=9001, file=proc_id_str//"g2.txt", action="write", status="unknown")
+open(unit=9002, file=proc_id_str//"g3.txt", action="write", status="unknown")
+open(unit=9003, file=proc_id_str//"h1.txt", action="write", status="unknown")
+open(unit=9004, file=proc_id_str//"h2.txt", action="write", status="unknown")
+open(unit=9005, file=proc_id_str//"h3.txt", action="write", status="unknown")
 
-! Write initial field to vtk
-if (wvtk) then
-    call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
-end if
+call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
 
-! Initialize fields.
-call init_fields(ex_Tptrb, Ra)
-call init_to_fourier(ex_Tptrb)
-
-! Get nu0 and kappa0
-call global_params_Ra(Ra)
-
-if (proc_id == 0) then
-    write(*,'(A70)')                  ' '
-    write(*,'(A70)')                  '*********************************************************************|'
-    write(*,'(A70)')                  '                TWO-DIMENSIONAL THERMAL CONVECTION                   |'
-    write(*,'(A70)')                  '*********************************************************************|'
-    write(*,'(5X,A,47X,A)')           'COMPUTATION SIZE:',                                                 '|'
-    write(*,'(A69,A)')                '                                                                  ','|'
-    write(*,'(20X,A23,I5,21X,A)')     'Nx                       = ', Nx,                                   '|'
-    write(*,'(20X,A23,I5,21X,A)')     'Ny                       = ', Ny,                                   '|'
-    write(*,'(20X,A23,I5,21X,A)')     'Nz                       = ', Nz,                                   '|'
-    write(*,'(20X,A23,I5,21X,A)')     'Number of time steps     = ', nt,                                   '|'
-    write(*,'(20X,A23,I5,21X,A)')     'Number of Fourier modes  = ', Nf,                                   '|'
-    write(*,'(A69,A)')                '                                                                  ','|'
-    write(*,'(A70)')                  '*********************************************************************|'
-    write(*,'(5X,A,45X,A)')           'PROBLEM PARAMETERS:',                                               '|'
-    write(*,'(A69,A)')                '                                                                  ','|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Prandtl number  (Pr)                    = ', Pr,                    '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Rayleigh number (Ra)            = ', Ra,                    '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Reynolds number (Re)            = ', sqrt(Ra/(16.0_dp*Pr)), '|'
-    write(*,'(A69,A)')                '                                                                  ','|'
-    write(*,'(A70)')                  '*********************************************************************|'
-    write(*,'(5X,A,42X,A)')           'PHYSICAL PROBLEM SIZE:',                                            '|'
-    write(*,'(A69,A)')                '                                                                  ','|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'alpha                     = ', alpha,                               '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Left coordinate of box    = ', xL,                                  '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Right coordinate of box   = ', xR,                                  '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of bottom wall = ', ybot,                                '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of top wall    = ', ytop,                                '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Box width                 = ', Lx,                                  '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Box height                = ', Ly,                                  '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Aspect Ratio              = ', Lx/Ly,                               '|'
-    write(*,'(A69,A)')                '                                                                  ','|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Integration time (T)      = ', t_final,                              '|'
-    write(*,'(10X,A32,ES16.8,11X,A)') 'Time step size (Delta t)  = ', dt,                                   '|'
-    write(*,'(A69,A)')                '                                                                  ','|'
-    write(*,'(A70)')                  '*********************************************************************|'
-    write(*,'(A70)')                  '*********************************************************************|'
-    write(*,'(A70)')                  '                                                                      '
+do i=1,Ny
+   write (9000,*) g1(i)
+   write (9001,*) g2(i)
+   write (9002,*) g3(i)
+   write (9003,*) h1(i)
+   write (9004,*) h2(i)
+   write (9005,*) h3(i)
+end do
+
+close(unit=9000)
+close(unit=9001)
+close(unit=9002)
+close(unit=9003)
+close(unit=9004)
+close(unit=9005)
+
+! dymin = minval(dynu)
+! dxmin = sqrt(dx**2.0_dp + dymin**2.0_dp)
+
+! ! Fourier modes
+! do ii = 1,Nx/2
+!     kx_modes(ii) = real(ii,kind=dp) - 1.0_dp
+! end do
+! do ii = Nx/2+1, Nx
+!     kx_modes(ii) = real(ii-Nx,kind=dp) - 1.0_dp
+! end do
+! kx = alpha*kx_modes
+
+! ! Write initial field to vtk
+! if (wvtk) then
+!     call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
+! end if
+
+! ! Initialize fields.
+! call init_fields(ex_Tptrb, Ra)
+! call init_to_fourier(ex_Tptrb)
+
+! ! Get nu0 and kappa0
+! call global_params_Ra(Ra)
+
+! if (proc_id == 0) then
+!     write(*,'(A70)')                  ' '
+!     write(*,'(A70)')                  '*********************************************************************|'
+!     write(*,'(A70)')                  '                TWO-DIMENSIONAL THERMAL CONVECTION                   |'
+!     write(*,'(A70)')                  '*********************************************************************|'
+!     write(*,'(5X,A,47X,A)')           'COMPUTATION SIZE:',                                                 '|'
+!     write(*,'(A69,A)')                '                                                                  ','|'
+!     write(*,'(20X,A23,I5,21X,A)')     'Nx                       = ', Nx,                                   '|'
+!     write(*,'(20X,A23,I5,21X,A)')     'Ny                       = ', Ny,                                   '|'
+!     write(*,'(20X,A23,I5,21X,A)')     'Nz                       = ', Nz,                                   '|'
+!     write(*,'(20X,A23,I5,21X,A)')     'Number of time steps     = ', nt,                                   '|'
+!     write(*,'(20X,A23,I5,21X,A)')     'Number of Fourier modes  = ', Nf,                                   '|'
+!     write(*,'(A69,A)')                '                                                                  ','|'
+!     write(*,'(A70)')                  '*********************************************************************|'
+!     write(*,'(5X,A,45X,A)')           'PROBLEM PARAMETERS:',                                               '|'
+!     write(*,'(A69,A)')                '                                                                  ','|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Prandtl number  (Pr)                    = ', Pr,                    '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Rayleigh number (Ra)            = ', Ra,                    '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Reynolds number (Re)            = ', sqrt(Ra/(16.0_dp*Pr)), '|'
+!     write(*,'(A69,A)')                '                                                                  ','|'
+!     write(*,'(A70)')                  '*********************************************************************|'
+!     write(*,'(5X,A,42X,A)')           'PHYSICAL PROBLEM SIZE:',                                            '|'
+!     write(*,'(A69,A)')                '                                                                  ','|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'alpha                     = ', alpha,                               '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Left coordinate of box    = ', xL,                                  '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Right coordinate of box   = ', xR,                                  '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of bottom wall = ', ybot,                                '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of top wall    = ', ytop,                                '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Box width                 = ', Lx,                                  '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Box height                = ', Ly,                                  '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Aspect Ratio              = ', Lx/Ly,                               '|'
+!     write(*,'(A69,A)')                '                                                                  ','|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Integration time (T)      = ', t_final,                              '|'
+!     write(*,'(10X,A32,ES16.8,11X,A)') 'Time step size (Delta t)  = ', dt,                                   '|'
+!     write(*,'(A69,A)')                '                                                                  ','|'
+!     write(*,'(A70)')                  '*********************************************************************|'
+!     write(*,'(A70)')                  '*********************************************************************|'
+!     write(*,'(A70)')                  '                                                                      '
     
-    flush(6)
-    ! Print MPI division.
-    write(*,*) "Ny = ", Ny*num_procs, " divided among ", num_procs, " processors -> ", &
-               Ny, " rows per processor."
-end if
+!     flush(6)
+!     ! Print MPI division.
+!     write(*,*) "Ny = ", Ny*num_procs, " divided among ", num_procs, " processors -> ", &
+!                Ny, " rows per processor."
+! end if
 
-call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+! call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
 
-write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
+! write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
 
-! Get solution with time integration
-call imex_rk_MPI(proc_id_str, 1, .true.) ! true causes writing of nusselt number.
+! ! Get solution with time integration
+! call imex_rk_MPI(proc_id_str, 1, .true.) ! true causes writing of nusselt number.
 
 call MPI_Finalize(mpierror)
 

From b2864f5ec43e189f3ec4f7b04dc1687de4641733 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Tue, 27 Apr 2021 12:16:43 -0400
Subject: [PATCH 12/24] support boundary condition initialization in MPU

---
 bc_setup.f90             | 165 +++++++++++++++++++++++++++++++++++++++
 time_integrators_MPI.f90 |  22 +++++-
 time_loop_MPI.f90        | 161 ++++++++++++++++----------------------
 3 files changed, 253 insertions(+), 95 deletions(-)

diff --git a/bc_setup.f90 b/bc_setup.f90
index 10e412d..893aca1 100755
--- a/bc_setup.f90
+++ b/bc_setup.f90
@@ -4,6 +4,7 @@ module bc_setup
 use write_pack
 
 implicit none
+include 'mpif.h'
 
 contains
 
@@ -138,4 +139,168 @@ subroutine init_bc(aii)
 
 end subroutine init_bc
 
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+subroutine init_bc_MPI(aii, proc_id, num_procs, proc_id_str)
+
+implicit none
+
+real(dp),              intent(in)     :: aii
+integer,               intent(in)     :: proc_id, num_procs
+character(3),  optional,        intent(in)     :: proc_id_str
+real(dp)                              :: pnu, qnu
+real(dp)                              :: wavex
+
+real(dp), allocatable, dimension(:)   :: d, dl, du
+real(dp), allocatable, dimension(:)   :: phi1_b, phi1_t
+real(dp), allocatable, dimension(:)   :: g1_total, g2_total, g3_total
+real(dp), allocatable, dimension(:)   :: recv_col, recv_col2, last_row_recv
+real(dp), allocatable, dimension(:)   :: phi2_b, phi2_t
+real(dp), allocatable, dimension(:,:) :: Fphi12, FV12
+
+
+integer                               :: ii, jj, i, total_ny, otherproc, start
+integer                               :: info, mpierror
+
+! Only do allocation on main node.
+if(proc_id == 0) then
+   total_ny = Ny * num_procs
+
+   ! Allocate local variables
+   allocate(Fphi12(total_ny-2,2), FV12(total_ny-2,2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(d(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(dl(total_ny-3), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(du(total_ny-3), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(phi1_b(Nx), phi1_t(Nx), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(phi2_b(Nx), phi2_t(Nx), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+
+   ! Need to pull all of g1, g2, g3 onto main node for calculations.
+   allocate(g1_total(total_ny), g2_total(total_ny), g3_total(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+
+   ! Receive g1,g2,g3 from all other nodes.
+   do otherproc = 1,num_procs-1
+      start = otherproc * Ny + 1
+      call MPI_RECV(g1_total(start), Ny, MPI_DOUBLE, otherproc, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      call MPI_RECV(g2_total(start), Ny, MPI_DOUBLE, otherproc, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      call MPI_RECV(g3_total(start), Ny, MPI_DOUBLE, otherproc, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   end do
+   g1_total(1:Ny) = g1
+   g2_total(1:Ny) = g2
+   g3_total(1:Ny) = g3
+
+   Fphi12 = 0.0_dp
+   FV12   = 0.0_dp
+   d      = 0.0_dp
+   dl     = 0.0_dp
+   du     = 0.0_dp
+   phi1_b = 1.0_dp
+   phi1_t = 0.0_dp
+   phi2_b = 0.0_dp
+   phi2_t = 1.0_dp
+
+   phi1_b(1) = 0.0_dp
+   phi2_t(1) = 0.0_dp
+
+   ! Compute phi1 and phi2 from D2 phi_i = 0 where
+   ! D2 = -(alpha*kx)^2 + dy^2 and phi1 satisfies the BCs
+   ! phi1 = 1 at the bottom wall and phi1 = 0 at the top wall.
+   ! phi2 satisfies phi2 = 0 at the bottom wall and phi2 = 1
+   ! at the top wall.
+
+   ! A comment on how these arrays are formed.  Since we are only
+   ! dealing with Dirichlet conditions at the walls, we only need arrays of size
+   ! Ny-2, i.e. we don't need the values at the walls.  Thus, in the calculations
+   ! below, F(1) would correspond to point number 2 and F(Ny-2) would correspond
+   ! to point Ny-1.
+
+   ! Some parameters
+   pnu = nu0*dt*aii
+
+   do ii = 1,Nx
+
+      Fphi12 = 0.0_dp
+   
+      if (abs(kx(ii)/alpha) > Nf/2) then
+         phi1_b(ii) = 0.0_dp
+         phi2_t(ii) = 0.0_dp
+      end if
+   
+      wavex = kx(ii)
+   
+      qnu = 1.0_dp + pnu*wavex**2.0_dp
+   
+      do jj = 2,total_ny-1
+         d(jj-1) = qnu - pnu*g2_total(jj)
+      end do
+   
+      do jj = 2,total_ny-2
+         du(jj-1) = -pnu*g3_total(jj)
+      end do
+   
+      do jj = 3,total_ny-1
+         dl(jj-2) = -pnu*g1_total(jj)
+      end do
+
+      Fphi12(1,1)    = pnu*g1_total(2)*phi1_b(ii)
+      Fphi12(total_ny-2,2) = pnu*g3_total(total_ny-1)*phi2_t(ii)
+
+      !  Solve the system Aphi phi = Fphi
+      call dgtsv(total_ny-2, 2, dl, d, du, Fphi12, total_ny-2, info)
+
+      !  Put phi1 and phi2 together in main node.
+      phi1(1,ii)      = phi1_b(ii)
+      phi1(2:Ny,ii) = Fphi12(1:Ny-1,1)
+      phi2(1,ii)      = 0.0_dp
+      phi2(2:Ny,ii) = Fphi12(1:Ny-1,2)
+      
+      ! Send phi1 and phi2 columns to each node.
+      do otherproc = 1,num_procs-1
+         start = otherproc * Ny
+         ! write(*,*) start, Fphi12
+         call MPI_SEND(Fphi12(start,1), Ny, MPI_DOUBLE, otherproc, 45, MPI_COMM_WORLD, mpierror)
+         call MPI_SEND(Fphi12(start,2), Ny, MPI_DOUBLE, otherproc, 46, MPI_COMM_WORLD, mpierror)
+      end do
+   end do
+
+   ! Send last row to final proc.
+   call MPI_SEND(phi2_t(1), Nx, MPI_DOUBLE, num_procs-1, 47, MPI_COMM_WORLD, mpierror)
+
+else
+   ! Send g1, g2, g3 to first node.
+   call MPI_SEND(g1(1), 12, MPI_DOUBLE, 0, 42, MPI_COMM_WORLD, mpierror)
+   call MPI_SEND(g2(1), 12, MPI_DOUBLE, 0, 43, MPI_COMM_WORLD, mpierror)
+   call MPI_SEND(g3(1), 12, MPI_DOUBLE, 0, 44, MPI_COMM_WORLD, mpierror)
+
+   allocate(recv_col(Ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(recv_col2(Ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+
+   ! Receive rows of phi1 and phi2.
+   do ii = 1,Nx
+      call MPI_RECV(recv_col, Ny, MPI_DOUBLE, 0, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      call MPI_RECV(recv_col2, Ny, MPI_DOUBLE, 0, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      phi1(1:Ny, ii) = recv_col
+      phi2(1:Ny, ii) = recv_col2
+   end do
+
+   ! Handle boundary of last processor.
+   if (proc_id == num_procs - 1) then
+      allocate(last_row_recv(Nx), stat=alloc_err)
+      call check_alloc_err(alloc_err)
+      call MPI_RECV(last_row_recv, Nx, MPI_DOUBLE, 0, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      phi1(Ny,1:Nx)     = 0.0_dp
+      phi2(Ny,1:Nx)     = last_row_recv
+   end if 
+end if 
+
+end subroutine init_bc_MPI
+
 end module bc_setup
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 8cf7fa8..0e27029 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -14,7 +14,7 @@ module time_integrators_MPI
 
 contains
 
-subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt)
+subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 !!  This progam solves the equations of thermal convection using a Fourier
@@ -28,8 +28,9 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt)
 character(3), intent(in)       :: proc_id_str
 integer, optional, intent(in)  :: vtk_print
 logical, optional, intent(in)  :: save_nusselt
+integer,           intent(in)  :: proc_id, num_procs
 
-integer                        :: nti
+integer                        :: nti, i, j
 integer                        :: nprint
 logical                        :: wvtk
 real(dp)                       :: nusselt_num
@@ -50,12 +51,29 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt)
     wvtk = .false.
     nprint = 100
 end if
+
 write(*,*) "imex_rk_MPI from proc ", proc_id_str
 
 if (wvtk) then
     call write_to_vtk(0, .false., proc_id_str) ! false = Fourier space
 end if
 
+dt = dt_init
+
+call init_bc_MPI(acoeffs(1,1), proc_id, num_procs, proc_id_str)
+
+open(unit=9000, file="P"//proc_id_str//"phi1.txt", action="write", status="unknown")
+open(unit=9001, file="P"//proc_id_str//"phi2.txt", action="write", status="unknown")
+
+do i=1,Ny
+   do j=1,Nx
+       write (9000,*) phi1(i,j)
+       write (9001,*) phi2(i,j)
+   end do
+end do
+
+close(unit=9000)
+close(unit=9001)
 
 end subroutine imex_rk_MPI
 
diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
index e01673b..0e2956b 100644
--- a/time_loop_MPI.f90
+++ b/time_loop_MPI.f90
@@ -166,106 +166,81 @@ program time_loop
 call dxdydz_MPI(dynu, xp,yp,zp, proc_id, num_procs) ! get mesh spacing for nonuniform grids
 call y_mesh_params_MPI(proc_id, num_procs) ! get metric coefficients for nonuniform grids
 
-open(unit=9000, file=proc_id_str//"g1.txt", action="write", status="unknown")
-open(unit=9001, file=proc_id_str//"g2.txt", action="write", status="unknown")
-open(unit=9002, file=proc_id_str//"g3.txt", action="write", status="unknown")
-open(unit=9003, file=proc_id_str//"h1.txt", action="write", status="unknown")
-open(unit=9004, file=proc_id_str//"h2.txt", action="write", status="unknown")
-open(unit=9005, file=proc_id_str//"h3.txt", action="write", status="unknown")
+dymin = minval(dynu)
+dxmin = sqrt(dx**2.0_dp + dymin**2.0_dp)
 
-call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
-
-do i=1,Ny
-   write (9000,*) g1(i)
-   write (9001,*) g2(i)
-   write (9002,*) g3(i)
-   write (9003,*) h1(i)
-   write (9004,*) h2(i)
-   write (9005,*) h3(i)
+! Fourier modes
+do ii = 1,Nx/2
+    kx_modes(ii) = real(ii,kind=dp) - 1.0_dp
 end do
+do ii = Nx/2+1, Nx
+    kx_modes(ii) = real(ii-Nx,kind=dp) - 1.0_dp
+end do
+kx = alpha*kx_modes
+
+! Write initial field to vtk
+if (wvtk) then
+    call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
+end if
 
-close(unit=9000)
-close(unit=9001)
-close(unit=9002)
-close(unit=9003)
-close(unit=9004)
-close(unit=9005)
-
-! dymin = minval(dynu)
-! dxmin = sqrt(dx**2.0_dp + dymin**2.0_dp)
-
-! ! Fourier modes
-! do ii = 1,Nx/2
-!     kx_modes(ii) = real(ii,kind=dp) - 1.0_dp
-! end do
-! do ii = Nx/2+1, Nx
-!     kx_modes(ii) = real(ii-Nx,kind=dp) - 1.0_dp
-! end do
-! kx = alpha*kx_modes
-
-! ! Write initial field to vtk
-! if (wvtk) then
-!     call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
-! end if
-
-! ! Initialize fields.
-! call init_fields(ex_Tptrb, Ra)
-! call init_to_fourier(ex_Tptrb)
-
-! ! Get nu0 and kappa0
-! call global_params_Ra(Ra)
-
-! if (proc_id == 0) then
-!     write(*,'(A70)')                  ' '
-!     write(*,'(A70)')                  '*********************************************************************|'
-!     write(*,'(A70)')                  '                TWO-DIMENSIONAL THERMAL CONVECTION                   |'
-!     write(*,'(A70)')                  '*********************************************************************|'
-!     write(*,'(5X,A,47X,A)')           'COMPUTATION SIZE:',                                                 '|'
-!     write(*,'(A69,A)')                '                                                                  ','|'
-!     write(*,'(20X,A23,I5,21X,A)')     'Nx                       = ', Nx,                                   '|'
-!     write(*,'(20X,A23,I5,21X,A)')     'Ny                       = ', Ny,                                   '|'
-!     write(*,'(20X,A23,I5,21X,A)')     'Nz                       = ', Nz,                                   '|'
-!     write(*,'(20X,A23,I5,21X,A)')     'Number of time steps     = ', nt,                                   '|'
-!     write(*,'(20X,A23,I5,21X,A)')     'Number of Fourier modes  = ', Nf,                                   '|'
-!     write(*,'(A69,A)')                '                                                                  ','|'
-!     write(*,'(A70)')                  '*********************************************************************|'
-!     write(*,'(5X,A,45X,A)')           'PROBLEM PARAMETERS:',                                               '|'
-!     write(*,'(A69,A)')                '                                                                  ','|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Prandtl number  (Pr)                    = ', Pr,                    '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Rayleigh number (Ra)            = ', Ra,                    '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Reynolds number (Re)            = ', sqrt(Ra/(16.0_dp*Pr)), '|'
-!     write(*,'(A69,A)')                '                                                                  ','|'
-!     write(*,'(A70)')                  '*********************************************************************|'
-!     write(*,'(5X,A,42X,A)')           'PHYSICAL PROBLEM SIZE:',                                            '|'
-!     write(*,'(A69,A)')                '                                                                  ','|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'alpha                     = ', alpha,                               '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Left coordinate of box    = ', xL,                                  '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Right coordinate of box   = ', xR,                                  '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of bottom wall = ', ybot,                                '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of top wall    = ', ytop,                                '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Box width                 = ', Lx,                                  '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Box height                = ', Ly,                                  '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Aspect Ratio              = ', Lx/Ly,                               '|'
-!     write(*,'(A69,A)')                '                                                                  ','|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Integration time (T)      = ', t_final,                              '|'
-!     write(*,'(10X,A32,ES16.8,11X,A)') 'Time step size (Delta t)  = ', dt,                                   '|'
-!     write(*,'(A69,A)')                '                                                                  ','|'
-!     write(*,'(A70)')                  '*********************************************************************|'
-!     write(*,'(A70)')                  '*********************************************************************|'
-!     write(*,'(A70)')                  '                                                                      '
+! Initialize fields.
+call init_fields(ex_Tptrb, Ra)
+call init_to_fourier(ex_Tptrb)
+
+! Get nu0 and kappa0
+call global_params_Ra(Ra)
+
+if (proc_id == 0) then
+    write(*,'(A70)')                  ' '
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '                TWO-DIMENSIONAL THERMAL CONVECTION                   |'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,47X,A)')           'COMPUTATION SIZE:',                                                 '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(20X,A23,I5,21X,A)')     'Nx                       = ', Nx,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Ny                       = ', Ny,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Nz                       = ', Nz,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Number of time steps     = ', nt,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Number of Fourier modes  = ', Nf,                                   '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,45X,A)')           'PROBLEM PARAMETERS:',                                               '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Prandtl number  (Pr)                    = ', Pr,                    '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Rayleigh number (Ra)            = ', Ra,                    '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Reynolds number (Re)            = ', sqrt(Ra/(16.0_dp*Pr)), '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,42X,A)')           'PHYSICAL PROBLEM SIZE:',                                            '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'alpha                     = ', alpha,                               '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Left coordinate of box    = ', xL,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Right coordinate of box   = ', xR,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of bottom wall = ', ybot,                                '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of top wall    = ', ytop,                                '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Box width                 = ', Lx,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Box height                = ', Ly,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Aspect Ratio              = ', Lx/Ly,                               '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Integration time (T)      = ', t_final,                              '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Time step size (Delta t)  = ', dt,                                   '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '                                                                      '
     
-!     flush(6)
-!     ! Print MPI division.
-!     write(*,*) "Ny = ", Ny*num_procs, " divided among ", num_procs, " processors -> ", &
-!                Ny, " rows per processor."
-! end if
+    flush(6)
+    ! Print MPI division.
+    write(*,*) "Ny = ", Ny*num_procs, " divided among ", num_procs, " processors -> ", &
+               Ny, " rows per processor."
+end if
 
-! call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
 
-! write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
+write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
 
-! ! Get solution with time integration
-! call imex_rk_MPI(proc_id_str, 1, .true.) ! true causes writing of nusselt number.
+! Get solution with time integration
+call imex_rk_MPI(proc_id_str, 1, .true., proc_id, num_procs) ! true causes writing of nusselt number.
 
 call MPI_Finalize(mpierror)
 

From c1f1f7e6fb6de24bf141505917c0c6fa50b8ffbf Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Tue, 27 Apr 2021 17:51:58 -0400
Subject: [PATCH 13/24] support V1 V2 instantiation with MPI

---
 bc_setup.f90             | 43 ++++++++++++++++++++++++++++++++++++++--
 time_integrators_MPI.f90 |  8 ++++----
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/bc_setup.f90 b/bc_setup.f90
index 893aca1..03346df 100755
--- a/bc_setup.f90
+++ b/bc_setup.f90
@@ -263,10 +263,39 @@ subroutine init_bc_MPI(aii, proc_id, num_procs, proc_id_str)
       ! Send phi1 and phi2 columns to each node.
       do otherproc = 1,num_procs-1
          start = otherproc * Ny
-         ! write(*,*) start, Fphi12
          call MPI_SEND(Fphi12(start,1), Ny, MPI_DOUBLE, otherproc, 45, MPI_COMM_WORLD, mpierror)
          call MPI_SEND(Fphi12(start,2), Ny, MPI_DOUBLE, otherproc, 46, MPI_COMM_WORLD, mpierror)
       end do
+
+      ! Getting V1 and V2 initialized.
+      FV12(:,1) = Fphi12(:,1)
+      FV12(:,2) = Fphi12(:,2)
+
+      do jj = 2,total_ny-1
+         d(jj-1) = -wavex**2.0_dp + g2_total(jj)
+      end do
+
+      do jj = 2,total_ny-2
+         du(jj-1) = g3_total(jj)
+      end do
+
+      do jj = 3,total_ny-1
+         dl(jj-2) = g1_total(jj)
+      end do
+
+      call dgtsv(total_ny-2, 2, dl, d, du, FV12, total_ny-2, info)
+
+      ! Place V1, V2 in main node memory.
+      V1(2:Ny,ii) = FV12(1:Ny-1,1)
+      V2(2:Ny,ii) = FV12(1:Ny-1,2)
+
+      ! Send V1 and V1 columns to each node.
+      do otherproc = 1,num_procs-1
+         start = otherproc * Ny
+         call MPI_SEND(FV12(start,1), Ny, MPI_DOUBLE, otherproc, 48, MPI_COMM_WORLD, mpierror)
+         call MPI_SEND(FV12(start,2), Ny, MPI_DOUBLE, otherproc, 49, MPI_COMM_WORLD, mpierror)
+      end do
+
    end do
 
    ! Send last row to final proc.
@@ -283,7 +312,7 @@ subroutine init_bc_MPI(aii, proc_id, num_procs, proc_id_str)
    allocate(recv_col2(Ny), stat=alloc_err)
    call check_alloc_err(alloc_err)
 
-   ! Receive rows of phi1 and phi2.
+   ! Receive cols of phi1 and phi2.
    do ii = 1,Nx
       call MPI_RECV(recv_col, Ny, MPI_DOUBLE, 0, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
       call MPI_RECV(recv_col2, Ny, MPI_DOUBLE, 0, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
@@ -291,6 +320,14 @@ subroutine init_bc_MPI(aii, proc_id, num_procs, proc_id_str)
       phi2(1:Ny, ii) = recv_col2
    end do
 
+   ! Receive cols of V1 and V2.
+   do ii = 1,Nx
+      call MPI_RECV(recv_col, Ny, MPI_DOUBLE, 0, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      call MPI_RECV(recv_col2, Ny, MPI_DOUBLE, 0, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      V1(1:Ny, ii) = recv_col
+      V2(1:Ny, ii) = recv_col2
+   end do
+
    ! Handle boundary of last processor.
    if (proc_id == num_procs - 1) then
       allocate(last_row_recv(Nx), stat=alloc_err)
@@ -298,6 +335,8 @@ subroutine init_bc_MPI(aii, proc_id, num_procs, proc_id_str)
       call MPI_RECV(last_row_recv, Nx, MPI_DOUBLE, 0, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
       phi1(Ny,1:Nx)     = 0.0_dp
       phi2(Ny,1:Nx)     = last_row_recv
+      V1(Ny, 1:Nx)      = 0.0_dp
+      V2(Ny, 1:Nx)      = 0.0_dp
    end if 
 end if 
 
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 0e27029..9b6abec 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -62,13 +62,13 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 
 call init_bc_MPI(acoeffs(1,1), proc_id, num_procs, proc_id_str)
 
-open(unit=9000, file="P"//proc_id_str//"phi1.txt", action="write", status="unknown")
-open(unit=9001, file="P"//proc_id_str//"phi2.txt", action="write", status="unknown")
+open(unit=9000, file="P"//proc_id_str//"V1.txt", action="write", status="unknown")
+open(unit=9001, file="P"//proc_id_str//"V2.txt", action="write", status="unknown")
 
 do i=1,Ny
    do j=1,Nx
-       write (9000,*) phi1(i,j)
-       write (9001,*) phi2(i,j)
+       write (9000,*) V1(i,j)
+       write (9001,*) V2(i,j)
    end do
 end do
 

From c37b311138a52671eca72faa9d599647ad052872 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Tue, 27 Apr 2021 18:24:50 -0400
Subject: [PATCH 14/24] calculate wall derivatives with MPI

---
 bc_setup.f90             | 13 +++++++++++++
 time_integrators_MPI.f90 | 39 +++++++++++++++++++++++++++------------
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/bc_setup.f90 b/bc_setup.f90
index 03346df..31f5f56 100755
--- a/bc_setup.f90
+++ b/bc_setup.f90
@@ -340,6 +340,19 @@ subroutine init_bc_MPI(aii, proc_id, num_procs, proc_id_str)
    end if 
 end if 
 
+! Calculate wall derivative on first and last processor.
+if (proc_id == 0) then
+   do ii = 1,Nx
+      dyv1_B(ii) = h1(1)*V1(1,ii) + h2(1)*V1(2,ii) + h3(1)*V1(3,ii)
+      dyv2_B(ii) = h1(1)*V2(1,ii) + h2(1)*V2(2,ii) + h3(1)*V2(3,ii)
+   end do
+else if (proc_id == num_procs - 1) then
+   do ii = 1,Nx
+      dyv1_T(ii) = h1(Ny)*V1(Ny-2,ii) + h2(Ny)*V1(Ny-1,ii) + h3(Ny)*V1(Ny,ii)
+      dyv2_T(ii) = h1(Ny)*V2(Ny-2,ii) + h2(Ny)*V2(Ny-1,ii) + h3(Ny)*V2(Ny,ii)
+   end do
+end if
+
 end subroutine init_bc_MPI
 
 end module bc_setup
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 9b6abec..0498468 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -62,18 +62,33 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 
 call init_bc_MPI(acoeffs(1,1), proc_id, num_procs, proc_id_str)
 
-open(unit=9000, file="P"//proc_id_str//"V1.txt", action="write", status="unknown")
-open(unit=9001, file="P"//proc_id_str//"V2.txt", action="write", status="unknown")
-
-do i=1,Ny
-   do j=1,Nx
-       write (9000,*) V1(i,j)
-       write (9001,*) V2(i,j)
-   end do
-end do
-
-close(unit=9000)
-close(unit=9001)
+if (proc_id == 0) then
+    open(unit=9000, file="P000dyv1_b.txt", action="write", status="unknown")
+    open(unit=9001, file="P000dyv2_b.txt", action="write", status="unknown")
+    
+    do i=1,Nx
+        write (9000,*) dyv1_B(i)
+        write (9001,*) dyv2_B(i)
+    end do
+    
+    close(unit=9000)
+    close(unit=9001)
+end if
+
+if (proc_id == num_procs-1) then
+    open(unit=9000, file="P003dyv1_t.txt", action="write", status="unknown")
+    open(unit=9001, file="P003dyv2_t.txt", action="write", status="unknown")
+    
+    do i=1,Nx
+        write (9000,*) dyv1_T(i)
+        write (9001,*) dyv2_T(i)
+    end do
+    
+    close(unit=9000)
+    close(unit=9001)
+end if
+
+
 
 end subroutine imex_rk_MPI
 

From 5fa988a792fd9c419ae47712e8c8e151b5d268c8 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Thu, 29 Apr 2021 00:03:11 -0400
Subject: [PATCH 15/24] working on time loop

---
 bc_setup.f90             |   9 +-
 time_integrators_MPI.f90 | 575 +++++++++++++++++++++++++++++++++++++--
 time_loop_MPI.f90        |   1 +
 3 files changed, 556 insertions(+), 29 deletions(-)

diff --git a/bc_setup.f90 b/bc_setup.f90
index 31f5f56..0c127eb 100755
--- a/bc_setup.f90
+++ b/bc_setup.f90
@@ -2,9 +2,10 @@ module bc_setup
 
 use global
 use write_pack
+use mesh_pack
 
 implicit none
-include 'mpif.h'
+! include 'mpif.h'
 
 contains
 
@@ -303,9 +304,9 @@ subroutine init_bc_MPI(aii, proc_id, num_procs, proc_id_str)
 
 else
    ! Send g1, g2, g3 to first node.
-   call MPI_SEND(g1(1), 12, MPI_DOUBLE, 0, 42, MPI_COMM_WORLD, mpierror)
-   call MPI_SEND(g2(1), 12, MPI_DOUBLE, 0, 43, MPI_COMM_WORLD, mpierror)
-   call MPI_SEND(g3(1), 12, MPI_DOUBLE, 0, 44, MPI_COMM_WORLD, mpierror)
+   call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 42, MPI_COMM_WORLD, mpierror)
+   call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 43, MPI_COMM_WORLD, mpierror)
+   call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 44, MPI_COMM_WORLD, mpierror)
 
    allocate(recv_col(Ny), stat=alloc_err)
    call check_alloc_err(alloc_err)
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 0498468..00da549 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -30,13 +30,20 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 logical, optional, intent(in)  :: save_nusselt
 integer,           intent(in)  :: proc_id, num_procs
 
-integer                        :: nti, i, j
-integer                        :: nprint
+integer                        :: nti, i, j, mpierror, total_ny
+integer                        :: nprint, otherproc
 logical                        :: wvtk
 real(dp)                       :: nusselt_num
 real(dp)                       :: start, finish
 real(dp)                       :: start_overall, finish_overall
-integer                        :: nthreads, myid
+integer                        :: nthreads, myid, stind
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_phi_MPI, tmp_T_MPI, phi_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1hat_phi_MPI, K2hat_phi_MPI, K3hat_phi_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1hat_T_MPI, K2hat_T_MPI, K3hat_T_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: T_MPI, demo_T_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:,:) :: phiout, Tout
+real(dp), allocatable, dimension(:)   :: g1_total, g2_total, g3_total
 integer, EXTERNAL              :: OMP_GET_THREAD_NUM, OMP_GET_NUM_THREADS
 real(dp), EXTERNAL             :: OMP_GET_WTIME
 EXTERNAL                       :: OMP_SET_NUM_THREADS
@@ -62,36 +69,554 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 
 call init_bc_MPI(acoeffs(1,1), proc_id, num_procs, proc_id_str)
 
-if (proc_id == 0) then
-    open(unit=9000, file="P000dyv1_b.txt", action="write", status="unknown")
-    open(unit=9001, file="P000dyv2_b.txt", action="write", status="unknown")
-    
-    do i=1,Nx
-        write (9000,*) dyv1_B(i)
-        write (9001,*) dyv2_B(i)
-    end do
-    
-    close(unit=9000)
-    close(unit=9001)
+time = 0.0_dp
+
+dtmax = 0.5_dp
+dtmin = 1.0e-4_dp
+
+dt_ramp = 1.1_dp
+
+dt_old = dt
+
+nti = 0
+
+! Format for writing out single values.x
+1000 format(E25.16E3)
+
+if (proc_id == 0) then 
+   total_ny = Ny * num_procs
+   allocate(tmp_phi_MPI(total_ny), tmp_T_MPI(total_ny), phi_MPI(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(K1hat_phi_MPI(total_ny-2), K2hat_phi_MPI(total_ny-2), K3hat_phi_MPI(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(K1hat_T_MPI(total_ny-2), K2hat_T_MPI(total_ny-2), K3hat_T_MPI(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(K1_phi_MPI(total_ny-2), K2_phi_MPI(total_ny-2), K1_T_MPI(total_ny-2), K2_T_MPI(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(T_MPI(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(g1_total(total_ny), g2_total(total_ny), g3_total(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
 end if
 
-if (proc_id == num_procs-1) then
-    open(unit=9000, file="P003dyv1_t.txt", action="write", status="unknown")
-    open(unit=9001, file="P003dyv2_t.txt", action="write", status="unknown")
-    
-    do i=1,Nx
-        write (9000,*) dyv1_T(i)
-        write (9001,*) dyv2_T(i)
+allocate(phiout(Ny,Nx), Tout(Ny,Nx), stat=alloc_err)
+call check_alloc_err(alloc_err)
+
+call MPI_BARRIER(MPI_COMM_WORLD, mpierror) 
+
+! Time integration
+do ! while (time < t_final)
+    start_overall = OMP_GET_WTIME()
+ 
+    dt_final = t_final - time
+ 
+    if (dt_final <= dt) then
+       time = t_final
+    else
+       time = time + dt
+    end if
+ 
+    write(*,*) "time = ", time, "dt = ", dt
+ 
+    nti = nti + 1
+ 
+    !:::::::::::
+    ! STAGE 1 ::
+    !:::::::::::
+    phii = phi
+    Ti   = T
+    uxi  = ux
+    uyi  = uy
+    start = OMP_GET_WTIME()
+    call calc_explicit_MPI(1, proc_id, num_procs, proc_id_str)
+    finish = OMP_GET_WTIME()
+    write(*,*) " - calc_explicit(1) timing: ", finish-start, "(s)"
+    start = OMP_GET_WTIME()
+    !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+    do it = 1,1 ! kx loop
+        ! Compute phi1 and T1
+        if (proc_id == 0) then
+            ! Need to pull all of the variables into local main memory.
+            phi_MPI(1:Ny-1) = phi(2:Ny,it)
+            K1hat_phi_MPI(1:Ny-1) = K1hat_phi(2:Ny,it)
+            K2hat_phi_MPI(1:Ny-1) = K2hat_phi(2:Ny,it)
+            K3hat_phi_MPI(1:Ny-1) = K3hat_phi(2:Ny,it)
+            K1hat_T_MPI(1:Ny-1) = K1hat_T(2:Ny,it)
+            K2hat_T_MPI(1:Ny-1) = K2hat_T(2:Ny,it)
+            K3hat_t_MPI(1:Ny-1) = K3hat_T(2:Ny,it)
+            K1_phi_MPI(1:Ny-1) = K1_phi(2:Ny,it)
+            K2_phi_MPI(1:Ny-1) = K2_phi(2:Ny,it)
+            K1_T_MPI(1:Ny-1) = K1_T(2:Ny,it)
+            K2_T_MPI(1:Ny-1) = K2_T(2:Ny,it)
+            T_MPI(1:Ny) = T(1:Ny,it)
+
+            ! Receive pieces from other nodes.
+            do otherproc = 1,num_procs-2
+                stind = otherproc * Ny
+                call MPI_RECV(phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            
+            ! Receive from last node.
+            stind = (num_procs-1) * Ny
+            call MPI_RECV(phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+        
+            ! Receive g1,g2,g3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(g1_total(stind), Ny, MPI_DOUBLE, otherproc, 54, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g2_total(stind), Ny, MPI_DOUBLE, otherproc, 55, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g3_total(stind), Ny, MPI_DOUBLE, otherproc, 56, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            g1_total(1:Ny) = g1
+            g2_total(1:Ny) = g2
+            g3_total(1:Ny) = g3
+
+            ! Call vari mod
+            call calc_vari_mod_MPI(tmp_phi_MPI, tmp_T_MPI, acoeffs(1,1), 1, total_ny,&
+                                kx(it), phi_MPI,&
+                                K1hat_phi_MPI,K2hat_phi_MPI,K3hat_phi_MPI,&
+                                K1hat_T_MPI,K2hat_T_MPI,K3hat_T_MPI,&
+                                K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI,&
+                                g1_total, g2_total, g3_total,&
+                                T_MPI)
+            do i = 1,total_ny
+                write(*,*) i, tmp_T_MPI(i)
+            end do 
+            
+        else if (proc_id == num_procs - 1) then
+            ! Send data to main node
+            call MPI_SEND(phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+        else 
+            ! Send data to main node
+            call MPI_SEND(phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+        end if
     end do
+    !$OMP END PARALLEL DO
+    finish = OMP_GET_WTIME()
+    write(*,*) " - stage 1 mid timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+    ! if (proc_id == 0) then 
+    !     open(unit=9010, file="P"//proc_id_str//"phiout_real.txt", action="write", status="unknown")
+    !     open(unit=9011, file="P"//proc_id_str//"phiout_im.txt", action="write", status="unknown")
+    !     do i=1,total_ny
+    !         do j=1,Nx
+    !             write (9010,*) REAL(Tout(i,j))
+    !             write (9011,*) AIMAG(Tout(i,j))
+    !         end do
+    !     end do
+    !     close(unit=9010)
+    !     close(unit=9011)
+    ! end if 
+   write(*,*) "done writing phi!"
+    if (time == t_final) then
+        exit
+    end if
     
-    close(unit=9000)
-    close(unit=9001)
-end if
+ 
+ end do
 
+end subroutine imex_rk_MPI
 
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-end subroutine imex_rk_MPI
+subroutine calc_explicit_MPI(stage, proc_id, num_procs, proc_id_str)
+
+integer             :: i, j
+integer, intent(in) :: stage, proc_id, num_procs
+character(3), intent(in) :: proc_id_str
+real(dp)            :: start, finish
+
+start = OMP_GET_WTIME()
+select case(stage)
+    case (1)
+        !$OMP PARALLEL DO schedule(dynamic)
+        do i = 1,Nx
+            K1hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
+        end do
+        !$OMP END PARALLEL DO
+    case (2)
+        !$OMP PARALLEL DO schedule(dynamic)
+        do i = 1,Nx
+            K2hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
+        end do
+        !$OMP END PARALLEL DO
+    case (3)
+        !$OMP PARALLEL DO schedule(dynamic)
+        do i = 1,Nx
+            K3hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
+        end do
+        !$OMP END PARALLEL DO
+    case (4)
+        !$OMP PARALLEL DO schedule(dynamic)
+        do i = 1,Nx
+            K4hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
+        end do
+        !$OMP END PARALLEL DO
+end select
+finish = OMP_GET_WTIME()
+write(*,*) " - - l1 timing: ", finish-start, "(s)"
+
+start = OMP_GET_WTIME()
+!$OMP PARALLEL DO schedule(dynamic)
+do i=1,Nx
+   ! Compute dx(T) in Fourier space
+   nlT  (:,i) =  kx(i)*Ti(:,i)
+   ! Compute D2(ux)
+   nlphi(:,i) = -kx(i)**2.0_dp*uxi(:,i) + d2y(uxi(:,i))
+end do
+!$OMP END PARALLEL DO
+finish = OMP_GET_WTIME()
+
+write(*,*) " - - l2 timing: ", finish-start, "(s)"
+
+!nlT = -CI*nlT
+nlT = CI*nlT
+
+start = OMP_GET_WTIME()
+!$OMP PARALLEL DO private(tnlT, tnlphi, tT, tux, tuy, tphi) schedule(dynamic)
+do j = 1,Ny
+   ! Bring everything to physical space
+   tnlT   = nlT(j,:)
+   tnlphi = nlphi(j,:)
+   tT     = Ti(j,:)
+   tux    = uxi(j,:)
+   tuy    = uyi(j,:)
+   tphi   = phii(j,:)
+   call fftw_execute_dft(iplannlT, tnlT, tnlT)
+   call fftw_execute_dft(iplannlphi, tnlphi, tnlphi)
+   call fftw_execute_dft(iplanT, tT, tT)
+   call fftw_execute_dft(iplanux, tux, tux)
+   call fftw_execute_dft(iplanuy, tuy, tuy)
+   call fftw_execute_dft(iplanphi, tphi, tphi)
+   nlT(j,:)   = tnlT
+   nlphi(j,:) = tnlphi
+   Ti(j,:)   = tT
+   uxi(j,:)  = tux
+   uyi(j,:)  = tuy
+   phii(j,:) = tphi
+end do
+!$OMP END PARALLEL DO
+finish = OMP_GET_WTIME()
+write(*,*) " - - l3 timing: ", finish-start, "(s)"
+
+! Calculate nonlinear term
+start = OMP_GET_WTIME()
+!$OMP PARALLEL DO private(tmp_T) schedule(dynamic)
+do i = 1,Nx
+   ! Temperature
+   tmp_T = Ti(:,i)
+   nlT(:,i) = uxi(:,i)*nlT(:,i) + uyi(:,i)*d1y(tmp_T)
+   ! phi
+   nlphi(:,i) = uxi(:,i)*phii(:,i) - uyi(:,i)*nlphi(:,i)
+end do
+!$OMP END PARALLEL DO
+finish = OMP_GET_WTIME()
+write(*,*) " - - l4 timing: ", finish-start, "(s)"
+
+! Bring nonlinear terms back to Fourier space
+start = OMP_GET_WTIME()
+!$OMP PARALLEL DO private(tnlT, tnlphi) schedule(dynamic)
+do j = 1,Ny
+   tnlT   = nlT(j,:)
+   tnlphi = nlphi(j,:)
+   call fftw_execute_dft(plannlT, tnlT, tnlT)
+   call fftw_execute_dft(plannlphi, tnlphi, tnlphi)
+   ! Dealias
+   do i = 1,Nx
+      if (abs(kx(i))/alpha >= Nf/2) then
+         tnlT(i)   = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+         tnlphi(i) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+      end if
+   end do
+   nlT(j,:)   = tnlT
+   nlphi(j,:) = tnlphi
+end do
+!$OMP END PARALLEL DO
+finish = OMP_GET_WTIME()
+write(*,*) " - - l5 timing: ", finish-start, "(s)"
+
+nlT   = nlT   / real(Nx,kind=dp)
+nlphi = nlphi / real(Nx,kind=dp)
+
+start = OMP_GET_WTIME()
+select case (stage)
+   case (1)
+      !$OMP PARALLEL DO schedule(dynamic)
+      do i = 1,Nx
+         !K1hat_phi(:,i) = K1hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
+         K1hat_phi(:,i) = K1hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
+      end do
+      !$OMP END PARALLEL DO
+      K1hat_T = -nlT
+   case (2)
+      !$OMP PARALLEL DO schedule(dynamic)
+      do i = 1,Nx
+         !K2hat_phi(:,i) = K2hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
+         K2hat_phi(:,i) = K2hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
+      end do
+      !$OMP END PARALLEL DO
+      K2hat_T = -nlT
+   case (3)
+      !$OMP PARALLEL DO schedule(dynamic)
+      do i = 1,Nx
+         !K3hat_phi(:,i) = K3hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
+         K3hat_phi(:,i) = K3hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
+      end do
+      !$OMP END PARALLEL DO
+      K3hat_T = -nlT
+   case (4)
+      !$OMP PARALLEL DO schedule(dynamic)
+      do i = 1,Nx
+         !K4hat_phi(:,i) = K4hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
+         K4hat_phi(:,i) = K4hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
+      end do
+      !$OMP END PARALLEL DO
+      K4hat_T = -nlT
+end select
+finish = OMP_GET_WTIME()
+write(*,*) " - - l6 timing: ", finish-start, "(s)"
+
+end subroutine calc_explicit_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
+subroutine calc_vari_mod_MPI(phiout,Tout, aii, stage, total_ny, kx_it, phi_in, &
+    k1hat_phi_in, k2hat_phi_in, k3hat_phi_in,&
+    k1hat_T_in, k2hat_T_in, k3hat_T_in,&
+    k1_phi_in, k2_phi_in, k1_T_in, k2_T_in,&
+    g1_total, g2_total, g3_total,&
+    T_in)
+
+real(dp),                                               intent(in)  :: aii
+integer,                                                intent(in)  :: stage, total_ny
+real(dp),                                               intent(in)  :: kx_it
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k1hat_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k2hat_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k3hat_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k1hat_T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k2hat_T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k3hat_T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny),   intent(in)  :: T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k1_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k2_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k1_T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k2_T_in
+real(dp),              dimension(total_ny), intent(in)  :: g1_total, g2_total, g3_total
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:),   intent(out) :: phiout, Tout
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:)              :: Fphi, FT
+real(dp),                  allocatable, dimension(:,:)            :: phi_rhs, T_rhs
+real(dp),                  allocatable, dimension(:)              :: dphi, duphi, dlphi
+real(dp),                  allocatable, dimension(:)              :: ddT, duT, dlT
+integer :: i
+
+allocate(dphi(total_ny-2), ddT(total_ny-2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(duphi(total_ny-3), dlphi(total_ny-3), duT(total_ny-3), dlT(total_ny-3), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(phiout(total_ny), Tout(total_ny), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(Fphi(total_ny-2), FT(total_ny-2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(phi_rhs(total_ny-2,2), T_rhs(total_ny-2,2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+
+dphi    = 0.0_dp
+ddT     = 0.0_dp
+duphi   = 0.0_dp
+dlphi   = 0.0_dp
+duT     = 0.0_dp
+dlT     = 0.0_dp
+phi_rhs = 0.0_dp
+T_rhs   = 0.0_dp
+Fphi    = (0.0_dp, 0.0_dp)
+FT      = (0.0_dp, 0.0_dp)
+phiout  = (0.0_dp, 0.0_dp)
+Tout    = (0.0_dp, 0.0_dp)
+
+! LHS Matrix (tridiagonal, not necessarily symmetric)
+do jt = 2,total_ny-1
+    ddT (jt-1) = 1.0_dp - kappa0*dt*aii*(-kx_it**2.0_dp + g2_total(jt))
+    dphi(jt-1) = 1.0_dp - nu0   *dt*aii*(-kx_it**2.0_dp + g2_total(jt))
+end do
+
+! do i = 2,total_ny-1
+!     write(*,*) i, ddT (i-1)
+! end do 
+
+! write(*,*) kappa0 ,dt , aii
+
+
+do jt = 2,total_ny-2
+    duT  (jt-1) = -kappa0*dt*g3_total(jt)*aii
+    duphi(jt-1) = -nu0   *dt*g3_total(jt)*aii
+end do
+
+do jt = 3,total_ny-1
+    dlT  (jt-2) = -kappa0*dt*g1_total(jt)*aii
+    dlphi(jt-2) = -nu0   *dt*g1_total(jt)*aii
+end do
+
+select case (stage)
+    case(1)
+        Fphi     = phi_in + dt*ahatcoeffs(2,1)*k1hat_phi_in
+        FT       = T_in(2:total_ny-1) + dt*ahatcoeffs(2,1)*k1hat_T_in
+        FT(1)    = FT(1) + kappa0*dt*aii*g1_total(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
+        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3_total(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
+
+        phi_rhs(:,1) = real(Fphi)
+        phi_rhs(:,2) = aimag(Fphi)
+
+        T_rhs  (:,1) = real(FT)
+        T_rhs  (:,2) = aimag(FT)
+
+        call dgtsv(total_ny-2, 2, dlT, ddT, duT, T_rhs, total_ny-2, info)
+        Tout(2:total_ny-1) = cmplx(T_rhs(:,1), T_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+        ! Set temperature boundary conditions
+        Tout(1) = T_in(1)
+        Tout(total_ny) = T_in(total_ny)
+
+        call dgtsv(total_ny-2, 2, dlphi, dphi, duphi, phi_rhs, total_ny-2, info)
+        phiout(2:total_ny-1) = cmplx(phi_rhs(:,1), phi_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+
+    case(2)
+        Fphi = phi_in + dt*(acoeffs(2,1)*k1_phi_in       + &
+        &                     ahatcoeffs(3,1)*k1hat_phi_in + &
+        &                     ahatcoeffs(3,2)*k2hat_phi_in)
+        FT   = T_in(2:total_ny-1) + dt*(acoeffs(2,1)*k1_T_in      + &
+        &                     ahatcoeffs(3,1)*k1hat_T_in + &
+        &                     ahatcoeffs(3,2)*k2hat_T_in)
+        FT(1)    = FT(1) + kappa0*dt*aii*g1(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
+        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
+
+        phi_rhs(:,1) = real(Fphi)
+        phi_rhs(:,2) = aimag(Fphi)
+
+        T_rhs  (:,1) = real(FT)
+        T_rhs  (:,2) = aimag(FT)
+
+        call dgtsv(total_ny-2, 2, dlT, ddT, duT, T_rhs, total_ny-2, info)
+        Tout(2:total_ny-1) = cmplx(T_rhs(:,1), T_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+        ! Set temperature boundary conditions
+        Tout(1) = T_in(1)
+        Tout(total_ny) = T_in(total_ny)
+
+        call dgtsv(total_ny-2, 2, dlphi, dphi, duphi, phi_rhs, total_ny-2, info)
+        phiout(2:total_ny-1) = cmplx(phi_rhs(:,1), phi_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+
+    case(3)
+        Fphi = phi_in + dt*(acoeffs(3,1)*k1_phi_in       + &
+                    &acoeffs(3,2)*k2_phi_in       + &
+                    &ahatcoeffs(4,1)*k1hat_phi_in + &
+                    &ahatcoeffs(4,2)*k2hat_phi_in + &
+                    &ahatcoeffs(4,3)*k3hat_phi_in)
+        FT   = T_in(2:total_ny-1) + dt*(acoeffs(3,1)*k1_T_in       + &
+                    &acoeffs(3,2)*k2_T_in     + &
+                    &ahatcoeffs(4,1)*k1hat_T_in + &
+                    &ahatcoeffs(4,2)*k2hat_T_in + &
+                    &ahatcoeffs(4,3)*k3hat_T_in)
+        FT(1)    = FT(1) + kappa0*dt*aii*g1(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
+        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
+
+        phi_rhs(:,1) = real(Fphi)
+        phi_rhs(:,2) = aimag(Fphi)
+
+        T_rhs  (:,1) = real(FT)
+        T_rhs  (:,2) = aimag(FT)
+
+        call dgtsv(total_ny-2, 2, dlT, ddT, duT, T_rhs, total_ny-2, info)
+        Tout(2:total_ny-1) = cmplx(T_rhs(:,1), T_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+        ! Set temperature boundary conditions
+        Tout(1) = T_in(1)
+        Tout(total_ny) = T_in(total_ny)
+
+        call dgtsv(total_ny-2, 2, dlphi, dphi, duphi, phi_rhs, total_ny-2, info)
+        phiout(2:total_ny-1) = cmplx(phi_rhs(:,1), phi_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+
+end select
+
+end subroutine calc_vari_mod_MPI
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
 
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
 
 end module time_integrators_MPI
     
\ No newline at end of file
diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
index 0e2956b..5201f65 100644
--- a/time_loop_MPI.f90
+++ b/time_loop_MPI.f90
@@ -12,6 +12,7 @@ program time_loop
 use mesh_pack
 use time_integrators_MPI
 
+! include 'mpif.h'
 implicit none
 
 integer                                                :: ntokens, ios, i

From e4fea8830c1ea4baa16747cc2a0bf21e35391671 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Thu, 29 Apr 2021 17:12:05 -0400
Subject: [PATCH 16/24] finished stage 1

---
 global.f90               |  90 ++++++++-
 time_integrators_MPI.f90 | 386 +++++++++++++++++++++++++++++++++------
 2 files changed, 424 insertions(+), 52 deletions(-)

diff --git a/global.f90 b/global.f90
index e94d8d8..96ca400 100755
--- a/global.f90
+++ b/global.f90
@@ -314,7 +314,7 @@ function d1y(vec) result(d1yvec)
 
 complex(C_DOUBLE_COMPLEX), dimension(:), intent(in)  :: vec
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: d1yvec
-integer               :: alloc_err, jj, n
+integer               :: alloc_err, jj, n, i
 
 n = size(vec)
 
@@ -336,6 +336,35 @@ end function d1y
 
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
+function d1y_MPI(vec, h1_total, h2_total, h3_total) result(d1yvec)
+
+implicit none
+
+complex(C_DOUBLE_COMPLEX), dimension(:), intent(in)  :: vec
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: d1yvec
+real(dp),   dimension(:),  intent(in)  :: h1_total, h2_total, h3_total                
+integer               :: alloc_err, jj, n, i
+
+n = size(vec)
+
+allocate(d1yvec(n), stat=alloc_err)
+if (alloc_err /= 0) then
+   write(*,*) "ERROR:  Allocation problem in d1y."
+   stop
+end if
+
+d1yvec = 0.0_dp
+
+d1yvec(1) = h1_total(1)*vec(1) + h2_total(1)*vec(2) + h3_total(1)*vec(3)
+do jj = 2,n-1
+   d1yvec(jj) = h1_total(jj)*vec(jj-1) + h2_total(jj)*vec(jj) + h3_total(jj)*vec(jj+1)
+end do
+d1yvec(n) = h1_total(n)*vec(n-2) + h2_total(n)*vec(n-1) + h3_total(n)*vec(n)
+
+end function d1y_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 function d2y(vec) result(d2yvec)
 
 implicit none
@@ -389,6 +418,65 @@ function d2y(vec) result(d2yvec)
 
 end function d2y
 
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+function d2y_MPI(vec, dynu_in, g1_total, g2_total, g3_total, total_ny) result(d2yvec)
+
+implicit none
+
+complex(C_DOUBLE_COMPLEX), intent(in)  :: vec(:)
+complex(C_DOUBLE_COMPLEX), allocatable :: d2yvec(:)
+real(dp),   dimension(:),  intent(in)  :: dynu_in, g1_total, g2_total, g3_total                
+integer,                   intent(in)  :: total_ny
+real(dp)                               :: d1, d2, d3, d4
+real(dp)                               :: ell1, ell2, ell3, ell4
+real(dp)                               :: dNm3, dNm2, dNm1, dN
+real(dp)                               :: ellNm3, ellNm2, ellNm1, ellN
+integer                                :: jj, n
+
+n = size(vec)
+
+allocate(d2yvec(n), stat=alloc_err)
+if (alloc_err /= 0) then
+   write(*,*) "ERROR:  Allocation problem in d2y."
+   stop
+end if
+
+d2yvec = 0.0_dp
+
+d1 = -dynu_in(1)*(dynu_in(1)+dynu_in(2))*(dynu_in(1)+dynu_in(2)+dynu_in(3))
+d2 =  dynu_in(1)*dynu_in(2)*(dynu_in(2) + dynu_in(3))
+d3 = -dynu_in(2)*dynu_in(3)*(dynu_in(1) + dynu_in(2))
+d4 =  dynu_in(3)*(dynu_in(2)+dynu_in(3))*(dynu_in(1)+dynu_in(2)+dynu_in(3))
+
+ell1 = -2.0_dp*(3.0_dp*dynu_in(1) + 2.0_dp*dynu_in(2) + dynu_in(3)) / d1
+ell2 = -2.0_dp*(2.0_dp*(dynu_in(1) + dynu_in(2)) + dynu_in(3)) / d2
+ell3 = -2.0_dp*(2.0_dp*dynu_in(1) + dynu_in(2) + dynu_in(3)) / d3
+ell4 = -2.0_dp*(2.0_dp*dynu_in(1) + dynu_in(2)) / d4
+
+dN   =  dynu_in(total_ny-1)*(dynu_in(total_ny-1)+dynu_in(total_ny-2))*(dynu_in(total_ny-1)+dynu_in(total_ny-2)+dynu_in(total_ny-3))
+dNm1 = -dynu_in(total_ny-1)*dynu_in(total_ny-2)*(dynu_in(total_ny-2)+dynu_in(total_ny-3))
+dNm2 =  dynu_in(total_ny-2)*dynu_in(total_ny-3)*(dynu_in(total_ny-1)+dynu_in(total_ny-2))
+dNm3 = -dynu_in(total_ny-3)*(dynu_in(total_ny-2)+dynu_in(total_ny-3))*(dynu_in(total_ny-1)+dynu_in(total_ny-2)+dynu_in(total_ny-3))
+
+ellN   = 2.0_dp*(3.0_dp*dynu_in(total_ny-1) + 2.0_dp*dynu_in(total_ny-2) + dynu_in(total_ny-3)) / dN
+ellNm1 = 2.0_dp*(2.0_dp*(dynu_in(total_ny-1)+dynu_in(total_ny-2)) + dynu_in(total_ny-3)) / dNm1
+ellNm2 = 2.0_dp*(2.0_dp*dynu_in(total_ny-1) + dynu_in(total_ny-2) + dynu_in(total_ny-3)) / dNm2
+ellNm3 = 2.0_dp*(2.0_dp*dynu_in(total_ny-1) + dynu_in(total_ny-2)) / dNm3
+
+!d2yvec(1) = g1(1)*vec(1) + g2(1)*vec(2) + g3(1)*vec(3)
+d2yvec(1) = ell1*vec(1) + ell2*vec(2) + ell3*vec(3) + ell4*vec(4)
+do jj = 2,n-1
+   d2yvec(jj) = g1_total(jj)*vec(jj-1) + g2_total(jj)*vec(jj) + g3_total(jj)*vec(jj+1)
+end do
+!d2yvec(n) = g1(n)*vec(n-2) + g2(n)*vec(n-1) + g3(n)*vec(n)
+d2yvec(n) = ellNm3*vec(n-3) + ellNm2*vec(n-2) + &
+            &ellNm1*vec(n-1) + ellN*vec(n)
+
+end function d2y_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 subroutine check_alloc_err(err_mess)
 
 integer, intent(in) :: err_mess
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 00da549..9bbb35d 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -41,9 +41,13 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1hat_phi_MPI, K2hat_phi_MPI, K3hat_phi_MPI
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1hat_T_MPI, K2hat_T_MPI, K3hat_T_MPI
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI
-complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: T_MPI, demo_T_MPI
-complex(C_DOUBLE_COMPLEX), allocatable, dimension(:,:) :: phiout, Tout
-real(dp), allocatable, dimension(:)   :: g1_total, g2_total, g3_total
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: T_MPI, tmp_uy_MPI, tmp_uy1_MPI, tmp_phi1_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_K_phi_MPI, tmp_K_T_MPI, uxi_MPI
+real(dp), allocatable, dimension(:) :: phi1_MPI, phi2_MPI, V1_MPI, V2_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:,:) :: single_phiout, single_Tout
+real(dp), allocatable, dimension(:)   :: g1_total, g2_total, g3_total, dynu_MPI
+real(dp), allocatable, dimension(:)   :: h1_total, h2_total, h3_total
+real(dp)                       :: dyv1_T_it_MPI, dyv2_T_it_MPI, h1_end_MPI, h2_end_MPI, h3_end_MPI
 integer, EXTERNAL              :: OMP_GET_THREAD_NUM, OMP_GET_NUM_THREADS
 real(dp), EXTERNAL             :: OMP_GET_WTIME
 EXTERNAL                       :: OMP_SET_NUM_THREADS
@@ -95,13 +99,20 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
    call check_alloc_err(alloc_err)
    allocate(T_MPI(total_ny), stat=alloc_err)
    call check_alloc_err(alloc_err)
-   allocate(g1_total(total_ny), g2_total(total_ny), g3_total(total_ny), stat=alloc_err)
+   allocate(g1_total(total_ny), g2_total(total_ny), g3_total(total_ny), dynu_MPI(total_ny-1), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(h1_total(total_ny), h2_total(total_ny), h3_total(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(tmp_uy_MPI(total_ny), tmp_uy1_MPI(total_ny), tmp_phi1_MPI(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(single_phiout(total_ny, Nx), single_Tout(total_ny,Nx), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(phi1_MPI(total_ny), phi2_MPI(total_ny), V1_MPI(total_ny), V2_MPI(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(tmp_K_phi_MPI(total_ny), tmp_K_T_MPI(total_ny), uxi_MPI(total_ny), stat=alloc_err)
    call check_alloc_err(alloc_err)
 end if
 
-allocate(phiout(Ny,Nx), Tout(Ny,Nx), stat=alloc_err)
-call check_alloc_err(alloc_err)
-
 call MPI_BARRIER(MPI_COMM_WORLD, mpierror) 
 
 ! Time integration
@@ -133,7 +144,7 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     write(*,*) " - calc_explicit(1) timing: ", finish-start, "(s)"
     start = OMP_GET_WTIME()
     !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
-    do it = 1,1 ! kx loop
+    do it = 1,Nx ! kx loop
         ! Compute phi1 and T1
         if (proc_id == 0) then
             ! Need to pull all of the variables into local main memory.
@@ -226,67 +237,223 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
                                 K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI,&
                                 g1_total, g2_total, g3_total,&
                                 T_MPI)
-            do i = 1,total_ny
-                write(*,*) i, tmp_T_MPI(i)
-            end do 
+            
+            ! Compute v1 from phi1
+            call calc_vi_mod_MPI(tmp_uy_MPI, tmp_phi_MPI, kx(it), total_ny, g1_total, g2_total, g3_total)
+            
+            ! Receive dyv1 and dyv2
+            call MPI_RECV(dyv1_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 57, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(dyv2_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 58, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+            ! Receive V1, V2, phi1, phi2
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(V1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 59, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(V2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 60, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 61, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 62, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            ! Set local variables to MPI.
+            V1_MPI(1:Ny) = V1(1:Ny, it)
+            V2_MPI(1:Ny) = V2(1:Ny, it)
+            phi1_MPI(1:Ny) = phi1(1:Ny, it)
+            phi2_MPI(1:Ny) = phi2(1:Ny, it)
+
+            ! Receive end of h1, h2, h3 for last processor.
+            call MPI_RECV(h1_end_MPI, 1, MPI_DOUBLE, num_procs-1, 63, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h2_end_MPI, 1, MPI_DOUBLE, num_procs-1, 64, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h3_end_MPI, 1, MPI_DOUBLE, num_procs-1, 65, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+            ! BOUNDAY CONDITIONS!
+            call update_bcs_mod_MPI(tmp_phi1_MPI,tmp_uy1_MPI, tmp_phi_MPI,tmp_uy_MPI,&
+                                dyv1_T_it_MPI,dyv2_T_it_MPI,&
+                                dyv1_B(it),dyv2_B(it),&
+                                V1_MPI,V2_MPI,phi1_MPI,phi2_MPI,&
+                                total_ny, h1_end_MPI, h2_end_MPI, h3_end_MPI)
+            
+            tmp_phi_MPI = tmp_phi1_MPI
+            tmp_uy_MPI  = tmp_uy1_MPI
+
+            ! Receive dynu from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny
+                call MPI_RECV(dynu_MPI(stind), Ny, MPI_DOUBLE, otherproc, 66, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            dynu_MPI(1:Ny-1) = dynu(1:Ny-1)
+            
+            ! Implicit.
+            call calc_implicit_mod_MPI(tmp_K_phi_MPI,tmp_K_T_MPI, tmp_phi_MPI,tmp_T_MPI,&
+                                       kx(it), total_ny, dynu_MPI, g1_total, g2_total, g3_total)
+
+            ! Receive h1,h2,h3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(h1_total(stind), Ny, MPI_DOUBLE, otherproc, 67, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h2_total(stind), Ny, MPI_DOUBLE, otherproc, 68, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h3_total(stind), Ny, MPI_DOUBLE, otherproc, 69, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            h1_total(1:Ny) = h1
+            h2_total(1:Ny) = h2
+            h3_total(1:Ny) = h3
+            
+            ! Compute u1 from v1
+            if (kx(it) /= 0.0_dp) then
+                uxi_MPI = CI*d1y_MPI(tmp_uy_MPI, h1_total, h2_total, h3_total)/kx(it)
+            else if (kx(it) == 0.0_dp) then
+                uxi_MPI = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+            end if
+
+            ! Send K1_phi, K1_T, phii, Ti, uyi, uxi back to each node.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_SEND(tmp_K_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 70, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_K_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 71, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 72, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 73, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_uy_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 74, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(uxi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 75, MPI_COMM_WORLD, mpierror)
+            end do
+
+            ! Set local to main node variables.
+            K1_phi(1:Ny,it) = tmp_K_phi_MPI(1:Ny)
+            K1_T(1:Ny,it) = tmp_K_T_MPI(1:Ny)
+            phii(1:Ny,it) = tmp_phi_MPI(1:Ny)
+            Ti(1:Ny,it) = tmp_T_MPI(1:Ny)
+            uyi(1:Ny,it) = tmp_uy_MPI(1:Ny)
+            uxi(1:Ny,it) = uxi_MPI(1:Ny)
+
             
         else if (proc_id == num_procs - 1) then
             ! Send data to main node
-            call MPI_SEND(phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K1hat_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K2hat_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K3hat_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K1hat_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K2hat_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K3hat_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K1_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K2_phi(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K1_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K2_T(1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
             call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
 
             ! Send g1, g2, g3 to first node.
             call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
             call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
             call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send dyv1 and dyv2.
+            call MPI_SEND(dyv1_T(it), 1, MPI_DOUBLE, 0, 57, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(dyv2_T(it), 1, MPI_DOUBLE, 0, 58, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+            
+            ! Send h1,h2,h3 last elements.
+            call MPI_SEND(h1(Ny), 1, MPI_DOUBLE, 0, 63, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(Ny), 1, MPI_DOUBLE, 0, 64, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(Ny), 1, MPI_DOUBLE, 0, 65, MPI_COMM_WORLD, mpierror)
+
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K1_phi, K1_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
         else 
             ! Send data to main node
-            call MPI_SEND(phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K1hat_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K2hat_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K3hat_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K1hat_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K2hat_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K3hat_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K1_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K2_phi(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K1_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(K2_T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
-            call MPI_SEND(T(1,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
 
             ! Send g1, g2, g3 to first node.
             call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
             call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
             call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+        
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K1_phi, K1_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
         end if
     end do
     !$OMP END PARALLEL DO
     finish = OMP_GET_WTIME()
     write(*,*) " - stage 1 mid timing: ", finish-start, "(s)"
+
+   
+
     call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
-    ! if (proc_id == 0) then 
-    !     open(unit=9010, file="P"//proc_id_str//"phiout_real.txt", action="write", status="unknown")
-    !     open(unit=9011, file="P"//proc_id_str//"phiout_im.txt", action="write", status="unknown")
-    !     do i=1,total_ny
-    !         do j=1,Nx
-    !             write (9010,*) REAL(Tout(i,j))
-    !             write (9011,*) AIMAG(Tout(i,j))
-    !         end do
-    !     end do
-    !     close(unit=9010)
-    !     close(unit=9011)
-    ! end if 
-   write(*,*) "done writing phi!"
+    ! Compute K2hat
+    start = OMP_GET_WTIME()
+    call calc_explicit_MPI(2, proc_id, num_procs, proc_id_str)
+    finish = OMP_GET_WTIME()
+    write(*,*) " - calc_explicit(2) timing: ", finish-start, "(s)"
+
+    open(unit=9010, file="P"//proc_id_str//"uxi_real_stage1.txt", action="write", status="unknown")
+    open(unit=9011, file="P"//proc_id_str//"uxi_im_stage1.txt", action="write", status="unknown")
+    do i=1,Ny
+        do j=1,Nx
+            write (9010,*) REAL(uxi(i,j))
+            write (9011,*) AIMAG(uxi(i,j))
+        end do
+    end do
+    close(unit=9010)
+    close(unit=9011)
+    write(*,*) "done writing uxi!"
+   
     if (time == t_final) then
         exit
     end if
@@ -564,8 +731,8 @@ subroutine calc_vari_mod_MPI(phiout,Tout, aii, stage, total_ny, kx_it, phi_in, &
         FT   = T_in(2:total_ny-1) + dt*(acoeffs(2,1)*k1_T_in      + &
         &                     ahatcoeffs(3,1)*k1hat_T_in + &
         &                     ahatcoeffs(3,2)*k2hat_T_in)
-        FT(1)    = FT(1) + kappa0*dt*aii*g1(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
-        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
+        FT(1)    = FT(1) + kappa0*dt*aii*g1_total(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
+        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3_total(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
 
         phi_rhs(:,1) = real(Fphi)
         phi_rhs(:,2) = aimag(Fphi)
@@ -593,8 +760,8 @@ subroutine calc_vari_mod_MPI(phiout,Tout, aii, stage, total_ny, kx_it, phi_in, &
                     &ahatcoeffs(4,1)*k1hat_T_in + &
                     &ahatcoeffs(4,2)*k2hat_T_in + &
                     &ahatcoeffs(4,3)*k3hat_T_in)
-        FT(1)    = FT(1) + kappa0*dt*aii*g1(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
-        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
+        FT(1)    = FT(1) + kappa0*dt*aii*g1_total(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
+        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3_total(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
 
         phi_rhs(:,1) = real(Fphi)
         phi_rhs(:,2) = aimag(Fphi)
@@ -616,6 +783,123 @@ subroutine calc_vari_mod_MPI(phiout,Tout, aii, stage, total_ny, kx_it, phi_in, &
 end subroutine calc_vari_mod_MPI
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
 
+subroutine calc_vi_mod_MPI(vi, phiin, kx_it, total_ny, g1_total, g2_total, g3_total)
+complex(C_DOUBLE_COMPLEX),              dimension(:),    intent(in)  :: phiin
+complex(C_DOUBLE_COMPLEX),              dimension(:),    intent(out) :: vi
+real(dp),                                                intent(in)  :: kx_it
+real(dp),                  allocatable, dimension(:,:)               :: vi_rhs
+real(dp),                  allocatable, dimension(:)                 :: dvi, dlvi, duvi
+integer                                                              :: j
+integer,                              intent(in)  :: total_ny
+real(dp),              dimension(:), intent(in)  :: g1_total, g2_total, g3_total
+
+allocate(vi_rhs(total_ny-2,2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(dvi(total_ny-2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(dlvi(total_ny-3), duvi(total_ny-3), stat=alloc_err)
+call check_alloc_err(alloc_err)
+
+vi     = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+vi_rhs = 0.0_dp
+dvi    = 0.0_dp
+dlvi   = 0.0_dp
+duvi   = 0.0_dp
+
+do j = 2,total_ny-1
+    dvi(j-1) = -kx_it**2.0_dp + g2_total(j)
+end do
+
+do j = 2,total_ny-2
+    duvi(j-1) = g3_total(j)
+end do
+
+do j = 3,total_ny-1
+    dlvi(j-2) = g1_total(j)
+end do
+
+vi_rhs(:,1) = real (phiin(2:total_ny-1))
+vi_rhs(:,2) = aimag(phiin(2:total_ny-1))
+
+call dgtsv(total_ny-2, 2, dlvi, dvi, duvi, vi_rhs, total_ny-2, info)
+
+vi(1)   = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+vi(2:total_ny-1) = cmplx(vi_rhs(:,1), vi_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+vi(total_ny)  = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+
+end subroutine calc_vi_mod_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+subroutine update_bcs_mod_MPI(phiout,vout, phiin,vin,dyv1_T_it,dyv2_T_it,dyv1_B_it,&
+                              dyv2_B_it,V1_in, V2_in, phi1_in, phi2_in,&
+                              total_ny, h1_end, h2_end, h3_end)
+
+integer,                                               intent(in)  :: total_ny
+complex(C_DOUBLE_COMPLEX),              dimension(:),  intent(in)  :: phiin, vin
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:),  intent(out) :: phiout, vout
+real(dp),                               dimension(2,2)             :: C
+real(dp)                                                           :: detC
+complex(dp)                                                        :: c1, c2, c1t
+complex(dp)                                                        :: dyV_T, dyV_B
+real(dp),                                                intent(in)  :: dyv1_T_it,dyv2_T_it,dyv1_B_it,dyv2_B_it
+real(dp),              dimension(:),  intent(in)  :: V1_in, V2_in, phi1_in, phi2_in
+real(dp),                                              intent(in)  :: h1_end, h2_end, h3_end
+allocate(phiout(total_ny), vout(total_ny), stat=alloc_err)
+call check_alloc_err(alloc_err)
+phiout = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+vout   = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+C      = 0.0_dp
+
+C(1,1) = dyv1_T_it
+C(1,2) = dyv2_T_it
+C(2,1) = dyv1_B_it
+C(2,2) = dyv2_B_it
+
+detC = C(1,1)*C(2,2) - C(1,2)*C(2,1)
+
+dyV_T = h1_end*vin(total_ny-2) + h2_end*vin(total_ny-1) + h3_end*vin(total_ny)
+dyV_B = h1(1)*vin(1) + h2(1)*vin(2) + h3(1)*vin(3)
+
+! Need to negate b/c want to solve Cx = -c12.
+c1 = -dyV_T
+c2 = -dyV_B
+
+! Find c1 and c2.
+if (detC == 0.0_dp) then
+    c1 = (0.0_dp, 0.0_dp)
+    c2 = (0.0_dp, 0.0_dp)
+else
+    c1t = (C(2,2)*c1 - C(1,2)*c2) / detC
+    c2  = (C(1,1)*c2 - C(2,1)*c1) / detC
+    c1  = c1t
+end if
+
+! Update uy and Phi.
+vout   = vin   + c1*V1_in   + c2*V2_in
+phiout = phiin + c1*phi1_in + c2*phi2_in
+
+end subroutine update_bcs_mod_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+subroutine calc_implicit_mod_MPI(Kphi,KT, phiin,Tin, kx_it, total_ny, dynu_in, g1_total, g2_total, g3_total)
+
+complex(C_DOUBLE_COMPLEX),              dimension(:), intent(in)  :: phiin, Tin
+complex(C_DOUBLE_COMPLEX),              dimension(:), intent(out) :: Kphi, KT
+real(dp),                                               intent(in)  :: kx_it
+integer,                                              intent(in)  :: total_ny
+real(dp),   dimension(:),  intent(in)  :: dynu_in, g1_total, g2_total, g3_total                
+
+Kphi = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+KT   = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+
+Kphi = nu0   *(-kx_it**2.0_dp*phiin + d2y_MPI(phiin, dynu_in, g1_total, g2_total, g3_total, total_ny))
+KT   = kappa0*(-kx_it**2.0_dp*Tin   + d2y_MPI(Tin, dynu_in, g1_total, g2_total, g3_total, total_ny))
+
+end subroutine calc_implicit_mod_MPI
+
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
 
 end module time_integrators_MPI

From 56d86c45ae098d8eee512d81f8205ee3b640ac95 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Fri, 30 Apr 2021 09:43:46 -0400
Subject: [PATCH 17/24] finished stage 2

---
 global.f90               | 159 ++++++++++++++++++-
 mesh_pack.f90            |   6 +-
 time_integrators_MPI.f90 | 332 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 475 insertions(+), 22 deletions(-)

diff --git a/global.f90 b/global.f90
index 96ca400..e0c750b 100755
--- a/global.f90
+++ b/global.f90
@@ -3,6 +3,7 @@ module global
 use fftw
 
 implicit none
+include 'mpif.h'
 save
 
 ! Get double-precision type
@@ -103,10 +104,9 @@ module global
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_phi, tmp_phi1, tmp_T, tmp_uy, tmp_uy1
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_K_phi, tmp_K_T
 
-
-
 contains
 
+
 subroutine global_params
 
   implicit none
@@ -365,6 +365,73 @@ end function d1y_MPI
 
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
+function d1y_MPI2(vec, proc_id, num_procs) result(d1yvec)
+
+implicit none
+
+complex(C_DOUBLE_COMPLEX), dimension(:), intent(in)  :: vec
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: d1yvec
+integer,                                 intent(in)  :: proc_id, num_procs
+integer               :: alloc_err, jj, n, i, mpierror
+complex(C_DOUBLE_COMPLEX)              :: prev_MPI, post_MPI
+
+n = size(vec)
+
+allocate(d1yvec(n), stat=alloc_err)
+if (alloc_err /= 0) then
+   write(*,*) "ERROR:  Allocation problem in d1y."
+   stop
+end if
+
+d1yvec = 0.0_dp
+
+if (proc_id == 0) then 
+   ! Send nth element of vector
+   call MPI_SEND(vec(n), 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 78, MPI_COMM_WORLD, mpierror)
+   ! Receive post element of vector
+   call MPI_RECV(post_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 79, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   
+   d1yvec(1) = h1(1)*vec(1) + h2(1)*vec(2) + h3(1)*vec(3)
+   do jj = 2,n-1
+      d1yvec(jj) = h1(jj)*vec(jj-1) + h2(jj)*vec(jj) + h3(jj)*vec(jj+1)
+   end do
+   d1yvec(n) = h1(n)*vec(n-1) + h2(n)*vec(n) + h3(n)*post_MPI
+else if (proc_id == num_procs - 1) then
+   ! Receive previous element of vector
+   call MPI_RECV(prev_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 78, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send first element of vector.
+   call MPI_SEND(vec(1), 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 79, MPI_COMM_WORLD, mpierror)       
+   d1yvec(1) = h1(1)*prev_MPI + h2(1)*vec(1) + h3(1)*vec(2)
+   do jj = 2,n-1
+      d1yvec(jj) = h1(jj)*vec(jj-1) + h2(jj)*vec(jj) + h3(jj)*vec(jj+1)
+   end do
+   d1yvec(n) = h1(n)*vec(n-2) + h2(n)*vec(n-1) + h3(n)*vec(n)
+else 
+   ! Send nth element of vector
+   call MPI_SEND(vec(n), 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 78, MPI_COMM_WORLD, mpierror)
+   ! Receive previous element of vector
+   call MPI_RECV(prev_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 78, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send first element of vector.
+   call MPI_SEND(vec(1), 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 79, MPI_COMM_WORLD, mpierror)
+   ! Receive post element of vector
+   call MPI_RECV(post_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 79, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   d1yvec(1) = h1(1)*prev_MPI + h2(1)*vec(1) + h3(1)*vec(2)
+   do jj = 2,n-1
+      d1yvec(jj) = h1(jj)*vec(jj-1) + h2(jj)*vec(jj) + h3(jj)*vec(jj+1)
+   end do
+   d1yvec(n) = h1(n)*vec(n-1) + h2(n)*vec(n) + h3(n)*post_MPI
+end if
+
+
+
+end function d1y_MPI2
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 function d2y(vec) result(d2yvec)
 
 implicit none
@@ -477,6 +544,94 @@ end function d2y_MPI
 
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
+function d2y_MPI2(vec, proc_id, num_procs) result(d2yvec)
+
+implicit none
+
+complex(C_DOUBLE_COMPLEX), intent(in)  :: vec(:)
+integer                  , intent(in)  :: proc_id, num_procs
+complex(C_DOUBLE_COMPLEX), allocatable :: d2yvec(:)
+real(dp)                               :: d1, d2, d3, d4
+real(dp)                               :: ell1, ell2, ell3, ell4
+real(dp)                               :: dNm3, dNm2, dNm1, dN
+real(dp)                               :: ellNm3, ellNm2, ellNm1, ellN
+integer                                :: jj, n, mpierror
+complex(C_DOUBLE_COMPLEX)              :: prev_MPI, post_MPI
+
+n = size(vec)
+
+allocate(d2yvec(n), stat=alloc_err)
+if (alloc_err /= 0) then
+   write(*,*) "ERROR:  Allocation problem in d2y."
+   stop
+end if
+
+d2yvec = 0.0_dp
+
+d1 = -dynu(1)*(dynu(1)+dynu(2))*(dynu(1)+dynu(2)+dynu(3))
+d2 =  dynu(1)*dynu(2)*(dynu(2) + dynu(3))
+d3 = -dynu(2)*dynu(3)*(dynu(1) + dynu(2))
+d4 =  dynu(3)*(dynu(2)+dynu(3))*(dynu(1)+dynu(2)+dynu(3))
+
+ell1 = -2.0_dp*(3.0_dp*dynu(1) + 2.0_dp*dynu(2) + dynu(3)) / d1
+ell2 = -2.0_dp*(2.0_dp*(dynu(1) + dynu(2)) + dynu(3)) / d2
+ell3 = -2.0_dp*(2.0_dp*dynu(1) + dynu(2) + dynu(3)) / d3
+ell4 = -2.0_dp*(2.0_dp*dynu(1) + dynu(2)) / d4
+
+dN   =  dynu(Ny)*(dynu(Ny)+dynu(Ny-1))*(dynu(Ny)+dynu(Ny-1)+dynu(Ny-2))
+dNm1 = -dynu(Ny)*dynu(Ny-1)*(dynu(Ny-1)+dynu(Ny-2))
+dNm2 =  dynu(Ny-1)*dynu(Ny-2)*(dynu(Ny)+dynu(Ny-1))
+dNm3 = -dynu(Ny-2)*(dynu(Ny-1)+dynu(Ny-2))*(dynu(Ny)+dynu(Ny-1)+dynu(Ny-2))
+
+ellN   = 2.0_dp*(3.0_dp*dynu(Ny) + 2.0_dp*dynu(Ny-1) + dynu(Ny-2)) / dN
+ellNm1 = 2.0_dp*(2.0_dp*(dynu(Ny)+dynu(Ny-1)) + dynu(Ny-2)) / dNm1
+ellNm2 = 2.0_dp*(2.0_dp*dynu(Ny) + dynu(Ny-1) + dynu(Ny-2)) / dNm2
+ellNm3 = 2.0_dp*(2.0_dp*dynu(Ny) + dynu(Ny-1)) / dNm3
+
+if (proc_id == 0) then
+   d2yvec(1) = ell1*vec(1) + ell2*vec(2) + ell3*vec(3) + ell4*vec(4)
+   ! Send nth element of vector
+   call MPI_SEND(vec(n), 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 76, MPI_COMM_WORLD, mpierror)
+   ! Receive post element of vector
+   call MPI_RECV(post_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 77, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   do jj = 2,n-1
+      d2yvec(jj) = g1(jj)*vec(jj-1) + g2(jj)*vec(jj) + g3(jj)*vec(jj+1)
+   end do
+   d2yvec(n) = g1(n)*vec(n-1) + g2(n)*vec(n) + g3(n)*post_MPI
+else if (proc_id == num_procs-1) then
+   ! Receive previous element of vector
+   call MPI_RECV(prev_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 76, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send first element of vector.
+   call MPI_SEND(vec(1), 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 77, MPI_COMM_WORLD, mpierror)       
+   d2yvec(1) = g1(1)*prev_MPI + g2(1)*vec(1) + g3(1)*vec(2)
+   do jj = 2,n-1
+      d2yvec(jj) = g1(jj)*vec(jj-1) + g2(jj)*vec(jj) + g3(jj)*vec(jj+1)
+   end do 
+   d2yvec(n) = ellNm3*vec(n-3) + ellNm2*vec(n-2) + ellNm1*vec(n-1) + ellN*vec(n)
+else
+   ! Send nth element of vector
+   call MPI_SEND(vec(n), 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 76, MPI_COMM_WORLD, mpierror)
+   ! Receive previous element of vector
+   call MPI_RECV(prev_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 76, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send first element of vector.
+   call MPI_SEND(vec(1), 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 77, MPI_COMM_WORLD, mpierror)
+   ! Receive post element of vector
+   call MPI_RECV(post_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 77, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   d2yvec(1) = g1(1)*prev_MPI + g2(1)*vec(1) + g3(1)*vec(2)
+   do jj = 2,n-1
+      d2yvec(jj) = g1(jj)*vec(jj-1) + g2(jj)*vec(jj) + g3(jj)*vec(jj+1)
+   end do 
+   d2yvec(n) = g1(n)*vec(n-1) + g2(n)*vec(n) + g3(n)*post_MPI
+endif 
+
+end function d2y_MPI2
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 subroutine check_alloc_err(err_mess)
 
 integer, intent(in) :: err_mess
diff --git a/mesh_pack.f90 b/mesh_pack.f90
index cec5b2d..965550c 100755
--- a/mesh_pack.f90
+++ b/mesh_pack.f90
@@ -1,7 +1,7 @@
 module mesh_pack
 
 use global
-include 'mpif.h'
+! include 'mpif.h'
 
 contains
 
@@ -107,6 +107,10 @@ subroutine dxdydz(dyj, xcoor,ycoor,zcoor)
    dyj(jj-1) = ycoor(jj) - ycoor(jj-1)
 end do
 
+! do jj=1,numy-1
+!    write(*,*) jj,  dyj(jj)
+! end do
+
 end subroutine dxdydz
 
 ! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 9bbb35d..28752fa 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -8,6 +8,7 @@ module time_integrators_MPI
 use statistics
 use omp_lib
 
+
 integer  :: it, jt, kkt
 integer  :: info
 real(dp) :: time, dtmax, dtmin, dt_old, dt_ramp, dt_final
@@ -433,26 +434,324 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     finish = OMP_GET_WTIME()
     write(*,*) " - stage 1 mid timing: ", finish-start, "(s)"
 
-   
-
-    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
     ! Compute K2hat
     start = OMP_GET_WTIME()
     call calc_explicit_MPI(2, proc_id, num_procs, proc_id_str)
     finish = OMP_GET_WTIME()
     write(*,*) " - calc_explicit(2) timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+
+    !:::::::::::
+    ! STAGE 2 ::
+    !:::::::::::
+    start = OMP_GET_WTIME()
+    !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+    do it = 1,Nx ! kx loop
+        ! Compute phi1 and T1
+        if (proc_id == 0) then
+            ! Need to pull all of the variables into local main memory.
+            phi_MPI(1:Ny-1) = phi(2:Ny,it)
+            K1hat_phi_MPI(1:Ny-1) = K1hat_phi(2:Ny,it)
+            K2hat_phi_MPI(1:Ny-1) = K2hat_phi(2:Ny,it)
+            K3hat_phi_MPI(1:Ny-1) = K3hat_phi(2:Ny,it)
+            K1hat_T_MPI(1:Ny-1) = K1hat_T(2:Ny,it)
+            K2hat_T_MPI(1:Ny-1) = K2hat_T(2:Ny,it)
+            K3hat_t_MPI(1:Ny-1) = K3hat_T(2:Ny,it)
+            K1_phi_MPI(1:Ny-1) = K1_phi(2:Ny,it)
+            K2_phi_MPI(1:Ny-1) = K2_phi(2:Ny,it)
+            K1_T_MPI(1:Ny-1) = K1_T(2:Ny,it)
+            K2_T_MPI(1:Ny-1) = K2_T(2:Ny,it)
+            T_MPI(1:Ny) = T(1:Ny,it)
+
+            ! Receive pieces from other nodes.
+            do otherproc = 1,num_procs-2
+                stind = otherproc * Ny
+                call MPI_RECV(phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            
+            ! Receive from last node.
+            stind = (num_procs-1) * Ny
+            call MPI_RECV(phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+        
+            ! Receive g1,g2,g3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(g1_total(stind), Ny, MPI_DOUBLE, otherproc, 54, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g2_total(stind), Ny, MPI_DOUBLE, otherproc, 55, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g3_total(stind), Ny, MPI_DOUBLE, otherproc, 56, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            g1_total(1:Ny) = g1
+            g2_total(1:Ny) = g2
+            g3_total(1:Ny) = g3
+
+            ! Call vari mod
+            call calc_vari_mod_MPI(tmp_phi_MPI, tmp_T_MPI, acoeffs(2,2), 2, total_ny,&
+                                kx(it), phi_MPI,&
+                                K1hat_phi_MPI,K2hat_phi_MPI,K3hat_phi_MPI,&
+                                K1hat_T_MPI,K2hat_T_MPI,K3hat_T_MPI,&
+                                K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI,&
+                                g1_total, g2_total, g3_total,&
+                                T_MPI)
+            
+            ! Compute v1 from phi1
+            call calc_vi_mod_MPI(tmp_uy_MPI, tmp_phi_MPI, kx(it), total_ny, g1_total, g2_total, g3_total)
+            
+            ! Receive dyv1 and dyv2
+            call MPI_RECV(dyv1_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 57, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(dyv2_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 58, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+            ! Receive V1, V2, phi1, phi2
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(V1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 59, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(V2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 60, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 61, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 62, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            ! Set local variables to MPI.
+            V1_MPI(1:Ny) = V1(1:Ny, it)
+            V2_MPI(1:Ny) = V2(1:Ny, it)
+            phi1_MPI(1:Ny) = phi1(1:Ny, it)
+            phi2_MPI(1:Ny) = phi2(1:Ny, it)
+
+            ! Receive end of h1, h2, h3 for last processor.
+            call MPI_RECV(h1_end_MPI, 1, MPI_DOUBLE, num_procs-1, 63, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h2_end_MPI, 1, MPI_DOUBLE, num_procs-1, 64, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h3_end_MPI, 1, MPI_DOUBLE, num_procs-1, 65, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+            ! BOUNDAY CONDITIONS!
+            call update_bcs_mod_MPI(tmp_phi1_MPI,tmp_uy1_MPI, tmp_phi_MPI,tmp_uy_MPI,&
+                                dyv1_T_it_MPI,dyv2_T_it_MPI,&
+                                dyv1_B(it),dyv2_B(it),&
+                                V1_MPI,V2_MPI,phi1_MPI,phi2_MPI,&
+                                total_ny, h1_end_MPI, h2_end_MPI, h3_end_MPI)
+            
+            tmp_phi_MPI = tmp_phi1_MPI
+            tmp_uy_MPI  = tmp_uy1_MPI
+
+            ! Receive dynu from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny
+                call MPI_RECV(dynu_MPI(stind), Ny, MPI_DOUBLE, otherproc, 66, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            dynu_MPI(1:Ny-1) = dynu(1:Ny-1)
+            
+            ! Implicit.
+            call calc_implicit_mod_MPI(tmp_K_phi_MPI,tmp_K_T_MPI, tmp_phi_MPI,tmp_T_MPI,&
+                                       kx(it), total_ny, dynu_MPI, g1_total, g2_total, g3_total)
+
+            ! Receive h1,h2,h3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(h1_total(stind), Ny, MPI_DOUBLE, otherproc, 67, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h2_total(stind), Ny, MPI_DOUBLE, otherproc, 68, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h3_total(stind), Ny, MPI_DOUBLE, otherproc, 69, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            h1_total(1:Ny) = h1
+            h2_total(1:Ny) = h2
+            h3_total(1:Ny) = h3
+            
+            ! Compute u1 from v1
+            if (kx(it) /= 0.0_dp) then
+                uxi_MPI = CI*d1y_MPI(tmp_uy_MPI, h1_total, h2_total, h3_total)/kx(it)
+            else if (kx(it) == 0.0_dp) then
+                uxi_MPI = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+            end if
+
+            ! Send K2_phi, K2_T, phii, Ti, uyi, uxi back to each node.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_SEND(tmp_K_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 70, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_K_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 71, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 72, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 73, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_uy_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 74, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(uxi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 75, MPI_COMM_WORLD, mpierror)
+            end do
+
+            ! Set local to main node variables.
+            K2_phi(1:Ny,it) = tmp_K_phi_MPI(1:Ny)
+            K2_T(1:Ny,it) = tmp_K_T_MPI(1:Ny)
+            phii(1:Ny,it) = tmp_phi_MPI(1:Ny)
+            Ti(1:Ny,it) = tmp_T_MPI(1:Ny)
+            uyi(1:Ny,it) = tmp_uy_MPI(1:Ny)
+            uxi(1:Ny,it) = uxi_MPI(1:Ny)
+
+            
+        else if (proc_id == num_procs - 1) then
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send dyv1 and dyv2.
+            call MPI_SEND(dyv1_T(it), 1, MPI_DOUBLE, 0, 57, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(dyv2_T(it), 1, MPI_DOUBLE, 0, 58, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+            
+            ! Send h1,h2,h3 last elements.
+            call MPI_SEND(h1(Ny), 1, MPI_DOUBLE, 0, 63, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(Ny), 1, MPI_DOUBLE, 0, 64, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(Ny), 1, MPI_DOUBLE, 0, 65, MPI_COMM_WORLD, mpierror)
+
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
 
-    open(unit=9010, file="P"//proc_id_str//"uxi_real_stage1.txt", action="write", status="unknown")
-    open(unit=9011, file="P"//proc_id_str//"uxi_im_stage1.txt", action="write", status="unknown")
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K2_phi, K2_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+        else 
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+        
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K2_phi, K2_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end if
+    end do
+    !$OMP END PARALLEL DO
+    finish = OMP_GET_WTIME()
+    write(*,*) " - stage 2 mid timing: ", finish-start, "(s)"
+    ! Compute K3hat
+    start = OMP_GET_WTIME()
+    call calc_explicit_MPI(3, proc_id, num_procs, proc_id_str)
+    finish = OMP_GET_WTIME()
+    write(*,*) " - calc_explicit(3) timing: ", finish-start, "(s)"
+
+
+    open(unit=9010, file="P"//proc_id_str//"uyi_real_stage2.txt", action="write", status="unknown")
+    open(unit=9011, file="P"//proc_id_str//"uyi_im_stage2.txt", action="write", status="unknown")
     do i=1,Ny
         do j=1,Nx
-            write (9010,*) REAL(uxi(i,j))
-            write (9011,*) AIMAG(uxi(i,j))
+            write (9010,*) REAL(uyi(i,j))
+            write (9011,*) AIMAG(uyi(i,j))
         end do
     end do
     close(unit=9010)
     close(unit=9011)
-    write(*,*) "done writing uxi!"
+    write(*,*) "done writing uyi!"
    
     if (time == t_final) then
         exit
@@ -508,7 +807,7 @@ subroutine calc_explicit_MPI(stage, proc_id, num_procs, proc_id_str)
    ! Compute dx(T) in Fourier space
    nlT  (:,i) =  kx(i)*Ti(:,i)
    ! Compute D2(ux)
-   nlphi(:,i) = -kx(i)**2.0_dp*uxi(:,i) + d2y(uxi(:,i))
+   nlphi(:,i) = -kx(i)**2.0_dp*uxi(:,i) + d2y_MPI2(uxi(:,i), proc_id, num_procs)
 end do
 !$OMP END PARALLEL DO
 finish = OMP_GET_WTIME()
@@ -551,7 +850,7 @@ subroutine calc_explicit_MPI(stage, proc_id, num_procs, proc_id_str)
 do i = 1,Nx
    ! Temperature
    tmp_T = Ti(:,i)
-   nlT(:,i) = uxi(:,i)*nlT(:,i) + uyi(:,i)*d1y(tmp_T)
+   nlT(:,i) = uxi(:,i)*nlT(:,i) + uyi(:,i)*d1y_MPI2(tmp_T, proc_id, num_procs)
    ! phi
    nlphi(:,i) = uxi(:,i)*phii(:,i) - uyi(:,i)*nlphi(:,i)
 end do
@@ -685,13 +984,6 @@ subroutine calc_vari_mod_MPI(phiout,Tout, aii, stage, total_ny, kx_it, phi_in, &
     dphi(jt-1) = 1.0_dp - nu0   *dt*aii*(-kx_it**2.0_dp + g2_total(jt))
 end do
 
-! do i = 2,total_ny-1
-!     write(*,*) i, ddT (i-1)
-! end do 
-
-! write(*,*) kappa0 ,dt , aii
-
-
 do jt = 2,total_ny-2
     duT  (jt-1) = -kappa0*dt*g3_total(jt)*aii
     duphi(jt-1) = -nu0   *dt*g3_total(jt)*aii
@@ -734,6 +1026,10 @@ subroutine calc_vari_mod_MPI(phiout,Tout, aii, stage, total_ny, kx_it, phi_in, &
         FT(1)    = FT(1) + kappa0*dt*aii*g1_total(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
         FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3_total(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
 
+        ! do i = 1,total_ny-2
+        !     write(*,*) i, k2hat_phi_in(i)
+        ! end do 
+
         phi_rhs(:,1) = real(Fphi)
         phi_rhs(:,2) = aimag(Fphi)
 
@@ -829,8 +1125,6 @@ subroutine calc_vi_mod_MPI(vi, phiin, kx_it, total_ny, g1_total, g2_total, g3_to
 
 end subroutine calc_vi_mod_MPI
 
-!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
-
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 subroutine update_bcs_mod_MPI(phiout,vout, phiin,vin,dyv1_T_it,dyv2_T_it,dyv1_B_it,&
                               dyv2_B_it,V1_in, V2_in, phi1_in, phi2_in,&

From 374b9f8eb9f93c9db3881d5bc1d266465aa17092 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Fri, 30 Apr 2021 10:04:00 -0400
Subject: [PATCH 18/24] finished stage 3

---
 time_integrators_MPI.f90 | 306 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 304 insertions(+), 2 deletions(-)

diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 28752fa..3aaaaf8 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -734,15 +734,317 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     !$OMP END PARALLEL DO
     finish = OMP_GET_WTIME()
     write(*,*) " - stage 2 mid timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
     ! Compute K3hat
     start = OMP_GET_WTIME()
     call calc_explicit_MPI(3, proc_id, num_procs, proc_id_str)
     finish = OMP_GET_WTIME()
     write(*,*) " - calc_explicit(3) timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+
+    !:::::::::::
+    ! STAGE 3 ::
+    !:::::::::::
+    start = OMP_GET_WTIME()
+    !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+    do it = 1,Nx ! kx loop
+        ! Compute phi1 and T1
+        if (proc_id == 0) then
+            ! Need to pull all of the variables into local main memory.
+            phi_MPI(1:Ny-1) = phi(2:Ny,it)
+            K1hat_phi_MPI(1:Ny-1) = K1hat_phi(2:Ny,it)
+            K2hat_phi_MPI(1:Ny-1) = K2hat_phi(2:Ny,it)
+            K3hat_phi_MPI(1:Ny-1) = K3hat_phi(2:Ny,it)
+            K1hat_T_MPI(1:Ny-1) = K1hat_T(2:Ny,it)
+            K2hat_T_MPI(1:Ny-1) = K2hat_T(2:Ny,it)
+            K3hat_t_MPI(1:Ny-1) = K3hat_T(2:Ny,it)
+            K1_phi_MPI(1:Ny-1) = K1_phi(2:Ny,it)
+            K2_phi_MPI(1:Ny-1) = K2_phi(2:Ny,it)
+            K1_T_MPI(1:Ny-1) = K1_T(2:Ny,it)
+            K2_T_MPI(1:Ny-1) = K2_T(2:Ny,it)
+            T_MPI(1:Ny) = T(1:Ny,it)
+
+            ! Receive pieces from other nodes.
+            do otherproc = 1,num_procs-2
+                stind = otherproc * Ny
+                call MPI_RECV(phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            
+            ! Receive from last node.
+            stind = (num_procs-1) * Ny
+            call MPI_RECV(phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+        
+            ! Receive g1,g2,g3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(g1_total(stind), Ny, MPI_DOUBLE, otherproc, 54, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g2_total(stind), Ny, MPI_DOUBLE, otherproc, 55, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g3_total(stind), Ny, MPI_DOUBLE, otherproc, 56, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            g1_total(1:Ny) = g1
+            g2_total(1:Ny) = g2
+            g3_total(1:Ny) = g3
+
+            ! Call vari mod
+            call calc_vari_mod_MPI(tmp_phi_MPI, tmp_T_MPI, acoeffs(3,3), 3, total_ny,&
+                                kx(it), phi_MPI,&
+                                K1hat_phi_MPI,K2hat_phi_MPI,K3hat_phi_MPI,&
+                                K1hat_T_MPI,K2hat_T_MPI,K3hat_T_MPI,&
+                                K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI,&
+                                g1_total, g2_total, g3_total,&
+                                T_MPI)
+            
+            ! Compute v1 from phi1
+            call calc_vi_mod_MPI(tmp_uy_MPI, tmp_phi_MPI, kx(it), total_ny, g1_total, g2_total, g3_total)
+            
+            ! Receive dyv1 and dyv2
+            call MPI_RECV(dyv1_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 57, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(dyv2_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 58, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+            ! Receive V1, V2, phi1, phi2
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(V1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 59, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(V2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 60, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 61, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 62, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            ! Set local variables to MPI.
+            V1_MPI(1:Ny) = V1(1:Ny, it)
+            V2_MPI(1:Ny) = V2(1:Ny, it)
+            phi1_MPI(1:Ny) = phi1(1:Ny, it)
+            phi2_MPI(1:Ny) = phi2(1:Ny, it)
+
+            ! Receive end of h1, h2, h3 for last processor.
+            call MPI_RECV(h1_end_MPI, 1, MPI_DOUBLE, num_procs-1, 63, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h2_end_MPI, 1, MPI_DOUBLE, num_procs-1, 64, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h3_end_MPI, 1, MPI_DOUBLE, num_procs-1, 65, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+            ! BOUNDAY CONDITIONS!
+            call update_bcs_mod_MPI(tmp_phi1_MPI,tmp_uy1_MPI, tmp_phi_MPI,tmp_uy_MPI,&
+                                dyv1_T_it_MPI,dyv2_T_it_MPI,&
+                                dyv1_B(it),dyv2_B(it),&
+                                V1_MPI,V2_MPI,phi1_MPI,phi2_MPI,&
+                                total_ny, h1_end_MPI, h2_end_MPI, h3_end_MPI)
+            
+            tmp_phi_MPI = tmp_phi1_MPI
+            tmp_uy_MPI  = tmp_uy1_MPI
 
+            ! Receive dynu from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny
+                call MPI_RECV(dynu_MPI(stind), Ny, MPI_DOUBLE, otherproc, 66, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            dynu_MPI(1:Ny-1) = dynu(1:Ny-1)
+            
+            ! Implicit.
+            call calc_implicit_mod_MPI(tmp_K_phi_MPI,tmp_K_T_MPI, tmp_phi_MPI,tmp_T_MPI,&
+                                       kx(it), total_ny, dynu_MPI, g1_total, g2_total, g3_total)
+
+            ! Receive h1,h2,h3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(h1_total(stind), Ny, MPI_DOUBLE, otherproc, 67, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h2_total(stind), Ny, MPI_DOUBLE, otherproc, 68, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h3_total(stind), Ny, MPI_DOUBLE, otherproc, 69, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            h1_total(1:Ny) = h1
+            h2_total(1:Ny) = h2
+            h3_total(1:Ny) = h3
+            
+            ! Compute u1 from v1
+            if (kx(it) /= 0.0_dp) then
+                uxi_MPI = CI*d1y_MPI(tmp_uy_MPI, h1_total, h2_total, h3_total)/kx(it)
+            else if (kx(it) == 0.0_dp) then
+                uxi_MPI = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+            end if
+
+            ! Send K3_phi, K3_T, phii, Ti, uyi, uxi back to each node.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_SEND(tmp_K_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 70, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_K_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 71, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 72, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 73, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_uy_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 74, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(uxi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 75, MPI_COMM_WORLD, mpierror)
+            end do
+
+            ! Set local to main node variables.
+            K3_phi(1:Ny,it) = tmp_K_phi_MPI(1:Ny)
+            K3_T(1:Ny,it) = tmp_K_T_MPI(1:Ny)
+            phii(1:Ny,it) = tmp_phi_MPI(1:Ny)
+            Ti(1:Ny,it) = tmp_T_MPI(1:Ny)
+            uyi(1:Ny,it) = tmp_uy_MPI(1:Ny)
+            uxi(1:Ny,it) = uxi_MPI(1:Ny)
+
+            
+        else if (proc_id == num_procs - 1) then
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send dyv1 and dyv2.
+            call MPI_SEND(dyv1_T(it), 1, MPI_DOUBLE, 0, 57, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(dyv2_T(it), 1, MPI_DOUBLE, 0, 58, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+            
+            ! Send h1,h2,h3 last elements.
+            call MPI_SEND(h1(Ny), 1, MPI_DOUBLE, 0, 63, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(Ny), 1, MPI_DOUBLE, 0, 64, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(Ny), 1, MPI_DOUBLE, 0, 65, MPI_COMM_WORLD, mpierror)
+
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K3_phi, K3_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K3_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+        else 
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+        
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K3_phi, K3_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K3_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end if
+    end do
+    !$OMP END PARALLEL DO
+    finish = OMP_GET_WTIME()
+    write(*,*) " - stage 3 mid timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+    ! Compute K4hat
+    start = OMP_GET_WTIME()
+    call calc_explicit_MPI(4, proc_id, num_procs, proc_id_str)
+    finish = OMP_GET_WTIME()
+    write(*,*) " - calc_explicit(4) timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
 
-    open(unit=9010, file="P"//proc_id_str//"uyi_real_stage2.txt", action="write", status="unknown")
-    open(unit=9011, file="P"//proc_id_str//"uyi_im_stage2.txt", action="write", status="unknown")
+    open(unit=9010, file="P"//proc_id_str//"uyi_real_stage3.txt", action="write", status="unknown")
+    open(unit=9011, file="P"//proc_id_str//"uyi_im_stage3.txt", action="write", status="unknown")
     do i=1,Ny
         do j=1,Nx
             write (9010,*) REAL(uyi(i,j))

From 826d2c2ea23c5f61ff9916daa8d8e0af39de38e0 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Fri, 30 Apr 2021 10:19:12 -0400
Subject: [PATCH 19/24] finished update sols

---
 time_integrators_MPI.f90 | 43 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 3aaaaf8..1980417 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -1043,17 +1043,50 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     write(*,*) " - calc_explicit(4) timing: ", finish-start, "(s)"
     call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
 
-    open(unit=9010, file="P"//proc_id_str//"uyi_real_stage3.txt", action="write", status="unknown")
-    open(unit=9011, file="P"//proc_id_str//"uyi_im_stage3.txt", action="write", status="unknown")
+    ! UPDATE SOLUTIONS
+    if (proc_id == 0) then
+        ! Get phi
+        phi(2:Ny,:) = phi(2:Ny,:) + dt*(b(1)*(K1_phi(2:Ny,:) + K2hat_phi(2:Ny,:)) + &
+        &                    b(2)*(K2_phi(2:Ny,:) + K3hat_phi(2:Ny,:)) + &
+        &                    b(3)*(K3_phi(2:Ny,:) + K4hat_phi(2:Ny,:)))
+
+        ! Get temperature
+        T(2:Ny,:)   = T(2:Ny,:)  + dt*(b(1)*(K1_T(2:Ny,:) + K2hat_T(2:Ny,:)) + &
+        &                   b(2)*(K2_T(2:Ny,:) + K3hat_T(2:Ny,:)) + &
+        &                   b(3)*(K3_T(2:Ny,:) + K4hat_T(2:Ny,:)))
+    else if (proc_id == num_procs - 1) then
+        ! Get phi
+        phi(1:Ny-1,:) = phi(1:Ny-1,:) + dt*(b(1)*(K1_phi(1:Ny-1,:) + K2hat_phi(1:Ny-1,:)) + &
+        &                    b(2)*(K2_phi(1:Ny-1,:) + K3hat_phi(1:Ny-1,:)) + &
+        &                    b(3)*(K3_phi(1:Ny-1,:) + K4hat_phi(1:Ny-1,:)))
+
+        ! Get temperature
+        T(1:Ny-1,:)   = T(1:Ny-1,:)  + dt*(b(1)*(K1_T(1:Ny-1,:) + K2hat_T(1:Ny-1,:)) + &
+        &                   b(2)*(K2_T(1:Ny-1,:) + K3hat_T(1:Ny-1,:)) + &
+        &                   b(3)*(K3_T(1:Ny-1,:) + K4hat_T(1:Ny-1,:)))    
+    else 
+        ! Get phi
+        phi(1:Ny,:) = phi(1:Ny,:) + dt*(b(1)*(K1_phi(1:Ny,:) + K2hat_phi(1:Ny,:)) + &
+        &                    b(2)*(K2_phi(1:Ny,:) + K3hat_phi(1:Ny,:)) + &
+        &                    b(3)*(K3_phi(1:Ny,:) + K4hat_phi(1:Ny,:)))
+    
+        ! Get temperature
+        T(1:Ny,:)   = T(1:Ny,:)  + dt*(b(1)*(K1_T(1:Ny,:) + K2hat_T(1:Ny,:)) + &
+        &                   b(2)*(K2_T(1:Ny,:) + K3hat_T(1:Ny,:)) + &
+        &                   b(3)*(K3_T(1:Ny,:) + K4hat_T(1:Ny,:)))
+    end if
+    
+    open(unit=9010, file="P"//proc_id_str//"phi_real_update.txt", action="write", status="unknown")
+    open(unit=9011, file="P"//proc_id_str//"phi_im_update.txt", action="write", status="unknown")
     do i=1,Ny
         do j=1,Nx
-            write (9010,*) REAL(uyi(i,j))
-            write (9011,*) AIMAG(uyi(i,j))
+            write (9010,*) REAL(phi(i,j))
+            write (9011,*) AIMAG(phi(i,j))
         end do
     end do
     close(unit=9010)
     close(unit=9011)
-    write(*,*) "done writing uyi!"
+    write(*,*) "done writing phi!"
    
     if (time == t_final) then
         exit

From de7636462650295e370ec1bc95032620ab2620c2 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Fri, 30 Apr 2021 12:15:52 -0400
Subject: [PATCH 20/24] finished ux uy updates

---
 time_integrators_MPI.f90 | 97 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 90 insertions(+), 7 deletions(-)

diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 1980417..493a821 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -38,7 +38,7 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 real(dp)                       :: start, finish
 real(dp)                       :: start_overall, finish_overall
 integer                        :: nthreads, myid, stind
-complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_phi_MPI, tmp_T_MPI, phi_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_phi_MPI, tmp_T_MPI, phi_MPI, phi_update_MPI
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1hat_phi_MPI, K2hat_phi_MPI, K3hat_phi_MPI
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1hat_T_MPI, K2hat_T_MPI, K3hat_T_MPI
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI
@@ -90,7 +90,7 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 
 if (proc_id == 0) then 
    total_ny = Ny * num_procs
-   allocate(tmp_phi_MPI(total_ny), tmp_T_MPI(total_ny), phi_MPI(total_ny-2), stat=alloc_err)
+   allocate(tmp_phi_MPI(total_ny), tmp_T_MPI(total_ny), phi_MPI(total_ny-2), phi_update_MPI(total_ny), stat=alloc_err)
    call check_alloc_err(alloc_err)
    allocate(K1hat_phi_MPI(total_ny-2), K2hat_phi_MPI(total_ny-2), K3hat_phi_MPI(total_ny-2), stat=alloc_err)
    call check_alloc_err(alloc_err)
@@ -1043,6 +1043,7 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     write(*,*) " - calc_explicit(4) timing: ", finish-start, "(s)"
     call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
 
+    start = OMP_GET_WTIME()
     ! UPDATE SOLUTIONS
     if (proc_id == 0) then
         ! Get phi
@@ -1075,18 +1076,100 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
         &                   b(2)*(K2_T(1:Ny,:) + K3hat_T(1:Ny,:)) + &
         &                   b(3)*(K3_T(1:Ny,:) + K4hat_T(1:Ny,:)))
     end if
+
+    ! Get ux and uy
+    if (proc_id == 0) then
+        ! Receive g1,g2,g3 from all other nodes.
+        do otherproc = 1,num_procs-1
+            stind = otherproc * Ny + 1
+            call MPI_RECV(g1_total(stind), Ny, MPI_DOUBLE, otherproc, 80, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(g2_total(stind), Ny, MPI_DOUBLE, otherproc, 81, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(g3_total(stind), Ny, MPI_DOUBLE, otherproc, 82, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end do
+        g1_total(1:Ny) = g1
+        g2_total(1:Ny) = g2
+        g3_total(1:Ny) = g3
+
+        ! Receive h1,h2,h3 from all other nodes.
+        do otherproc = 1,num_procs-1
+            stind = otherproc * Ny + 1
+            call MPI_RECV(h1_total(stind), Ny, MPI_DOUBLE, otherproc, 83, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h2_total(stind), Ny, MPI_DOUBLE, otherproc, 84, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h3_total(stind), Ny, MPI_DOUBLE, otherproc, 85, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end do
+        h1_total(1:Ny) = h1
+        h2_total(1:Ny) = h2
+        h3_total(1:Ny) = h3
+    else 
+        ! Send g1, g2, g3 to first node.
+        call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 80, MPI_COMM_WORLD, mpierror)
+        call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 81, MPI_COMM_WORLD, mpierror)
+        call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 82, MPI_COMM_WORLD, mpierror)
+
+        ! Send h1, h2, h3 to first node.
+        call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 83, MPI_COMM_WORLD, mpierror)
+        call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 84, MPI_COMM_WORLD, mpierror)
+        call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 85, MPI_COMM_WORLD, mpierror)
+    end if
+    !$OMP PARALLEL DO private(tmp_uy, it) schedule(dynamic)
+    do it = 1,Nx
+        ! Only call from single node.
+        if (proc_id == 0) then 
+
+            do otherproc= 1, num_procs-1
+                stind = otherproc * Ny + 1
+                ! Receive phi
+                call MPI_RECV(phi_update_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 86, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            ! Set local phi
+            phi_update_MPI(1:Ny) = phi(1:Ny,it)
+
+            ! Solve for v
+            call calc_vi_mod_MPI(tmp_uy_MPI, phi_update_MPI, kx(it), total_ny, g1_total, g2_total, g3_total)
+            ! Solve for u
+            if (kx(it) /= 0.0_dp) then
+                uxi_MPI = CI*d1y_MPI(tmp_uy_MPI, h1_total, h2_total, h3_total)/kx(it)
+            else if (kx(it) == 0.0_dp) then
+                uxi_MPI = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+            end if
+
+            ! Send uy, ux back to each node.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_SEND(tmp_uy_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 87, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(uxi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 88, MPI_COMM_WORLD, mpierror)
+            end do
+            ! Set local versions.
+            uy(1:Ny,it) = tmp_uy_MPI(1:Ny)
+            ux(1:Ny,it) = uxi_MPI(1:Ny)
+
+        else 
+            ! Send phi
+            call MPI_SEND(phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 86, MPI_COMM_WORLD, mpierror)
+
+            ! Receive uy, ux from main node.
+            call MPI_RECV(uy(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 87,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(ux(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 88,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end if 
+    end do
+    !$OMP END PARALLEL DO
+    finish = OMP_GET_WTIME()
+    write(*,*) " - update sols timing: ", finish-start, "(s)"
     
-    open(unit=9010, file="P"//proc_id_str//"phi_real_update.txt", action="write", status="unknown")
-    open(unit=9011, file="P"//proc_id_str//"phi_im_update.txt", action="write", status="unknown")
+    open(unit=9010, file="P"//proc_id_str//"uy_real_update.txt", action="write", status="unknown")
+    open(unit=9011, file="P"//proc_id_str//"uy_im_update.txt", action="write", status="unknown")
     do i=1,Ny
         do j=1,Nx
-            write (9010,*) REAL(phi(i,j))
-            write (9011,*) AIMAG(phi(i,j))
+            write (9010,*) REAL(uy(i,j))
+            write (9011,*) AIMAG(uy(i,j))
         end do
     end do
     close(unit=9010)
     close(unit=9011)
-    write(*,*) "done writing phi!"
+    write(*,*) "done writing uy!"
    
     if (time == t_final) then
         exit

From 91dfedb5e99683bac66827fc77a57c436c9ecc6e Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Fri, 30 Apr 2021 13:27:32 -0400
Subject: [PATCH 21/24] finished MPI version.

---
 Makefile                 |  4 ++--
 input.data               |  4 ++--
 time_integrators_MPI.f90 | 33 +++++++++++++++++++++------------
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 6db4fb2..8054957 100755
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@ PROGRAMS = $(MAIN).exe
 all: $(PROGRAMS)
 
 $(PROGRAMS) : $(OBJECTS)
-	$(FC) -fopenmp $(LDFLAGS) -o $(PROGRAMS) $(OBJECTS) $(LIBFLAGS1) -llapack -lblas $(LIBFLAGS2) -lfftw3 -lm
+	$(FC) -fopenmp -fallow-argument-mismatch $(LDFLAGS) -o $(PROGRAMS) $(OBJECTS) $(LIBFLAGS1) -llapack -lblas $(LIBFLAGS2) -lfftw3 -lm
 
 fftw.o : fftw.f90
 	$(FC) $(FFLAGS) fftw.f90
@@ -54,7 +54,7 @@ time_integrators.o : time_integrators.f90
 	$(FC) -fopenmp $(FFLAGS) time_integrators.f90
 
 time_integrators_MPI.o : time_integrators_MPI.f90
-	$(FC) -fopenmp $(FFLAGS) time_integrators_MPI.f90
+	$(FC) -fopenmp -fallow-argument-mismatch  $(FFLAGS) time_integrators_MPI.f90
 
 jacobians.o : jacobians.f90
 	$(FC) $(FFLAGS) jacobians.f90
diff --git a/input.data b/input.data
index cc633a4..0a17907 100755
--- a/input.data
+++ b/input.data
@@ -1,6 +1,6 @@
 7.0, 1.5585, 0.1
-5.0e+03, 1, 1.1
-5.0, 0.1
+5.0e+04, 1, 1.1
+10.0, 0.1
 -1.0, 1.0
 64, 48, 1
 0, 0, 0
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 493a821..905cfc0 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -1158,23 +1158,32 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     !$OMP END PARALLEL DO
     finish = OMP_GET_WTIME()
     write(*,*) " - update sols timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
     
-    open(unit=9010, file="P"//proc_id_str//"uy_real_update.txt", action="write", status="unknown")
-    open(unit=9011, file="P"//proc_id_str//"uy_im_update.txt", action="write", status="unknown")
-    do i=1,Ny
-        do j=1,Nx
-            write (9010,*) REAL(uy(i,j))
-            write (9011,*) AIMAG(uy(i,j))
-        end do
-    end do
-    close(unit=9010)
-    close(unit=9011)
-    write(*,*) "done writing uy!"
+    ! open(unit=9010, file="P"//proc_id_str//"uy_real_update.txt", action="write", status="unknown")
+    ! open(unit=9011, file="P"//proc_id_str//"uy_im_update.txt", action="write", status="unknown")
+    ! do i=1,Ny
+    !     do j=1,Nx
+    !         write (9010,*) REAL(uy(i,j))
+    !         write (9011,*) AIMAG(uy(i,j))
+    !     end do
+    ! end do
+    ! close(unit=9010)
+    ! close(unit=9011)
+    ! write(*,*) "done writing uy!"
    
+    
+
+    if (wvtk) then
+        call write_to_vtk(nti, .false., proc_id_str) ! false = Fourier space
+    end if
+
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+    write(*,*) "proc ", proc_id_str, " write vtk complete for t", time 
+
     if (time == t_final) then
         exit
     end if
-    
  
  end do
 

From 6a29dbebfe12ac96e48818478db16f41998c86de Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Wed, 5 May 2021 18:09:33 -0400
Subject: [PATCH 22/24] finished MPI version.

---
 time_integrators_MPI.f90 | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 905cfc0..c9c8b5f 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -1182,6 +1182,17 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     write(*,*) "proc ", proc_id_str, " write vtk complete for t", time 
 
     if (time == t_final) then
+        open(unit=9010, file="P"//proc_id_str//"T_real_update.txt", action="write", status="unknown")
+        open(unit=9011, file="P"//proc_id_str//"T_im_update.txt", action="write", status="unknown")
+        do i=1,Ny
+            do j=1,Nx
+                write (9010,*) REAL(T(i,j))
+                write (9011,*) AIMAG(T(i,j))
+            end do
+        end do
+        close(unit=9010)
+        close(unit=9011)
+        write(*,*) "done writing T!"
         exit
     end if
  

From 9bf9348f1a0816b60852f1fa7ca772a15a9dfd45 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Thu, 6 May 2021 10:28:12 -0400
Subject: [PATCH 23/24] small debug

---
 input.data               |  4 ++--
 time_integrators.f90     | 14 +++++++++++++-
 time_integrators_MPI.f90 | 23 ++++++++++++-----------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/input.data b/input.data
index 0a17907..4b2b6c0 100755
--- a/input.data
+++ b/input.data
@@ -1,6 +1,6 @@
 7.0, 1.5585, 0.1
-5.0e+04, 1, 1.1
-10.0, 0.1
+5.0e+03, 1, 1.1
+0.1, 0.1
 -1.0, 1.0
 64, 48, 1
 0, 0, 0
diff --git a/time_integrators.f90 b/time_integrators.f90
index 7cd60c9..f4728e9 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -29,7 +29,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
 logical, optional, intent(in)  :: save_nusselt
 
 integer                        :: nti
-integer                        :: nprint
+integer                        :: nprint, i, j
 logical                        :: wvtk
 real(dp)                       :: nusselt_num
 real(dp)                       :: start, finish
@@ -72,6 +72,18 @@ subroutine imex_rk(vtk_print, save_nusselt)
 ! Format for writing out single values.
 1000 format(E25.16E3)
 
+open(unit=9010, file="T_real_update.txt", action="write", status="unknown")
+open(unit=9011, file="T_im_update.txt", action="write", status="unknown")
+do i=1,Ny
+   do j=1,Nx
+         write (9010,*) REAL(T(i,j))
+         write (9011,*) AIMAG(T(i,j))
+   end do
+end do
+close(unit=9010)
+close(unit=9011)
+write(*,*) "done writing T!"
+
 ! Time integration
 do ! while (time < t_final)
    start_overall = OMP_GET_WTIME()
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index c9c8b5f..5b5c707 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -116,6 +116,18 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 
 call MPI_BARRIER(MPI_COMM_WORLD, mpierror) 
 
+open(unit=9010, file="P"//proc_id_str//"T_real_update.txt", action="write", status="unknown")
+open(unit=9011, file="P"//proc_id_str//"T_im_update.txt", action="write", status="unknown")
+do i=1,Ny
+    do j=1,Nx
+        write (9010,*) REAL(T(i,j))
+        write (9011,*) AIMAG(T(i,j))
+    end do
+end do
+close(unit=9010)
+close(unit=9011)
+write(*,*) "done writing T!"
+
 ! Time integration
 do ! while (time < t_final)
     start_overall = OMP_GET_WTIME()
@@ -1182,17 +1194,6 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     write(*,*) "proc ", proc_id_str, " write vtk complete for t", time 
 
     if (time == t_final) then
-        open(unit=9010, file="P"//proc_id_str//"T_real_update.txt", action="write", status="unknown")
-        open(unit=9011, file="P"//proc_id_str//"T_im_update.txt", action="write", status="unknown")
-        do i=1,Ny
-            do j=1,Nx
-                write (9010,*) REAL(T(i,j))
-                write (9011,*) AIMAG(T(i,j))
-            end do
-        end do
-        close(unit=9010)
-        close(unit=9011)
-        write(*,*) "done writing T!"
         exit
     end if
  

From 875100aff8747df737cbb2299a317dbd88fa01c9 Mon Sep 17 00:00:00 2001
From: Michael Neuder <michael.neuder@gmail.com>
Date: Thu, 6 May 2021 10:47:28 -0400
Subject: [PATCH 24/24] small debug

---
 time_integrators.f90     | 22 +++++++++++-----------
 time_integrators_MPI.f90 | 30 ++++++++++++++----------------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/time_integrators.f90 b/time_integrators.f90
index f4728e9..99d4ff1 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -72,17 +72,6 @@ subroutine imex_rk(vtk_print, save_nusselt)
 ! Format for writing out single values.
 1000 format(E25.16E3)
 
-open(unit=9010, file="T_real_update.txt", action="write", status="unknown")
-open(unit=9011, file="T_im_update.txt", action="write", status="unknown")
-do i=1,Ny
-   do j=1,Nx
-         write (9010,*) REAL(T(i,j))
-         write (9011,*) AIMAG(T(i,j))
-   end do
-end do
-close(unit=9010)
-close(unit=9011)
-write(*,*) "done writing T!"
 
 ! Time integration
 do ! while (time < t_final)
@@ -274,6 +263,17 @@ subroutine imex_rk(vtk_print, save_nusselt)
 
 
    if (time == t_final) then
+      open(unit=9010, file="T_real_update.txt", action="write", status="unknown")
+      open(unit=9011, file="T_im_update.txt", action="write", status="unknown")
+      do i=1,Ny
+         do j=1,Nx
+               write (9010,*) REAL(T(i,j))
+               write (9011,*) AIMAG(T(i,j))
+         end do
+      end do
+      close(unit=9010)
+      close(unit=9011)
+      write(*,*) "done writing T!"
       exit
    end if
 
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
index 5b5c707..73e48ce 100644
--- a/time_integrators_MPI.f90
+++ b/time_integrators_MPI.f90
@@ -116,17 +116,6 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 
 call MPI_BARRIER(MPI_COMM_WORLD, mpierror) 
 
-open(unit=9010, file="P"//proc_id_str//"T_real_update.txt", action="write", status="unknown")
-open(unit=9011, file="P"//proc_id_str//"T_im_update.txt", action="write", status="unknown")
-do i=1,Ny
-    do j=1,Nx
-        write (9010,*) REAL(T(i,j))
-        write (9011,*) AIMAG(T(i,j))
-    end do
-end do
-close(unit=9010)
-close(unit=9011)
-write(*,*) "done writing T!"
 
 ! Time integration
 do ! while (time < t_final)
@@ -1184,7 +1173,20 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
     ! close(unit=9011)
     ! write(*,*) "done writing uy!"
    
-    
+    if (time == t_final) then
+        open(unit=9010, file="P"//proc_id_str//"T_real_update.txt", action="write", status="unknown")
+        open(unit=9011, file="P"//proc_id_str//"T_im_update.txt", action="write", status="unknown")
+        do i=1,Ny
+            do j=1,Nx
+                write (9010,*) REAL(T(i,j))
+                write (9011,*) AIMAG(T(i,j))
+            end do
+        end do
+        close(unit=9010)
+        close(unit=9011)
+        write(*,*) "done writing T!"
+        exit
+    end if
 
     if (wvtk) then
         call write_to_vtk(nti, .false., proc_id_str) ! false = Fourier space
@@ -1192,10 +1194,6 @@ subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
 
     call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
     write(*,*) "proc ", proc_id_str, " write vtk complete for t", time 
-
-    if (time == t_final) then
-        exit
-    end if
  
  end do