diff --git a/Makefile b/Makefile
index 684c2f9..8054957 100755
--- a/Makefile
+++ b/Makefile
@@ -1,20 +1,21 @@
-FC=gfortran
+FC=mpif90
 FFLAGS= -c -O3
 
 LIBFLAGS2 = -L/usr/local/fftw/lib
 LDFLAGS   = -I/usr/local/fftw/include
 
-#MAIN = Ra_loop
-#MAIN = Ra_loop_no_opt
-MAIN = time_loop
+# MAIN = Ra_loop
+# MAIN = Ra_loop_no_opt
+# MAIN = time_loop
+MAIN = time_loop_MPI
 
-OBJECTS = fftw.o global.o allocate_vars.o precmod.o stringmod.o write_pack.o interpolation_pack.o mesh_pack.o imod.o bc_setup.o statistics.o time_integrators.o jacobians.o gmres_pack.o nonlinear_solvers.o $(MAIN).o
+OBJECTS = fftw.o global.o allocate_vars.o precmod.o stringmod.o write_pack.o interpolation_pack.o mesh_pack.o imod.o bc_setup.o statistics.o time_integrators.o time_integrators_MPI.o jacobians.o gmres_pack.o nonlinear_solvers.o $(MAIN).o
 PROGRAMS = $(MAIN).exe
 
 all: $(PROGRAMS)
 
 $(PROGRAMS) : $(OBJECTS)
-	$(FC) -fopenmp $(LDFLAGS) -o $(PROGRAMS) $(OBJECTS) $(LIBFLAGS1) -llapack -lblas $(LIBFLAGS2) -lfftw3 -lm
+	$(FC) -fopenmp -fallow-argument-mismatch $(LDFLAGS) -o $(PROGRAMS) $(OBJECTS) $(LIBFLAGS1) -llapack -lblas $(LIBFLAGS2) -lfftw3 -lm
 
 fftw.o : fftw.f90
 	$(FC) $(FFLAGS) fftw.f90
@@ -52,6 +53,9 @@ statistics.o : statistics.f90
 time_integrators.o : time_integrators.f90
 	$(FC) -fopenmp $(FFLAGS) time_integrators.f90
 
+time_integrators_MPI.o : time_integrators_MPI.f90
+	$(FC) -fopenmp -fallow-argument-mismatch  $(FFLAGS) time_integrators_MPI.f90
+
 jacobians.o : jacobians.f90
 	$(FC) $(FFLAGS) jacobians.f90
 
diff --git a/bc_setup.f90 b/bc_setup.f90
index 10e412d..0c127eb 100755
--- a/bc_setup.f90
+++ b/bc_setup.f90
@@ -2,8 +2,10 @@ module bc_setup
 
 use global
 use write_pack
+use mesh_pack
 
 implicit none
+! include 'mpif.h'
 
 contains
 
@@ -138,4 +140,220 @@ subroutine init_bc(aii)
 
 end subroutine init_bc
 
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+subroutine init_bc_MPI(aii, proc_id, num_procs, proc_id_str)
+
+implicit none
+
+real(dp),              intent(in)     :: aii
+integer,               intent(in)     :: proc_id, num_procs
+character(3),  optional,        intent(in)     :: proc_id_str
+real(dp)                              :: pnu, qnu
+real(dp)                              :: wavex
+
+real(dp), allocatable, dimension(:)   :: d, dl, du
+real(dp), allocatable, dimension(:)   :: phi1_b, phi1_t
+real(dp), allocatable, dimension(:)   :: g1_total, g2_total, g3_total
+real(dp), allocatable, dimension(:)   :: recv_col, recv_col2, last_row_recv
+real(dp), allocatable, dimension(:)   :: phi2_b, phi2_t
+real(dp), allocatable, dimension(:,:) :: Fphi12, FV12
+
+
+integer                               :: ii, jj, i, total_ny, otherproc, start
+integer                               :: info, mpierror
+
+! Only do allocation on main node.
+if(proc_id == 0) then
+   total_ny = Ny * num_procs
+
+   ! Allocate local variables
+   allocate(Fphi12(total_ny-2,2), FV12(total_ny-2,2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(d(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(dl(total_ny-3), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(du(total_ny-3), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(phi1_b(Nx), phi1_t(Nx), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(phi2_b(Nx), phi2_t(Nx), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+
+   ! Need to pull all of g1, g2, g3 onto main node for calculations.
+   allocate(g1_total(total_ny), g2_total(total_ny), g3_total(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+
+   ! Receive g1,g2,g3 from all other nodes.
+   do otherproc = 1,num_procs-1
+      start = otherproc * Ny + 1
+      call MPI_RECV(g1_total(start), Ny, MPI_DOUBLE, otherproc, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      call MPI_RECV(g2_total(start), Ny, MPI_DOUBLE, otherproc, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      call MPI_RECV(g3_total(start), Ny, MPI_DOUBLE, otherproc, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   end do
+   g1_total(1:Ny) = g1
+   g2_total(1:Ny) = g2
+   g3_total(1:Ny) = g3
+
+   Fphi12 = 0.0_dp
+   FV12   = 0.0_dp
+   d      = 0.0_dp
+   dl     = 0.0_dp
+   du     = 0.0_dp
+   phi1_b = 1.0_dp
+   phi1_t = 0.0_dp
+   phi2_b = 0.0_dp
+   phi2_t = 1.0_dp
+
+   phi1_b(1) = 0.0_dp
+   phi2_t(1) = 0.0_dp
+
+   ! Compute phi1 and phi2 from D2 phi_i = 0 where
+   ! D2 = -(alpha*kx)^2 + dy^2 and phi1 satisfies the BCs
+   ! phi1 = 1 at the bottom wall and phi1 = 0 at the top wall.
+   ! phi2 satisfies phi2 = 0 at the bottom wall and phi2 = 1
+   ! at the top wall.
+
+   ! A comment on how these arrays are formed.  Since we are only
+   ! dealing with Dirichlet conditions at the walls, we only need arrays of size
+   ! Ny-2, i.e. we don't need the values at the walls.  Thus, in the calculations
+   ! below, F(1) would correspond to point number 2 and F(Ny-2) would correspond
+   ! to point Ny-1.
+
+   ! Some parameters
+   pnu = nu0*dt*aii
+
+   do ii = 1,Nx
+
+      Fphi12 = 0.0_dp
+   
+      if (abs(kx(ii)/alpha) > Nf/2) then
+         phi1_b(ii) = 0.0_dp
+         phi2_t(ii) = 0.0_dp
+      end if
+   
+      wavex = kx(ii)
+   
+      qnu = 1.0_dp + pnu*wavex**2.0_dp
+   
+      do jj = 2,total_ny-1
+         d(jj-1) = qnu - pnu*g2_total(jj)
+      end do
+   
+      do jj = 2,total_ny-2
+         du(jj-1) = -pnu*g3_total(jj)
+      end do
+   
+      do jj = 3,total_ny-1
+         dl(jj-2) = -pnu*g1_total(jj)
+      end do
+
+      Fphi12(1,1)    = pnu*g1_total(2)*phi1_b(ii)
+      Fphi12(total_ny-2,2) = pnu*g3_total(total_ny-1)*phi2_t(ii)
+
+      !  Solve the system Aphi phi = Fphi
+      call dgtsv(total_ny-2, 2, dl, d, du, Fphi12, total_ny-2, info)
+
+      !  Put phi1 and phi2 together in main node.
+      phi1(1,ii)      = phi1_b(ii)
+      phi1(2:Ny,ii) = Fphi12(1:Ny-1,1)
+      phi2(1,ii)      = 0.0_dp
+      phi2(2:Ny,ii) = Fphi12(1:Ny-1,2)
+      
+      ! Send phi1 and phi2 columns to each node.
+      do otherproc = 1,num_procs-1
+         start = otherproc * Ny
+         call MPI_SEND(Fphi12(start,1), Ny, MPI_DOUBLE, otherproc, 45, MPI_COMM_WORLD, mpierror)
+         call MPI_SEND(Fphi12(start,2), Ny, MPI_DOUBLE, otherproc, 46, MPI_COMM_WORLD, mpierror)
+      end do
+
+      ! Getting V1 and V2 initialized.
+      FV12(:,1) = Fphi12(:,1)
+      FV12(:,2) = Fphi12(:,2)
+
+      do jj = 2,total_ny-1
+         d(jj-1) = -wavex**2.0_dp + g2_total(jj)
+      end do
+
+      do jj = 2,total_ny-2
+         du(jj-1) = g3_total(jj)
+      end do
+
+      do jj = 3,total_ny-1
+         dl(jj-2) = g1_total(jj)
+      end do
+
+      call dgtsv(total_ny-2, 2, dl, d, du, FV12, total_ny-2, info)
+
+      ! Place V1, V2 in main node memory.
+      V1(2:Ny,ii) = FV12(1:Ny-1,1)
+      V2(2:Ny,ii) = FV12(1:Ny-1,2)
+
+      ! Send V1 and V1 columns to each node.
+      do otherproc = 1,num_procs-1
+         start = otherproc * Ny
+         call MPI_SEND(FV12(start,1), Ny, MPI_DOUBLE, otherproc, 48, MPI_COMM_WORLD, mpierror)
+         call MPI_SEND(FV12(start,2), Ny, MPI_DOUBLE, otherproc, 49, MPI_COMM_WORLD, mpierror)
+      end do
+
+   end do
+
+   ! Send last row to final proc.
+   call MPI_SEND(phi2_t(1), Nx, MPI_DOUBLE, num_procs-1, 47, MPI_COMM_WORLD, mpierror)
+
+else
+   ! Send g1, g2, g3 to first node.
+   call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 42, MPI_COMM_WORLD, mpierror)
+   call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 43, MPI_COMM_WORLD, mpierror)
+   call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 44, MPI_COMM_WORLD, mpierror)
+
+   allocate(recv_col(Ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(recv_col2(Ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+
+   ! Receive cols of phi1 and phi2.
+   do ii = 1,Nx
+      call MPI_RECV(recv_col, Ny, MPI_DOUBLE, 0, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      call MPI_RECV(recv_col2, Ny, MPI_DOUBLE, 0, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      phi1(1:Ny, ii) = recv_col
+      phi2(1:Ny, ii) = recv_col2
+   end do
+
+   ! Receive cols of V1 and V2.
+   do ii = 1,Nx
+      call MPI_RECV(recv_col, Ny, MPI_DOUBLE, 0, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      call MPI_RECV(recv_col2, Ny, MPI_DOUBLE, 0, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      V1(1:Ny, ii) = recv_col
+      V2(1:Ny, ii) = recv_col2
+   end do
+
+   ! Handle boundary of last processor.
+   if (proc_id == num_procs - 1) then
+      allocate(last_row_recv(Nx), stat=alloc_err)
+      call check_alloc_err(alloc_err)
+      call MPI_RECV(last_row_recv, Nx, MPI_DOUBLE, 0, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+      phi1(Ny,1:Nx)     = 0.0_dp
+      phi2(Ny,1:Nx)     = last_row_recv
+      V1(Ny, 1:Nx)      = 0.0_dp
+      V2(Ny, 1:Nx)      = 0.0_dp
+   end if 
+end if 
+
+! Calculate wall derivative on first and last processor.
+if (proc_id == 0) then
+   do ii = 1,Nx
+      dyv1_B(ii) = h1(1)*V1(1,ii) + h2(1)*V1(2,ii) + h3(1)*V1(3,ii)
+      dyv2_B(ii) = h1(1)*V2(1,ii) + h2(1)*V2(2,ii) + h3(1)*V2(3,ii)
+   end do
+else if (proc_id == num_procs - 1) then
+   do ii = 1,Nx
+      dyv1_T(ii) = h1(Ny)*V1(Ny-2,ii) + h2(Ny)*V1(Ny-1,ii) + h3(Ny)*V1(Ny,ii)
+      dyv2_T(ii) = h1(Ny)*V2(Ny-2,ii) + h2(Ny)*V2(Ny-1,ii) + h3(Ny)*V2(Ny,ii)
+   end do
+end if
+
+end subroutine init_bc_MPI
+
 end module bc_setup
diff --git a/global.f90 b/global.f90
index e94d8d8..e0c750b 100755
--- a/global.f90
+++ b/global.f90
@@ -3,6 +3,7 @@ module global
 use fftw
 
 implicit none
+include 'mpif.h'
 save
 
 ! Get double-precision type
@@ -103,10 +104,9 @@ module global
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_phi, tmp_phi1, tmp_T, tmp_uy, tmp_uy1
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_K_phi, tmp_K_T
 
-
-
 contains
 
+
 subroutine global_params
 
   implicit none
@@ -314,7 +314,7 @@ function d1y(vec) result(d1yvec)
 
 complex(C_DOUBLE_COMPLEX), dimension(:), intent(in)  :: vec
 complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: d1yvec
-integer               :: alloc_err, jj, n
+integer               :: alloc_err, jj, n, i
 
 n = size(vec)
 
@@ -336,6 +336,102 @@ end function d1y
 
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
+function d1y_MPI(vec, h1_total, h2_total, h3_total) result(d1yvec)
+
+implicit none
+
+complex(C_DOUBLE_COMPLEX), dimension(:), intent(in)  :: vec
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: d1yvec
+real(dp),   dimension(:),  intent(in)  :: h1_total, h2_total, h3_total                
+integer               :: alloc_err, jj, n, i
+
+n = size(vec)
+
+allocate(d1yvec(n), stat=alloc_err)
+if (alloc_err /= 0) then
+   write(*,*) "ERROR:  Allocation problem in d1y."
+   stop
+end if
+
+d1yvec = 0.0_dp
+
+d1yvec(1) = h1_total(1)*vec(1) + h2_total(1)*vec(2) + h3_total(1)*vec(3)
+do jj = 2,n-1
+   d1yvec(jj) = h1_total(jj)*vec(jj-1) + h2_total(jj)*vec(jj) + h3_total(jj)*vec(jj+1)
+end do
+d1yvec(n) = h1_total(n)*vec(n-2) + h2_total(n)*vec(n-1) + h3_total(n)*vec(n)
+
+end function d1y_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+function d1y_MPI2(vec, proc_id, num_procs) result(d1yvec)
+
+implicit none
+
+complex(C_DOUBLE_COMPLEX), dimension(:), intent(in)  :: vec
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: d1yvec
+integer,                                 intent(in)  :: proc_id, num_procs
+integer               :: alloc_err, jj, n, i, mpierror
+complex(C_DOUBLE_COMPLEX)              :: prev_MPI, post_MPI
+
+n = size(vec)
+
+allocate(d1yvec(n), stat=alloc_err)
+if (alloc_err /= 0) then
+   write(*,*) "ERROR:  Allocation problem in d1y."
+   stop
+end if
+
+d1yvec = 0.0_dp
+
+if (proc_id == 0) then 
+   ! Send nth element of vector
+   call MPI_SEND(vec(n), 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 78, MPI_COMM_WORLD, mpierror)
+   ! Receive post element of vector
+   call MPI_RECV(post_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 79, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   
+   d1yvec(1) = h1(1)*vec(1) + h2(1)*vec(2) + h3(1)*vec(3)
+   do jj = 2,n-1
+      d1yvec(jj) = h1(jj)*vec(jj-1) + h2(jj)*vec(jj) + h3(jj)*vec(jj+1)
+   end do
+   d1yvec(n) = h1(n)*vec(n-1) + h2(n)*vec(n) + h3(n)*post_MPI
+else if (proc_id == num_procs - 1) then
+   ! Receive previous element of vector
+   call MPI_RECV(prev_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 78, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send first element of vector.
+   call MPI_SEND(vec(1), 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 79, MPI_COMM_WORLD, mpierror)       
+   d1yvec(1) = h1(1)*prev_MPI + h2(1)*vec(1) + h3(1)*vec(2)
+   do jj = 2,n-1
+      d1yvec(jj) = h1(jj)*vec(jj-1) + h2(jj)*vec(jj) + h3(jj)*vec(jj+1)
+   end do
+   d1yvec(n) = h1(n)*vec(n-2) + h2(n)*vec(n-1) + h3(n)*vec(n)
+else 
+   ! Send nth element of vector
+   call MPI_SEND(vec(n), 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 78, MPI_COMM_WORLD, mpierror)
+   ! Receive previous element of vector
+   call MPI_RECV(prev_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 78, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send first element of vector.
+   call MPI_SEND(vec(1), 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 79, MPI_COMM_WORLD, mpierror)
+   ! Receive post element of vector
+   call MPI_RECV(post_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 79, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   d1yvec(1) = h1(1)*prev_MPI + h2(1)*vec(1) + h3(1)*vec(2)
+   do jj = 2,n-1
+      d1yvec(jj) = h1(jj)*vec(jj-1) + h2(jj)*vec(jj) + h3(jj)*vec(jj+1)
+   end do
+   d1yvec(n) = h1(n)*vec(n-1) + h2(n)*vec(n) + h3(n)*post_MPI
+end if
+
+
+
+end function d1y_MPI2
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 function d2y(vec) result(d2yvec)
 
 implicit none
@@ -389,6 +485,153 @@ function d2y(vec) result(d2yvec)
 
 end function d2y
 
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+function d2y_MPI(vec, dynu_in, g1_total, g2_total, g3_total, total_ny) result(d2yvec)
+
+implicit none
+
+complex(C_DOUBLE_COMPLEX), intent(in)  :: vec(:)
+complex(C_DOUBLE_COMPLEX), allocatable :: d2yvec(:)
+real(dp),   dimension(:),  intent(in)  :: dynu_in, g1_total, g2_total, g3_total                
+integer,                   intent(in)  :: total_ny
+real(dp)                               :: d1, d2, d3, d4
+real(dp)                               :: ell1, ell2, ell3, ell4
+real(dp)                               :: dNm3, dNm2, dNm1, dN
+real(dp)                               :: ellNm3, ellNm2, ellNm1, ellN
+integer                                :: jj, n
+
+n = size(vec)
+
+allocate(d2yvec(n), stat=alloc_err)
+if (alloc_err /= 0) then
+   write(*,*) "ERROR:  Allocation problem in d2y."
+   stop
+end if
+
+d2yvec = 0.0_dp
+
+d1 = -dynu_in(1)*(dynu_in(1)+dynu_in(2))*(dynu_in(1)+dynu_in(2)+dynu_in(3))
+d2 =  dynu_in(1)*dynu_in(2)*(dynu_in(2) + dynu_in(3))
+d3 = -dynu_in(2)*dynu_in(3)*(dynu_in(1) + dynu_in(2))
+d4 =  dynu_in(3)*(dynu_in(2)+dynu_in(3))*(dynu_in(1)+dynu_in(2)+dynu_in(3))
+
+ell1 = -2.0_dp*(3.0_dp*dynu_in(1) + 2.0_dp*dynu_in(2) + dynu_in(3)) / d1
+ell2 = -2.0_dp*(2.0_dp*(dynu_in(1) + dynu_in(2)) + dynu_in(3)) / d2
+ell3 = -2.0_dp*(2.0_dp*dynu_in(1) + dynu_in(2) + dynu_in(3)) / d3
+ell4 = -2.0_dp*(2.0_dp*dynu_in(1) + dynu_in(2)) / d4
+
+dN   =  dynu_in(total_ny-1)*(dynu_in(total_ny-1)+dynu_in(total_ny-2))*(dynu_in(total_ny-1)+dynu_in(total_ny-2)+dynu_in(total_ny-3))
+dNm1 = -dynu_in(total_ny-1)*dynu_in(total_ny-2)*(dynu_in(total_ny-2)+dynu_in(total_ny-3))
+dNm2 =  dynu_in(total_ny-2)*dynu_in(total_ny-3)*(dynu_in(total_ny-1)+dynu_in(total_ny-2))
+dNm3 = -dynu_in(total_ny-3)*(dynu_in(total_ny-2)+dynu_in(total_ny-3))*(dynu_in(total_ny-1)+dynu_in(total_ny-2)+dynu_in(total_ny-3))
+
+ellN   = 2.0_dp*(3.0_dp*dynu_in(total_ny-1) + 2.0_dp*dynu_in(total_ny-2) + dynu_in(total_ny-3)) / dN
+ellNm1 = 2.0_dp*(2.0_dp*(dynu_in(total_ny-1)+dynu_in(total_ny-2)) + dynu_in(total_ny-3)) / dNm1
+ellNm2 = 2.0_dp*(2.0_dp*dynu_in(total_ny-1) + dynu_in(total_ny-2) + dynu_in(total_ny-3)) / dNm2
+ellNm3 = 2.0_dp*(2.0_dp*dynu_in(total_ny-1) + dynu_in(total_ny-2)) / dNm3
+
+!d2yvec(1) = g1(1)*vec(1) + g2(1)*vec(2) + g3(1)*vec(3)
+d2yvec(1) = ell1*vec(1) + ell2*vec(2) + ell3*vec(3) + ell4*vec(4)
+do jj = 2,n-1
+   d2yvec(jj) = g1_total(jj)*vec(jj-1) + g2_total(jj)*vec(jj) + g3_total(jj)*vec(jj+1)
+end do
+!d2yvec(n) = g1(n)*vec(n-2) + g2(n)*vec(n-1) + g3(n)*vec(n)
+d2yvec(n) = ellNm3*vec(n-3) + ellNm2*vec(n-2) + &
+            &ellNm1*vec(n-1) + ellN*vec(n)
+
+end function d2y_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+function d2y_MPI2(vec, proc_id, num_procs) result(d2yvec)
+
+implicit none
+
+complex(C_DOUBLE_COMPLEX), intent(in)  :: vec(:)
+integer                  , intent(in)  :: proc_id, num_procs
+complex(C_DOUBLE_COMPLEX), allocatable :: d2yvec(:)
+real(dp)                               :: d1, d2, d3, d4
+real(dp)                               :: ell1, ell2, ell3, ell4
+real(dp)                               :: dNm3, dNm2, dNm1, dN
+real(dp)                               :: ellNm3, ellNm2, ellNm1, ellN
+integer                                :: jj, n, mpierror
+complex(C_DOUBLE_COMPLEX)              :: prev_MPI, post_MPI
+
+n = size(vec)
+
+allocate(d2yvec(n), stat=alloc_err)
+if (alloc_err /= 0) then
+   write(*,*) "ERROR:  Allocation problem in d2y."
+   stop
+end if
+
+d2yvec = 0.0_dp
+
+d1 = -dynu(1)*(dynu(1)+dynu(2))*(dynu(1)+dynu(2)+dynu(3))
+d2 =  dynu(1)*dynu(2)*(dynu(2) + dynu(3))
+d3 = -dynu(2)*dynu(3)*(dynu(1) + dynu(2))
+d4 =  dynu(3)*(dynu(2)+dynu(3))*(dynu(1)+dynu(2)+dynu(3))
+
+ell1 = -2.0_dp*(3.0_dp*dynu(1) + 2.0_dp*dynu(2) + dynu(3)) / d1
+ell2 = -2.0_dp*(2.0_dp*(dynu(1) + dynu(2)) + dynu(3)) / d2
+ell3 = -2.0_dp*(2.0_dp*dynu(1) + dynu(2) + dynu(3)) / d3
+ell4 = -2.0_dp*(2.0_dp*dynu(1) + dynu(2)) / d4
+
+dN   =  dynu(Ny)*(dynu(Ny)+dynu(Ny-1))*(dynu(Ny)+dynu(Ny-1)+dynu(Ny-2))
+dNm1 = -dynu(Ny)*dynu(Ny-1)*(dynu(Ny-1)+dynu(Ny-2))
+dNm2 =  dynu(Ny-1)*dynu(Ny-2)*(dynu(Ny)+dynu(Ny-1))
+dNm3 = -dynu(Ny-2)*(dynu(Ny-1)+dynu(Ny-2))*(dynu(Ny)+dynu(Ny-1)+dynu(Ny-2))
+
+ellN   = 2.0_dp*(3.0_dp*dynu(Ny) + 2.0_dp*dynu(Ny-1) + dynu(Ny-2)) / dN
+ellNm1 = 2.0_dp*(2.0_dp*(dynu(Ny)+dynu(Ny-1)) + dynu(Ny-2)) / dNm1
+ellNm2 = 2.0_dp*(2.0_dp*dynu(Ny) + dynu(Ny-1) + dynu(Ny-2)) / dNm2
+ellNm3 = 2.0_dp*(2.0_dp*dynu(Ny) + dynu(Ny-1)) / dNm3
+
+if (proc_id == 0) then
+   d2yvec(1) = ell1*vec(1) + ell2*vec(2) + ell3*vec(3) + ell4*vec(4)
+   ! Send nth element of vector
+   call MPI_SEND(vec(n), 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 76, MPI_COMM_WORLD, mpierror)
+   ! Receive post element of vector
+   call MPI_RECV(post_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 77, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   do jj = 2,n-1
+      d2yvec(jj) = g1(jj)*vec(jj-1) + g2(jj)*vec(jj) + g3(jj)*vec(jj+1)
+   end do
+   d2yvec(n) = g1(n)*vec(n-1) + g2(n)*vec(n) + g3(n)*post_MPI
+else if (proc_id == num_procs-1) then
+   ! Receive previous element of vector
+   call MPI_RECV(prev_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 76, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send first element of vector.
+   call MPI_SEND(vec(1), 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 77, MPI_COMM_WORLD, mpierror)       
+   d2yvec(1) = g1(1)*prev_MPI + g2(1)*vec(1) + g3(1)*vec(2)
+   do jj = 2,n-1
+      d2yvec(jj) = g1(jj)*vec(jj-1) + g2(jj)*vec(jj) + g3(jj)*vec(jj+1)
+   end do 
+   d2yvec(n) = ellNm3*vec(n-3) + ellNm2*vec(n-2) + ellNm1*vec(n-1) + ellN*vec(n)
+else
+   ! Send nth element of vector
+   call MPI_SEND(vec(n), 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 76, MPI_COMM_WORLD, mpierror)
+   ! Receive previous element of vector
+   call MPI_RECV(prev_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 76, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send first element of vector.
+   call MPI_SEND(vec(1), 1, MPI_C_DOUBLE_COMPLEX, proc_id-1, 77, MPI_COMM_WORLD, mpierror)
+   ! Receive post element of vector
+   call MPI_RECV(post_MPI, 1, MPI_C_DOUBLE_COMPLEX, proc_id+1, 77, &
+                 MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   d2yvec(1) = g1(1)*prev_MPI + g2(1)*vec(1) + g3(1)*vec(2)
+   do jj = 2,n-1
+      d2yvec(jj) = g1(jj)*vec(jj-1) + g2(jj)*vec(jj) + g3(jj)*vec(jj+1)
+   end do 
+   d2yvec(n) = g1(n)*vec(n-1) + g2(n)*vec(n) + g3(n)*post_MPI
+endif 
+
+end function d2y_MPI2
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 subroutine check_alloc_err(err_mess)
 
 integer, intent(in) :: err_mess
diff --git a/input.data b/input.data
index ca77d7c..4b2b6c0 100755
--- a/input.data
+++ b/input.data
@@ -1,7 +1,7 @@
 7.0, 1.5585, 0.1
 5.0e+03, 1, 1.1
-5.0, 0.1
+0.1, 0.1
 -1.0, 1.0
-1600, 1275, 1
-0, 0, 0
+64, 48, 1
 0, 0, 0
+1, 0, 0
diff --git a/mesh_pack.f90 b/mesh_pack.f90
index b7239b2..965550c 100755
--- a/mesh_pack.f90
+++ b/mesh_pack.f90
@@ -1,6 +1,7 @@
 module mesh_pack
 
 use global
+! include 'mpif.h'
 
 contains
 
@@ -42,6 +43,51 @@ subroutine cosine_mesh(xcoor,ycoor,zcoor, numx,numy,numz,dxin)
 
 end subroutine cosine_mesh
 ! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+subroutine cosine_mesh_MPI(xcoor,ycoor,zcoor, numx,numy,numz, proc_id,num_procs,dxin)
+
+implicit none
+
+integer,                             intent(in)  :: numx, numy, numz
+integer,                             intent(in)  :: proc_id, num_procs
+integer                                          :: mpierror
+integer                                          :: ii, jj, kk
+real(dp)                                         :: c
+real(dp), optional,                  intent(in)  :: dxin
+real(dp)                                         :: dxj
+real(dp), allocatable, dimension(:), intent(out) :: xcoor, ycoor, zcoor
+integer                                          :: total_ny, start_ny
+
+if (present(dxin)) then
+   dxj = dxin
+else
+   dxj = dx
+end if
+
+allocate(xcoor(numx), stat=alloc_err)
+allocate(ycoor(numy), stat=alloc_err)
+allocate(zcoor(numz), stat=alloc_err)
+call check_alloc_err(alloc_err)
+
+! Create the grid.
+xcoor(1) = xL
+zcoor(1) = 0.0_dp
+total_ny = numy * num_procs
+start_ny = proc_id * numy
+do jj = 1,numy
+      c = (real(jj+start_ny,kind=dp) - 1.0_dp) / (real(total_ny,kind=dp) - 1.0_dp)
+      ycoor(jj) = -cos(c*pi)
+end do
+do ii = 2,numx
+   xcoor(ii) = xcoor(ii-1) + dxj
+end do
+do kk = 2,numz
+   zcoor(kk) = 0.0_dp
+end do
+
+end subroutine cosine_mesh_MPI
+! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 subroutine dxdydz(dyj, xcoor,ycoor,zcoor)
 
 implicit none
@@ -61,7 +107,62 @@ subroutine dxdydz(dyj, xcoor,ycoor,zcoor)
    dyj(jj-1) = ycoor(jj) - ycoor(jj-1)
 end do
 
+! do jj=1,numy-1
+!    write(*,*) jj,  dyj(jj)
+! end do
+
 end subroutine dxdydz
+
+! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+subroutine dxdydz_MPI(dyj, xcoor,ycoor,zcoor, proc_id, num_procs)
+
+implicit none
+
+real(dp),              dimension(:), intent(in)  :: xcoor, ycoor, zcoor
+integer,                             intent(in)  :: proc_id, num_procs
+real(dp), allocatable, dimension(:), intent(out) :: dyj
+integer                                          :: alloc_err, mpierror
+integer                                          :: numy, jj
+real(dp)                                         :: prev_element
+
+numy = size(ycoor)
+
+if (proc_id == 0) then
+   allocate(dyj(numy-1), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   ! Calculate grid spacing
+   do jj = 2,numy
+      dyj(jj-1) = ycoor(jj) - ycoor(jj-1)
+   end do
+   ! Send last element to next process to the right.
+   call MPI_SEND(ycoor(numy), 1, MPI_DOUBLE, proc_id+1, 42, MPI_COMM_WORLD, mpierror)
+else if (proc_id == num_procs - 1) then
+   ! Recieve first element from next process to the left.
+   call MPI_RECV(prev_element, 1, MPI_DOUBLE, proc_id-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   allocate(dyj(numy), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   ! Calculate grid spacing
+   dyj(1) = ycoor(1) - prev_element
+   do jj = 2,numy
+      dyj(jj) = ycoor(jj) - ycoor(jj-1)
+   end do
+else
+   ! Recieve first element from next process to the left.
+   call MPI_RECV(prev_element, 1, MPI_DOUBLE, proc_id-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! Send last element to next process to the right.
+   call MPI_SEND(ycoor(numy), 1, MPI_DOUBLE, proc_id+1, 42, MPI_COMM_WORLD, mpierror)
+   allocate(dyj(numy), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   ! Calculate grid spacing
+   dyj(1) = ycoor(1) - prev_element
+   do jj = 2,numy
+      dyj(jj) = ycoor(jj) - ycoor(jj-1)
+   end do
+end if
+
+
+end subroutine dxdydz_MPI
 ! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 subroutine y_mesh_params
 
@@ -95,6 +196,95 @@ subroutine y_mesh_params
 
 end subroutine y_mesh_params
 ! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+subroutine y_mesh_params_MPI (proc_id, num_procs)
+
+implicit none
+
+integer, intent(in)  :: proc_id, num_procs
+real(dp)             :: recv_val
+integer              :: mpierror, jj
+
+! Calculate metric coefficients
+! g is for second derivatives
+! h is for first derivatives
+
+! First element
+if (proc_id == 0) then
+   call MPI_RECV(recv_val, 1, MPI_DOUBLE, proc_id+1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+   ! First elems
+   g1(1) =  2.0_dp / (dynu(1)*(dynu(1)+dynu(2)))
+   g2(1) = -2.0_dp / (dynu(1)*dynu(2))
+   g3(1) =  2.0_dp / (dynu(2)*(dynu(1)+dynu(2)))
+
+   h1(1) = -(2.0_dp*dynu(1)+dynu(2)) / (dynu(1)*(dynu(1)+dynu(2)))
+   h2(1) = (dynu(1) + dynu(2)) / (dynu(1)*dynu(2))
+   h3(1) = -dynu(1) / (dynu(2)*(dynu(1)+dynu(2)))
+
+   ! Middle Ny-2 elements.
+   do jj = 2,Ny-1
+      g1(jj) =  2.0_dp / (dynu(jj-1)*(dynu(jj)+dynu(jj-1)))
+      g2(jj) = -2.0_dp / (dynu(jj-1)*dynu(jj))
+      g3(jj) =  2.0_dp / (dynu( jj )*(dynu(jj)+dynu(jj-1)))
+
+      h1(jj) = -dynu(jj) / (dynu(jj-1)*(dynu(jj-1) + dynu(jj)))
+      h2(jj) = (dynu(jj) - dynu(jj-1)) / (dynu(jj-1)*dynu(jj))
+      h3(jj) = dynu(jj-1) / (dynu(jj)*(dynu(jj-1) + dynu(jj)))
+   end do
+
+   ! Last elems
+   g1(Ny) =  2.0_dp / (dynu(Ny-1)*(recv_val+dynu(Ny-1)))
+   g2(Ny) = -2.0_dp / (dynu(Ny-1)*recv_val)
+   g3(Ny) =  2.0_dp / (recv_val*(recv_val+dynu(Ny-1)))
+
+   h1(Ny) = -recv_val / (dynu(Ny-1)*(dynu(Ny-1) + recv_val))
+   h2(Ny) = (recv_val - dynu(Ny-1)) / (dynu(Ny-1)*recv_val)
+   h3(Ny) = dynu(Ny-1) / (recv_val*(dynu(Ny-1) + recv_val))
+else if (proc_id == num_procs - 1) then
+   call MPI_SEND(dynu(1), 1, MPI_DOUBLE, proc_id-1, 42, MPI_COMM_WORLD, mpierror)
+   do jj = 1,Ny-1
+      g1(jj) =  2.0_dp / (dynu(jj)*(dynu(jj+1)+dynu(jj)))
+      g2(jj) = -2.0_dp / (dynu(jj)*dynu(jj+1))
+      g3(jj) =  2.0_dp / (dynu(jj+1)*(dynu(jj+1)+dynu(jj)))
+
+      h1(jj) = -dynu(jj+1) / (dynu(jj)*(dynu(jj) + dynu(jj+1)))
+      h2(jj) = (dynu(jj+1) - dynu(jj)) / (dynu(jj)*dynu(jj+1))
+      h3(jj) = dynu(jj) / (dynu(jj+1)*(dynu(jj) + dynu(jj+1)))
+   end do
+   h1(Ny) = dynu(Ny) / (dynu(Ny-1)*(dynu(Ny-1)+dynu(Ny)))
+   h2(Ny) = -(dynu(Ny-1) + dynu(Ny)) / (dynu(Ny-1)*dynu(Ny))
+   h3(Ny) = (2.0_dp*dynu(Ny) + dynu(Ny-1)) / (dynu(Ny)*(dynu(Ny-1) + dynu(Ny)))
+
+   g1(Ny) =  2.0_dp / (dynu(Ny-1)*(dynu(Ny-1) + dynu(Ny)))
+   g2(Ny) = -2.0_dp / (dynu(Ny)*dynu(Ny-1))
+   g3(Ny) =  2.0_dp / (dynu(Ny)*(dynu(Ny) + dynu(Ny-1)))
+
+else
+   call MPI_SEND(dynu(1), 1, MPI_DOUBLE, proc_id-1, 42, MPI_COMM_WORLD, mpierror)
+   call MPI_RECV(recv_val, 1, MPI_DOUBLE, proc_id+1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+   ! All but last elems.
+   do jj = 1,Ny-1
+      g1(jj) =  2.0_dp / (dynu(jj)*(dynu(jj+1)+dynu(jj)))
+      g2(jj) = -2.0_dp / (dynu(jj)*dynu(jj+1))
+      g3(jj) =  2.0_dp / (dynu(jj+1)*(dynu(jj+1)+dynu(jj)))
+
+      h1(jj) = -dynu(jj+1) / (dynu(jj)*(dynu(jj) + dynu(jj+1)))
+      h2(jj) = (dynu(jj+1) - dynu(jj)) / (dynu(jj)*dynu(jj+1))
+      h3(jj) = dynu(jj) / (dynu(jj+1)*(dynu(jj) + dynu(jj+1)))
+   end do
+   ! Last elems
+   g1(Ny) =  2.0_dp / (dynu(Ny)*(recv_val+dynu(Ny)))
+   g2(Ny) = -2.0_dp / (dynu(Ny)*recv_val)
+   g3(Ny) =  2.0_dp / (recv_val*(recv_val+dynu(Ny)))
+
+   h1(Ny) = -recv_val / (dynu(Ny)*(dynu(Ny) + recv_val))
+   h2(Ny) = (recv_val - dynu(Ny)) / (dynu(Ny)*recv_val)
+   h3(Ny) = dynu(Ny) / (recv_val*(dynu(Ny) + recv_val))
+end if 
+
+end subroutine y_mesh_params_MPI
+! :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
 subroutine uniform_mesh(xcoor,ycoor,zcoor, numx,numy,numz)
 
 implicit none
diff --git a/time_integrators.f90 b/time_integrators.f90
index 3f1b908..99d4ff1 100644
--- a/time_integrators.f90
+++ b/time_integrators.f90
@@ -29,7 +29,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
 logical, optional, intent(in)  :: save_nusselt
 
 integer                        :: nti
-integer                        :: nprint
+integer                        :: nprint, i, j
 logical                        :: wvtk
 real(dp)                       :: nusselt_num
 real(dp)                       :: start, finish
@@ -37,8 +37,11 @@ subroutine imex_rk(vtk_print, save_nusselt)
 integer                        :: nthreads, myid
 integer, EXTERNAL              :: OMP_GET_THREAD_NUM, OMP_GET_NUM_THREADS
 real(dp), EXTERNAL             :: OMP_GET_WTIME
+EXTERNAL                       :: OMP_SET_NUM_THREADS
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
+call OMP_SET_NUM_THREADS(1)
+
 if (present(vtk_print)) then
    wvtk = .true.
    nprint = vtk_print
@@ -69,6 +72,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
 ! Format for writing out single values.
 1000 format(E25.16E3)
 
+
 ! Time integration
 do ! while (time < t_final)
    start_overall = OMP_GET_WTIME()
@@ -259,6 +263,17 @@ subroutine imex_rk(vtk_print, save_nusselt)
 
 
    if (time == t_final) then
+      open(unit=9010, file="T_real_update.txt", action="write", status="unknown")
+      open(unit=9011, file="T_im_update.txt", action="write", status="unknown")
+      do i=1,Ny
+         do j=1,Nx
+               write (9010,*) REAL(T(i,j))
+               write (9011,*) AIMAG(T(i,j))
+         end do
+      end do
+      close(unit=9010)
+      close(unit=9011)
+      write(*,*) "done writing T!"
       exit
    end if
 
@@ -283,6 +298,7 @@ subroutine imex_rk(vtk_print, save_nusselt)
 
 end do ! time loop
 
+
 end subroutine imex_rk
 
 !:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
@@ -607,25 +623,25 @@ subroutine calc_explicit(stage)
 start = OMP_GET_WTIME()
 select case(stage)
    case (1)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          K1hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
       end do
       !$OMP END PARALLEL DO
    case (2)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          K2hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
       end do
       !$OMP END PARALLEL DO
    case (3)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          K3hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
       end do
       !$OMP END PARALLEL DO
    case (4)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          K4hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
       end do
@@ -635,7 +651,7 @@ subroutine calc_explicit(stage)
 write(*,*) " - - l1 timing: ", finish-start, "(s)"
 
 start = OMP_GET_WTIME()
-!$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+!$OMP PARALLEL DO schedule(dynamic)
 do i=1,Nx
    ! Compute dx(T) in Fourier space
    nlT  (:,i) =  kx(i)*Ti(:,i)
@@ -650,7 +666,7 @@ subroutine calc_explicit(stage)
 nlT = CI*nlT
 
 start = OMP_GET_WTIME()
-!$OMP PARALLEL DO num_threads(16) private(tnlT, tnlphi, tT, tux, tuy, tphi) schedule(dynamic)
+!$OMP PARALLEL DO private(tnlT, tnlphi, tT, tux, tuy, tphi) schedule(dynamic)
 do j = 1,Ny
    ! Bring everything to physical space
    tnlT   = nlT(j,:)
@@ -678,7 +694,7 @@ subroutine calc_explicit(stage)
 
 ! Calculate nonlinear term
 start = OMP_GET_WTIME()
-!$OMP PARALLEL DO num_threads(16) private(tmp_T) schedule(dynamic)
+!$OMP PARALLEL DO private(tmp_T) schedule(dynamic)
 do i = 1,Nx
    ! Temperature
    tmp_T = Ti(:,i)
@@ -692,7 +708,7 @@ subroutine calc_explicit(stage)
 
 ! Bring nonlinear terms back to Fourier space
 start = OMP_GET_WTIME()
-!$OMP PARALLEL DO num_threads(16) private(tnlT, tnlphi) schedule(dynamic)
+!$OMP PARALLEL DO private(tnlT, tnlphi) schedule(dynamic)
 do j = 1,Ny
    tnlT   = nlT(j,:)
    tnlphi = nlphi(j,:)
@@ -718,7 +734,7 @@ subroutine calc_explicit(stage)
 start = OMP_GET_WTIME()
 select case (stage)
    case (1)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          !K1hat_phi(:,i) = K1hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
          K1hat_phi(:,i) = K1hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
@@ -726,7 +742,7 @@ subroutine calc_explicit(stage)
       !$OMP END PARALLEL DO
       K1hat_T = -nlT
    case (2)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          !K2hat_phi(:,i) = K2hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
          K2hat_phi(:,i) = K2hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
@@ -734,7 +750,7 @@ subroutine calc_explicit(stage)
       !$OMP END PARALLEL DO
       K2hat_T = -nlT
    case (3)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          !K3hat_phi(:,i) = K3hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
          K3hat_phi(:,i) = K3hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
@@ -742,7 +758,7 @@ subroutine calc_explicit(stage)
       !$OMP END PARALLEL DO
       K3hat_T = -nlT
    case (4)
-      !$OMP PARALLEL DO num_threads(16) schedule(dynamic)
+      !$OMP PARALLEL DO schedule(dynamic)
       do i = 1,Nx
          !K4hat_phi(:,i) = K4hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
          K4hat_phi(:,i) = K4hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
diff --git a/time_integrators_MPI.f90 b/time_integrators_MPI.f90
new file mode 100644
index 0000000..73e48ce
--- /dev/null
+++ b/time_integrators_MPI.f90
@@ -0,0 +1,1637 @@
+module time_integrators_MPI
+
+use fftw
+use global
+use write_pack
+use allocate_vars
+use bc_setup
+use statistics
+use omp_lib
+
+
+integer  :: it, jt, kkt
+integer  :: info
+real(dp) :: time, dtmax, dtmin, dt_old, dt_ramp, dt_final
+
+contains
+
+subroutine imex_rk_MPI(proc_id_str, vtk_print, save_nusselt, proc_id, num_procs)
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!!  This progam solves the equations of thermal convection using a Fourier
+!!  spectral method in the x-direction and a 2nd order finite difference scheme in
+!!  the y-direction.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+implicit none
+
+character(3), intent(in)       :: proc_id_str
+integer, optional, intent(in)  :: vtk_print
+logical, optional, intent(in)  :: save_nusselt
+integer,           intent(in)  :: proc_id, num_procs
+
+integer                        :: nti, i, j, mpierror, total_ny
+integer                        :: nprint, otherproc
+logical                        :: wvtk
+real(dp)                       :: nusselt_num
+real(dp)                       :: start, finish
+real(dp)                       :: start_overall, finish_overall
+integer                        :: nthreads, myid, stind
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_phi_MPI, tmp_T_MPI, phi_MPI, phi_update_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1hat_phi_MPI, K2hat_phi_MPI, K3hat_phi_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1hat_T_MPI, K2hat_T_MPI, K3hat_T_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: T_MPI, tmp_uy_MPI, tmp_uy1_MPI, tmp_phi1_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: tmp_K_phi_MPI, tmp_K_T_MPI, uxi_MPI
+real(dp), allocatable, dimension(:) :: phi1_MPI, phi2_MPI, V1_MPI, V2_MPI
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:,:) :: single_phiout, single_Tout
+real(dp), allocatable, dimension(:)   :: g1_total, g2_total, g3_total, dynu_MPI
+real(dp), allocatable, dimension(:)   :: h1_total, h2_total, h3_total
+real(dp)                       :: dyv1_T_it_MPI, dyv2_T_it_MPI, h1_end_MPI, h2_end_MPI, h3_end_MPI
+integer, EXTERNAL              :: OMP_GET_THREAD_NUM, OMP_GET_NUM_THREADS
+real(dp), EXTERNAL             :: OMP_GET_WTIME
+EXTERNAL                       :: OMP_SET_NUM_THREADS
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+call OMP_SET_NUM_THREADS(1)
+
+if (present(vtk_print)) then
+    wvtk = .true.
+    nprint = vtk_print
+else
+    wvtk = .false.
+    nprint = 100
+end if
+
+write(*,*) "imex_rk_MPI from proc ", proc_id_str
+
+if (wvtk) then
+    call write_to_vtk(0, .false., proc_id_str) ! false = Fourier space
+end if
+
+dt = dt_init
+
+call init_bc_MPI(acoeffs(1,1), proc_id, num_procs, proc_id_str)
+
+time = 0.0_dp
+
+dtmax = 0.5_dp
+dtmin = 1.0e-4_dp
+
+dt_ramp = 1.1_dp
+
+dt_old = dt
+
+nti = 0
+
+! Format for writing out single values.x
+1000 format(E25.16E3)
+
+if (proc_id == 0) then 
+   total_ny = Ny * num_procs
+   allocate(tmp_phi_MPI(total_ny), tmp_T_MPI(total_ny), phi_MPI(total_ny-2), phi_update_MPI(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(K1hat_phi_MPI(total_ny-2), K2hat_phi_MPI(total_ny-2), K3hat_phi_MPI(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(K1hat_T_MPI(total_ny-2), K2hat_T_MPI(total_ny-2), K3hat_T_MPI(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(K1_phi_MPI(total_ny-2), K2_phi_MPI(total_ny-2), K1_T_MPI(total_ny-2), K2_T_MPI(total_ny-2), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(T_MPI(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(g1_total(total_ny), g2_total(total_ny), g3_total(total_ny), dynu_MPI(total_ny-1), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(h1_total(total_ny), h2_total(total_ny), h3_total(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(tmp_uy_MPI(total_ny), tmp_uy1_MPI(total_ny), tmp_phi1_MPI(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(single_phiout(total_ny, Nx), single_Tout(total_ny,Nx), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(phi1_MPI(total_ny), phi2_MPI(total_ny), V1_MPI(total_ny), V2_MPI(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+   allocate(tmp_K_phi_MPI(total_ny), tmp_K_T_MPI(total_ny), uxi_MPI(total_ny), stat=alloc_err)
+   call check_alloc_err(alloc_err)
+end if
+
+call MPI_BARRIER(MPI_COMM_WORLD, mpierror) 
+
+
+! Time integration
+do ! while (time < t_final)
+    start_overall = OMP_GET_WTIME()
+ 
+    dt_final = t_final - time
+ 
+    if (dt_final <= dt) then
+       time = t_final
+    else
+       time = time + dt
+    end if
+ 
+    write(*,*) "time = ", time, "dt = ", dt
+ 
+    nti = nti + 1
+ 
+    !:::::::::::
+    ! STAGE 1 ::
+    !:::::::::::
+    phii = phi
+    Ti   = T
+    uxi  = ux
+    uyi  = uy
+    start = OMP_GET_WTIME()
+    call calc_explicit_MPI(1, proc_id, num_procs, proc_id_str)
+    finish = OMP_GET_WTIME()
+    write(*,*) " - calc_explicit(1) timing: ", finish-start, "(s)"
+    start = OMP_GET_WTIME()
+    !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+    do it = 1,Nx ! kx loop
+        ! Compute phi1 and T1
+        if (proc_id == 0) then
+            ! Need to pull all of the variables into local main memory.
+            phi_MPI(1:Ny-1) = phi(2:Ny,it)
+            K1hat_phi_MPI(1:Ny-1) = K1hat_phi(2:Ny,it)
+            K2hat_phi_MPI(1:Ny-1) = K2hat_phi(2:Ny,it)
+            K3hat_phi_MPI(1:Ny-1) = K3hat_phi(2:Ny,it)
+            K1hat_T_MPI(1:Ny-1) = K1hat_T(2:Ny,it)
+            K2hat_T_MPI(1:Ny-1) = K2hat_T(2:Ny,it)
+            K3hat_t_MPI(1:Ny-1) = K3hat_T(2:Ny,it)
+            K1_phi_MPI(1:Ny-1) = K1_phi(2:Ny,it)
+            K2_phi_MPI(1:Ny-1) = K2_phi(2:Ny,it)
+            K1_T_MPI(1:Ny-1) = K1_T(2:Ny,it)
+            K2_T_MPI(1:Ny-1) = K2_T(2:Ny,it)
+            T_MPI(1:Ny) = T(1:Ny,it)
+
+            ! Receive pieces from other nodes.
+            do otherproc = 1,num_procs-2
+                stind = otherproc * Ny
+                call MPI_RECV(phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            
+            ! Receive from last node.
+            stind = (num_procs-1) * Ny
+            call MPI_RECV(phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+        
+            ! Receive g1,g2,g3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(g1_total(stind), Ny, MPI_DOUBLE, otherproc, 54, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g2_total(stind), Ny, MPI_DOUBLE, otherproc, 55, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g3_total(stind), Ny, MPI_DOUBLE, otherproc, 56, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            g1_total(1:Ny) = g1
+            g2_total(1:Ny) = g2
+            g3_total(1:Ny) = g3
+
+            ! Call vari mod
+            call calc_vari_mod_MPI(tmp_phi_MPI, tmp_T_MPI, acoeffs(1,1), 1, total_ny,&
+                                kx(it), phi_MPI,&
+                                K1hat_phi_MPI,K2hat_phi_MPI,K3hat_phi_MPI,&
+                                K1hat_T_MPI,K2hat_T_MPI,K3hat_T_MPI,&
+                                K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI,&
+                                g1_total, g2_total, g3_total,&
+                                T_MPI)
+            
+            ! Compute v1 from phi1
+            call calc_vi_mod_MPI(tmp_uy_MPI, tmp_phi_MPI, kx(it), total_ny, g1_total, g2_total, g3_total)
+            
+            ! Receive dyv1 and dyv2
+            call MPI_RECV(dyv1_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 57, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(dyv2_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 58, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+            ! Receive V1, V2, phi1, phi2
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(V1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 59, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(V2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 60, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 61, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 62, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            ! Set local variables to MPI.
+            V1_MPI(1:Ny) = V1(1:Ny, it)
+            V2_MPI(1:Ny) = V2(1:Ny, it)
+            phi1_MPI(1:Ny) = phi1(1:Ny, it)
+            phi2_MPI(1:Ny) = phi2(1:Ny, it)
+
+            ! Receive end of h1, h2, h3 for last processor.
+            call MPI_RECV(h1_end_MPI, 1, MPI_DOUBLE, num_procs-1, 63, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h2_end_MPI, 1, MPI_DOUBLE, num_procs-1, 64, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h3_end_MPI, 1, MPI_DOUBLE, num_procs-1, 65, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+            ! BOUNDAY CONDITIONS!
+            call update_bcs_mod_MPI(tmp_phi1_MPI,tmp_uy1_MPI, tmp_phi_MPI,tmp_uy_MPI,&
+                                dyv1_T_it_MPI,dyv2_T_it_MPI,&
+                                dyv1_B(it),dyv2_B(it),&
+                                V1_MPI,V2_MPI,phi1_MPI,phi2_MPI,&
+                                total_ny, h1_end_MPI, h2_end_MPI, h3_end_MPI)
+            
+            tmp_phi_MPI = tmp_phi1_MPI
+            tmp_uy_MPI  = tmp_uy1_MPI
+
+            ! Receive dynu from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny
+                call MPI_RECV(dynu_MPI(stind), Ny, MPI_DOUBLE, otherproc, 66, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            dynu_MPI(1:Ny-1) = dynu(1:Ny-1)
+            
+            ! Implicit.
+            call calc_implicit_mod_MPI(tmp_K_phi_MPI,tmp_K_T_MPI, tmp_phi_MPI,tmp_T_MPI,&
+                                       kx(it), total_ny, dynu_MPI, g1_total, g2_total, g3_total)
+
+            ! Receive h1,h2,h3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(h1_total(stind), Ny, MPI_DOUBLE, otherproc, 67, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h2_total(stind), Ny, MPI_DOUBLE, otherproc, 68, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h3_total(stind), Ny, MPI_DOUBLE, otherproc, 69, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            h1_total(1:Ny) = h1
+            h2_total(1:Ny) = h2
+            h3_total(1:Ny) = h3
+            
+            ! Compute u1 from v1
+            if (kx(it) /= 0.0_dp) then
+                uxi_MPI = CI*d1y_MPI(tmp_uy_MPI, h1_total, h2_total, h3_total)/kx(it)
+            else if (kx(it) == 0.0_dp) then
+                uxi_MPI = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+            end if
+
+            ! Send K1_phi, K1_T, phii, Ti, uyi, uxi back to each node.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_SEND(tmp_K_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 70, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_K_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 71, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 72, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 73, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_uy_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 74, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(uxi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 75, MPI_COMM_WORLD, mpierror)
+            end do
+
+            ! Set local to main node variables.
+            K1_phi(1:Ny,it) = tmp_K_phi_MPI(1:Ny)
+            K1_T(1:Ny,it) = tmp_K_T_MPI(1:Ny)
+            phii(1:Ny,it) = tmp_phi_MPI(1:Ny)
+            Ti(1:Ny,it) = tmp_T_MPI(1:Ny)
+            uyi(1:Ny,it) = tmp_uy_MPI(1:Ny)
+            uxi(1:Ny,it) = uxi_MPI(1:Ny)
+
+            
+        else if (proc_id == num_procs - 1) then
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send dyv1 and dyv2.
+            call MPI_SEND(dyv1_T(it), 1, MPI_DOUBLE, 0, 57, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(dyv2_T(it), 1, MPI_DOUBLE, 0, 58, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+            
+            ! Send h1,h2,h3 last elements.
+            call MPI_SEND(h1(Ny), 1, MPI_DOUBLE, 0, 63, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(Ny), 1, MPI_DOUBLE, 0, 64, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(Ny), 1, MPI_DOUBLE, 0, 65, MPI_COMM_WORLD, mpierror)
+
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K1_phi, K1_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+        else 
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+        
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K1_phi, K1_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end if
+    end do
+    !$OMP END PARALLEL DO
+    finish = OMP_GET_WTIME()
+    write(*,*) " - stage 1 mid timing: ", finish-start, "(s)"
+
+    ! Compute K2hat
+    start = OMP_GET_WTIME()
+    call calc_explicit_MPI(2, proc_id, num_procs, proc_id_str)
+    finish = OMP_GET_WTIME()
+    write(*,*) " - calc_explicit(2) timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+
+    !:::::::::::
+    ! STAGE 2 ::
+    !:::::::::::
+    start = OMP_GET_WTIME()
+    !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+    do it = 1,Nx ! kx loop
+        ! Compute phi1 and T1
+        if (proc_id == 0) then
+            ! Need to pull all of the variables into local main memory.
+            phi_MPI(1:Ny-1) = phi(2:Ny,it)
+            K1hat_phi_MPI(1:Ny-1) = K1hat_phi(2:Ny,it)
+            K2hat_phi_MPI(1:Ny-1) = K2hat_phi(2:Ny,it)
+            K3hat_phi_MPI(1:Ny-1) = K3hat_phi(2:Ny,it)
+            K1hat_T_MPI(1:Ny-1) = K1hat_T(2:Ny,it)
+            K2hat_T_MPI(1:Ny-1) = K2hat_T(2:Ny,it)
+            K3hat_t_MPI(1:Ny-1) = K3hat_T(2:Ny,it)
+            K1_phi_MPI(1:Ny-1) = K1_phi(2:Ny,it)
+            K2_phi_MPI(1:Ny-1) = K2_phi(2:Ny,it)
+            K1_T_MPI(1:Ny-1) = K1_T(2:Ny,it)
+            K2_T_MPI(1:Ny-1) = K2_T(2:Ny,it)
+            T_MPI(1:Ny) = T(1:Ny,it)
+
+            ! Receive pieces from other nodes.
+            do otherproc = 1,num_procs-2
+                stind = otherproc * Ny
+                call MPI_RECV(phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            
+            ! Receive from last node.
+            stind = (num_procs-1) * Ny
+            call MPI_RECV(phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+        
+            ! Receive g1,g2,g3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(g1_total(stind), Ny, MPI_DOUBLE, otherproc, 54, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g2_total(stind), Ny, MPI_DOUBLE, otherproc, 55, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g3_total(stind), Ny, MPI_DOUBLE, otherproc, 56, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            g1_total(1:Ny) = g1
+            g2_total(1:Ny) = g2
+            g3_total(1:Ny) = g3
+
+            ! Call vari mod
+            call calc_vari_mod_MPI(tmp_phi_MPI, tmp_T_MPI, acoeffs(2,2), 2, total_ny,&
+                                kx(it), phi_MPI,&
+                                K1hat_phi_MPI,K2hat_phi_MPI,K3hat_phi_MPI,&
+                                K1hat_T_MPI,K2hat_T_MPI,K3hat_T_MPI,&
+                                K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI,&
+                                g1_total, g2_total, g3_total,&
+                                T_MPI)
+            
+            ! Compute v1 from phi1
+            call calc_vi_mod_MPI(tmp_uy_MPI, tmp_phi_MPI, kx(it), total_ny, g1_total, g2_total, g3_total)
+            
+            ! Receive dyv1 and dyv2
+            call MPI_RECV(dyv1_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 57, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(dyv2_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 58, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+            ! Receive V1, V2, phi1, phi2
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(V1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 59, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(V2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 60, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 61, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 62, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            ! Set local variables to MPI.
+            V1_MPI(1:Ny) = V1(1:Ny, it)
+            V2_MPI(1:Ny) = V2(1:Ny, it)
+            phi1_MPI(1:Ny) = phi1(1:Ny, it)
+            phi2_MPI(1:Ny) = phi2(1:Ny, it)
+
+            ! Receive end of h1, h2, h3 for last processor.
+            call MPI_RECV(h1_end_MPI, 1, MPI_DOUBLE, num_procs-1, 63, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h2_end_MPI, 1, MPI_DOUBLE, num_procs-1, 64, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h3_end_MPI, 1, MPI_DOUBLE, num_procs-1, 65, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+            ! BOUNDAY CONDITIONS!
+            call update_bcs_mod_MPI(tmp_phi1_MPI,tmp_uy1_MPI, tmp_phi_MPI,tmp_uy_MPI,&
+                                dyv1_T_it_MPI,dyv2_T_it_MPI,&
+                                dyv1_B(it),dyv2_B(it),&
+                                V1_MPI,V2_MPI,phi1_MPI,phi2_MPI,&
+                                total_ny, h1_end_MPI, h2_end_MPI, h3_end_MPI)
+            
+            tmp_phi_MPI = tmp_phi1_MPI
+            tmp_uy_MPI  = tmp_uy1_MPI
+
+            ! Receive dynu from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny
+                call MPI_RECV(dynu_MPI(stind), Ny, MPI_DOUBLE, otherproc, 66, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            dynu_MPI(1:Ny-1) = dynu(1:Ny-1)
+            
+            ! Implicit.
+            call calc_implicit_mod_MPI(tmp_K_phi_MPI,tmp_K_T_MPI, tmp_phi_MPI,tmp_T_MPI,&
+                                       kx(it), total_ny, dynu_MPI, g1_total, g2_total, g3_total)
+
+            ! Receive h1,h2,h3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(h1_total(stind), Ny, MPI_DOUBLE, otherproc, 67, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h2_total(stind), Ny, MPI_DOUBLE, otherproc, 68, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h3_total(stind), Ny, MPI_DOUBLE, otherproc, 69, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            h1_total(1:Ny) = h1
+            h2_total(1:Ny) = h2
+            h3_total(1:Ny) = h3
+            
+            ! Compute u1 from v1
+            if (kx(it) /= 0.0_dp) then
+                uxi_MPI = CI*d1y_MPI(tmp_uy_MPI, h1_total, h2_total, h3_total)/kx(it)
+            else if (kx(it) == 0.0_dp) then
+                uxi_MPI = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+            end if
+
+            ! Send K2_phi, K2_T, phii, Ti, uyi, uxi back to each node.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_SEND(tmp_K_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 70, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_K_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 71, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 72, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 73, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_uy_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 74, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(uxi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 75, MPI_COMM_WORLD, mpierror)
+            end do
+
+            ! Set local to main node variables.
+            K2_phi(1:Ny,it) = tmp_K_phi_MPI(1:Ny)
+            K2_T(1:Ny,it) = tmp_K_T_MPI(1:Ny)
+            phii(1:Ny,it) = tmp_phi_MPI(1:Ny)
+            Ti(1:Ny,it) = tmp_T_MPI(1:Ny)
+            uyi(1:Ny,it) = tmp_uy_MPI(1:Ny)
+            uxi(1:Ny,it) = uxi_MPI(1:Ny)
+
+            
+        else if (proc_id == num_procs - 1) then
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send dyv1 and dyv2.
+            call MPI_SEND(dyv1_T(it), 1, MPI_DOUBLE, 0, 57, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(dyv2_T(it), 1, MPI_DOUBLE, 0, 58, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+            
+            ! Send h1,h2,h3 last elements.
+            call MPI_SEND(h1(Ny), 1, MPI_DOUBLE, 0, 63, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(Ny), 1, MPI_DOUBLE, 0, 64, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(Ny), 1, MPI_DOUBLE, 0, 65, MPI_COMM_WORLD, mpierror)
+
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K2_phi, K2_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+        else 
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+        
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K2_phi, K2_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end if
+    end do
+    !$OMP END PARALLEL DO
+    finish = OMP_GET_WTIME()
+    write(*,*) " - stage 2 mid timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+    ! Compute K3hat
+    start = OMP_GET_WTIME()
+    call calc_explicit_MPI(3, proc_id, num_procs, proc_id_str)
+    finish = OMP_GET_WTIME()
+    write(*,*) " - calc_explicit(3) timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+
+    !:::::::::::
+    ! STAGE 3 ::
+    !:::::::::::
+    start = OMP_GET_WTIME()
+    !$OMP PARALLEL DO private(tmp_phi, tmp_T, tmp_uy, tmp_phi1, tmp_uy1, tmp_K_phi, tmp_K_T) schedule(dynamic)
+    do it = 1,Nx ! kx loop
+        ! Compute phi1 and T1
+        if (proc_id == 0) then
+            ! Need to pull all of the variables into local main memory.
+            phi_MPI(1:Ny-1) = phi(2:Ny,it)
+            K1hat_phi_MPI(1:Ny-1) = K1hat_phi(2:Ny,it)
+            K2hat_phi_MPI(1:Ny-1) = K2hat_phi(2:Ny,it)
+            K3hat_phi_MPI(1:Ny-1) = K3hat_phi(2:Ny,it)
+            K1hat_T_MPI(1:Ny-1) = K1hat_T(2:Ny,it)
+            K2hat_T_MPI(1:Ny-1) = K2hat_T(2:Ny,it)
+            K3hat_t_MPI(1:Ny-1) = K3hat_T(2:Ny,it)
+            K1_phi_MPI(1:Ny-1) = K1_phi(2:Ny,it)
+            K2_phi_MPI(1:Ny-1) = K2_phi(2:Ny,it)
+            K1_T_MPI(1:Ny-1) = K1_T(2:Ny,it)
+            K2_T_MPI(1:Ny-1) = K2_T(2:Ny,it)
+            T_MPI(1:Ny) = T(1:Ny,it)
+
+            ! Receive pieces from other nodes.
+            do otherproc = 1,num_procs-2
+                stind = otherproc * Ny
+                call MPI_RECV(phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K3hat_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K1_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(K2_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                              otherproc, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            
+            ! Receive from last node.
+            stind = (num_procs-1) * Ny
+            call MPI_RECV(phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 43, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 45, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 46, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 47, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3hat_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 48, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 49, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_phi_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 50, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K1_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 51, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K2_T_MPI(stind), Ny-1, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 52, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(T_MPI(stind+1), Ny, MPI_C_DOUBLE_COMPLEX, & 
+                          num_procs-1, 53, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+        
+            ! Receive g1,g2,g3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(g1_total(stind), Ny, MPI_DOUBLE, otherproc, 54, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g2_total(stind), Ny, MPI_DOUBLE, otherproc, 55, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(g3_total(stind), Ny, MPI_DOUBLE, otherproc, 56, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            g1_total(1:Ny) = g1
+            g2_total(1:Ny) = g2
+            g3_total(1:Ny) = g3
+
+            ! Call vari mod
+            call calc_vari_mod_MPI(tmp_phi_MPI, tmp_T_MPI, acoeffs(3,3), 3, total_ny,&
+                                kx(it), phi_MPI,&
+                                K1hat_phi_MPI,K2hat_phi_MPI,K3hat_phi_MPI,&
+                                K1hat_T_MPI,K2hat_T_MPI,K3hat_T_MPI,&
+                                K1_phi_MPI, K2_phi_MPI, K1_T_MPI, K2_T_MPI,&
+                                g1_total, g2_total, g3_total,&
+                                T_MPI)
+            
+            ! Compute v1 from phi1
+            call calc_vi_mod_MPI(tmp_uy_MPI, tmp_phi_MPI, kx(it), total_ny, g1_total, g2_total, g3_total)
+            
+            ! Receive dyv1 and dyv2
+            call MPI_RECV(dyv1_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 57, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(dyv2_T_it_MPI, 1, MPI_DOUBLE, num_procs-1, 58, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+
+            ! Receive V1, V2, phi1, phi2
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(V1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 59, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(V2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 60, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi1_MPI(stind), Ny, MPI_DOUBLE, otherproc, 61, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(phi2_MPI(stind), Ny, MPI_DOUBLE, otherproc, 62, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            ! Set local variables to MPI.
+            V1_MPI(1:Ny) = V1(1:Ny, it)
+            V2_MPI(1:Ny) = V2(1:Ny, it)
+            phi1_MPI(1:Ny) = phi1(1:Ny, it)
+            phi2_MPI(1:Ny) = phi2(1:Ny, it)
+
+            ! Receive end of h1, h2, h3 for last processor.
+            call MPI_RECV(h1_end_MPI, 1, MPI_DOUBLE, num_procs-1, 63, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h2_end_MPI, 1, MPI_DOUBLE, num_procs-1, 64, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h3_end_MPI, 1, MPI_DOUBLE, num_procs-1, 65, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+            ! BOUNDAY CONDITIONS!
+            call update_bcs_mod_MPI(tmp_phi1_MPI,tmp_uy1_MPI, tmp_phi_MPI,tmp_uy_MPI,&
+                                dyv1_T_it_MPI,dyv2_T_it_MPI,&
+                                dyv1_B(it),dyv2_B(it),&
+                                V1_MPI,V2_MPI,phi1_MPI,phi2_MPI,&
+                                total_ny, h1_end_MPI, h2_end_MPI, h3_end_MPI)
+            
+            tmp_phi_MPI = tmp_phi1_MPI
+            tmp_uy_MPI  = tmp_uy1_MPI
+
+            ! Receive dynu from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny
+                call MPI_RECV(dynu_MPI(stind), Ny, MPI_DOUBLE, otherproc, 66, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            dynu_MPI(1:Ny-1) = dynu(1:Ny-1)
+            
+            ! Implicit.
+            call calc_implicit_mod_MPI(tmp_K_phi_MPI,tmp_K_T_MPI, tmp_phi_MPI,tmp_T_MPI,&
+                                       kx(it), total_ny, dynu_MPI, g1_total, g2_total, g3_total)
+
+            ! Receive h1,h2,h3 from all other nodes.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_RECV(h1_total(stind), Ny, MPI_DOUBLE, otherproc, 67, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h2_total(stind), Ny, MPI_DOUBLE, otherproc, 68, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+                call MPI_RECV(h3_total(stind), Ny, MPI_DOUBLE, otherproc, 69, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            h1_total(1:Ny) = h1
+            h2_total(1:Ny) = h2
+            h3_total(1:Ny) = h3
+            
+            ! Compute u1 from v1
+            if (kx(it) /= 0.0_dp) then
+                uxi_MPI = CI*d1y_MPI(tmp_uy_MPI, h1_total, h2_total, h3_total)/kx(it)
+            else if (kx(it) == 0.0_dp) then
+                uxi_MPI = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+            end if
+
+            ! Send K3_phi, K3_T, phii, Ti, uyi, uxi back to each node.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_SEND(tmp_K_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 70, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_K_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 71, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_phi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 72, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_T_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 73, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(tmp_uy_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 74, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(uxi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 75, MPI_COMM_WORLD, mpierror)
+            end do
+
+            ! Set local to main node variables.
+            K3_phi(1:Ny,it) = tmp_K_phi_MPI(1:Ny)
+            K3_T(1:Ny,it) = tmp_K_T_MPI(1:Ny)
+            phii(1:Ny,it) = tmp_phi_MPI(1:Ny)
+            Ti(1:Ny,it) = tmp_T_MPI(1:Ny)
+            uyi(1:Ny,it) = tmp_uy_MPI(1:Ny)
+            uxi(1:Ny,it) = uxi_MPI(1:Ny)
+
+            
+        else if (proc_id == num_procs - 1) then
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny-1,it), Ny-1, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send dyv1 and dyv2.
+            call MPI_SEND(dyv1_T(it), 1, MPI_DOUBLE, 0, 57, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(dyv2_T(it), 1, MPI_DOUBLE, 0, 58, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+            
+            ! Send h1,h2,h3 last elements.
+            call MPI_SEND(h1(Ny), 1, MPI_DOUBLE, 0, 63, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(Ny), 1, MPI_DOUBLE, 0, 64, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(Ny), 1, MPI_DOUBLE, 0, 65, MPI_COMM_WORLD, mpierror)
+
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K3_phi, K3_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K3_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            
+        else 
+            ! Send data to main node
+            call MPI_SEND(phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 42, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 43, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 44, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 45, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 46, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 47, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K3hat_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 48, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 49, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 50, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K1_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 51, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(K2_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 52, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 53, MPI_COMM_WORLD, mpierror)
+
+            ! Send g1, g2, g3 to first node.
+            call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 54, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 55, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 56, MPI_COMM_WORLD, mpierror)
+
+            ! Send V1, V2, phi1, phi2
+            call MPI_SEND(V1(1:Ny,it), Ny, MPI_DOUBLE, 0, 59, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(V2(1:Ny,it), Ny, MPI_DOUBLE, 0, 60, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi1(1:Ny,it), Ny, MPI_DOUBLE, 0, 61, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(phi2(1:Ny,it), Ny, MPI_DOUBLE, 0, 62, MPI_COMM_WORLD, mpierror)
+        
+            ! Send dynu to main.
+            call MPI_SEND(dynu(1), Ny, MPI_DOUBLE, 0, 66, MPI_COMM_WORLD, mpierror)
+
+            ! Send h1, h2, h3 to first node.
+            call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 67, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 68, MPI_COMM_WORLD, mpierror)
+            call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 69, MPI_COMM_WORLD, mpierror)
+
+            ! Receive K3_phi, K3_T, phii, Ti, uyi, uxi from main node.
+            call MPI_RECV(K3_phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 70,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(K3_T(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 71,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(phii(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 72,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(Ti(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 73,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uyi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 74,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(uxi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 75,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end if
+    end do
+    !$OMP END PARALLEL DO
+    finish = OMP_GET_WTIME()
+    write(*,*) " - stage 3 mid timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+    ! Compute K4hat
+    start = OMP_GET_WTIME()
+    call calc_explicit_MPI(4, proc_id, num_procs, proc_id_str)
+    finish = OMP_GET_WTIME()
+    write(*,*) " - calc_explicit(4) timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+
+    start = OMP_GET_WTIME()
+    ! UPDATE SOLUTIONS
+    if (proc_id == 0) then
+        ! Get phi
+        phi(2:Ny,:) = phi(2:Ny,:) + dt*(b(1)*(K1_phi(2:Ny,:) + K2hat_phi(2:Ny,:)) + &
+        &                    b(2)*(K2_phi(2:Ny,:) + K3hat_phi(2:Ny,:)) + &
+        &                    b(3)*(K3_phi(2:Ny,:) + K4hat_phi(2:Ny,:)))
+
+        ! Get temperature
+        T(2:Ny,:)   = T(2:Ny,:)  + dt*(b(1)*(K1_T(2:Ny,:) + K2hat_T(2:Ny,:)) + &
+        &                   b(2)*(K2_T(2:Ny,:) + K3hat_T(2:Ny,:)) + &
+        &                   b(3)*(K3_T(2:Ny,:) + K4hat_T(2:Ny,:)))
+    else if (proc_id == num_procs - 1) then
+        ! Get phi
+        phi(1:Ny-1,:) = phi(1:Ny-1,:) + dt*(b(1)*(K1_phi(1:Ny-1,:) + K2hat_phi(1:Ny-1,:)) + &
+        &                    b(2)*(K2_phi(1:Ny-1,:) + K3hat_phi(1:Ny-1,:)) + &
+        &                    b(3)*(K3_phi(1:Ny-1,:) + K4hat_phi(1:Ny-1,:)))
+
+        ! Get temperature
+        T(1:Ny-1,:)   = T(1:Ny-1,:)  + dt*(b(1)*(K1_T(1:Ny-1,:) + K2hat_T(1:Ny-1,:)) + &
+        &                   b(2)*(K2_T(1:Ny-1,:) + K3hat_T(1:Ny-1,:)) + &
+        &                   b(3)*(K3_T(1:Ny-1,:) + K4hat_T(1:Ny-1,:)))    
+    else 
+        ! Get phi
+        phi(1:Ny,:) = phi(1:Ny,:) + dt*(b(1)*(K1_phi(1:Ny,:) + K2hat_phi(1:Ny,:)) + &
+        &                    b(2)*(K2_phi(1:Ny,:) + K3hat_phi(1:Ny,:)) + &
+        &                    b(3)*(K3_phi(1:Ny,:) + K4hat_phi(1:Ny,:)))
+    
+        ! Get temperature
+        T(1:Ny,:)   = T(1:Ny,:)  + dt*(b(1)*(K1_T(1:Ny,:) + K2hat_T(1:Ny,:)) + &
+        &                   b(2)*(K2_T(1:Ny,:) + K3hat_T(1:Ny,:)) + &
+        &                   b(3)*(K3_T(1:Ny,:) + K4hat_T(1:Ny,:)))
+    end if
+
+    ! Get ux and uy
+    if (proc_id == 0) then
+        ! Receive g1,g2,g3 from all other nodes.
+        do otherproc = 1,num_procs-1
+            stind = otherproc * Ny + 1
+            call MPI_RECV(g1_total(stind), Ny, MPI_DOUBLE, otherproc, 80, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(g2_total(stind), Ny, MPI_DOUBLE, otherproc, 81, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(g3_total(stind), Ny, MPI_DOUBLE, otherproc, 82, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end do
+        g1_total(1:Ny) = g1
+        g2_total(1:Ny) = g2
+        g3_total(1:Ny) = g3
+
+        ! Receive h1,h2,h3 from all other nodes.
+        do otherproc = 1,num_procs-1
+            stind = otherproc * Ny + 1
+            call MPI_RECV(h1_total(stind), Ny, MPI_DOUBLE, otherproc, 83, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h2_total(stind), Ny, MPI_DOUBLE, otherproc, 84, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(h3_total(stind), Ny, MPI_DOUBLE, otherproc, 85, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end do
+        h1_total(1:Ny) = h1
+        h2_total(1:Ny) = h2
+        h3_total(1:Ny) = h3
+    else 
+        ! Send g1, g2, g3 to first node.
+        call MPI_SEND(g1(1), Ny, MPI_DOUBLE, 0, 80, MPI_COMM_WORLD, mpierror)
+        call MPI_SEND(g2(1), Ny, MPI_DOUBLE, 0, 81, MPI_COMM_WORLD, mpierror)
+        call MPI_SEND(g3(1), Ny, MPI_DOUBLE, 0, 82, MPI_COMM_WORLD, mpierror)
+
+        ! Send h1, h2, h3 to first node.
+        call MPI_SEND(h1(1), Ny, MPI_DOUBLE, 0, 83, MPI_COMM_WORLD, mpierror)
+        call MPI_SEND(h2(1), Ny, MPI_DOUBLE, 0, 84, MPI_COMM_WORLD, mpierror)
+        call MPI_SEND(h3(1), Ny, MPI_DOUBLE, 0, 85, MPI_COMM_WORLD, mpierror)
+    end if
+    !$OMP PARALLEL DO private(tmp_uy, it) schedule(dynamic)
+    do it = 1,Nx
+        ! Only call from single node.
+        if (proc_id == 0) then 
+
+            do otherproc= 1, num_procs-1
+                stind = otherproc * Ny + 1
+                ! Receive phi
+                call MPI_RECV(phi_update_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, &
+                              otherproc, 86, MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            end do
+            ! Set local phi
+            phi_update_MPI(1:Ny) = phi(1:Ny,it)
+
+            ! Solve for v
+            call calc_vi_mod_MPI(tmp_uy_MPI, phi_update_MPI, kx(it), total_ny, g1_total, g2_total, g3_total)
+            ! Solve for u
+            if (kx(it) /= 0.0_dp) then
+                uxi_MPI = CI*d1y_MPI(tmp_uy_MPI, h1_total, h2_total, h3_total)/kx(it)
+            else if (kx(it) == 0.0_dp) then
+                uxi_MPI = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+            end if
+
+            ! Send uy, ux back to each node.
+            do otherproc = 1,num_procs-1
+                stind = otherproc * Ny + 1
+                call MPI_SEND(tmp_uy_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 87, MPI_COMM_WORLD, mpierror)
+                call MPI_SEND(uxi_MPI(stind), Ny, MPI_C_DOUBLE_COMPLEX, otherproc, 88, MPI_COMM_WORLD, mpierror)
+            end do
+            ! Set local versions.
+            uy(1:Ny,it) = tmp_uy_MPI(1:Ny)
+            ux(1:Ny,it) = uxi_MPI(1:Ny)
+
+        else 
+            ! Send phi
+            call MPI_SEND(phi(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 86, MPI_COMM_WORLD, mpierror)
+
+            ! Receive uy, ux from main node.
+            call MPI_RECV(uy(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 87,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+            call MPI_RECV(ux(1:Ny,it), Ny, MPI_C_DOUBLE_COMPLEX, 0, 88,&
+                          MPI_COMM_WORLD, MPI_STATUS_IGNORE, mpierror)
+        end if 
+    end do
+    !$OMP END PARALLEL DO
+    finish = OMP_GET_WTIME()
+    write(*,*) " - update sols timing: ", finish-start, "(s)"
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+    
+    ! open(unit=9010, file="P"//proc_id_str//"uy_real_update.txt", action="write", status="unknown")
+    ! open(unit=9011, file="P"//proc_id_str//"uy_im_update.txt", action="write", status="unknown")
+    ! do i=1,Ny
+    !     do j=1,Nx
+    !         write (9010,*) REAL(uy(i,j))
+    !         write (9011,*) AIMAG(uy(i,j))
+    !     end do
+    ! end do
+    ! close(unit=9010)
+    ! close(unit=9011)
+    ! write(*,*) "done writing uy!"
+   
+    if (time == t_final) then
+        open(unit=9010, file="P"//proc_id_str//"T_real_update.txt", action="write", status="unknown")
+        open(unit=9011, file="P"//proc_id_str//"T_im_update.txt", action="write", status="unknown")
+        do i=1,Ny
+            do j=1,Nx
+                write (9010,*) REAL(T(i,j))
+                write (9011,*) AIMAG(T(i,j))
+            end do
+        end do
+        close(unit=9010)
+        close(unit=9011)
+        write(*,*) "done writing T!"
+        exit
+    end if
+
+    if (wvtk) then
+        call write_to_vtk(nti, .false., proc_id_str) ! false = Fourier space
+    end if
+
+    call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+    write(*,*) "proc ", proc_id_str, " write vtk complete for t", time 
+ 
+ end do
+
+end subroutine imex_rk_MPI
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+subroutine calc_explicit_MPI(stage, proc_id, num_procs, proc_id_str)
+
+integer             :: i, j
+integer, intent(in) :: stage, proc_id, num_procs
+character(3), intent(in) :: proc_id_str
+real(dp)            :: start, finish
+
+start = OMP_GET_WTIME()
+select case(stage)
+    case (1)
+        !$OMP PARALLEL DO schedule(dynamic)
+        do i = 1,Nx
+            K1hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
+        end do
+        !$OMP END PARALLEL DO
+    case (2)
+        !$OMP PARALLEL DO schedule(dynamic)
+        do i = 1,Nx
+            K2hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
+        end do
+        !$OMP END PARALLEL DO
+    case (3)
+        !$OMP PARALLEL DO schedule(dynamic)
+        do i = 1,Nx
+            K3hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
+        end do
+        !$OMP END PARALLEL DO
+    case (4)
+        !$OMP PARALLEL DO schedule(dynamic)
+        do i = 1,Nx
+            K4hat_phi(:,i) = -kx(i)**2.0_dp*Ti(:,i)
+        end do
+        !$OMP END PARALLEL DO
+end select
+finish = OMP_GET_WTIME()
+write(*,*) " - - l1 timing: ", finish-start, "(s)"
+
+start = OMP_GET_WTIME()
+!$OMP PARALLEL DO schedule(dynamic)
+do i=1,Nx
+   ! Compute dx(T) in Fourier space
+   nlT  (:,i) =  kx(i)*Ti(:,i)
+   ! Compute D2(ux)
+   nlphi(:,i) = -kx(i)**2.0_dp*uxi(:,i) + d2y_MPI2(uxi(:,i), proc_id, num_procs)
+end do
+!$OMP END PARALLEL DO
+finish = OMP_GET_WTIME()
+
+write(*,*) " - - l2 timing: ", finish-start, "(s)"
+
+!nlT = -CI*nlT
+nlT = CI*nlT
+
+start = OMP_GET_WTIME()
+!$OMP PARALLEL DO private(tnlT, tnlphi, tT, tux, tuy, tphi) schedule(dynamic)
+do j = 1,Ny
+   ! Bring everything to physical space
+   tnlT   = nlT(j,:)
+   tnlphi = nlphi(j,:)
+   tT     = Ti(j,:)
+   tux    = uxi(j,:)
+   tuy    = uyi(j,:)
+   tphi   = phii(j,:)
+   call fftw_execute_dft(iplannlT, tnlT, tnlT)
+   call fftw_execute_dft(iplannlphi, tnlphi, tnlphi)
+   call fftw_execute_dft(iplanT, tT, tT)
+   call fftw_execute_dft(iplanux, tux, tux)
+   call fftw_execute_dft(iplanuy, tuy, tuy)
+   call fftw_execute_dft(iplanphi, tphi, tphi)
+   nlT(j,:)   = tnlT
+   nlphi(j,:) = tnlphi
+   Ti(j,:)   = tT
+   uxi(j,:)  = tux
+   uyi(j,:)  = tuy
+   phii(j,:) = tphi
+end do
+!$OMP END PARALLEL DO
+finish = OMP_GET_WTIME()
+write(*,*) " - - l3 timing: ", finish-start, "(s)"
+
+! Calculate nonlinear term
+start = OMP_GET_WTIME()
+!$OMP PARALLEL DO private(tmp_T) schedule(dynamic)
+do i = 1,Nx
+   ! Temperature
+   tmp_T = Ti(:,i)
+   nlT(:,i) = uxi(:,i)*nlT(:,i) + uyi(:,i)*d1y_MPI2(tmp_T, proc_id, num_procs)
+   ! phi
+   nlphi(:,i) = uxi(:,i)*phii(:,i) - uyi(:,i)*nlphi(:,i)
+end do
+!$OMP END PARALLEL DO
+finish = OMP_GET_WTIME()
+write(*,*) " - - l4 timing: ", finish-start, "(s)"
+
+! Bring nonlinear terms back to Fourier space
+start = OMP_GET_WTIME()
+!$OMP PARALLEL DO private(tnlT, tnlphi) schedule(dynamic)
+do j = 1,Ny
+   tnlT   = nlT(j,:)
+   tnlphi = nlphi(j,:)
+   call fftw_execute_dft(plannlT, tnlT, tnlT)
+   call fftw_execute_dft(plannlphi, tnlphi, tnlphi)
+   ! Dealias
+   do i = 1,Nx
+      if (abs(kx(i))/alpha >= Nf/2) then
+         tnlT(i)   = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+         tnlphi(i) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+      end if
+   end do
+   nlT(j,:)   = tnlT
+   nlphi(j,:) = tnlphi
+end do
+!$OMP END PARALLEL DO
+finish = OMP_GET_WTIME()
+write(*,*) " - - l5 timing: ", finish-start, "(s)"
+
+nlT   = nlT   / real(Nx,kind=dp)
+nlphi = nlphi / real(Nx,kind=dp)
+
+start = OMP_GET_WTIME()
+select case (stage)
+   case (1)
+      !$OMP PARALLEL DO schedule(dynamic)
+      do i = 1,Nx
+         !K1hat_phi(:,i) = K1hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
+         K1hat_phi(:,i) = K1hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
+      end do
+      !$OMP END PARALLEL DO
+      K1hat_T = -nlT
+   case (2)
+      !$OMP PARALLEL DO schedule(dynamic)
+      do i = 1,Nx
+         !K2hat_phi(:,i) = K2hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
+         K2hat_phi(:,i) = K2hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
+      end do
+      !$OMP END PARALLEL DO
+      K2hat_T = -nlT
+   case (3)
+      !$OMP PARALLEL DO schedule(dynamic)
+      do i = 1,Nx
+         !K3hat_phi(:,i) = K3hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
+         K3hat_phi(:,i) = K3hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
+      end do
+      !$OMP END PARALLEL DO
+      K3hat_T = -nlT
+   case (4)
+      !$OMP PARALLEL DO schedule(dynamic)
+      do i = 1,Nx
+         !K4hat_phi(:,i) = K4hat_phi(:,i) + CI*kx(i)*nlphi(:,i)
+         K4hat_phi(:,i) = K4hat_phi(:,i) - CI*kx(i)*nlphi(:,i)
+      end do
+      !$OMP END PARALLEL DO
+      K4hat_T = -nlT
+end select
+finish = OMP_GET_WTIME()
+write(*,*) " - - l6 timing: ", finish-start, "(s)"
+
+end subroutine calc_explicit_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
+subroutine calc_vari_mod_MPI(phiout,Tout, aii, stage, total_ny, kx_it, phi_in, &
+    k1hat_phi_in, k2hat_phi_in, k3hat_phi_in,&
+    k1hat_T_in, k2hat_T_in, k3hat_T_in,&
+    k1_phi_in, k2_phi_in, k1_T_in, k2_T_in,&
+    g1_total, g2_total, g3_total,&
+    T_in)
+
+real(dp),                                               intent(in)  :: aii
+integer,                                                intent(in)  :: stage, total_ny
+real(dp),                                               intent(in)  :: kx_it
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k1hat_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k2hat_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k3hat_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k1hat_T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k2hat_T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k3hat_T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny),   intent(in)  :: T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k1_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k2_phi_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k1_T_in
+complex(C_DOUBLE_COMPLEX),              dimension(total_ny-2), intent(in)  :: k2_T_in
+real(dp),              dimension(total_ny), intent(in)  :: g1_total, g2_total, g3_total
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:),   intent(out) :: phiout, Tout
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:)              :: Fphi, FT
+real(dp),                  allocatable, dimension(:,:)            :: phi_rhs, T_rhs
+real(dp),                  allocatable, dimension(:)              :: dphi, duphi, dlphi
+real(dp),                  allocatable, dimension(:)              :: ddT, duT, dlT
+integer :: i
+
+allocate(dphi(total_ny-2), ddT(total_ny-2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(duphi(total_ny-3), dlphi(total_ny-3), duT(total_ny-3), dlT(total_ny-3), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(phiout(total_ny), Tout(total_ny), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(Fphi(total_ny-2), FT(total_ny-2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(phi_rhs(total_ny-2,2), T_rhs(total_ny-2,2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+
+dphi    = 0.0_dp
+ddT     = 0.0_dp
+duphi   = 0.0_dp
+dlphi   = 0.0_dp
+duT     = 0.0_dp
+dlT     = 0.0_dp
+phi_rhs = 0.0_dp
+T_rhs   = 0.0_dp
+Fphi    = (0.0_dp, 0.0_dp)
+FT      = (0.0_dp, 0.0_dp)
+phiout  = (0.0_dp, 0.0_dp)
+Tout    = (0.0_dp, 0.0_dp)
+
+! LHS Matrix (tridiagonal, not necessarily symmetric)
+do jt = 2,total_ny-1
+    ddT (jt-1) = 1.0_dp - kappa0*dt*aii*(-kx_it**2.0_dp + g2_total(jt))
+    dphi(jt-1) = 1.0_dp - nu0   *dt*aii*(-kx_it**2.0_dp + g2_total(jt))
+end do
+
+do jt = 2,total_ny-2
+    duT  (jt-1) = -kappa0*dt*g3_total(jt)*aii
+    duphi(jt-1) = -nu0   *dt*g3_total(jt)*aii
+end do
+
+do jt = 3,total_ny-1
+    dlT  (jt-2) = -kappa0*dt*g1_total(jt)*aii
+    dlphi(jt-2) = -nu0   *dt*g1_total(jt)*aii
+end do
+
+select case (stage)
+    case(1)
+        Fphi     = phi_in + dt*ahatcoeffs(2,1)*k1hat_phi_in
+        FT       = T_in(2:total_ny-1) + dt*ahatcoeffs(2,1)*k1hat_T_in
+        FT(1)    = FT(1) + kappa0*dt*aii*g1_total(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
+        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3_total(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
+
+        phi_rhs(:,1) = real(Fphi)
+        phi_rhs(:,2) = aimag(Fphi)
+
+        T_rhs  (:,1) = real(FT)
+        T_rhs  (:,2) = aimag(FT)
+
+        call dgtsv(total_ny-2, 2, dlT, ddT, duT, T_rhs, total_ny-2, info)
+        Tout(2:total_ny-1) = cmplx(T_rhs(:,1), T_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+        ! Set temperature boundary conditions
+        Tout(1) = T_in(1)
+        Tout(total_ny) = T_in(total_ny)
+
+        call dgtsv(total_ny-2, 2, dlphi, dphi, duphi, phi_rhs, total_ny-2, info)
+        phiout(2:total_ny-1) = cmplx(phi_rhs(:,1), phi_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+
+    case(2)
+        Fphi = phi_in + dt*(acoeffs(2,1)*k1_phi_in       + &
+        &                     ahatcoeffs(3,1)*k1hat_phi_in + &
+        &                     ahatcoeffs(3,2)*k2hat_phi_in)
+        FT   = T_in(2:total_ny-1) + dt*(acoeffs(2,1)*k1_T_in      + &
+        &                     ahatcoeffs(3,1)*k1hat_T_in + &
+        &                     ahatcoeffs(3,2)*k2hat_T_in)
+        FT(1)    = FT(1) + kappa0*dt*aii*g1_total(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
+        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3_total(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
+
+        ! do i = 1,total_ny-2
+        !     write(*,*) i, k2hat_phi_in(i)
+        ! end do 
+
+        phi_rhs(:,1) = real(Fphi)
+        phi_rhs(:,2) = aimag(Fphi)
+
+        T_rhs  (:,1) = real(FT)
+        T_rhs  (:,2) = aimag(FT)
+
+        call dgtsv(total_ny-2, 2, dlT, ddT, duT, T_rhs, total_ny-2, info)
+        Tout(2:total_ny-1) = cmplx(T_rhs(:,1), T_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+        ! Set temperature boundary conditions
+        Tout(1) = T_in(1)
+        Tout(total_ny) = T_in(total_ny)
+
+        call dgtsv(total_ny-2, 2, dlphi, dphi, duphi, phi_rhs, total_ny-2, info)
+        phiout(2:total_ny-1) = cmplx(phi_rhs(:,1), phi_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+
+    case(3)
+        Fphi = phi_in + dt*(acoeffs(3,1)*k1_phi_in       + &
+                    &acoeffs(3,2)*k2_phi_in       + &
+                    &ahatcoeffs(4,1)*k1hat_phi_in + &
+                    &ahatcoeffs(4,2)*k2hat_phi_in + &
+                    &ahatcoeffs(4,3)*k3hat_phi_in)
+        FT   = T_in(2:total_ny-1) + dt*(acoeffs(3,1)*k1_T_in       + &
+                    &acoeffs(3,2)*k2_T_in     + &
+                    &ahatcoeffs(4,1)*k1hat_T_in + &
+                    &ahatcoeffs(4,2)*k2hat_T_in + &
+                    &ahatcoeffs(4,3)*k3hat_T_in)
+        FT(1)    = FT(1) + kappa0*dt*aii*g1_total(2)*T_in(1) ! b/c Ti(y_1) = T(y_1)
+        FT(total_ny-2) = FT(total_ny-2) + kappa0*dt*aii*g3_total(total_ny-1)*T_in(total_ny) ! b/c Ti(total_ny) = T(total_ny)
+
+        phi_rhs(:,1) = real(Fphi)
+        phi_rhs(:,2) = aimag(Fphi)
+
+        T_rhs  (:,1) = real(FT)
+        T_rhs  (:,2) = aimag(FT)
+
+        call dgtsv(total_ny-2, 2, dlT, ddT, duT, T_rhs, total_ny-2, info)
+        Tout(2:total_ny-1) = cmplx(T_rhs(:,1), T_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+        ! Set temperature boundary conditions
+        Tout(1) = T_in(1)
+        Tout(total_ny) = T_in(total_ny)
+
+        call dgtsv(total_ny-2, 2, dlphi, dphi, duphi, phi_rhs, total_ny-2, info)
+        phiout(2:total_ny-1) = cmplx(phi_rhs(:,1), phi_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+
+end select
+
+end subroutine calc_vari_mod_MPI
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
+
+subroutine calc_vi_mod_MPI(vi, phiin, kx_it, total_ny, g1_total, g2_total, g3_total)
+complex(C_DOUBLE_COMPLEX),              dimension(:),    intent(in)  :: phiin
+complex(C_DOUBLE_COMPLEX),              dimension(:),    intent(out) :: vi
+real(dp),                                                intent(in)  :: kx_it
+real(dp),                  allocatable, dimension(:,:)               :: vi_rhs
+real(dp),                  allocatable, dimension(:)                 :: dvi, dlvi, duvi
+integer                                                              :: j
+integer,                              intent(in)  :: total_ny
+real(dp),              dimension(:), intent(in)  :: g1_total, g2_total, g3_total
+
+allocate(vi_rhs(total_ny-2,2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(dvi(total_ny-2), stat=alloc_err)
+call check_alloc_err(alloc_err)
+allocate(dlvi(total_ny-3), duvi(total_ny-3), stat=alloc_err)
+call check_alloc_err(alloc_err)
+
+vi     = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+vi_rhs = 0.0_dp
+dvi    = 0.0_dp
+dlvi   = 0.0_dp
+duvi   = 0.0_dp
+
+do j = 2,total_ny-1
+    dvi(j-1) = -kx_it**2.0_dp + g2_total(j)
+end do
+
+do j = 2,total_ny-2
+    duvi(j-1) = g3_total(j)
+end do
+
+do j = 3,total_ny-1
+    dlvi(j-2) = g1_total(j)
+end do
+
+vi_rhs(:,1) = real (phiin(2:total_ny-1))
+vi_rhs(:,2) = aimag(phiin(2:total_ny-1))
+
+call dgtsv(total_ny-2, 2, dlvi, dvi, duvi, vi_rhs, total_ny-2, info)
+
+vi(1)   = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+vi(2:total_ny-1) = cmplx(vi_rhs(:,1), vi_rhs(:,2), kind=C_DOUBLE_COMPLEX)
+vi(total_ny)  = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+
+end subroutine calc_vi_mod_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+subroutine update_bcs_mod_MPI(phiout,vout, phiin,vin,dyv1_T_it,dyv2_T_it,dyv1_B_it,&
+                              dyv2_B_it,V1_in, V2_in, phi1_in, phi2_in,&
+                              total_ny, h1_end, h2_end, h3_end)
+
+integer,                                               intent(in)  :: total_ny
+complex(C_DOUBLE_COMPLEX),              dimension(:),  intent(in)  :: phiin, vin
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:),  intent(out) :: phiout, vout
+real(dp),                               dimension(2,2)             :: C
+real(dp)                                                           :: detC
+complex(dp)                                                        :: c1, c2, c1t
+complex(dp)                                                        :: dyV_T, dyV_B
+real(dp),                                                intent(in)  :: dyv1_T_it,dyv2_T_it,dyv1_B_it,dyv2_B_it
+real(dp),              dimension(:),  intent(in)  :: V1_in, V2_in, phi1_in, phi2_in
+real(dp),                                              intent(in)  :: h1_end, h2_end, h3_end
+allocate(phiout(total_ny), vout(total_ny), stat=alloc_err)
+call check_alloc_err(alloc_err)
+phiout = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+vout   = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+C      = 0.0_dp
+
+C(1,1) = dyv1_T_it
+C(1,2) = dyv2_T_it
+C(2,1) = dyv1_B_it
+C(2,2) = dyv2_B_it
+
+detC = C(1,1)*C(2,2) - C(1,2)*C(2,1)
+
+dyV_T = h1_end*vin(total_ny-2) + h2_end*vin(total_ny-1) + h3_end*vin(total_ny)
+dyV_B = h1(1)*vin(1) + h2(1)*vin(2) + h3(1)*vin(3)
+
+! Need to negate b/c want to solve Cx = -c12.
+c1 = -dyV_T
+c2 = -dyV_B
+
+! Find c1 and c2.
+if (detC == 0.0_dp) then
+    c1 = (0.0_dp, 0.0_dp)
+    c2 = (0.0_dp, 0.0_dp)
+else
+    c1t = (C(2,2)*c1 - C(1,2)*c2) / detC
+    c2  = (C(1,1)*c2 - C(2,1)*c1) / detC
+    c1  = c1t
+end if
+
+! Update uy and Phi.
+vout   = vin   + c1*V1_in   + c2*V2_in
+phiout = phiin + c1*phi1_in + c2*phi2_in
+
+end subroutine update_bcs_mod_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+subroutine calc_implicit_mod_MPI(Kphi,KT, phiin,Tin, kx_it, total_ny, dynu_in, g1_total, g2_total, g3_total)
+
+complex(C_DOUBLE_COMPLEX),              dimension(:), intent(in)  :: phiin, Tin
+complex(C_DOUBLE_COMPLEX),              dimension(:), intent(out) :: Kphi, KT
+real(dp),                                               intent(in)  :: kx_it
+integer,                                              intent(in)  :: total_ny
+real(dp),   dimension(:),  intent(in)  :: dynu_in, g1_total, g2_total, g3_total                
+
+Kphi = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+KT   = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+
+Kphi = nu0   *(-kx_it**2.0_dp*phiin + d2y_MPI(phiin, dynu_in, g1_total, g2_total, g3_total, total_ny))
+KT   = kappa0*(-kx_it**2.0_dp*Tin   + d2y_MPI(Tin, dynu_in, g1_total, g2_total, g3_total, total_ny))
+
+end subroutine calc_implicit_mod_MPI
+
+!:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::!
+
+end module time_integrators_MPI
+    
\ No newline at end of file
diff --git a/time_loop_MPI.f90 b/time_loop_MPI.f90
new file mode 100644
index 0000000..5201f65
--- /dev/null
+++ b/time_loop_MPI.f90
@@ -0,0 +1,334 @@
+program time_loop
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+use global
+use strings
+use write_pack
+use interpolation_pack
+use allocate_vars
+use statistics
+use mesh_pack
+use time_integrators_MPI
+
+! include 'mpif.h'
+implicit none
+
+integer                                                :: ntokens, ios, i
+integer                                                :: Nxc, Nyc, Nzc
+integer                                                :: nRa, ii, jj
+integer                                                :: refine_flag_x, refine_flag_y, refine_flag_z
+integer                                                :: vtk_flag, rstrt_flag, opt_flag
+logical                                                :: wvtk, save_restart, calc_opt
+logical                                                :: refine_x, refine_y, refine_z, refine_xy
+logical                                                :: fTexist, fuyexist
+logical                                                :: ex_Tptrb
+real(dp)                                               :: temp
+real(dp)                                               :: dxc
+real(dp)                                               :: Ra, dRa, Nu, dalpha
+real(dp),                  allocatable, dimension(:)   :: xpc, ypc, zpc
+complex(C_DOUBLE_COMPLEX), allocatable, dimension(:,:) :: uyc, Tc
+character(10)                                          :: citer
+character(9)                                           :: fiter
+character(80)                                          :: line
+character(80)                                          :: tokens(80)
+
+! MPI specific variables
+integer                                                :: mpierror, num_procs
+integer                                                :: proc_id
+character(4)                                           :: proc_id_str
+real(dp)                                               :: mpi_spacing_y
+integer                                                status(MPI_STATUS_SIZE)
+
+
+! Initialize MPI.
+call MPI_Init(mpierror)
+call MPI_Comm_size(MPI_COMM_WORLD, num_procs, mpierror)
+call MPI_Comm_rank(MPI_COMM_WORLD, proc_id, mpierror)
+
+! Read file.
+open(2,file="input.data", status="old")
+do ii = 1,7
+read(2,'(A)') line
+call parse(line,' ,', tokens, ntokens)
+if (ntokens > 0) then
+    select case(ii)
+        case (1)
+        call value(tokens(1), Pr, ios)
+        call value(tokens(2), alpha, ios)
+        call value(tokens(3), dalpha, ios)
+        case (2)
+        call value(tokens(1), Ra,  ios)
+        call value(tokens(2), nRa, ios)
+        call value(tokens(3), dRa, ios)
+        case (3)
+        call value(tokens(1), t_final, ios)
+        call value(tokens(2), dt, ios)
+        dt_init = dt
+        case (4)
+        call value(tokens(1), ybot, ios)
+        call value(tokens(2), ytop, ios)
+        case (5)
+        call value(tokens(1), Nx, ios)
+        call value(tokens(2), Ny, ios)
+        call value(tokens(3), Nz, ios)
+        case (6)
+        call value(tokens(1), refine_flag_x, ios)
+        call value(tokens(2), refine_flag_y, ios)
+        call value(tokens(3), refine_flag_z, ios)
+        if (refine_flag_x == 1) then
+            if (refine_flag_y == 1) then
+                refine_xy = .true.
+                refine_x = .false.
+                refine_y = .false.
+            else
+            refine_x = .true.
+            refine_y = .false.
+            refine_xy = .false.
+            end if
+        else if (refine_flag_y == 1) then
+            refine_x = .false.
+            refine_y = .true.
+            refine_xy = .false.
+        else if (refine_flag_z == 1) then
+            write(*,*) "Refinement in z-direction not available yet!"
+            stop
+        else
+            refine_x = .false.
+            refine_y = .false.
+            refine_z = .false.
+            refine_xy = .false.
+        end if
+        case (7)
+        call value(tokens(1), vtk_flag, ios)
+        call value(tokens(2), rstrt_flag, ios)
+        call value(tokens(2), opt_flag, ios)
+        if (vtk_flag == 1) then
+            wvtk = .true.
+        else
+            wvtk = .false.
+        end if
+        if (rstrt_flag == 1) then
+            save_restart = .true.
+        else
+            save_restart = .false.
+        end if
+        if (opt_flag == 1) then
+            calc_opt = .true.
+        else
+            calc_opt = .false.
+        end if
+    end select
+end if
+end do
+close(unit=2)
+
+! Initialize MPI variables.
+Ny = Ny / 4
+mpi_spacing_y = (ytop - ybot) / num_procs
+ybot = ybot + proc_id * mpi_spacing_y
+ytop = ybot + mpi_spacing_y
+write(proc_id_str, "(I3.3)") proc_id
+
+! Create FFT plans
+planuy = fftw_plan_dft_1d(Nx,tuy,tuy, FFTW_FORWARD,FFTW_ESTIMATE)
+iplanuy = fftw_plan_dft_1d(Nx,tuy,tuy, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+planux = fftw_plan_dft_1d(Nx,tux,tux, FFTW_FORWARD,FFTW_ESTIMATE)
+iplanux = fftw_plan_dft_1d(Nx,tux,tux, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+planphi = fftw_plan_dft_1d(Nx,tphi,tphi, FFTW_FORWARD,FFTW_ESTIMATE)
+iplanphi = fftw_plan_dft_1d(Nx,tphi,tphi, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+planT = fftw_plan_dft_1d(Nx,tT,tT, FFTW_FORWARD,FFTW_ESTIMATE)
+iplanT = fftw_plan_dft_1d(Nx,tT,tT, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+plannlT = fftw_plan_dft_1d(Nx,tnlT,tnlT, FFTW_FORWARD,FFTW_ESTIMATE)
+iplannlT = fftw_plan_dft_1d(Nx,tnlT,tnlT, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+plannlphi = fftw_plan_dft_1d(Nx,tnlphi,tnlphi, FFTW_FORWARD,FFTW_ESTIMATE)
+iplannlphi = fftw_plan_dft_1d(Nx,tnlphi,tnlphi, FFTW_BACKWARD,FFTW_ESTIMATE)
+
+call global_params
+call global_allocations
+
+xR =  pi / alpha
+xL = -pi / alpha
+Lx = xR - xL
+
+if (Nx == 1) then
+    dx = 1.0_dp
+else
+    dx = Lx / (real(Nx,kind=dp))
+end if
+
+call cosine_mesh_MPI(xp,yp,zp, Nx,Ny,Nz, proc_id, num_procs) ! get coordinates
+call dxdydz_MPI(dynu, xp,yp,zp, proc_id, num_procs) ! get mesh spacing for nonuniform grids
+call y_mesh_params_MPI(proc_id, num_procs) ! get metric coefficients for nonuniform grids
+
+dymin = minval(dynu)
+dxmin = sqrt(dx**2.0_dp + dymin**2.0_dp)
+
+! Fourier modes
+do ii = 1,Nx/2
+    kx_modes(ii) = real(ii,kind=dp) - 1.0_dp
+end do
+do ii = Nx/2+1, Nx
+    kx_modes(ii) = real(ii-Nx,kind=dp) - 1.0_dp
+end do
+kx = alpha*kx_modes
+
+! Write initial field to vtk
+if (wvtk) then
+    call write_to_vtk(int(Ra), .true., proc_id_str) ! true = already in physical space
+end if
+
+! Initialize fields.
+call init_fields(ex_Tptrb, Ra)
+call init_to_fourier(ex_Tptrb)
+
+! Get nu0 and kappa0
+call global_params_Ra(Ra)
+
+if (proc_id == 0) then
+    write(*,'(A70)')                  ' '
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '                TWO-DIMENSIONAL THERMAL CONVECTION                   |'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,47X,A)')           'COMPUTATION SIZE:',                                                 '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(20X,A23,I5,21X,A)')     'Nx                       = ', Nx,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Ny                       = ', Ny,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Nz                       = ', Nz,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Number of time steps     = ', nt,                                   '|'
+    write(*,'(20X,A23,I5,21X,A)')     'Number of Fourier modes  = ', Nf,                                   '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,45X,A)')           'PROBLEM PARAMETERS:',                                               '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Prandtl number  (Pr)                    = ', Pr,                    '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Rayleigh number (Ra)            = ', Ra,                    '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Initial Reynolds number (Re)            = ', sqrt(Ra/(16.0_dp*Pr)), '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(5X,A,42X,A)')           'PHYSICAL PROBLEM SIZE:',                                            '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'alpha                     = ', alpha,                               '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Left coordinate of box    = ', xL,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Right coordinate of box   = ', xR,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of bottom wall = ', ybot,                                '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Coordinate of top wall    = ', ytop,                                '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Box width                 = ', Lx,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Box height                = ', Ly,                                  '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Aspect Ratio              = ', Lx/Ly,                               '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Integration time (T)      = ', t_final,                              '|'
+    write(*,'(10X,A32,ES16.8,11X,A)') 'Time step size (Delta t)  = ', dt,                                   '|'
+    write(*,'(A69,A)')                '                                                                  ','|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '*********************************************************************|'
+    write(*,'(A70)')                  '                                                                      '
+    
+    flush(6)
+    ! Print MPI division.
+    write(*,*) "Ny = ", Ny*num_procs, " divided among ", num_procs, " processors -> ", &
+               Ny, " rows per processor."
+end if
+
+call MPI_BARRIER(MPI_COMM_WORLD, mpierror)
+
+write(*,*) "processor ", proc_id, "initialized with ", Ny, "rows."
+
+! Get solution with time integration
+call imex_rk_MPI(proc_id_str, 1, .true., proc_id, num_procs) ! true causes writing of nusselt number.
+
+call MPI_Finalize(mpierror)
+
+end program time_loop
+
+!::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+subroutine init_fields(ex_Tptrb,Ra)
+
+use global
+
+implicit none
+
+logical, intent(out) :: ex_Tptrb
+real(dp), intent(in) :: Ra
+integer              :: ii
+
+! Initialize fields.
+if (Ra < 1710.0_dp) then
+    ex_Tptrb = .true.
+    do ii = 1,Nx
+        Tptrb(:,ii) = 0.5_dp*2.0_dp*cos(alpha*xp(ii))*cos(pi*yp/2.0_dp)
+        T(:,ii) = -yp + Tptrb(:,ii)
+    end do
+else
+    ex_Tptrb = .false.
+    do ii = 1,Nx
+        T(:,ii) = -yp + 0.5_dp*2.0_dp*cos(alpha*xp(ii))*cos(pi*yp/2.0_dp)
+    end do
+end if
+
+end subroutine init_fields
+
+!::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+subroutine init_to_fourier(ex_Tptrb)
+
+use global
+
+implicit none
+
+logical, intent(in) :: ex_Tptrb
+integer             :: ii, jj
+
+! Bring temperature and velocity to Fourier space.
+do jj = 1,Ny
+    tT = T(jj,:)
+    tuy = uy(jj,:)
+    call fftw_execute_dft(planT, tT, tT)
+    call fftw_execute_dft(planuy, tuy, tuy)
+    ! Truncate modes
+    do ii = 1,Nx
+        if (abs(kx(ii))/alpha >= Nf/2) then
+            tT(ii)  = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+            tuy(ii) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+        end if
+    end do
+    T(jj,:) = tT
+    uy(jj,:) = tuy
+    ! If temperature perturbation needed.
+    if (ex_Tptrb) then
+        tT = Tptrb(jj,:)
+        call fftw_execute_dft(planT, tT, tT)
+        ! Truncate modes
+        do ii = 1,Nx
+            if (abs(kx(ii))/alpha >= Nf/2) then
+            tT(ii)  = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX)
+            end if
+        end do
+        Tptrb(jj,:) = tT
+    end if
+end do
+T = T / real(Nx, kind=dp)
+uy = uy / real(Nx, kind=dp)
+
+if (ex_Tptrb) then
+    Tptrb = Tptrb / real(Nx, kind=dp)
+end if
+
+! Calculate phi and ux from uy
+do ii = 1,Nx
+    if (kx(ii) /= 0.0_dp) then
+        tmp_uy = uy(:,ii)
+        !ux(:,ii) = -CI*d1y(tmp_uy)/kx(ii)
+        ux(:,ii) = CI*d1y(tmp_uy)/kx(ii)
+    else if (kx(ii) == 0.0_dp) then
+        ux(:,ii) = cmplx(0.0_dp, 0.0_dp, kind=C_DOUBLE_COMPLEX) ! Zero mean flow!
+    end if
+    phi(:,ii) = -kx(ii)**2.0_dp*uy(:,ii) + d2y(uy(:,ii))
+end do
+
+end subroutine init_to_fourier
\ No newline at end of file
diff --git a/write_pack.f90 b/write_pack.f90
index 08f53d4..dd1311d 100755
--- a/write_pack.f90
+++ b/write_pack.f90
@@ -88,7 +88,7 @@ end subroutine write_mat_cmplx
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-subroutine write_vtk_structured_grid(iter)
+subroutine write_vtk_structured_grid(iter, process_id)
 
 use global
 
@@ -99,13 +99,17 @@ subroutine write_vtk_structured_grid(iter)
 character(9)  :: fiter
 integer :: n
 integer :: ii, jj, kk
+character(3), intent(in), optional  :: process_id
 
 write(citer, "(I10)") 1000000000 + iter
 fiter = citer(2:10)
 
 n = (Nx+1)*Ny*Nz
-
-open(unit=2, file=vtkloc//vtkname//fiter//gtype, action="write", status="replace")
+if (PRESENT(process_id)) then
+   open(unit=2, file=vtkloc//vtkname//"_P"//process_id//"_"//fiter//gtype, action="write", status="replace")
+else
+   open(unit=2, file=vtkloc//vtkname//fiter//gtype, action="write", status="replace")
+endif
 write(2, "(a)") "# vtk DataFile Version 3.1"
 write(2, "(a)") "Thermal Convection Data"
 write(2, "(a)") "ASCII"
@@ -180,18 +184,21 @@ subroutine write_vtk_structured_grid(iter)
 end subroutine write_vtk_structured_grid
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine write_to_vtk(step, physical)
-
-integer, intent(in) :: step
-logical, intent(in) :: physical
-integer             :: j
+subroutine write_to_vtk(step, physical, process_id)
 
-if (physical) then
+integer, intent(in)            :: step
+logical, intent(in)            :: physical
+character(3), intent(in), optional  :: process_id
+integer                        :: j
 
-   call write_vtk_structured_grid(step)
 
+if (physical) then
+   if (PRESENT(process_id)) then
+      call write_vtk_structured_grid(step, process_id)
+   else 
+      call write_vtk_structured_grid(step)
+   endif 
 else
-
    do j = 1,Ny
       ! Bring everything to physical space
       tT  = T(j,:)
@@ -204,7 +211,11 @@ subroutine write_to_vtk(step, physical)
       ux(j,:)  = tux
       uy(j,:)  = tuy
    end do
-   call write_vtk_structured_grid(step)
+   if (PRESENT(process_id)) then
+      call write_vtk_structured_grid(step, process_id)
+   else 
+      call write_vtk_structured_grid(step)
+   endif 
    do j = 1,Ny
       ! Bring everything to Fourier space
       tT  = T(j,:)
@@ -220,7 +231,6 @@ subroutine write_to_vtk(step, physical)
    T  = T  / real(Nx, kind=dp)
    ux = ux / real(Nx, kind=dp)
    uy = uy / real(Nx, kind=dp)
-
 end if
 
 end subroutine write_to_vtk