From f9867bafdafa346a30cb13c253984fda0605acd5 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Wed, 27 Dec 2023 19:17:53 -0500
Subject: [PATCH 01/15] feat(subdomain): make fully asynchronous

---
 example/time-paradigm.f90  | 18 ++++++++++++------
 src/matcha/subdomain_m.f90 |  6 +++---
 src/matcha/subdomain_s.f90 | 24 +++++++++---------------
 test/subdomain_test_m.f90  | 17 +++++++++++------
 4 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/example/time-paradigm.f90 b/example/time-paradigm.f90
index 1f34b8f..494d213 100644
--- a/example/time-paradigm.f90
+++ b/example/time-paradigm.f90
@@ -12,7 +12,7 @@ program time_paradigm_m
   character(len=:), allocatable :: steps_string, resolution_string
   type(command_line_t) command_line
   integer(int64) counter_start, counter_end, clock_rate
-  integer :: steps=200, resolution=64
+  integer :: steps=300, resolution=128
 
   associate(me => this_image())
     if (command_line%argument_present(["--help"])) then
@@ -56,16 +56,18 @@ function functional_programming_time() result(system_time)
 
     call T%define(side=1., boundary_val=T_boundary, internal_val=T_internal_initial, n=resolution)
 
-    call system_clock(t_start_functional)
-
     associate(dt => T%dx()*T%dy()/(4*alpha))
+      call system_clock(t_start_functional)
+
       functional_programming: &
       do step = 1, steps
+        sync all
         T =  T + dt * alpha * .laplacian. T
       end do functional_programming
+
+      call system_clock(t_end_functional, clock_rate)
     end associate
 
-    call system_clock(t_end_functional, clock_rate)
     system_time = real(t_end_functional - t_start_functional)/real(clock_rate)
 
     associate(L_infinity_norm => maxval(abs(T%values() - T_steady)))
@@ -80,16 +82,20 @@ function procedural_programming_time() result(system_time)
     real system_time
     type(subdomain_t) T
 
+    call T%define(side=1., boundary_val=0., internal_val=1., n=resolution)
+
     associate(dt => T%dx()*T%dy()/(4*alpha))
-      call T%define(side=1., boundary_val=0., internal_val=1., n=resolution)
       call system_clock(t_start_procedural)
+
       procedural_programming: &
       do step = 1, steps
+        sync all
         call T%step(alpha*dt)
       end do procedural_programming
+
+      call system_clock(t_end_procedural, clock_rate)
     end associate
 
-    call system_clock(t_end_procedural, clock_rate)
     system_time = real(t_end_procedural - t_start_procedural)/real(clock_rate)
 
     associate(L_infinity_norm => maxval(abs(T%values() - T_steady)))
diff --git a/src/matcha/subdomain_m.f90 b/src/matcha/subdomain_m.f90
index 72c9cbf..5524cae 100644
--- a/src/matcha/subdomain_m.f90
+++ b/src/matcha/subdomain_m.f90
@@ -14,14 +14,14 @@ module subdomain_m
     generic :: operator(.laplacian.) => laplacian
     generic :: operator(*) => multiply
     generic :: operator(+) => add
-    generic :: assignment(=) => assign_and_sync
+    generic :: assignment(=) => assign_
     procedure dx
     procedure dy
     procedure dz
     procedure values
     procedure, private :: laplacian
     procedure, private :: add
-    procedure, private :: assign_and_sync
+    procedure, private :: assign_
   end type
 
   interface
@@ -83,7 +83,7 @@ pure module function add(lhs, rhs) result(total)
       type(subdomain_t) total
     end function
 
-    module subroutine assign_and_sync(lhs, rhs)
+    module subroutine assign_(lhs, rhs)
       implicit none
       class(subdomain_t), intent(out) :: lhs
       type(subdomain_t), intent(in) :: rhs
diff --git a/src/matcha/subdomain_s.f90 b/src/matcha/subdomain_s.f90
index 2ca5eb9..467a693 100644
--- a/src/matcha/subdomain_s.f90
+++ b/src/matcha/subdomain_s.f90
@@ -55,7 +55,6 @@
     allocate(halo_x(west:east, ny, nz)[*])
     if (me>1) halo_x(east,:,:)[me-1] = self%s_(1,:,:)
     if (me<num_subdomains) halo_x(west,:,:)[me+1] = self%s_(my_nx,:,:)
-    sync all
   end procedure
 
   module procedure dx
@@ -123,13 +122,10 @@
     total%s_ =  lhs%s_ + rhs%s_
   end procedure
 
-  module procedure assign_and_sync
-    call assert(allocated(rhs%s_), "subdomain_t%assign_and_sync: allocated(rhs%s_)")
-    sync all
+  module procedure assign_
+    call assert(allocated(rhs%s_), "subdomain_t%assign_: allocated(rhs%s_)")
     lhs%s_ =  rhs%s_
-    if (me>1) halo_x(east,:,:)[me-1] = rhs%s_(1,:,:)
-    if (me<num_subdomains) halo_x(west,:,:)[me+1] = rhs%s_(my_nx,:,:)
-    sync all
+    call exchange_halo(rhs%s_)
   end procedure
 
   module procedure values
@@ -137,6 +133,12 @@
     my_values =  self%s_
   end procedure
 
+  subroutine exchange_halo(s)
+    real, intent(in) :: s(:,:,:)
+    if (me>1) halo_x(east,:,:)[me-1] = s(1,:,:)
+    if (me<num_subdomains) halo_x(west,:,:)[me+1] = s(my_nx,:,:)
+  end subroutine
+
   module procedure step
 
     call assert(allocated(self%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
@@ -149,10 +151,7 @@
     call internal_points(increment)
     call edge_points(increment)
     call apply_boundary_condition(increment)
-
-    sync all
     self%s_ = self%s_ + increment
-    sync all
     call exchange_halo(self%s_)
 
   contains
@@ -207,11 +206,6 @@ subroutine apply_boundary_condition(ds)
       if (me==num_subdomains) ds(my_nx,:,:) = 0.
     end subroutine
 
-    subroutine exchange_halo(s)
-      real, intent(in) :: s(:,:,:)
-      if (me>1) halo_x(east,:,:)[me-1] = s(1,:,:)
-      if (me<num_subdomains) halo_x(west,:,:)[me+1] = s(my_nx,:,:)
-    end subroutine
 
   end procedure
 
diff --git a/test/subdomain_test_m.f90 b/test/subdomain_test_m.f90
index 144ab6e..3f5140a 100644
--- a/test/subdomain_test_m.f90
+++ b/test/subdomain_test_m.f90
@@ -53,7 +53,8 @@ subroutine output(v)
     critical
       do j = 1, size(v,2)
         do k = 1, size(v,3)
-          print *,"image ",this_image(),": ",j,k,v(:,j,k)
+          !print *,"image ",this_image(),": ",j,k,v(:,j,k)
+          print *,j,k,v(:,j,k)
         end do
       end do
     end critical
@@ -65,7 +66,8 @@ function concave_laplacian() result(test_passes)
     type(subdomain_t) f, laplacian_f
     real, allocatable :: lap_f_vals(:,:,:)
 
-    call f%define(side=1., boundary_val=1., internal_val=2., n=21) ! internally constant subdomain with a step down at all surfaces
+    call f%define(side=1., boundary_val=1., internal_val=2., n=32) ! internally constant subdomain with a step down at all surfaces
+    sync all
     laplacian_f = .laplacian. f
     lap_f_vals = laplacian_f%values()
 
@@ -154,14 +156,15 @@ function concave_laplacian() result(test_passes)
   function correct_steady_state() result(test_passes)
     logical test_passes
     type(subdomain_t) T
-    real, parameter :: T_boundary = 1., T_initial = 2., tolerance = 0.01, T_steady = T_boundary, alpha = 1.
-    integer, parameter :: steps = 6000
+    real, parameter :: T_boundary = 1., T_initial = 2., tolerance = 5.E-03, T_steady = T_boundary, alpha = 1.
+    integer, parameter :: steps = 25000
     integer step
 
-    call T%define(side=1., boundary_val=T_boundary, internal_val=T_initial, n=21) ! const. internally with a step down at boundaries
+    call T%define(side=1., boundary_val=T_boundary, internal_val=T_initial, n=32) ! const. internally with a step down at boundaries
 
     associate(dt => T%dx()*T%dy()*T%dz()/(4*alpha))
       do step = 1, steps
+        sync all
         T =  T + dt * alpha * .laplacian. T
       end do
     end associate
@@ -174,7 +177,7 @@ function correct_steady_state() result(test_passes)
   function functional_matches_procedural() result(test_passes)
     logical test_passes
     real, parameter :: tolerance = 1.E-06
-    integer, parameter :: steps = 6000, n=21
+    integer, parameter :: steps = 6000, n=32
     real, parameter :: alpha = 1.
     real, parameter :: side=1., boundary_val=1., internal_val=2.
 
@@ -195,6 +198,7 @@ function T_functional()
 
       associate(dt => T%dx()*T%dy()/(4*alpha))
         do step = 1, steps
+          sync all
           T =  T + dt * alpha * .laplacian. T
         end do
       end associate
@@ -211,6 +215,7 @@ function T_procedural()
 
       associate(dt => T%dx()*T%dy()/(4*alpha))
         do step = 1, steps
+          sync all
           call T%step(alpha*dt)
         end do
       end associate

From 837309ff208d71d2e0c496c9c691df36de9c194e Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Sun, 31 Dec 2023 11:31:38 -0500
Subject: [PATCH 02/15] fix(subdomain/test/example): rm halo

---
 example/time-paradigm.f90  |  4 ++--
 src/matcha/subdomain_m.f90 |  4 ++--
 src/matcha/subdomain_s.f90 | 49 +++++++++++++++++++-------------------
 test/subdomain_test_m.f90  |  9 +++----
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/example/time-paradigm.f90 b/example/time-paradigm.f90
index 494d213..ce369d2 100644
--- a/example/time-paradigm.f90
+++ b/example/time-paradigm.f90
@@ -52,7 +52,7 @@ function functional_programming_time() result(system_time)
     integer(int64) t_start_functional, t_end_functional, clock_rate
     integer step
     real system_time
-    type(subdomain_t) T
+    type(subdomain_t), save :: T[*]
 
     call T%define(side=1., boundary_val=T_boundary, internal_val=T_internal_initial, n=resolution)
 
@@ -80,7 +80,7 @@ function procedural_programming_time() result(system_time)
     integer(int64) t_start_procedural, t_end_procedural, clock_rate
     integer step
     real system_time
-    type(subdomain_t) T
+    type(subdomain_t), save :: T[*]
 
     call T%define(side=1., boundary_val=0., internal_val=1., n=resolution)
 
diff --git a/src/matcha/subdomain_m.f90 b/src/matcha/subdomain_m.f90
index 5524cae..5faa51b 100644
--- a/src/matcha/subdomain_m.f90
+++ b/src/matcha/subdomain_m.f90
@@ -36,7 +36,7 @@ module subroutine define(side, boundary_val, internal_val, n, self)
     module subroutine step(alpha_dt, self)
       implicit none
       real, intent(in) :: alpha_dt
-      class(subdomain_t), intent(inout) :: self
+      class(subdomain_t), intent(inout) :: self[*]
     end subroutine
 
     pure module function values(self) result(my_values)
@@ -65,7 +65,7 @@ pure module function dz(self) result(my_dz)
 
     pure module function laplacian(rhs) result(laplacian_rhs)
       implicit none
-      class(subdomain_t), intent(in) :: rhs
+      class(subdomain_t), intent(in) :: rhs[*]
       type(subdomain_t) laplacian_rhs
     end function
 
diff --git a/src/matcha/subdomain_s.f90 b/src/matcha/subdomain_s.f90
index 467a693..a1dc8c8 100644
--- a/src/matcha/subdomain_s.f90
+++ b/src/matcha/subdomain_s.f90
@@ -4,9 +4,6 @@
   use intrinsic_array_m, only : intrinsic_array_t
   implicit none
 
-  real, allocatable :: halo_x(:,:,:)[:]
-  integer, parameter :: west=1, east=2
-
   type(data_partition_t) data_partition
 
   real dx_, dy_, dz_
@@ -50,11 +47,6 @@
 
     if (me == 1)              self%s_(1    , :, :) = boundary_val ! minimum x boundary
     if (me == num_subdomains) self%s_(my_nx, :, :) = boundary_val ! maximum x boundary
-
-    if (allocated(halo_x)) deallocate(halo_x)
-    allocate(halo_x(west:east, ny, nz)[*])
-    if (me>1) halo_x(east,:,:)[me-1] = self%s_(1,:,:)
-    if (me<num_subdomains) halo_x(west,:,:)[me+1] = self%s_(my_nx,:,:)
   end procedure
 
   module procedure dx
@@ -75,11 +67,14 @@
     real, allocatable :: halo_west(:,:), halo_east(:,:)
 
     call assert(allocated(rhs%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
-    call assert(allocated(halo_x), "subdomain_t%laplacian: allocated(halo_x)")
 
     allocate(laplacian_rhs%s_(my_nx, ny, nz))
 
-    halo_west = merge(halo_x(west,:,:), rhs%s_(1,:,:), me/=1)
+    if (me==1) then
+      halo_west = rhs%s_(1,:,:)
+    else
+      halo_west = rhs[me-1]%s_(ubound(rhs[me-1]%s_,1),:,:)
+    end if
     i = my_internal_west
     call assert(i+1<=my_nx,"laplacian: westernmost subdomain too small")
     do concurrent(j=2:ny-1, k=2:nz-1)
@@ -94,7 +89,11 @@
                                 (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
     end do
 
-    halo_east = merge(halo_x(east,:,:), rhs%s_(my_nx,:,:), me/=num_subdomains)
+    if (me==1) then
+      halo_east = rhs%s_(1,:,:)
+    else
+      halo_east = rhs[me+1]%s_(lbound(rhs[me+1]%s_,1),:,:)
+    end if
     i = my_internal_east
     call assert(i-1>0,"laplacian: easternmost subdomain too small")
     do concurrent(j=2:ny-1, k=2:nz-1)
@@ -125,7 +124,6 @@
   module procedure assign_
     call assert(allocated(rhs%s_), "subdomain_t%assign_: allocated(rhs%s_)")
     lhs%s_ =  rhs%s_
-    call exchange_halo(rhs%s_)
   end procedure
 
   module procedure values
@@ -133,26 +131,19 @@
     my_values =  self%s_
   end procedure
 
-  subroutine exchange_halo(s)
-    real, intent(in) :: s(:,:,:)
-    if (me>1) halo_x(east,:,:)[me-1] = s(1,:,:)
-    if (me<num_subdomains) halo_x(west,:,:)[me+1] = s(my_nx,:,:)
-  end subroutine
-
   module procedure step
 
     call assert(allocated(self%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
-    call assert(allocated(halo_x), "subdomain_t%laplacian: allocated(halo_x)")
     call assert(my_internal_west+1<=my_nx,"laplacian: westernmost subdomain too small")
     call assert(my_internal_east-1>0,"laplacian: easternmost subdomain too small")
 
     if (.not. allocated(increment)) allocate(increment(my_nx,ny,nz))
  
+    sync all
     call internal_points(increment)
-    call edge_points(increment)
+    call edge_points(self, increment)
     call apply_boundary_condition(increment)
     self%s_ = self%s_ + increment
-    call exchange_halo(self%s_)
 
   contains
 
@@ -169,14 +160,17 @@ subroutine internal_points(ds)
       end do
     end subroutine
 
-    subroutine edge_points(ds)
+    subroutine edge_points(self, ds)
+      type(subdomain_t), intent(in) :: self[*]
       real, intent(inout) :: ds(:,:,:)
       real, allocatable :: halo_west(:,:), halo_east(:,:)
       integer i, j, k
 
-      halo_west = merge(halo_x(west,:,:), self%s_(1,    :,:), me/=1)
-      halo_east = merge(halo_x(east,:,:), self%s_(my_nx,:,:), me/=num_subdomains)
-
+      if (me==1) then
+        halo_west = self%s_(1,:,:)
+      else
+        halo_west = self[me-1]%s_(ubound(self[me-1]%s_,1),:,:)
+      end if
       i = my_internal_west
       do concurrent(j=2:ny-1,k=2:nz-1)
         ds(i,j,k) = alpha_dt*( &
@@ -186,6 +180,11 @@ subroutine edge_points(ds)
         )
       end do
 
+      if (me==1) then
+        halo_east = self%s_(my_nx,:,:)
+      else
+        halo_east = self[me+1]%s_(1,:,:)
+      end if
       i = my_internal_east
       do concurrent(j=2:ny-1, k=2:nz-1)
         ds(i,j,k) = alpha_dt*( &
diff --git a/test/subdomain_test_m.f90 b/test/subdomain_test_m.f90
index 3f5140a..2c48222 100644
--- a/test/subdomain_test_m.f90
+++ b/test/subdomain_test_m.f90
@@ -63,7 +63,8 @@ subroutine output(v)
 
   function concave_laplacian() result(test_passes)
     logical test_passes
-    type(subdomain_t) f, laplacian_f
+    type(subdomain_t), save :: f[*]
+    type(subdomain_t) :: laplacian_f
     real, allocatable :: lap_f_vals(:,:,:)
 
     call f%define(side=1., boundary_val=1., internal_val=2., n=32) ! internally constant subdomain with a step down at all surfaces
@@ -155,7 +156,7 @@ function concave_laplacian() result(test_passes)
 
   function correct_steady_state() result(test_passes)
     logical test_passes
-    type(subdomain_t) T
+    type(subdomain_t), save :: T[*]
     real, parameter :: T_boundary = 1., T_initial = 2., tolerance = 5.E-03, T_steady = T_boundary, alpha = 1.
     integer, parameter :: steps = 25000
     integer step
@@ -191,7 +192,7 @@ function functional_matches_procedural() result(test_passes)
 
     function T_functional()
       real, allocatable :: T_functional(:,:,:)
-      type(subdomain_t) T
+      type(subdomain_t), save :: T[*]
       integer step
 
       call T%define(side, boundary_val, internal_val, n)
@@ -208,7 +209,7 @@ function T_functional()
 
     function T_procedural()
       real, allocatable :: T_procedural(:,:,:)
-      type(subdomain_t) T
+      type(subdomain_t), save :: T[*]
       integer step
 
       call T%define(side, boundary_val, internal_val, n)

From 345f5b8072088cd180fe774ae854442207e34061 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 10:49:16 -0500
Subject: [PATCH 03/15] fix(subdomain): non-type-bound laplacian operator

---
 example/time-paradigm.f90  |  4 +-
 src/matcha/subdomain_m.f90 | 83 +++++++++++++++++++++++++++++++++-----
 src/matcha/subdomain_s.f90 | 63 -----------------------------
 test/subdomain_test_m.f90  |  4 +-
 4 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/example/time-paradigm.f90 b/example/time-paradigm.f90
index ce369d2..662be3b 100644
--- a/example/time-paradigm.f90
+++ b/example/time-paradigm.f90
@@ -2,7 +2,7 @@
 ! Terms of use are as specified in LICENSE.txt
 program time_paradigm_m
   !! Time various alternative programming paradigms
-  use subdomain_m, only : subdomain_t
+  use subdomain_m, only : subdomain_t, operator(.laplacian.), step
   use assert_m, only : assert
   use sourcery_m, only : string_t, file_t, command_line_t, bin_t, csv 
   use iso_fortran_env, only : int64
@@ -90,7 +90,7 @@ function procedural_programming_time() result(system_time)
       procedural_programming: &
       do step = 1, steps
         sync all
-        call T%step(alpha*dt)
+        call step(alpha*dt, T)
       end do procedural_programming
 
       call system_clock(t_end_procedural, clock_rate)
diff --git a/src/matcha/subdomain_m.f90 b/src/matcha/subdomain_m.f90
index 5faa51b..605ead6 100644
--- a/src/matcha/subdomain_m.f90
+++ b/src/matcha/subdomain_m.f90
@@ -1,17 +1,18 @@
 module subdomain_m
+  use assert_m, only : assert
   implicit none
 
   private
   public :: subdomain_t
+  public :: operator(.laplacian.)
+  public :: step
 
   type subdomain_t 
     private
     real, allocatable :: s_(:,:,:)
   contains
     procedure, pass(self) :: define
-    procedure, pass(self) :: step
     procedure, pass(rhs) :: multiply
-    generic :: operator(.laplacian.) => laplacian
     generic :: operator(*) => multiply
     generic :: operator(+) => add
     generic :: assignment(=) => assign_
@@ -19,11 +20,21 @@ module subdomain_m
     procedure dy
     procedure dz
     procedure values
-    procedure, private :: laplacian
     procedure, private :: add
     procedure, private :: assign_
   end type
 
+  interface operator(.laplacian.)
+
+    module procedure laplacian
+    !pure module function laplacian(rhs) result(laplacian_rhs)
+    !  implicit none
+    !  type(subdomain_t), intent(in) :: rhs[*]
+    !  type(subdomain_t) laplacian_rhs
+    !end function
+
+  end interface
+
   interface
 
     module subroutine define(side, boundary_val, internal_val, n, self)
@@ -36,7 +47,7 @@ module subroutine define(side, boundary_val, internal_val, n, self)
     module subroutine step(alpha_dt, self)
       implicit none
       real, intent(in) :: alpha_dt
-      class(subdomain_t), intent(inout) :: self[*]
+      type(subdomain_t), intent(inout) :: self[*]
     end subroutine
 
     pure module function values(self) result(my_values)
@@ -63,12 +74,6 @@ pure module function dz(self) result(my_dz)
       real my_dz
     end function
 
-    pure module function laplacian(rhs) result(laplacian_rhs)
-      implicit none
-      class(subdomain_t), intent(in) :: rhs[*]
-      type(subdomain_t) laplacian_rhs
-    end function
-
     pure module function multiply(lhs, rhs) result(product)
       implicit none
       class(subdomain_t), intent(in) :: rhs
@@ -91,4 +96,62 @@ module subroutine assign_(lhs, rhs)
 
   end interface
 
+  real dx_, dy_, dz_
+  integer my_nx, nx, ny, nz, me, num_subdomains, my_internal_west, my_internal_east
+
+contains
+
+  pure module function laplacian(rhs) result(laplacian_rhs)
+    type(subdomain_t), intent(in) :: rhs[*]
+    type(subdomain_t) laplacian_rhs
+
+    integer i, j, k
+    real, allocatable :: halo_west(:,:), halo_east(:,:)
+
+    call assert(allocated(rhs%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
+
+    allocate(laplacian_rhs%s_, mold=rhs%s_)
+
+    if (me==1) then
+      halo_west = rhs%s_(1,:,:)
+    else
+      halo_west = rhs[me-1]%s_(ubound(rhs[me-1]%s_,1),:,:)
+    end if
+    i = my_internal_west
+    call assert(i+1<=my_nx,"laplacian: westernmost subdomain too small")
+    do concurrent(j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = ( halo_west(j,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    do concurrent(i=my_internal_west+1:my_internal_east-1, j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    if (me==1) then
+      halo_east = rhs%s_(1,:,:)
+    else
+      halo_east = rhs[me+1]%s_(lbound(rhs[me+1]%s_,1),:,:)
+    end if
+    i = my_internal_east
+    call assert(i-1>0,"laplacian: easternmost subdomain too small")
+    do concurrent(j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) +  halo_east(j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    laplacian_rhs%s_(:, 1,:) = 0.
+    laplacian_rhs%s_(:,ny,:) = 0.
+    laplacian_rhs%s_(:,:, 1) = 0.
+    laplacian_rhs%s_(:,:,nz) = 0.
+    if (me==1) laplacian_rhs%s_(1,:,:) = 0.
+    if (me==num_subdomains) laplacian_rhs%s_(my_nx,:,:) = 0.
+
+  end function
+
+
 end module
diff --git a/src/matcha/subdomain_s.f90 b/src/matcha/subdomain_s.f90
index a1dc8c8..8b4e953 100644
--- a/src/matcha/subdomain_s.f90
+++ b/src/matcha/subdomain_s.f90
@@ -1,13 +1,10 @@
 submodule(subdomain_m) subdomain_s
   use sourcery_m, only : data_partition_t
-  use assert_m, only : assert
   use intrinsic_array_m, only : intrinsic_array_t
   implicit none
 
   type(data_partition_t) data_partition
 
-  real dx_, dy_, dz_
-  integer my_nx, nx, ny, nz, me, num_subdomains, my_internal_west, my_internal_east
   real, allocatable :: increment(:,:,:)
 
 contains
@@ -61,66 +58,6 @@
     my_dz = dz_
   end procedure
 
-  module procedure laplacian
-
-    integer i, j, k
-    real, allocatable :: halo_west(:,:), halo_east(:,:)
-
-    call assert(allocated(rhs%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
-
-    allocate(laplacian_rhs%s_(my_nx, ny, nz))
-
-    if (me==1) then
-      halo_west = rhs%s_(1,:,:)
-    else
-      halo_west = rhs[me-1]%s_(ubound(rhs[me-1]%s_,1),:,:)
-    end if
-    i = my_internal_west
-    call assert(i+1<=my_nx,"laplacian: westernmost subdomain too small")
-    do concurrent(j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = ( halo_west(j,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    do concurrent(i=my_internal_west+1:my_internal_east-1, j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    if (me==1) then
-      halo_east = rhs%s_(1,:,:)
-    else
-      halo_east = rhs[me+1]%s_(lbound(rhs[me+1]%s_,1),:,:)
-    end if
-    i = my_internal_east
-    call assert(i-1>0,"laplacian: easternmost subdomain too small")
-    do concurrent(j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) +  halo_east(j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    laplacian_rhs%s_(:, 1,:) = 0.
-    laplacian_rhs%s_(:,ny,:) = 0.
-    laplacian_rhs%s_(:,:, 1) = 0.
-    laplacian_rhs%s_(:,:,nz) = 0.
-    if (me==1) laplacian_rhs%s_(1,:,:) = 0.
-    if (me==num_subdomains) laplacian_rhs%s_(my_nx,:,:) = 0.
-
-  end procedure
-
-  module procedure multiply
-    call assert(allocated(rhs%s_), "subdomain_t%multiply: allocated(rhs%s_)")
-    product%s_ =  lhs * rhs%s_
-  end procedure
-
-  module procedure add
-    call assert(allocated(rhs%s_), "subdomain_t%add: allocated(rhs%s_)")
-    total%s_ =  lhs%s_ + rhs%s_
-  end procedure
-
   module procedure assign_
     call assert(allocated(rhs%s_), "subdomain_t%assign_: allocated(rhs%s_)")
     lhs%s_ =  rhs%s_
diff --git a/test/subdomain_test_m.f90 b/test/subdomain_test_m.f90
index 2c48222..64c8ef9 100644
--- a/test/subdomain_test_m.f90
+++ b/test/subdomain_test_m.f90
@@ -3,7 +3,7 @@
 module subdomain_test_m
   !! Define subdomain tests and procedures required for reporting results
   use sourcery_m, only : test_t, test_result_t
-  use subdomain_m, only : subdomain_t
+  use subdomain_m, only : subdomain_t, operator(.laplacian.), step
   use assert_m, only : assert
   implicit none
 
@@ -217,7 +217,7 @@ function T_procedural()
       associate(dt => T%dx()*T%dy()/(4*alpha))
         do step = 1, steps
           sync all
-          call T%step(alpha*dt)
+          call step(alpha*dt, T)
         end do
       end associate
 

From 977d4c5b18fc3782d2e5d794ec2e9e62ebf66f23 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 10:56:11 -0500
Subject: [PATCH 04/15] chore(do_concurrent_m): rm unsued use association

---
 src/matcha/do_concurrent_m.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/matcha/do_concurrent_m.f90 b/src/matcha/do_concurrent_m.f90
index 21869c1..b13b26a 100644
--- a/src/matcha/do_concurrent_m.f90
+++ b/src/matcha/do_concurrent_m.f90
@@ -1,6 +1,6 @@
 module do_concurrent_m
   use iso_c_binding, only : c_double, c_int
-  use t_cell_collection_m, only : t_cell_collection_t, t_cell_collection_bind_C_t
+  use t_cell_collection_m, only : t_cell_collection_bind_C_t
   implicit none
   private
   public :: do_concurrent_sampled_speeds, do_concurrent_my_velocities, do_concurrent_k, do_concurrent_speeds

From 472be32ed5222c250ef5fa464ed780f177a8b956 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 11:30:23 -0500
Subject: [PATCH 05/15] fix(bind(C)): pre-allocate arguments

---
 src/matcha/distribution_s.F90  | 10 ++++++++--
 src/matcha/do_concurrent_m.f90 | 10 +++++-----
 src/matcha/do_concurrent_s.f90 | 14 +++++---------
 src/matcha/output_s.f90        |  7 +++++++
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/matcha/distribution_s.F90 b/src/matcha/distribution_s.F90
index fff76fb..d4d14f4 100644
--- a/src/matcha/distribution_s.F90
+++ b/src/matcha/distribution_s.F90
@@ -47,8 +47,11 @@ pure function monotonically_increasing(f) result(monotonic)
       "distribution_t%cumulative_distribution: allocated(cumulative_distribution_)")
     call assert(allocated(self%vel_), "distribution_t%cumulative_distribution: allocated(vel_)")
 
-     ! Sample from the distribution
-     call do_concurrent_sampled_speeds(speeds, self%vel_, self%cumulative_distribution(), sampled_speeds)
+    ! Sample from the distribution
+    associate(ncells => size(speeds,1), nsteps => size(speeds,2))
+      allocate(sampled_speeds(ncells,nsteps))
+      call do_concurrent_sampled_speeds(speeds, self%vel_, self%cumulative_distribution(), sampled_speeds)
+    end associate
      
      associate(nsteps => size(speeds,2))
 
@@ -63,6 +66,9 @@ pure function monotonically_increasing(f) result(monotonic)
          end associate
        end associate
        
+       if(allocated(my_velocities)) deallocate(my_velocities)
+       allocate(my_velocities, mold=dir)
+    
        call do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities)
        
      end associate
diff --git a/src/matcha/do_concurrent_m.f90 b/src/matcha/do_concurrent_m.f90
index b13b26a..db1b04f 100644
--- a/src/matcha/do_concurrent_m.f90
+++ b/src/matcha/do_concurrent_m.f90
@@ -11,20 +11,20 @@ module do_concurrent_m
     pure module subroutine do_concurrent_sampled_speeds(speeds, vel, cumulative_distribution, sampled_speeds) bind(C)
       implicit none
       real(c_double), intent(in) :: speeds(:,:), vel(:), cumulative_distribution(:)
-      real(c_double), intent(out), allocatable :: sampled_speeds(:,:)
+      real(c_double), intent(out) :: sampled_speeds(:,:)
     end subroutine
     
     pure module subroutine do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities) bind(C)
       implicit none
       integer(c_int), intent(in) :: nsteps
       real(c_double), intent(in) :: dir(:,:,:), sampled_speeds(:,:)
-      real(c_double), intent(out), allocatable :: my_velocities(:,:,:)
+      real(c_double), intent(out) :: my_velocities(:,:,:)
     end subroutine
     
     pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
       implicit none
       real(c_double), intent(in) :: speeds(:), vel(:)
-      integer(c_int), intent(out), allocatable :: k(:)
+      integer(c_int), intent(out) :: k(:)
     end subroutine
     
     pure module subroutine &
@@ -32,13 +32,13 @@ pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
       implicit none
       integer(c_int), intent(in) :: nintervals, speed, freq, k(:)
       real(c_double), intent(in) :: emp_distribution(:,:)
-      real(c_double), intent(out), allocatable :: output_distribution(:,:)
+      real(c_double), intent(out) :: output_distribution(:,:)
     end subroutine
     
     module subroutine do_concurrent_speeds(history, speeds) bind(C)
       implicit none
       type(t_cell_collection_bind_C_t), intent(in) :: history(:)
-      real(c_double), intent(out), allocatable :: speeds(:)
+      real(c_double), intent(out) :: speeds(:)
     end subroutine
       
   end interface
diff --git a/src/matcha/do_concurrent_s.f90 b/src/matcha/do_concurrent_s.f90
index 80c37e7..33c0218 100644
--- a/src/matcha/do_concurrent_s.f90
+++ b/src/matcha/do_concurrent_s.f90
@@ -1,4 +1,5 @@
 submodule(do_concurrent_m) do_concurrent_s
+  use assert_m, only : assert
   use iso_c_binding, only : c_f_pointer
   implicit none
 
@@ -7,9 +8,10 @@
   module procedure do_concurrent_sampled_speeds
   
     integer cell, step
+   
+    call assert(all(shape(sampled_speeds)==shape(speeds)), "do_concurrent_sampled_speeds: {sampled_,}speeds shape match")
     
     associate(ncells => size(speeds,1), nsteps => size(speeds,2))
-      allocate(sampled_speeds(ncells,nsteps))
       do concurrent(cell = 1:ncells, step = 1:nsteps)
         associate(k => findloc(speeds(cell,step) >= cumulative_distribution, value=.false., dim=1)-1)
           sampled_speeds(cell,step) = vel(k)
@@ -23,9 +25,8 @@
   
     integer step
     
-    if(allocated(my_velocities)) deallocate(my_velocities)
-    allocate(my_velocities, mold=dir)
-    
+    call assert(all(shape(sampled_speeds)==shape(my_velocities)), "do_concurrent_my_velocities: argument shape match")
+
     do concurrent(step=1:nsteps)
       my_velocities(:,step,1) = sampled_speeds(:,step)*dir(:,step,1)
       my_velocities(:,step,2) = sampled_speeds(:,step)*dir(:,step,2)
@@ -39,8 +40,6 @@
   integer i
   
     associate(nspeeds => size(speeds))
-      if(allocated(k)) deallocate(k)
-      allocate(k(nspeeds))
         do concurrent(i = 1:nspeeds)
           k(i) = findloc(speeds(i) >= vel, value=.false., dim=1)-1
         end do
@@ -51,8 +50,6 @@
   
     integer i
     
-    if(allocated(output_distribution)) deallocate(output_distribution)
-    allocate(output_distribution(nintervals,2))
     output_distribution(:,freq) = 0.d0
     output_distribution(:,speed) = emp_distribution(:,speed)
     do concurrent(i = 1:size(output_distribution,1))
@@ -79,7 +76,6 @@
       end do
   
       associate(t => history%time)
-        allocate(speeds(ncells*(npositions-1)))
         do concurrent(i = 1:npositions-1, j = 1:ncells)
           associate( &
             u => (x(i+1,j,:) - x(i,j,:))/(t(i+1) - t(i)), &
diff --git a/src/matcha/output_s.f90 b/src/matcha/output_s.f90
index 1bac989..57318b1 100644
--- a/src/matcha/output_s.f90
+++ b/src/matcha/output_s.f90
@@ -24,12 +24,19 @@
     
     integer, parameter :: speed=1, freq=2 ! subscripts for speeds and frequencies
 
+    associate(npositions => size(history), ncells => history(1)%positions_shape(1))
+      allocate(speeds(ncells*(npositions-1)))
+    end associate
     call do_concurrent_speeds(t_cell_collection_bind_C_t(self%history_), speeds)
 
     associate(emp_distribution => self%input_%sample_distribution())
       associate(nintervals => size(emp_distribution(:,1)), dvel_half => (emp_distribution(2,speed)-emp_distribution(1,speed))/2.d0)
         vel = [emp_distribution(1,speed) - dvel_half, [(emp_distribution(i,speed) + dvel_half, i=1,nintervals)]]
+        if (allocated(k)) deallocate(k)
+        allocate(k(nspeeds))
         call do_concurrent_k(speeds, vel, k)
+        if(allocated(output_distribution)) deallocate(output_distribution)
+        allocate(output_distribution(nintervals,2))
         call do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution)
         output_distribution(:,freq) = output_distribution(:,freq)/sum(output_distribution(:,freq))
       end associate

From 7d5d427d99a539ca1e12f432b55c4e798f3e7715 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 17:10:54 -0500
Subject: [PATCH 06/15] fix(do_concurrent): work around NAG compiler bug

nagfor 7.1 Build 7143 requires a separate module procedure
definition with the bind(C) attribute to repeat the full interface
body.
---
 src/matcha/do_concurrent_m.f90 |  4 +--
 src/matcha/do_concurrent_s.f90 | 49 ++++++++++++++++++++--------------
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/matcha/do_concurrent_m.f90 b/src/matcha/do_concurrent_m.f90
index db1b04f..21e1885 100644
--- a/src/matcha/do_concurrent_m.f90
+++ b/src/matcha/do_concurrent_m.f90
@@ -27,8 +27,8 @@ pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
       integer(c_int), intent(out) :: k(:)
     end subroutine
     
-    pure module subroutine &
-      do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution) bind(C)
+    pure module subroutine do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution) &
+      bind(C)
       implicit none
       integer(c_int), intent(in) :: nintervals, speed, freq, k(:)
       real(c_double), intent(in) :: emp_distribution(:,:)
diff --git a/src/matcha/do_concurrent_s.f90 b/src/matcha/do_concurrent_s.f90
index 33c0218..e8f1aac 100644
--- a/src/matcha/do_concurrent_s.f90
+++ b/src/matcha/do_concurrent_s.f90
@@ -5,8 +5,9 @@
 
 contains
   
-  module procedure do_concurrent_sampled_speeds
-  
+  pure module subroutine do_concurrent_sampled_speeds(speeds, vel, cumulative_distribution, sampled_speeds) bind(C)
+    real(c_double), intent(in) :: speeds(:,:), vel(:), cumulative_distribution(:)
+    real(c_double), intent(out) :: sampled_speeds(:,:)
     integer cell, step
    
     call assert(all(shape(sampled_speeds)==shape(speeds)), "do_concurrent_sampled_speeds: {sampled_,}speeds shape match")
@@ -19,34 +20,42 @@
       end do
     end associate
     
-  end procedure
-  
-  module procedure do_concurrent_my_velocities
-  
+  end subroutine  
+
+  pure module subroutine do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities) bind(C)
+    integer(c_int), intent(in) :: nsteps
+    real(c_double), intent(in) :: dir(:,:,:), sampled_speeds(:,:)
+    real(c_double), intent(out) :: my_velocities(:,:,:)
     integer step
     
-    call assert(all(shape(sampled_speeds)==shape(my_velocities)), "do_concurrent_my_velocities: argument shape match")
+    call assert(all([size(my_velocities,1),size(sampled_speeds,2)] == shape(sampled_speeds)), &
+      "do_concurrent_my_velocities: argument size match")
+    call assert(all(shape(my_velocities,1)==shape(dir)), "do_concurrent_my_velocities: argument shape match")
 
     do concurrent(step=1:nsteps)
       my_velocities(:,step,1) = sampled_speeds(:,step)*dir(:,step,1)
       my_velocities(:,step,2) = sampled_speeds(:,step)*dir(:,step,2)
       my_velocities(:,step,3) = sampled_speeds(:,step)*dir(:,step,3)
     end do
-    
-  end procedure
+  end subroutine  
   
-  module procedure do_concurrent_k
-  
-  integer i
+  pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
+    real(c_double), intent(in) :: speeds(:), vel(:)
+    integer(c_int), intent(out) :: k(:)  
+    integer i
   
     associate(nspeeds => size(speeds))
         do concurrent(i = 1:nspeeds)
           k(i) = findloc(speeds(i) >= vel, value=.false., dim=1)-1
         end do
     end associate
-  end procedure
+  end subroutine
   
-  module procedure do_concurrent_output_distribution
+  pure module subroutine do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution) &
+    bind(C)
+    integer(c_int), intent(in) :: nintervals, speed, freq, k(:)
+    real(c_double), intent(in) :: emp_distribution(:,:)
+    real(c_double), intent(out) :: output_distribution(:,:)
   
     integer i
     
@@ -55,11 +64,11 @@
     do concurrent(i = 1:size(output_distribution,1))
       output_distribution(i,freq) = count(k==i)
     end do
-    
-  end procedure
-  
-  module procedure do_concurrent_speeds
+  end subroutine
   
+  module subroutine do_concurrent_speeds(history, speeds) bind(C)
+    type(t_cell_collection_bind_C_t), intent(in) :: history(:)
+    real(c_double), intent(out) :: speeds(:)  
     integer i, j, k
     integer, parameter :: nspacedims=3
     
@@ -87,6 +96,6 @@
       end associate
     end associate
     
-  end procedure
+  end subroutine
   
-end submodule do_concurrent_s
+end submodule do_concurrent_s
\ No newline at end of file

From 14f7c6d5cb4c458edee5b7cfe448fd556d38f177 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 17:38:24 -0500
Subject: [PATCH 07/15] chore(do_concurrent_m): rm unused dummy argument

---
 src/matcha/do_concurrent_m.f90 | 4 ++--
 src/matcha/do_concurrent_s.f90 | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/matcha/do_concurrent_m.f90 b/src/matcha/do_concurrent_m.f90
index 21e1885..d28691c 100644
--- a/src/matcha/do_concurrent_m.f90
+++ b/src/matcha/do_concurrent_m.f90
@@ -27,10 +27,10 @@ pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
       integer(c_int), intent(out) :: k(:)
     end subroutine
     
-    pure module subroutine do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution) &
+    pure module subroutine do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution) &
       bind(C)
       implicit none
-      integer(c_int), intent(in) :: nintervals, speed, freq, k(:)
+      integer(c_int), intent(in) :: speed, freq, k(:)
       real(c_double), intent(in) :: emp_distribution(:,:)
       real(c_double), intent(out) :: output_distribution(:,:)
     end subroutine
diff --git a/src/matcha/do_concurrent_s.f90 b/src/matcha/do_concurrent_s.f90
index e8f1aac..a8d3a50 100644
--- a/src/matcha/do_concurrent_s.f90
+++ b/src/matcha/do_concurrent_s.f90
@@ -51,12 +51,10 @@ pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
     end associate
   end subroutine
   
-  pure module subroutine do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution) &
-    bind(C)
-    integer(c_int), intent(in) :: nintervals, speed, freq, k(:)
+  pure module subroutine do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution) bind(C)
+    integer(c_int), intent(in) :: speed, freq, k(:)
     real(c_double), intent(in) :: emp_distribution(:,:)
     real(c_double), intent(out) :: output_distribution(:,:)
-  
     integer i
     
     output_distribution(:,freq) = 0.d0

From 7e0190522f38d99c627b63cdb222e767cb770996 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 17:58:38 -0500
Subject: [PATCH 08/15] chore(output_s): rm unused c_loc use association

---
 src/matcha/output_s.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/matcha/output_s.f90 b/src/matcha/output_s.f90
index 57318b1..6fda001 100644
--- a/src/matcha/output_s.f90
+++ b/src/matcha/output_s.f90
@@ -3,7 +3,7 @@
 submodule(output_m) output_s
   use do_concurrent_m, only : do_concurrent_k, do_concurrent_output_distribution, do_concurrent_speeds
   use t_cell_collection_m, only : t_cell_collection_bind_C_t
-  use iso_c_binding, only : c_loc, c_double
+  use iso_c_binding, only : c_double
   implicit none
   
 contains

From 70ab130a37b1e7879e3c50c5967d90eec3bdc090 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 20:30:34 -0500
Subject: [PATCH 09/15] chore: eliminate pre-processing (.F90 -> .f90)

Changing file extensions from .f90 to .F90 eliminates unnecessary
automatic C preprocessing with some compilers, including gfortran.
---
 src/matcha/{distribution_s.F90 => distribution_s.f90}           | 0
 src/matcha/{t_cell_collection_s.F90 => t_cell_collection_s.f90} | 0
 src/{matcha_s.F90 => matcha_s.f90}                              | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename src/matcha/{distribution_s.F90 => distribution_s.f90} (100%)
 rename src/matcha/{t_cell_collection_s.F90 => t_cell_collection_s.f90} (100%)
 rename src/{matcha_s.F90 => matcha_s.f90} (100%)

diff --git a/src/matcha/distribution_s.F90 b/src/matcha/distribution_s.f90
similarity index 100%
rename from src/matcha/distribution_s.F90
rename to src/matcha/distribution_s.f90
diff --git a/src/matcha/t_cell_collection_s.F90 b/src/matcha/t_cell_collection_s.f90
similarity index 100%
rename from src/matcha/t_cell_collection_s.F90
rename to src/matcha/t_cell_collection_s.f90
diff --git a/src/matcha_s.F90 b/src/matcha_s.f90
similarity index 100%
rename from src/matcha_s.F90
rename to src/matcha_s.f90

From 86593b29962176d15980a231ceb296bd85359607 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 20:51:42 -0500
Subject: [PATCH 10/15] chore(subdomain): rm unused variables

---
 src/matcha/subdomain_s.f90 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/matcha/subdomain_s.f90 b/src/matcha/subdomain_s.f90
index 8b4e953..eb30147 100644
--- a/src/matcha/subdomain_s.f90
+++ b/src/matcha/subdomain_s.f90
@@ -134,7 +134,6 @@ subroutine edge_points(self, ds)
 
     subroutine apply_boundary_condition(ds)
       real, intent(inout) :: ds(:,:,:)
-      integer i, j
 
       ds(:,1:ny:ny-1, :       ) = 0.
       ds(:, :       ,1:nz:nz-1) = 0.

From 6526377c46f9c2e71aaea803e6804a3e8d221b05 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Mon, 1 Jan 2024 20:58:17 -0500
Subject: [PATCH 11/15] WIP: nagfor/gfortran internal compiler error (ICE)

The following commands reproduce an internal compiler error (ICE)
with each of the named compilers and compiler versions:

`gfortran` (Homebrew-installed)
---------------------------------
* Version 13.2.: `fpm test`
* Version 12.3.0: `fpm test --compiler gfortran-12`

`nagfor` 7.1 (Build 7143)
-------------------------
* `fpm test --compiler nagfor --flag "-fpp -f2018"`
---
 src/matcha/do_concurrent_s.f90     | 20 +++----
 src/matcha/output_s.f90            | 15 +++--
 src/matcha/subdomain_m.f90         | 88 ++++--------------------------
 src/matcha/subdomain_s.f90         | 59 +++++++++++++++++++-
 src/matcha/t_cell_collection_m.f90 |  4 +-
 src/{matcha_s.f90 => matcha_s.F90} |  2 +
 6 files changed, 90 insertions(+), 98 deletions(-)
 rename src/{matcha_s.f90 => matcha_s.F90} (99%)

diff --git a/src/matcha/do_concurrent_s.f90 b/src/matcha/do_concurrent_s.f90
index a8d3a50..c4798db 100644
--- a/src/matcha/do_concurrent_s.f90
+++ b/src/matcha/do_concurrent_s.f90
@@ -30,7 +30,7 @@ pure module subroutine do_concurrent_my_velocities(nsteps, dir, sampled_speeds,
     
     call assert(all([size(my_velocities,1),size(sampled_speeds,2)] == shape(sampled_speeds)), &
       "do_concurrent_my_velocities: argument size match")
-    call assert(all(shape(my_velocities,1)==shape(dir)), "do_concurrent_my_velocities: argument shape match")
+    call assert(all(size(my_velocities,1)==shape(dir)), "do_concurrent_my_velocities: argument shape match")
 
     do concurrent(step=1:nsteps)
       my_velocities(:,step,1) = sampled_speeds(:,step)*dir(:,step,1)
@@ -82,16 +82,14 @@ module subroutine do_concurrent_speeds(history, speeds) bind(C)
          x(i,:,:) = positions
       end do
   
-      associate(t => history%time)
-        do concurrent(i = 1:npositions-1, j = 1:ncells)
-          associate( &
-            u => (x(i+1,j,:) - x(i,j,:))/(t(i+1) - t(i)), &
-            ij => i + (j-1)*(npositions-1) &
-           )   
-            speeds(ij) = sqrt(sum([(u(k)**2, k=1,nspacedims)]))
-          end associate
-        end do
-      end associate
+      do concurrent(i = 1:npositions-1, j = 1:ncells)
+        associate( &
+          u => (x(i+1,j,:) - x(i,j,:))/(history(i+1)%time - history(i)%time), &
+          ij => i + (j-1)*(npositions-1) &
+         )   
+          speeds(ij) = sqrt(sum([(u(k)**2, k=1,nspacedims)]))
+        end associate
+      end do
     end associate
     
   end subroutine
diff --git a/src/matcha/output_s.f90 b/src/matcha/output_s.f90
index 6fda001..ec621bf 100644
--- a/src/matcha/output_s.f90
+++ b/src/matcha/output_s.f90
@@ -24,23 +24,26 @@
     
     integer, parameter :: speed=1, freq=2 ! subscripts for speeds and frequencies
 
-    associate(npositions => size(history), ncells => history(1)%positions_shape(1))
-      allocate(speeds(ncells*(npositions-1)))
+    associate(npositions => size(self%history_))
+      allocate(speeds(self%my_num_cells()*(npositions-1)))
     end associate
     call do_concurrent_speeds(t_cell_collection_bind_C_t(self%history_), speeds)
 
-    associate(emp_distribution => self%input_%sample_distribution())
+    block
+      real(c_double), allocatable :: emp_distribution(:,:)
+
+      emp_distribution = self%input_%sample_distribution()
       associate(nintervals => size(emp_distribution(:,1)), dvel_half => (emp_distribution(2,speed)-emp_distribution(1,speed))/2.d0)
         vel = [emp_distribution(1,speed) - dvel_half, [(emp_distribution(i,speed) + dvel_half, i=1,nintervals)]]
         if (allocated(k)) deallocate(k)
-        allocate(k(nspeeds))
+        allocate(k(size(speeds)))
         call do_concurrent_k(speeds, vel, k)
         if(allocated(output_distribution)) deallocate(output_distribution)
         allocate(output_distribution(nintervals,2))
-        call do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution)
+        call do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution)
         output_distribution(:,freq) = output_distribution(:,freq)/sum(output_distribution(:,freq))
       end associate
-    end associate
+    end block
 
   end procedure
 
diff --git a/src/matcha/subdomain_m.f90 b/src/matcha/subdomain_m.f90
index 605ead6..bf64959 100644
--- a/src/matcha/subdomain_m.f90
+++ b/src/matcha/subdomain_m.f90
@@ -1,42 +1,36 @@
 module subdomain_m
-  use assert_m, only : assert
   implicit none
 
   private
   public :: subdomain_t
-  public :: operator(.laplacian.)
-  public :: step
 
   type subdomain_t 
     private
     real, allocatable :: s_(:,:,:)
   contains
     procedure, pass(self) :: define
-    procedure, pass(rhs) :: multiply
-    generic :: operator(*) => multiply
-    generic :: operator(+) => add
-    generic :: assignment(=) => assign_
     procedure dx
     procedure dy
     procedure dz
     procedure values
+    generic :: operator(*) => multiply
+    generic :: operator(+) => add
+    generic :: operator(.laplacian.) => laplacian
+    generic :: assignment(=) => assign_
+    procedure, private, pass(rhs) :: multiply
+    procedure, private :: laplacian
     procedure, private :: add
     procedure, private :: assign_
   end type
 
-  interface operator(.laplacian.)
-
-    module procedure laplacian
-    !pure module function laplacian(rhs) result(laplacian_rhs)
-    !  implicit none
-    !  type(subdomain_t), intent(in) :: rhs[*]
-    !  type(subdomain_t) laplacian_rhs
-    !end function
-
-  end interface
-
   interface
 
+    pure module function laplacian(rhs) result(laplacian_rhs)
+      implicit none
+      class(subdomain_t), intent(in) :: rhs[*]
+      type(subdomain_t) laplacian_rhs
+    end function
+
     module subroutine define(side, boundary_val, internal_val, n, self)
       implicit none
       real, intent(in) :: side, boundary_val, internal_val
@@ -96,62 +90,4 @@ module subroutine assign_(lhs, rhs)
 
   end interface
 
-  real dx_, dy_, dz_
-  integer my_nx, nx, ny, nz, me, num_subdomains, my_internal_west, my_internal_east
-
-contains
-
-  pure module function laplacian(rhs) result(laplacian_rhs)
-    type(subdomain_t), intent(in) :: rhs[*]
-    type(subdomain_t) laplacian_rhs
-
-    integer i, j, k
-    real, allocatable :: halo_west(:,:), halo_east(:,:)
-
-    call assert(allocated(rhs%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
-
-    allocate(laplacian_rhs%s_, mold=rhs%s_)
-
-    if (me==1) then
-      halo_west = rhs%s_(1,:,:)
-    else
-      halo_west = rhs[me-1]%s_(ubound(rhs[me-1]%s_,1),:,:)
-    end if
-    i = my_internal_west
-    call assert(i+1<=my_nx,"laplacian: westernmost subdomain too small")
-    do concurrent(j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = ( halo_west(j,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    do concurrent(i=my_internal_west+1:my_internal_east-1, j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    if (me==1) then
-      halo_east = rhs%s_(1,:,:)
-    else
-      halo_east = rhs[me+1]%s_(lbound(rhs[me+1]%s_,1),:,:)
-    end if
-    i = my_internal_east
-    call assert(i-1>0,"laplacian: easternmost subdomain too small")
-    do concurrent(j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) +  halo_east(j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    laplacian_rhs%s_(:, 1,:) = 0.
-    laplacian_rhs%s_(:,ny,:) = 0.
-    laplacian_rhs%s_(:,:, 1) = 0.
-    laplacian_rhs%s_(:,:,nz) = 0.
-    if (me==1) laplacian_rhs%s_(1,:,:) = 0.
-    if (me==num_subdomains) laplacian_rhs%s_(my_nx,:,:) = 0.
-
-  end function
-
-
 end module
diff --git a/src/matcha/subdomain_s.f90 b/src/matcha/subdomain_s.f90
index eb30147..5cebb3d 100644
--- a/src/matcha/subdomain_s.f90
+++ b/src/matcha/subdomain_s.f90
@@ -1,10 +1,11 @@
 submodule(subdomain_m) subdomain_s
+  use assert_m, only : assert, intrinsic_array_t
   use sourcery_m, only : data_partition_t
-  use intrinsic_array_m, only : intrinsic_array_t
   implicit none
 
   type(data_partition_t) data_partition
-
+  real dx_, dy_, dz_
+  integer my_nx, nx, ny, nz, me, num_subdomains, my_internal_west, my_internal_east
   real, allocatable :: increment(:,:,:)
 
 contains
@@ -144,4 +145,56 @@ subroutine apply_boundary_condition(ds)
 
   end procedure
 
-end submodule subdomain_s
\ No newline at end of file
+  pure module function laplacian(rhs) result(laplacian_rhs)
+    class(subdomain_t), intent(in) :: rhs[*]
+    type(subdomain_t) laplacian_rhs
+
+    integer i, j, k
+    real, allocatable :: halo_west(:,:), halo_east(:,:)
+
+    call assert(allocated(rhs%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
+
+    allocate(laplacian_rhs%s_, mold=rhs%s_)
+
+    if (me==1) then
+      halo_west = rhs%s_(1,:,:)
+    else
+      halo_west = rhs[me-1]%s_(ubound(rhs[me-1]%s_,1),:,:)
+    end if
+    i = my_internal_west
+    call assert(i+1<=my_nx,"laplacian: westernmost subdomain too small")
+    do concurrent(j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = ( halo_west(j,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    do concurrent(i=my_internal_west+1:my_internal_east-1, j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    if (me==1) then
+      halo_east = rhs%s_(1,:,:)
+    else
+      halo_east = rhs[me+1]%s_(lbound(rhs[me+1]%s_,1),:,:)
+    end if
+    i = my_internal_east
+    call assert(i-1>0,"laplacian: easternmost subdomain too small")
+    do concurrent(j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) +  halo_east(j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    laplacian_rhs%s_(:, 1,:) = 0.
+    laplacian_rhs%s_(:,ny,:) = 0.
+    laplacian_rhs%s_(:,:, 1) = 0.
+    laplacian_rhs%s_(:,:,nz) = 0.
+    if (me==1) laplacian_rhs%s_(1,:,:) = 0.
+    if (me==num_subdomains) laplacian_rhs%s_(my_nx,:,:) = 0.
+
+  end function
+
+end submodule subdomain_s
diff --git a/src/matcha/t_cell_collection_m.f90 b/src/matcha/t_cell_collection_m.f90
index 323d47b..03789ac 100644
--- a/src/matcha/t_cell_collection_m.f90
+++ b/src/matcha/t_cell_collection_m.f90
@@ -42,8 +42,9 @@ pure module function construct(positions, time) result(t_cell_collection)
     
   interface t_cell_collection_bind_C_t
     
-    elemental module function construct_bind_C(t_cell_collection) result(t_cell_collection_bind_C)
+    impure elemental module function construct_bind_C(t_cell_collection) result(t_cell_collection_bind_C)
       !! Result is bind(C) representation of the data inside a t_cell_collection_t object
+      !! This function is impure because it invokes c_loc. Fortran 2023 compliance will allow this function to be pure.
       implicit none
       type(t_cell_collection_t), intent(in), target :: t_cell_collection
       type(t_cell_collection_bind_C_t) t_cell_collection_bind_C
@@ -60,7 +61,6 @@ pure module function positions(self) result(my_positions)
       double precision, allocatable :: my_positions(:,:)
     end function
     
-    
     elemental module function time(self) result(my_time)
       !! Return the t_cell_collection_t object's time stamp
       implicit none
diff --git a/src/matcha_s.f90 b/src/matcha_s.F90
similarity index 99%
rename from src/matcha_s.f90
rename to src/matcha_s.F90
index 4735efc..0cc9081 100644
--- a/src/matcha_s.f90
+++ b/src/matcha_s.F90
@@ -32,7 +32,9 @@
         associate(me => this_image())
           associate(my_num_cells => data_partition%last(me) - data_partition%first(me) + 1)
           
+#ifndef NAGFOR
             call random_init(repeatable=.true., image_distinct=.true.)
+#endif
             
             allocate(random_positions(my_num_cells,ndim))
             call random_number(random_positions)  

From d19482d76dc144776cbeb2db30401da297a28330 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Sat, 3 Feb 2024 14:02:55 -0800
Subject: [PATCH 12/15] test: work around nagfor 7.1 Build 7145 bug

---
 app/main.f90           |  6 +++---
 test/matcha_test_m.f90 | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/app/main.f90 b/app/main.f90
index 99ca6ce..67fb9b9 100644
--- a/app/main.f90
+++ b/app/main.f90
@@ -9,14 +9,14 @@ program main
   associate(input => input_t())
     output = output_t(input, matcha(input))
     block
-      double precision, allocatable :: simulated_distribution(:,:)
+      double precision, allocatable :: simulated_distribution(:,:), frequency_distribution(:)
       integer, parameter :: freq=2
       integer num_cells
 
       num_cells = output%my_num_cells()
       simulated_distribution = output%simulated_distribution()
-      simulated_distribution(:,freq) = num_cells*simulated_distribution(:,freq)
-      call co_sum(simulated_distribution(:,freq), result_image=1)
+      frequency_distribution =  num_cells*simulated_distribution(:,freq) ! copy to work around nagfor bug
+      call co_sum(frequency_distribution, result_image=1)
       call co_sum(num_cells, result_image=1)
       if (this_image()==1) simulated_distribution(:,freq) = simulated_distribution(:,freq)/dble(num_cells)
     end block
diff --git a/test/matcha_test_m.f90 b/test/matcha_test_m.f90
index f75e164..8a2c64e 100644
--- a/test/matcha_test_m.f90
+++ b/test/matcha_test_m.f90
@@ -67,7 +67,7 @@ function compare_image_distributions() result(test_passes)
   function compare_global_distributions() result(test_passes)
     logical test_passes
     type(output_t) output
-    double precision, allocatable :: simulated_distribution(:,:)
+    double precision, allocatable :: simulated_distribution(:,:), frequency(:)
     integer num_cells
     integer, parameter :: speed=1, freq=2 ! subscripts for speeds and frequencies
     real, parameter :: tolerance = 1.D-02
@@ -77,16 +77,16 @@ function compare_global_distributions() result(test_passes)
       associate(empirical_distribution => input%sample_distribution())
         simulated_distribution = output%simulated_distribution()  
         num_cells = output%my_num_cells()
-        simulated_distribution(:,freq) = num_cells*simulated_distribution(:,freq)
-        call co_sum(simulated_distribution(:,freq), result_image=1)
+        frequency_distribution = num_cells*simulated_distribution(:,freq) ! copy to work around nagfor 7.1 Build 7145 compiler bug
+        call co_sum(frequency_distribution, result_image=1)
         call co_sum(num_cells, result_image=1)
         if (this_image()/=1) then
           test_passes = .true.
         else
-          simulated_distribution(:,freq) = simulated_distribution(:,freq)/dble(num_cells)
+          frequency_distribution = frequency_distribution/dble(num_cells)
           associate( &
             diffmax_speeds=> maxval(abs(empirical_distribution(:,speed)-simulated_distribution(:,speed))), &
-            diffmax_freqs => maxval(abs(empirical_distribution(:,freq)-simulated_distribution(:,freq))) &
+            diffmax_freqs => maxval(abs(empirical_distribution(:,freq)-frequency_distribution)) &
           )
             test_passes = (diffmax_freqs < tolerance) .and. (diffmax_speeds < tolerance)
           end associate

From e561f4228d50b11072589071c586d76634b57a2b Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Sat, 3 Feb 2024 14:03:57 -0800
Subject: [PATCH 13/15] fix(subdomain): eliminate ambigous var/proc names

---
 example/time-paradigm.f90  | 4 ++--
 src/matcha/subdomain_m.f90 | 3 ++-
 src/matcha/subdomain_s.f90 | 2 +-
 test/subdomain_test_m.f90  | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/example/time-paradigm.f90 b/example/time-paradigm.f90
index 662be3b..a77008e 100644
--- a/example/time-paradigm.f90
+++ b/example/time-paradigm.f90
@@ -2,7 +2,7 @@
 ! Terms of use are as specified in LICENSE.txt
 program time_paradigm_m
   !! Time various alternative programming paradigms
-  use subdomain_m, only : subdomain_t, operator(.laplacian.), step
+  use subdomain_m, only : subdomain_t, march
   use assert_m, only : assert
   use sourcery_m, only : string_t, file_t, command_line_t, bin_t, csv 
   use iso_fortran_env, only : int64
@@ -90,7 +90,7 @@ function procedural_programming_time() result(system_time)
       procedural_programming: &
       do step = 1, steps
         sync all
-        call step(alpha*dt, T)
+        call march(alpha*dt, T)
       end do procedural_programming
 
       call system_clock(t_end_procedural, clock_rate)
diff --git a/src/matcha/subdomain_m.f90 b/src/matcha/subdomain_m.f90
index bf64959..f2a22d0 100644
--- a/src/matcha/subdomain_m.f90
+++ b/src/matcha/subdomain_m.f90
@@ -3,6 +3,7 @@ module subdomain_m
 
   private
   public :: subdomain_t
+  public :: march
 
   type subdomain_t 
     private
@@ -38,7 +39,7 @@ module subroutine define(side, boundary_val, internal_val, n, self)
       class(subdomain_t), intent(out) :: self
     end subroutine
 
-    module subroutine step(alpha_dt, self)
+    module subroutine march(alpha_dt, self)
       implicit none
       real, intent(in) :: alpha_dt
       type(subdomain_t), intent(inout) :: self[*]
diff --git a/src/matcha/subdomain_s.f90 b/src/matcha/subdomain_s.f90
index 5cebb3d..ac68f62 100644
--- a/src/matcha/subdomain_s.f90
+++ b/src/matcha/subdomain_s.f90
@@ -69,7 +69,7 @@
     my_values =  self%s_
   end procedure
 
-  module procedure step
+  module procedure march
 
     call assert(allocated(self%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
     call assert(my_internal_west+1<=my_nx,"laplacian: westernmost subdomain too small")
diff --git a/test/subdomain_test_m.f90 b/test/subdomain_test_m.f90
index 64c8ef9..5b506e3 100644
--- a/test/subdomain_test_m.f90
+++ b/test/subdomain_test_m.f90
@@ -3,7 +3,7 @@
 module subdomain_test_m
   !! Define subdomain tests and procedures required for reporting results
   use sourcery_m, only : test_t, test_result_t
-  use subdomain_m, only : subdomain_t, operator(.laplacian.), step
+  use subdomain_m, only : subdomain_t, march
   use assert_m, only : assert
   implicit none
 
@@ -217,7 +217,7 @@ function T_procedural()
       associate(dt => T%dx()*T%dy()/(4*alpha))
         do step = 1, steps
           sync all
-          call step(alpha*dt, T)
+          call march(alpha*dt, T)
         end do
       end associate
 

From 6ad5e1435dd3d7e9219f0a30bed492d05651bfdd Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Sat, 3 Feb 2024 14:36:42 -0800
Subject: [PATCH 14/15] test(matcha): fix test declaration

---
 example/heat-equation.f90 | 12 +++++++-----
 test/matcha_test_m.f90    |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/example/heat-equation.f90 b/example/heat-equation.f90
index 3c3cb5c..2e7624c 100644
--- a/example/heat-equation.f90
+++ b/example/heat-equation.f90
@@ -76,13 +76,14 @@ module subroutine exchange_halo(self)
 
   end interface
 
+  real, allocatable :: halo_x(:,:)[:]
+
 end module
 
 submodule(subdomain_2D_m) subdomain_2D_s
   use assertions_m, only : assert
   implicit none
 
-  real, allocatable :: halo_x(:,:)[:]
   real dx_, dy_
   integer, parameter :: west=1, east=2
   integer my_nx, nx, ny, me, num_subdomains, my_internal_west, my_internal_east
@@ -114,7 +115,8 @@ module subroutine exchange_halo(self)
     self%s_(my_nx, 2:ny-1) = merge(boundary_val, internal_val, me==num_subdomains) ! east subdomain boundary
 
     if (allocated(halo_x)) deallocate(halo_x)
-    allocate(halo_x(west:east, ny)[*])
+    !allocate(halo_x(west:east, ny)[*])
+    allocate(halo_x(2, ny)[*])
     call self%exchange_halo
   end procedure
 
@@ -131,7 +133,7 @@ module subroutine exchange_halo(self)
     integer i, j
     real, allocatable :: halo_west(:), halo_east(:)
 
-    call assert(allocated(rhs%s_), "subdomain_2D_t%laplacian: allocated(rhs%s_)")
+    !call assert(allocated(rhs%s_), "subdomain_2D_t%laplacian: allocated(rhs%s_)")
     call assert(allocated(halo_x), "subdomain_2D_t%laplacian: allocated(halo_x)")
 
     allocate(laplacian_rhs(my_nx, ny))
@@ -168,8 +170,8 @@ module subroutine exchange_halo(self)
   end procedure
    
   module procedure exchange_halo
-    if (me>1) halo_x(east,:)[me-1] = self%s_(1,:)
-    if (me<num_subdomains) halo_x(west,:)[me+1] = self%s_(my_nx,:)
+    !if (me>1) halo_x(east,:)[me-1] = self%s_(1,:)
+    !if (me<num_subdomains) halo_x(west,:)[me+1] = self%s_(my_nx,:)
   end procedure
 
   module procedure values
diff --git a/test/matcha_test_m.f90 b/test/matcha_test_m.f90
index 8a2c64e..20ffe4c 100644
--- a/test/matcha_test_m.f90
+++ b/test/matcha_test_m.f90
@@ -67,7 +67,7 @@ function compare_image_distributions() result(test_passes)
   function compare_global_distributions() result(test_passes)
     logical test_passes
     type(output_t) output
-    double precision, allocatable :: simulated_distribution(:,:), frequency(:)
+    double precision, allocatable :: simulated_distribution(:,:), frequency_distribution(:)
     integer num_cells
     integer, parameter :: speed=1, freq=2 ! subscripts for speeds and frequencies
     real, parameter :: tolerance = 1.D-02

From 59bbf8bf91d5afccf98f99e187d51062f568fad0 Mon Sep 17 00:00:00 2001
From: Damian Rouson <rouson@lbl.gov>
Date: Sat, 3 Feb 2024 20:56:05 -0800
Subject: [PATCH 15/15] fix(subdomain): add missing type-bound operators

---
 src/matcha/subdomain_s.f90 | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/matcha/subdomain_s.f90 b/src/matcha/subdomain_s.f90
index ac68f62..f5bfa0c 100644
--- a/src/matcha/subdomain_s.f90
+++ b/src/matcha/subdomain_s.f90
@@ -59,6 +59,16 @@
     my_dz = dz_
   end procedure
 
+  module procedure add
+    call assert(allocated(lhs%s_) .and. allocated(rhs%s_), "subdomain_t%add: allocated(rhs%s_)")
+    total%s_ = lhs%s_ +  rhs%s_
+  end procedure
+
+  module procedure multiply
+    call assert(allocated(rhs%s_), "subdomain_t%multiply: allocated(rhs%s_)")
+    product%s_ = lhs +  rhs%s_
+  end procedure
+
   module procedure assign_
     call assert(allocated(rhs%s_), "subdomain_t%assign_: allocated(rhs%s_)")
     lhs%s_ =  rhs%s_
@@ -142,7 +152,6 @@ subroutine apply_boundary_condition(ds)
       if (me==num_subdomains) ds(my_nx,:,:) = 0.
     end subroutine
 
-
   end procedure
 
   pure module function laplacian(rhs) result(laplacian_rhs)