From 14e419182cc8be2a664bda79ac1b90f5720a1817 Mon Sep 17 00:00:00 2001
From: Matthew Smith <mjsmith6@illinois.edu>
Date: Tue, 13 May 2025 10:08:37 -0500
Subject: [PATCH 1/2] optimize generate_box_mesh (#458, part 1)

* optimize generate_box_mesh

* optimize boundary processing loop

* fix type annotation

* parameterize dtypes

* simplify definition of a0/a1 for 1D

* use np.column_stack() instead of np.array([...]).T

* avoid potentially creating temporary arrays in vertex assignment

* *_crit -> *_face
---
 meshmode/mesh/generation.py | 341 +++++++++++++++++++++++-------------
 1 file changed, 219 insertions(+), 122 deletions(-)

diff --git a/meshmode/mesh/generation.py b/meshmode/mesh/generation.py
index adc6dcf6..76e44efb 100644
--- a/meshmode/mesh/generation.py
+++ b/meshmode/mesh/generation.py
@@ -1149,7 +1149,10 @@ def generate_box_mesh(
         axis_coords: tuple[np.ndarray, ...],
         order: int = 1, *,
         coord_dtype: Any = np.float64,
-        periodic: bool | None = None,
+        vertex_id_dtype: Any = np.int32,
+        element_id_dtype: Any = np.int32,
+        face_id_dtype: Any = np.int8,
+        periodic: tuple[bool, ...] | None = None,
         group_cls: type[MeshElementGroup] | None = None,
         boundary_tag_to_face: dict[Any, str] | None = None,
         mesh_type: str | None = None,
@@ -1232,7 +1235,7 @@ def generate_box_mesh(
     from pytools import product
     nvertices = product(shape)
 
-    vertex_indices = np.arange(nvertices).reshape(*shape)
+    vertex_indices = np.arange(nvertices, dtype=vertex_id_dtype).reshape(*shape)
 
     vertices = np.empty((dim, *shape), dtype=coord_dtype)
     for idim in range(dim):
@@ -1254,21 +1257,32 @@ def generate_box_mesh(
 
     shape_m1 = tuple(max(si-1, 0) for si in shape)
 
+    box_faces = [
+        side + str(axis)
+        for axis in range(dim)
+        for side in ["-", "+"]]
+
     if dim == 1:
         if mesh_type is not None:
             raise ValueError(f"unsupported mesh type: '{mesh_type}'")
 
         nelements = shape_m1[0]
         nvertices_per_element = 2
-        el_vertices = np.empty((nelements, nvertices_per_element), dtype=np.int32)
 
-        for i in range(shape_m1[0]):
-            # a--b
+        if nelements > 0:
+            a0 = vertex_indices[:shape_m1[0]]
+            a1 = vertex_indices[1:shape[0]]
 
-            a = vertex_indices[i]
-            b = vertex_indices[i+1]
+            el_vertices = np.column_stack([a0, a1])
+            box_face_to_elements = {
+                "-0": np.array([0], dtype=element_id_dtype),
+                "+0": np.array([shape_m1[0]-1], dtype=element_id_dtype)}
 
-            el_vertices[i, :] = (a, b)
+        else:
+            el_vertices = np.empty((0, nvertices_per_element), dtype=vertex_id_dtype)
+            box_face_to_elements = {
+                box_face: np.array([], dtype=element_id_dtype)
+                for box_face in box_faces}
 
     elif dim == 2:
         if is_tp:
@@ -1294,6 +1308,7 @@ def generate_box_mesh(
 
             midpoints = midpoints.reshape((dim, nmidpoints), order="F")
             vertices = np.concatenate((vertices, midpoints), axis=1)
+            nvertices += nmidpoints
 
             nsubelements = 4
             nvertices_per_element = 3
@@ -1306,38 +1321,86 @@ def generate_box_mesh(
             raise ValueError(f"unsupported mesh type: '{mesh_type}'")
 
         nelements = nsubelements * product(shape_m1)
-        el_vertices = np.empty((nelements, nvertices_per_element), dtype=np.int32)
-
-        iel = 0
-        for i in range(shape_m1[0]):
-            for j in range(shape_m1[1]):
-
-                # c--d
-                # |  |
-                # a--b
-
-                a = vertex_indices[i, j]
-                b = vertex_indices[i+1, j]
-                c = vertex_indices[i, j+1]
-                d = vertex_indices[i+1, j+1]
-
-                if is_tp:
-                    el_vertices[iel, :] = (a, b, c, d)
-
-                elif mesh_type == "X":
-                    m = midpoint_indices[i, j]
-                    el_vertices[iel:iel+4, :] = [
-                        (a, b, m),
-                        (b, d, m),
-                        (d, c, m),
-                        (c, a, m)]
 
-                else:
-                    el_vertices[iel:iel+2, :] = [
-                        (a, b, c),
-                        (d, c, b)]
-
-                iel += nsubelements
+        if nelements > 0:
+            base_idx_tuples = np.array(list(np.ndindex(shape_m1)))
+
+            i0 = base_idx_tuples[:, 0]
+            j0 = base_idx_tuples[:, 1]
+
+            a00 = vertex_indices[i0, j0]
+            a01 = vertex_indices[i0, j0+1]
+            a10 = vertex_indices[i0+1, j0]
+            a11 = vertex_indices[i0+1, j0+1]
+
+            if is_tp:
+                el_vertices = np.column_stack([a00, a10, a01, a11])
+
+                box_face_to_elements = {}
+                for box_face in box_faces:
+                    side, axis = box_face
+                    axis = int(axis)
+                    idx_face = 0 if side == "-" else shape_m1[axis] - 1
+                    elements, = np.where(base_idx_tuples[:, axis] == idx_face)
+                    box_face_to_elements[box_face] = elements
+
+            elif mesh_type == "X":
+                m = midpoint_indices[i0, j0]
+                subelement_to_vertices = [
+                    [a00, a10, m],
+                    [a10, a11, m],
+                    [a11, a01, m],
+                    [a01, a00, m]]
+
+                el_vertices = np.empty(
+                    (nelements, nvertices_per_element), dtype=vertex_id_dtype)
+                for isubelement, verts in enumerate(subelement_to_vertices):
+                    for ivert, vert in enumerate(verts):
+                        el_vertices[isubelement::nsubelements, ivert] = vert
+
+                box_face_to_subelement_offset = {
+                    "-0": 3,
+                    "+0": 1,
+                    "-1": 0,
+                    "+1": 2}
+
+                box_face_to_elements = {}
+                for box_face in box_faces:
+                    side, axis = box_face
+                    axis = int(axis)
+                    idx_face = 0 if side == "-" else shape_m1[axis] - 1
+                    box_elements, = np.where(base_idx_tuples[:, axis] == idx_face)
+                    box_face_to_elements[box_face] = \
+                        4*box_elements + box_face_to_subelement_offset[box_face]
+
+            else:
+                subelement_to_vertices = [
+                    [a00, a10, a01],
+                    [a11, a01, a10]]
+
+                el_vertices = np.empty(
+                    (nelements, nvertices_per_element), dtype=vertex_id_dtype)
+                for isubelement, verts in enumerate(subelement_to_vertices):
+                    for ivert, vert in enumerate(verts):
+                        el_vertices[isubelement::nsubelements, ivert] = vert
+
+                side_to_subelement_offset = {
+                    "-": 0,
+                    "+": 1}
+
+                box_face_to_elements = {}
+                for box_face in box_faces:
+                    side, axis = box_face
+                    axis = int(axis)
+                    idx_face = 0 if side == "-" else shape_m1[axis] - 1
+                    box_elements, = np.where(base_idx_tuples[:, axis] == idx_face)
+                    box_face_to_elements[box_face] = \
+                        2*box_elements + side_to_subelement_offset[side]
+        else:
+            el_vertices = np.empty((0, nvertices_per_element), dtype=vertex_id_dtype)
+            box_face_to_elements = {
+                box_face: np.array([], dtype=element_id_dtype)
+                for box_face in box_faces}
 
     elif dim == 3:
         if is_tp:
@@ -1355,38 +1418,73 @@ def generate_box_mesh(
             raise ValueError(f"unsupported mesh type: '{mesh_type}'")
 
         nelements = nsubelements * product(shape_m1)
-        el_vertices = np.empty((nelements, nvertices_per_element), dtype=np.int32)
-
-        iel = 0
-        for i in range(shape_m1[0]):
-            for j in range(shape_m1[1]):
-                for k in range(shape_m1[2]):
-
-                    a000 = vertex_indices[i, j, k]
-                    a001 = vertex_indices[i, j, k+1]
-                    a010 = vertex_indices[i, j+1, k]
-                    a011 = vertex_indices[i, j+1, k+1]
-
-                    a100 = vertex_indices[i+1, j, k]
-                    a101 = vertex_indices[i+1, j, k+1]
-                    a110 = vertex_indices[i+1, j+1, k]
-                    a111 = vertex_indices[i+1, j+1, k+1]
-
-                    if is_tp:
-                        el_vertices[iel, :] = (
-                            a000, a100, a010, a110,
-                            a001, a101, a011, a111)
-
-                    else:
-                        el_vertices[iel:iel+6, :] = [
-                            (a000, a100, a010, a001),
-                            (a101, a100, a001, a010),
-                            (a101, a011, a010, a001),
-                            (a100, a010, a101, a110),
-                            (a011, a010, a110, a101),
-                            (a011, a111, a101, a110)]
-
-                    iel += nsubelements
+
+        if nelements > 0:
+            base_idx_tuples = np.array(list(np.ndindex(shape_m1)))
+
+            i0 = base_idx_tuples[:, 0]
+            j0 = base_idx_tuples[:, 1]
+            k0 = base_idx_tuples[:, 2]
+
+            a000 = vertex_indices[i0, j0, k0]
+            a001 = vertex_indices[i0, j0, k0+1]
+            a010 = vertex_indices[i0, j0+1, k0]
+            a011 = vertex_indices[i0, j0+1, k0+1]
+            a100 = vertex_indices[i0+1, j0, k0]
+            a101 = vertex_indices[i0+1, j0, k0+1]
+            a110 = vertex_indices[i0+1, j0+1, k0]
+            a111 = vertex_indices[i0+1, j0+1, k0+1]
+
+            if is_tp:
+                el_vertices = np.column_stack([
+                    a000, a100, a010, a110,
+                    a001, a101, a011, a111])
+
+                box_face_to_elements = {}
+                for box_face in box_faces:
+                    side, axis = box_face
+                    axis = int(axis)
+                    idx_face = 0 if side == "-" else shape_m1[axis] - 1
+                    elements, = np.where(base_idx_tuples[:, axis] == idx_face)
+                    box_face_to_elements[box_face] = elements
+
+            else:
+                subelement_to_vertices = [
+                    [a000, a100, a010, a001],
+                    [a101, a100, a001, a010],
+                    [a101, a011, a010, a001],
+                    [a100, a010, a101, a110],
+                    [a011, a010, a110, a101],
+                    [a011, a111, a101, a110]]
+
+                el_vertices = np.empty(
+                    (nelements, nvertices_per_element), dtype=vertex_id_dtype)
+                for isubelement, verts in enumerate(subelement_to_vertices):
+                    for ivert, vert in enumerate(verts):
+                        el_vertices[isubelement::nsubelements, ivert] = vert
+
+                box_face_to_subelement_offsets = {
+                    "-0": (0, 2),
+                    "+0": (3, 5),
+                    "-1": (0, 1),
+                    "+1": (4, 5),
+                    "-2": (0, 3),
+                    "+2": (2, 5)}
+
+                box_face_to_elements = {}
+                for box_face in box_faces:
+                    side, axis = box_face
+                    axis = int(axis)
+                    idx_face = 0 if side == "-" else shape_m1[axis] - 1
+                    box_elements, = np.where(base_idx_tuples[:, axis] == idx_face)
+                    box_face_to_elements[box_face] = np.concatenate([
+                        6*box_elements + offset
+                        for offset in box_face_to_subelement_offsets[box_face]])
+        else:
+            el_vertices = np.empty((0, nvertices_per_element), dtype=vertex_id_dtype)
+            box_face_to_elements = {
+                box_face: np.array([], dtype=element_id_dtype)
+                for box_face in box_faces}
 
     else:
         raise NotImplementedError("box meshes of dimension %d" % dim)
@@ -1408,61 +1506,57 @@ def generate_box_mesh(
 
     facial_adjacency_groups = None
     face_vertex_indices_to_tags = {}
-    boundary_tags = list(boundary_tag_to_face.keys())
-    nbnd_tags = len(boundary_tags)
 
-    if nbnd_tags > 0:
-        vert_index_to_tuple = {
-                vertex_indices[itup]: itup
-                for itup in np.ndindex(shape)}
+    if boundary_tag_to_face:
+        box_face_to_tags = {}
+        for tag, tagged_box_faces in boundary_tag_to_face.items():
+            for box_face in tagged_box_faces:
+                if len(box_face) != 2:
+                    raise ValueError(
+                        f"face identifier '{box_face}' does not "
+                        "consist of exactly two characters")
+                side, axis = box_face
+                try:
+                    axis = axes.index(axis)
+                except ValueError as exc:
+                    raise ValueError(
+                        "unrecognized axis in face identifier "
+                        f"'{box_face}'") from exc
+                if axis >= dim:
+                    raise ValueError(f"axis in face identifier '{box_face}' "
+                                     f"does not exist in {dim}D")
+                if side not in ("-", "+"):
+                    raise ValueError("first character of face identifier"
+                                     f" '{box_face}' is not '+' or '-'")
+                box_face_to_tags.setdefault(f"{side}{axis}", []).append(tag)
+
+        # Extra vertices added beyond the box vertices don't have corresponding
+        # tuples, so assign them tuple(-1, ...)
+        vert_index_to_tuple = np.full((nvertices, dim), -1)
+        for itup in np.ndindex(shape):
+            vert_index_to_tuple[vertex_indices[itup], :] = itup
+
+        for box_face, elements in box_face_to_elements.items():
+            try:
+                tags = box_face_to_tags[box_face]
+            except KeyError:
+                continue
+            side, axis = box_face
+            axis = int(axis)
+            vert_face = 0 if side == "-" else shape[axis] - 1
 
-        for ielem in range(0, grp.nelements):
             for ref_fvi in grp.face_vertex_indices():
-                fvi = grp.vertex_indices[ielem, ref_fvi]
-                try:
-                    fvi_tuples = [vert_index_to_tuple[i] for i in fvi]
-                except KeyError:
-                    # Happens for interior faces of "X" meshes because
-                    # midpoints aren't in vert_index_to_tuple. We don't
-                    # care about them.
-                    continue
-
-                for tag in boundary_tags:
-                    # Need to map the correct face vertices to the boundary tags
-                    for face in boundary_tag_to_face[tag]:
-                        if len(face) != 2:
-                            raise ValueError(
-                                "face identifier '%s' does not "
-                                "consist of exactly two characters" % face)
-
-                        side, axis = face
-                        try:
-                            axis = axes.index(axis)
-                        except ValueError as exc:
-                            raise ValueError(
-                                "unrecognized axis in face identifier "
-                                f"'{face}'") from exc
-                        if axis >= dim:
-                            raise ValueError("axis in face identifier '%s' "
-                                             "does not exist in %dD" % (face, dim))
-
-                        if side == "-":
-                            vert_crit = 0
-                        elif side == "+":
-                            vert_crit = shape[axis] - 1
-                        else:
-                            raise ValueError("first character of face identifier"
-                                             " '%s' is not '+' or '-'" % face)
-
-                        if all(fvi_tuple[axis] == vert_crit
-                               for fvi_tuple in fvi_tuples):
-                            key = frozenset(fvi)
-                            face_vertex_indices_to_tags.setdefault(key,
-                                                                   []).append(tag)
+                fvis = grp.vertex_indices[elements[:, np.newaxis], ref_fvi]
+                fvi_tuples = vert_index_to_tuple[fvis, :]
+                face_fvis = fvis[
+                    np.all(fvi_tuples[:, :, axis] == vert_face, axis=1), :]
+                for iface in range(face_fvis.shape[0]):
+                    fvi = face_fvis[iface, :]
+                    face_vertex_indices_to_tags[frozenset(fvi)] = tags
 
         from meshmode.mesh import _compute_facial_adjacency_from_vertices
         facial_adjacency_groups = _compute_facial_adjacency_from_vertices(
-                [grp], np.int32, np.int8, face_vertex_indices_to_tags)
+                [grp], element_id_dtype, face_id_dtype, face_vertex_indices_to_tags)
     else:
         facial_adjacency_groups = None
 
@@ -1470,7 +1564,10 @@ def generate_box_mesh(
 
     mesh = make_mesh(vertices, [grp],
             facial_adjacency_groups=facial_adjacency_groups,
-            is_conforming=True)
+            is_conforming=True,
+            vertex_id_dtype=vertex_id_dtype,
+            element_id_dtype=element_id_dtype,
+            face_id_dtype=face_id_dtype)
 
     if any(periodic):
         from meshmode import AffineMap
@@ -1478,7 +1575,7 @@ def generate_box_mesh(
         bdry_pair_mappings_and_tols = []
         for idim in range(dim):
             if periodic[idim]:
-                offset = np.zeros(dim, dtype=np.float64)
+                offset = np.zeros(dim, dtype=coord_dtype)
                 offset[idim] = axis_coords[idim][-1] - axis_coords[idim][0]
                 bdry_pair_mappings_and_tols.append((
                     BoundaryPairMapping(

From 9f9016faf81a4f798219577d419d793bdb13d01f Mon Sep 17 00:00:00 2001
From: Matthew Smith <mjsmith6@illinois.edu>
Date: Tue, 13 May 2025 10:15:43 -0500
Subject: [PATCH 2/2] combine/optimize _match_vertices and
 find_point_permutation implementations (#458, part 2)

* optimize vertex matching in periodicity processing

* use np.median instead of approximating with np.histogram

    turns out to be just as fast

* combine _match_vertices and mesh.tools.find_point_permutation into find_point_to_point_mapping

* add tests for find_point_to_point_mapping

* use find_point_to_point_mapping instead of find_point_permutation
---
 meshmode/discretization/connection/direct.py |  13 +-
 meshmode/mesh/processing.py                  | 135 ++++-------------
 meshmode/mesh/tools.py                       | 147 ++++++++++++++++---
 test/test_mesh.py                            | 124 ++++++++++++++++
 4 files changed, 291 insertions(+), 128 deletions(-)

diff --git a/meshmode/discretization/connection/direct.py b/meshmode/discretization/connection/direct.py
index a0763ccd..a6f03280 100644
--- a/meshmode/discretization/connection/direct.py
+++ b/meshmode/discretization/connection/direct.py
@@ -422,12 +422,17 @@ def _resample_point_pick_indices(self, to_group_index: int, ibatch_index: int,
         ibatch = self.groups[to_group_index].batches[ibatch_index]
         from_grp = self.from_discr.groups[ibatch.from_group_index]
 
-        from meshmode.mesh.tools import find_point_permutation
-        return find_point_permutation(
-                    targets=ibatch.result_unit_nodes,
-                    permutees=from_grp.unit_nodes,
+        from meshmode.mesh.tools import find_point_to_point_mapping
+        src_idx_to_tgt_idx = find_point_to_point_mapping(
+                    src_points=ibatch.result_unit_nodes,
+                    tgt_points=from_grp.unit_nodes,
                     tol_multiplier=tol_multiplier)
 
+        return (
+            src_idx_to_tgt_idx
+            if np.all(src_idx_to_tgt_idx >= 0)
+            else None)
+
     @keyed_memoize_method(lambda actx, to_group_index, ibatch_index,
             tol_multiplier=None: (to_group_index, ibatch_index, tol_multiplier))
     def _frozen_resample_point_pick_indices(self, actx: ArrayContext,
diff --git a/meshmode/mesh/processing.py b/meshmode/mesh/processing.py
index f99e0a0a..1698665b 100644
--- a/meshmode/mesh/processing.py
+++ b/meshmode/mesh/processing.py
@@ -25,7 +25,8 @@
 from collections.abc import Callable, Mapping, Sequence
 from dataclasses import dataclass, replace
 from functools import reduce
-from typing import Any, Literal
+from typing import Literal
+from warnings import warn
 
 import numpy as np
 import numpy.linalg as la
@@ -45,7 +46,7 @@
     _FaceIDs,
     make_mesh,
 )
-from meshmode.mesh.tools import AffineMap, find_point_permutation
+from meshmode.mesh.tools import AffineMap, find_point_to_point_mapping
 
 
 __doc__ = """
@@ -588,18 +589,18 @@ def evec(i: int) -> np.ndarray:
         result[i] = 1
         return result
 
-    def unpack_single(ary: np.ndarray | None) -> np.ndarray:
+    def unpack_single(ary: np.ndarray | None) -> int:
         assert ary is not None
         item, = ary
         return item
 
-    base_vertex_index = unpack_single(find_point_permutation(
-             targets=-np.ones(grp.dim),
-             permutees=grp.vertex_unit_coordinates().T))
+    base_vertex_index = unpack_single(find_point_to_point_mapping(
+             src_points=-np.ones(grp.dim).reshape(-1, 1),
+             tgt_points=grp.vertex_unit_coordinates().T))
     spanning_vertex_indices = [
-        unpack_single(find_point_permutation(
-                     targets=-np.ones(grp.dim) + 2 * evec(i),
-                     permutees=grp.vertex_unit_coordinates().T))
+        unpack_single(find_point_to_point_mapping(
+                     src_points=(-np.ones(grp.dim) + 2 * evec(i)).reshape(-1, 1),
+                     tgt_points=grp.vertex_unit_coordinates().T))
         for i in range(grp.dim)
     ]
 
@@ -747,11 +748,10 @@ def _get_tensor_product_element_flip_matrix_and_vertex_permutation(
                  unit_flip_matrix,
                  grp.vertex_unit_coordinates().T)
 
-    vertex_permutation_to = find_point_permutation(
-        targets=flipped_vertices,
-        permutees=grp.vertex_unit_coordinates().T,
-    )
-    if vertex_permutation_to is None:
+    vertex_permutation_to = find_point_to_point_mapping(
+        src_points=flipped_vertices,
+        tgt_points=grp.vertex_unit_coordinates().T)
+    if np.any(vertex_permutation_to < 0):
         raise RuntimeError("flip permutation was not found")
 
     flipped_unit_nodes = np.einsum("ij,jn->in", unit_flip_matrix, grp.unit_nodes)
@@ -1020,85 +1020,6 @@ def split_mesh_groups(
 # }}}
 
 
-# {{{ vertex matching
-
-def _match_vertices(
-        mesh: Mesh,
-        src_vertex_indices: np.ndarray,
-        tgt_vertex_indices: np.ndarray, *,
-        aff_map: AffineMap | None = None,
-        tol: float = 1e-12,
-        use_tree: bool | None = None) -> np.ndarray:
-    if mesh.vertices is None:
-        raise ValueError("Mesh must have vertices")
-
-    if aff_map is None:
-        aff_map = AffineMap()
-
-    if use_tree is None:
-        # Empirically, the tree version becomes faster at 2**13.
-        # The temporary (displacements) below at that size requires
-        # 1.6GB, which seems like a lot. Capping at 2**11 instead,
-        # which requires a more reasonable 100M.
-        use_tree = len(tgt_vertex_indices) >= 2**11
-
-    src_vertices = mesh.vertices[:, src_vertex_indices]
-    tgt_vertices = mesh.vertices[:, tgt_vertex_indices]
-
-    mapped_src_vertices = aff_map(src_vertices)
-
-    if use_tree:
-        tgt_vertex_bboxes = np.stack((
-            tgt_vertices - tol,
-            tgt_vertices + tol))
-
-        from pytools.spatial_btree import SpatialBinaryTreeBucket
-        tree = SpatialBinaryTreeBucket(
-            np.min(tgt_vertex_bboxes[0], axis=1),
-            np.max(tgt_vertex_bboxes[1], axis=1))
-        for ivertex in range(len(tgt_vertex_indices)):
-            tree.insert(ivertex, tgt_vertex_bboxes[:, :, ivertex])
-
-        matched_tgt_vertices: np.ndarray[tuple[int, ...], np.dtype[Any]] \
-            = np.full(len(src_vertex_indices), -1)
-        for ivertex in range(len(src_vertex_indices)):
-            mapped_src_vertex = mapped_src_vertices[:, ivertex]
-            matches = np.array(list(tree.generate_matches(mapped_src_vertex)))
-            match_bboxes = tgt_vertex_bboxes[:, :, matches]
-            in_bbox = np.all(
-                (mapped_src_vertex[:, np.newaxis] >= match_bboxes[0, :, :])
-                & (mapped_src_vertex[:, np.newaxis] <= match_bboxes[1, :, :]),
-                axis=0)
-            candidate_indices = matches[in_bbox]
-            if len(candidate_indices) == 0:
-                continue
-            displacements = (
-                mapped_src_vertex.reshape(-1, 1)
-                - tgt_vertices[:, candidate_indices])
-            distances_sq = np.sum(displacements**2, axis=0)
-            matched_tgt_vertices[ivertex] = (
-                tgt_vertex_indices[candidate_indices[np.argmin(distances_sq)]])
-
-    else:
-        displacements = (
-            mapped_src_vertices.reshape(mesh.dim, -1, 1)
-            - tgt_vertices.reshape(mesh.dim, 1, -1))
-        distances_sq = np.sum(displacements**2, axis=0)
-
-        vertex_indices, = np.indices((len(src_vertex_indices),))
-        min_distance_sq_indices = np.argmin(distances_sq, axis=1)
-        min_distances_sq = distances_sq[vertex_indices, min_distance_sq_indices]
-
-        matched_tgt_vertices = np.where(
-            min_distances_sq < tol**2,
-            tgt_vertex_indices[min_distance_sq_indices],
-            -1)
-
-    return matched_tgt_vertices
-
-# }}}
-
-
 # {{{ boundary face matching
 
 @dataclass(frozen=True)
@@ -1170,8 +1091,8 @@ def _get_face_vertex_indices(mesh: Mesh, face_ids: _FaceIDs) -> np.ndarray:
 
 
 def _match_boundary_faces(
-        mesh: Mesh, bdry_pair_mapping: BoundaryPairMapping, tol: float, *,
-        use_tree: bool | None = None) -> tuple[_FaceIDs, _FaceIDs]:
+        mesh: Mesh, bdry_pair_mapping: BoundaryPairMapping, tol: float,
+        ) -> tuple[_FaceIDs, _FaceIDs]:
     """
     Given a :class:`BoundaryPairMapping` *bdry_pair_mapping*, return the
     correspondence between faces of the two boundaries (expressed as a pair of
@@ -1182,8 +1103,6 @@ def _match_boundary_faces(
         whose faces are to be matched.
     :arg tol: The allowed tolerance between the transformed vertex coordinates of
         the first boundary and the vertex coordinates of the second boundary.
-    :arg use_tree: Optional argument indicating whether to use a spatial binary
-        search tree or a (quadratic) numpy algorithm when matching vertices.
     :returns: A pair of :class:`meshmode.mesh._FaceIDs`, each having a number of
         entries equal to the number of faces in the boundary, that represents the
         correspondence between the two boundaries' faces. The first element in the
@@ -1217,12 +1136,15 @@ def _match_boundary_faces(
     bdry_n_vertex_indices = np.unique(bdry_n_face_vertex_indices)
     bdry_n_vertex_indices = bdry_n_vertex_indices[bdry_n_vertex_indices >= 0]
 
-    matched_bdry_n_vertex_indices = _match_vertices(
-        mesh, bdry_m_vertex_indices, bdry_n_vertex_indices,
-        aff_map=bdry_pair_mapping.aff_map, tol=tol, use_tree=use_tree)
+    bdry_m_vertices = mesh.vertices[:, bdry_m_vertex_indices]
+    bdry_n_vertices = mesh.vertices[:, bdry_n_vertex_indices]
+
+    m_idx_to_n_idx = find_point_to_point_mapping(
+        bdry_pair_mapping.aff_map(bdry_m_vertices),
+        bdry_n_vertices)
 
     unmatched_bdry_m_vertex_indices = bdry_m_vertex_indices[
-        np.where(matched_bdry_n_vertex_indices < 0)[0]]
+        np.where(m_idx_to_n_idx < 0)[0]]
     nunmatched = len(unmatched_bdry_m_vertex_indices)
     if nunmatched > 0:
         vertices = mesh.vertices[:, unmatched_bdry_m_vertex_indices]
@@ -1235,6 +1157,9 @@ def _match_boundary_faces(
                 for i in range(min(nunmatched, 10))])
             + f"\n...\n({nunmatched-10} more omitted.)" if nunmatched > 10 else "")
 
+    matched_bdry_n_vertex_indices = bdry_n_vertex_indices[
+        m_idx_to_n_idx]
+
     from meshmode.mesh import _concatenate_face_ids
     face_ids = _concatenate_face_ids([bdry_m_face_ids, bdry_n_face_ids])
 
@@ -1291,13 +1216,15 @@ def glue_mesh_boundaries(
         coordinates of the first boundary and the vertex coordinates of the second
         boundary when attempting to match the two. Pass at most one mapping for each
         unique (order-independent) pair of boundaries.
-    :arg use_tree: Optional argument indicating whether to use a spatial binary
-        search tree or a (quadratic) numpy algorithm when matching vertices.
     """
     if any(grp.vertex_indices is None for grp in mesh.groups):
         raise ValueError(
             "gluing mesh boundaries requires 'vertex_indices' in all groups")
 
+    if use_tree is not None:
+        warn("Passing 'use_tree' is deprecated and will be removed "
+             "in Q3 2025.", DeprecationWarning, stacklevel=2)
+
     glued_btags = {
         btag
         for mapping, _ in bdry_pair_mappings_and_tols
@@ -1320,7 +1247,7 @@ def glue_mesh_boundaries(
         glued_btag_pairs.add(btag_pair)
 
     face_id_pairs_for_mapping = [
-        _match_boundary_faces(mesh, mapping, tol, use_tree=use_tree)
+        _match_boundary_faces(mesh, mapping, tol)
         for mapping, tol in bdry_pair_mappings_and_tols]
 
     facial_adjacency_groups = []
diff --git a/meshmode/mesh/tools.py b/meshmode/mesh/tools.py
index 9a59873d..fdb16113 100644
--- a/meshmode/mesh/tools.py
+++ b/meshmode/mesh/tools.py
@@ -20,6 +20,8 @@
 THE SOFTWARE.
 """
 
+from warnings import warn
+
 import numpy as np
 import numpy.linalg as la
 
@@ -29,6 +31,8 @@
 
 
 __doc__ = """
+.. autofunction:: find_point_to_point_mapping
+
 .. currentmodule:: meshmode
 
 .. autoclass:: AffineMap
@@ -227,38 +231,141 @@ def find_point_permutation(
     :arg permutees: shaped ``(dim, npoints)``
     :returns: a "from"-style permutation, or None if none was found.
     """
+    warn("find_point_permutation is deprecated and will be removed in Q3 2025. Use "
+         "find_point_to_point_mapping instead.", DeprecationWarning, stacklevel=2)
 
-    if len(targets.shape) == 1:
+    single_target = len(targets.shape) == 1
+    if single_target:
         targets = targets.reshape(-1, 1)
 
-    if tol_multiplier is None:
-        tol_multiplier = 250
+    target_idx_to_permutee_idx = find_point_to_point_mapping(
+        src_points=targets,
+        tgt_points=permutees,
+        tol_multiplier=tol_multiplier)
 
-    tol = np.finfo(targets.dtype).eps * tol_multiplier
+    return (
+        target_idx_to_permutee_idx
+        if np.all(target_idx_to_permutee_idx >= 0)
+        else None)
 
-    dim, ntgt_nodes = targets.shape
-    if dim == 0:
-        assert ntgt_nodes == 1
-        return np.array([0], dtype=np.int16)
 
-    dist_vecs = (targets.reshape(dim, -1, 1)
-            - permutees.reshape(dim, 1, -1))
-    dists = la.norm(dist_vecs, axis=0, ord=2)
+def find_point_to_point_mapping(
+        src_points: np.ndarray,
+        tgt_points: np.ndarray, *,
+        tol: float | None = None,
+        tol_multiplier: float | None = None,
+        _max_leaf_points: int | None = None) -> np.ndarray:
+    """
+    Compute a mapping from indices of points in *src_points* to the matching
+    indices of points in *tgt_points*.
+
+    :arg src_points: shaped ``(dim, npoints)`` or just ``(dim,)`` if a single point
+    :arg tgt_points: shaped ``(dim, npoints)``
+    :arg tol: the maximum distance allowed between matching points.
+    :arg tol_multiplier: another way of specifying tolerance: as a multiplier of
+        machine epsilon. Must not specify both *tol* and *tol_multiplier*.
+
+    :returns: an array storing an index into *tgt_points* for each point in
+        *src_points* if a match exists or ``-1`` if not.
+    """
+
+    if tol is not None and tol_multiplier is not None:
+        raise ValueError("cannot specify both tol and tol_multiplier.")
+    elif tol is None:
+        if tol_multiplier is None:
+            tol_multiplier = 250
+        tol = np.finfo(src_points.dtype).eps * tol_multiplier
+
+    if _max_leaf_points is None:
+        _max_leaf_points = 2**8  # *shrug*
 
-    assert ntgt_nodes < 2**16
+    src_dim, n_src_points = src_points.shape
+    tgt_dim, n_tgt_points = tgt_points.shape
 
-    result: np.ndarray = np.zeros(ntgt_nodes, dtype=np.int16)
+    if tgt_dim != src_dim:
+        raise ValueError(
+            "source and target points must have the same ambient dimension.")
 
-    for irow in range(ntgt_nodes):
-        close_indices, = np.where(dists[irow] < tol)
+    dim = src_dim
+
+    if dim == 0:
+        assert n_tgt_points == 1
+        return np.array([0])
 
-        if len(close_indices) != 1:
-            return None
+    if n_src_points + n_tgt_points <= _max_leaf_points:
+        displacements = (
+            src_points.reshape(dim, -1, 1)
+            - tgt_points.reshape(dim, 1, -1))
+        distances_sq = np.sum(displacements**2, axis=0)
 
-        close_index, = close_indices
-        result[irow] = close_index
+        src_indices, = np.indices((n_src_points,))
+        min_distance_sq_indices = np.argmin(distances_sq, axis=1)
+        min_distances_sq = distances_sq[src_indices, min_distance_sq_indices]
 
-    return result
+        src_idx_to_tgt_idx = np.where(
+            min_distances_sq < tol**2,
+            min_distance_sq_indices,
+            -1)
+
+        return src_idx_to_tgt_idx
+
+    else:
+        both_points = np.concatenate((tgt_points, src_points), axis=1)
+
+        idim_largest = np.argmax(
+            np.max(both_points, axis=1)
+            - np.min(both_points, axis=1))
+
+        median_coord = np.median(both_points[idim_largest, :])
+
+        lower_src_idx_to_full_idx, = np.where(
+            src_points[idim_largest, :] <= median_coord + tol)
+        lower_tgt_idx_to_full_idx, = np.where(
+            tgt_points[idim_largest, :] <= median_coord + tol)
+
+        upper_src_idx_to_full_idx, = np.where(
+            src_points[idim_largest, :] >= median_coord - tol)
+        upper_tgt_idx_to_full_idx, = np.where(
+            tgt_points[idim_largest, :] >= median_coord - tol)
+
+        if __debug__:
+            n_lower_points = (
+                len(lower_src_idx_to_full_idx)
+                + len(lower_tgt_idx_to_full_idx))
+            n_upper_points = (
+                len(upper_src_idx_to_full_idx)
+                + len(upper_tgt_idx_to_full_idx))
+            if (
+                    # No theoretical justification for using 2/3, it just seems like
+                    # a reasonable number
+                    n_lower_points > 2*both_points.shape[1]/3
+                    or n_upper_points > 2*both_points.shape[1]/3):
+                warn(
+                    "bad partitioning of points, performance may be degraded.",
+                    stacklevel=2)
+
+        lower_src_idx_to_tgt_idx = find_point_to_point_mapping(
+            src_points[:, lower_src_idx_to_full_idx],
+            tgt_points[:, lower_tgt_idx_to_full_idx],
+            tol=tol,
+            _max_leaf_points=_max_leaf_points)
+
+        upper_src_idx_to_tgt_idx = find_point_to_point_mapping(
+            src_points[:, upper_src_idx_to_full_idx],
+            tgt_points[:, upper_tgt_idx_to_full_idx],
+            tol=tol,
+            _max_leaf_points=_max_leaf_points)
+
+        matched_lower_points, = np.where(lower_src_idx_to_tgt_idx >= 0)
+        matched_upper_points, = np.where(upper_src_idx_to_tgt_idx >= 0)
+
+        src_idx_to_tgt_idx = np.full(n_src_points, -1)
+        src_idx_to_tgt_idx[lower_src_idx_to_full_idx[matched_lower_points]] = \
+            lower_tgt_idx_to_full_idx[lower_src_idx_to_tgt_idx[matched_lower_points]]
+        src_idx_to_tgt_idx[upper_src_idx_to_full_idx[matched_upper_points]] = \
+            upper_tgt_idx_to_full_idx[upper_src_idx_to_tgt_idx[matched_upper_points]]
+
+        return src_idx_to_tgt_idx
 
 # }}}
 
diff --git a/test/test_mesh.py b/test/test_mesh.py
index 401a4842..8b672f2e 100644
--- a/test/test_mesh.py
+++ b/test/test_mesh.py
@@ -818,6 +818,130 @@ def test_lookup_tree(visualize=False):
 # }}}
 
 
+# {{{ test point matching
+
+def _find_point_to_point_mapping_brute_force(src_points, tgt_points, *,
+        tol, return_second_match=False):
+    d, n_src_points = src_points.shape
+
+    displacements = (
+        src_points.reshape(d, -1, 1)
+        - tgt_points.reshape(d, 1, -1))
+    distances_sq = np.sum(displacements**2, axis=0)
+
+    src_indices, = np.indices((n_src_points,))
+
+    min_distance_sq_indices = np.argmin(distances_sq, axis=1)
+    min_distances_sq = distances_sq[src_indices, min_distance_sq_indices]
+
+    src_idx_to_tgt_idx = np.where(
+        min_distances_sq < tol**2,
+        min_distance_sq_indices,
+        -1)
+
+    if return_second_match:
+        max_dist_sq = np.max(distances_sq)
+        distances_sq[src_indices, min_distance_sq_indices] = 2*max_dist_sq
+        min_distance_sq_indices = np.argmin(distances_sq, axis=1)
+        min_distances_sq = distances_sq[src_indices, min_distance_sq_indices]
+
+        src_idx_to_tgt_idx_2nd = np.where(
+            min_distances_sq < tol**2,
+            min_distance_sq_indices,
+            -1)
+
+        return src_idx_to_tgt_idx, src_idx_to_tgt_idx_2nd
+    else:
+        return src_idx_to_tgt_idx
+
+
+def test_point_matching():
+    from meshmode.mesh.tools import find_point_to_point_mapping
+
+    rng = np.random.default_rng(seed=42)
+
+    # {{{ Sanity check _find_point_to_point_mapping_brute_force
+
+    x, y = np.meshgrid(np.linspace(0., 1., 50), np.linspace(0., 1., 50))
+    points = np.stack((x.flatten(), y.flatten()))
+    permuted_points_to_points = rng.permutation(points.shape[1])
+    adjusted_points = points[:, permuted_points_to_points]
+    adjusted_points_to_points = _find_point_to_point_mapping_brute_force(
+        src_points=adjusted_points,
+        tgt_points=points,
+        tol=1e-10)
+    assert np.all(adjusted_points_to_points == permuted_points_to_points)
+
+    # }}}
+
+    # {{{ 0D
+
+    points = np.empty((0, 1))
+    points_to_points = find_point_to_point_mapping(
+        src_points=points,
+        tgt_points=points)
+    assert np.all(points_to_points == np.array([0]))
+
+    # }}}
+
+    # {{{ 1D through 3D
+
+    npoints = 1000
+    tol = 1e-10
+
+    for d in range(1, 4):
+        # Generate arrays of point data that don't contain points that are close
+        # to each other so that we can add perturbations without worrying about them
+        # getting too close to others
+        point_data = []
+        nattempts = 0
+        while len(point_data) < 10 and nattempts < 100:
+            points = npoints**(1/d) * rng.random(size=(d, npoints))
+
+            _, points_to_other = \
+                _find_point_to_point_mapping_brute_force(
+                    src_points=points,
+                    tgt_points=points,
+                    tol=100*tol,
+                    return_second_match=True)
+
+            if not np.any(points_to_other >= 0):
+                point_data.append(points)
+
+            nattempts += 1
+
+        if len(point_data) < 10:
+            pytest.fail("unable to generate enough input data.")
+
+        for points in point_data:
+            # Permuted and perturbed by a random amount in a random direction
+            permuted_points_to_points = rng.permutation(npoints)
+            adjusted_points = (
+                points[:, permuted_points_to_points]
+                # Can't quite go to 100% of tol due to cancellation error
+                + 0.9 * tol * (2 * rng.random(size=(d, npoints)) - 1)/np.sqrt(d))
+            adjusted_points_to_points = find_point_to_point_mapping(
+                src_points=adjusted_points,
+                tgt_points=points,
+                tol=tol)
+
+            assert np.all(adjusted_points_to_points >= 0)
+            assert np.all(adjusted_points_to_points == permuted_points_to_points)
+
+            # Perturbed too far
+            unmatchable_point = points[:, 0] + 2*tol
+            unmatchable_point_to_point = find_point_to_point_mapping(
+                src_points=unmatchable_point.reshape(-1, 1),
+                tgt_points=points,
+                tol=tol)
+
+            assert unmatchable_point_to_point == -1
+
+    # }}}
+
+# }}}
+
+
 # {{{ test boundary tags
 
 def test_boundary_tags():