From 3d0ef27c39e8b1e35f31a2b31a2ac0d97838cb7c Mon Sep 17 00:00:00 2001 From: ajpotts Date: Thu, 8 Jan 2026 20:11:44 -0500 Subject: [PATCH] Closes #5272: alignment tests for arkouda.numpy.pdarraysetops --- arkouda/numpy/pdarraysetops.py | 7 +- pytest.ini | 1 + .../pdarraysetops_alignment.py | 331 ++++++++++++++++++ 3 files changed, 336 insertions(+), 3 deletions(-) create mode 100644 tests/numpy/alignment_verification/pdarraysetops_alignment.py diff --git a/arkouda/numpy/pdarraysetops.py b/arkouda/numpy/pdarraysetops.py index db2f6f68c0b..3b0a137957c 100644 --- a/arkouda/numpy/pdarraysetops.py +++ b/arkouda/numpy/pdarraysetops.py @@ -19,7 +19,7 @@ if TYPE_CHECKING: - from arkouda.numpy.pdarraycreation import array, zeros, zeros_like + from arkouda.numpy.pdarraycreation import array, zeros_like from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical else: @@ -94,6 +94,7 @@ def _in1d_single( array([False True]) """ from arkouda.client import generic_msg + from arkouda.numpy.pdarraycreation import zeros as ak_zeros from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical as Categorical_ @@ -101,10 +102,10 @@ def _in1d_single( # While isinstance(thing, type) can be called on a tuple of types, # this causes an issue with mypy for unknown reasons. if pda1.size == 0: - return zeros(0, dtype=akbool) + return ak_zeros(0, dtype=akbool) if isinstance(pda2, pdarray) or isinstance(pda2, Strings) or isinstance(pda2, Categorical_): if pda2.size == 0: - return zeros(pda1.size, dtype=akbool) + return ak_zeros(pda1.size, dtype=akbool) if hasattr(pda1, "categories"): x = cast(Categorical_, pda1).in1d(pda2) return x if not invert else ~x diff --git a/pytest.ini b/pytest.ini index 80850f23388..c56c8efe491 100644 --- a/pytest.ini +++ b/pytest.ini @@ -52,6 +52,7 @@ testpaths = tests/numpy/err_test.py tests/numpy/manipulation_functions_test.py tests/numpy/alignment_verification/operators_alignment.py + tests/numpy/alignment_verification/pdarraysetops_alignment.py tests/numpy/numeric_test.py tests/numpy/numpy_test.py tests/numpy/pdarrayclass_test.py diff --git a/tests/numpy/alignment_verification/pdarraysetops_alignment.py b/tests/numpy/alignment_verification/pdarraysetops_alignment.py new file mode 100644 index 00000000000..3274a37f730 --- /dev/null +++ b/tests/numpy/alignment_verification/pdarraysetops_alignment.py @@ -0,0 +1,331 @@ +import numpy as np +import pytest + +import arkouda as ak + + +def _as_np(a): + """Convert arkouda pdarray/Strings to numpy ndarray.""" + # pdarray.to_ndarray exists; Strings.to_ndarray exists too in arkouda + return a.to_ndarray() + + +def _np_struct_from_cols(cols: list[np.ndarray]) -> np.ndarray: + """ + Build a NumPy structured array representing "rows" from multiple 1D columns. + This lets us use np.union1d/intersect1d/setdiff1d/setxor1d on rows. + """ + assert len(cols) >= 1 + n = len(cols[0]) + for c in cols[1:]: + assert len(c) == n + + dtype = [(f"f{i}", cols[i].dtype) for i in range(len(cols))] + out = np.empty(n, dtype=dtype) + for i, c in enumerate(cols): + out[f"f{i}"] = c + return out + + +def _np_setop_rows(op, A_cols, B_cols): + """ + Compute numpy reference for multi-column setops by treating rows as structured scalars. + op: one of np.union1d, np.intersect1d, np.setdiff1d, np.setxor1d + Returns list of numpy arrays (one per column), sorted lexicographically by row. + """ + A_rows = _np_struct_from_cols(A_cols) + B_rows = _np_struct_from_cols(B_cols) + rows = op(A_rows, B_rows) + + # Sort rows to match arkouda's "sorted unique" intent (and stable comparisons) + rows = np.sort(rows) + + # De-structure back into columns + out_cols = [rows[f"f{i}"] for i in range(len(A_cols))] + return out_cols + + +@pytest.mark.requires_chapel_module("In1dMsg") +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +@pytest.mark.parametrize("n", [0, 1, 2, 10, 100]) +def test_in1d_matches_numpy(dtype, n): + rng = np.random.default_rng(12345) + a_np = rng.integers(0, 20, size=n, dtype=np.int64) + b_np = rng.integers(0, 20, size=max(n // 2, 1), dtype=np.int64) + + # Cast for uint64 cases + if dtype == ak.uint64: + a_np = a_np.astype(np.uint64, copy=False) + b_np = b_np.astype(np.uint64, copy=False) + + a = ak.array(a_np) + b = ak.array(b_np) + + got = ak.in1d(a, b) + exp = np.in1d(a_np, b_np, assume_unique=False, invert=False) + + assert np.array_equal(_as_np(got), exp) + + got_inv = ak.in1d(a, b, invert=True) + exp_inv = np.in1d(a_np, b_np, assume_unique=False, invert=True) + assert np.array_equal(_as_np(got_inv), exp_inv) + + +@pytest.mark.requires_chapel_module("In1dMsg") +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +def test_in1d_symmetric_matches_numpy(dtype): + rng = np.random.default_rng(2468) + a_np = rng.integers(0, 30, size=50, dtype=np.int64) + b_np = rng.integers(0, 30, size=40, dtype=np.int64) + if dtype == ak.uint64: + a_np = a_np.astype(np.uint64, copy=False) + b_np = b_np.astype(np.uint64, copy=False) + + a = ak.array(a_np) + b = ak.array(b_np) + + got_a, got_b = ak.in1d(a, b, symmetric=True) + exp_a = np.in1d(a_np, b_np) + exp_b = np.in1d(b_np, a_np) + assert np.array_equal(_as_np(got_a), exp_a) + assert np.array_equal(_as_np(got_b), exp_b) + + +@pytest.mark.requires_chapel_module("In1dMsg") +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +def test_in1d_assume_unique_raises_when_not_unique(dtype): + # Arkouda explicitly validates uniqueness when assume_unique=True for multi-array path, + # and raises NonUniqueError. This test targets that behavior. + from arkouda.numpy.alignment import NonUniqueError + + a_np = np.array([1, 1, 2, 3], dtype=np.int64) + b_np = np.array([1, 2, 4], dtype=np.int64) + if dtype == ak.uint64: + a_np = a_np.astype(np.uint64, copy=False) + b_np = b_np.astype(np.uint64, copy=False) + + a = ak.array(a_np) + b = ak.array(b_np) + + # For scalar pdarray path, arkouda routes through _in1d_single, + # which does not validate uniqueness; so this test uses multi-array + # mode (sequence-of-arrays) which does validate. + A = [a] + B = [b] + with pytest.raises(NonUniqueError): + ak.in1d(A, B, assume_unique=True) + + +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +@pytest.mark.parametrize("n1,n2", [(0, 0), (0, 10), (10, 0), (10, 10), (50, 40)]) +def test_union1d_matches_numpy(dtype, n1, n2): + if (n1 == 0 and n2 > 0) or (n2 == 0 and n1 > 0): + pytest.xfail( + "Known bug: ak.union1d returns non-unique/unsorted when one input is empty; " + "should match np.union1d (sorted unique). Issue #5273." + ) + + rng = np.random.default_rng(999) + a_np = rng.integers(0, 25, size=n1, dtype=np.int64) + b_np = rng.integers(0, 25, size=n2, dtype=np.int64) + if dtype == ak.uint64: + a_np = a_np.astype(np.uint64, copy=False) + b_np = b_np.astype(np.uint64, copy=False) + + a = ak.array(a_np) + b = ak.array(b_np) + + got = ak.union1d(a, b) + exp = np.union1d(a_np, b_np) + assert np.array_equal(_as_np(got), exp) + + +@pytest.mark.xfail( + reason="Known bug: ak.union1d returns non-unique/unsorted when one input is empty; " + "should match np.union1d (sorted unique).. Issue #5273.", + strict=False, +) +def test_union1d_empty_left_matches_numpy(): + b_np = np.array([20, 19, 4, 4, 4, 17, 2, 18, 3, 4], dtype=np.int64) + got = ak.union1d(ak.array(np.array([], dtype=np.int64)), ak.array(b_np)) + assert np.array_equal(_as_np(got), np.union1d(np.array([], dtype=np.int64), b_np)) + + +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +@pytest.mark.parametrize("assume_unique", [False, True]) +def test_intersect1d_matches_numpy(dtype, assume_unique): + rng = np.random.default_rng(2024) + a_np = rng.integers(0, 40, size=100, dtype=np.int64) + b_np = rng.integers(0, 40, size=80, dtype=np.int64) + if dtype == ak.uint64: + a_np = a_np.astype(np.uint64, copy=False) + b_np = b_np.astype(np.uint64, copy=False) + + if assume_unique: + a_ref = np.unique(a_np) + b_ref = np.unique(b_np) + + a_ak = ak.array(a_ref) + b_ak = ak.array(b_ref) + + got = ak.intersect1d(a_ak, b_ak, assume_unique=True) + exp = np.intersect1d(a_ref, b_ref, assume_unique=True) + else: + a_ak = ak.array(a_np) + b_ak = ak.array(b_np) + + got = ak.intersect1d(a_ak, b_ak, assume_unique=False) + exp = np.intersect1d(a_np, b_np, assume_unique=False) + + assert np.array_equal(_as_np(got), exp) + + +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +@pytest.mark.parametrize("assume_unique", [False, True]) +def test_setdiff1d_matches_numpy(dtype, assume_unique): + rng = np.random.default_rng(777) + a_np = rng.integers(0, 50, size=120, dtype=np.int64) + b_np = rng.integers(0, 50, size=70, dtype=np.int64) + + if dtype == ak.uint64: + a_np = a_np.astype(np.uint64, copy=False) + b_np = b_np.astype(np.uint64, copy=False) + + if assume_unique: + a_ref = np.unique(a_np) + b_ref = np.unique(b_np) + + got = ak.setdiff1d(ak.array(a_ref), ak.array(b_ref), assume_unique=True) + exp = np.setdiff1d(a_ref, b_ref, assume_unique=True) + else: + got = ak.setdiff1d(ak.array(a_np), ak.array(b_np), assume_unique=False) + exp = np.setdiff1d(a_np, b_np, assume_unique=False) + + assert np.array_equal(_as_np(got), exp) + + +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +@pytest.mark.parametrize("assume_unique", [False, True]) +def test_setxor1d_matches_numpy(dtype, assume_unique): + rng = np.random.default_rng(31415) + a_np = rng.integers(0, 60, size=100, dtype=np.int64) + b_np = rng.integers(0, 60, size=90, dtype=np.int64) + if dtype == ak.uint64: + a_np = a_np.astype(np.uint64, copy=False) + b_np = b_np.astype(np.uint64, copy=False) + + if assume_unique: + a_ref = np.unique(a_np) + b_ref = np.unique(b_np) + + got = ak.setxor1d(ak.array(a_ref), ak.array(b_ref), assume_unique=True) + exp = np.setxor1d(a_ref, b_ref, assume_unique=True) + else: + got = ak.setxor1d(ak.array(a_np), ak.array(b_np), assume_unique=False) + exp = np.setxor1d(a_np, b_np, assume_unique=False) + + assert np.array_equal(_as_np(got), exp) + + +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +def test_concatenate_ordered_matches_numpy(dtype): + rng = np.random.default_rng(123) + parts = [rng.integers(0, 100, size=s, dtype=np.int64) for s in [0, 5, 1, 10]] + if dtype == ak.uint64: + parts = [p.astype(np.uint64, copy=False) for p in parts] + + ak_parts = [ak.array(p) for p in parts] + got = ak.concatenate(ak_parts, ordered=True) + exp = np.concatenate(parts, axis=0) + + assert np.array_equal(_as_np(got), exp) + + +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +def test_concatenate_unordered_is_multiset_equal(dtype): + rng = np.random.default_rng(456) + parts = [rng.integers(0, 50, size=s, dtype=np.int64) for s in [3, 7, 0, 9]] + if dtype == ak.uint64: + parts = [p.astype(np.uint64, copy=False) for p in parts] + + ak_parts = [ak.array(p) for p in parts] + got = ak.concatenate(ak_parts, ordered=False) + exp = np.concatenate(parts, axis=0) + + # unordered concatenate may interleave; compare as multisets + assert np.array_equal(np.sort(_as_np(got)), np.sort(exp)) + + +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +def test_multiarray_union_intersect_setdiff_setxor_align(dtype): + rng = np.random.default_rng(8888) + + # 2-column "rows" + n1, n2 = 60, 55 + a1 = rng.integers(0, 20, size=n1, dtype=np.int64) + a2 = rng.integers(0, 20, size=n1, dtype=np.int64) + b1 = rng.integers(0, 20, size=n2, dtype=np.int64) + b2 = rng.integers(0, 20, size=n2, dtype=np.int64) + + if dtype == ak.uint64: + a1, a2, b1, b2 = [x.astype(np.uint64, copy=False) for x in (a1, a2, b1, b2)] + + A = [ak.array(a1), ak.array(a2)] + B = [ak.array(b1), ak.array(b2)] + + # union1d (multi) + got_u = ak.union1d(A, B) + exp_u = _np_setop_rows(np.union1d, [a1, a2], [b1, b2]) + assert np.array_equal(_as_np(got_u[0]), exp_u[0]) + assert np.array_equal(_as_np(got_u[1]), exp_u[1]) + + # intersect1d (multi) + got_i = ak.intersect1d(A, B, assume_unique=False) + exp_i = _np_setop_rows(np.intersect1d, [a1, a2], [b1, b2]) + assert np.array_equal(_as_np(got_i[0]), exp_i[0]) + assert np.array_equal(_as_np(got_i[1]), exp_i[1]) + + # setdiff1d (multi): A - B + got_d = ak.setdiff1d(A, B, assume_unique=False) + exp_d = _np_setop_rows(np.setdiff1d, [a1, a2], [b1, b2]) + assert np.array_equal(_as_np(got_d[0]), exp_d[0]) + assert np.array_equal(_as_np(got_d[1]), exp_d[1]) + + # setxor1d (multi) + got_x = ak.setxor1d(A, B, assume_unique=False) + exp_x = _np_setop_rows(np.setxor1d, [a1, a2], [b1, b2]) + assert np.array_equal(_as_np(got_x[0]), exp_x[0]) + assert np.array_equal(_as_np(got_x[1]), exp_x[1]) + + +@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64]) +def test_indexof1d_all_occurrences_remove_missing(dtype): + rng = np.random.default_rng(13579) + space_np = rng.integers(0, 10, size=50, dtype=np.int64) + query_np = rng.integers(0, 10, size=20, dtype=np.int64) + + # Force some missing values by shifting query range + query_np = (query_np + 50).astype(np.int64) + + # Insert some present values as well + query_np[:5] = space_np[:5] + + if dtype == ak.uint64: + space_np = space_np.astype(np.uint64, copy=False) + query_np = query_np.astype(np.uint64, copy=False) + + space = ak.array(space_np) + query = ak.array(query_np) + + got = ak.indexof1d(query, space) + got_np = _as_np(got) + + # Reference: for each query value, emit indices of all matches in space; skip if none. + exp_list = [] + for q in query_np: + hits = np.nonzero(space_np == q)[0] + if hits.size: + exp_list.extend(hits.tolist()) + exp = np.array(exp_list, dtype=np.int64) + + assert np.array_equal(got_np, exp)