From 7ca62f68387978eb2601e2ed1238839ee455b6b9 Mon Sep 17 00:00:00 2001 From: Ryan Keck Date: Mon, 26 Jan 2026 11:43:07 -0500 Subject: [PATCH] Closes #5175: overloads for in1d --- arkouda/numpy/pdarraysetops.py | 56 +++++++++++++++----- arkouda/numpy/util.py | 93 ++++++++++++++++++++++++++++++---- arkouda/pandas/groupbyclass.py | 1 + arkouda/pandas/join.py | 6 +-- arkouda/pandas/series.py | 41 +++++++++++++-- tests/numpy/setops_test.py | 37 ++++++++++++++ tests/pandas/series_test.py | 46 +++++++++++++++++ 7 files changed, 251 insertions(+), 29 deletions(-) diff --git a/arkouda/numpy/pdarraysetops.py b/arkouda/numpy/pdarraysetops.py index 0f522d885c4..30d4510fd3f 100644 --- a/arkouda/numpy/pdarraysetops.py +++ b/arkouda/numpy/pdarraysetops.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Sequence, TypeVar, Union, cast +from typing import TYPE_CHECKING, Literal, Sequence, Tuple, TypeVar, Union, cast, overload import numpy as np @@ -19,7 +19,7 @@ if TYPE_CHECKING: - from arkouda.numpy.pdarraycreation import array, zeros, zeros_like + from arkouda.numpy.pdarraycreation import array, zeros_like from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical else: @@ -94,6 +94,7 @@ def _in1d_single( array([False True]) """ from arkouda.client import generic_msg + from arkouda.numpy.pdarraycreation import zeros from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical as Categorical_ @@ -138,6 +139,26 @@ def _in1d_single( raise TypeError("Both pda1 and pda2 must be pdarray, Strings, or Categorical") +@overload +def in1d( + A: groupable, + B: groupable, + assume_unique: bool = ..., + symmetric: Literal[False] = ..., + invert: bool = ..., +) -> pdarray: ... + + +@overload +def in1d( + A: groupable, + B: groupable, + assume_unique: bool = ..., + symmetric: Literal[True] = ..., + invert: bool = ..., +) -> Tuple[pdarray, pdarray]: ... + + @typechecked def in1d( A: groupable, @@ -145,15 +166,19 @@ def in1d( assume_unique: bool = False, symmetric: bool = False, invert: bool = False, -) -> groupable: +) -> Union[pdarray, Tuple[pdarray, pdarray]]: """ Test whether each element of a 1-D array is also present in a second array. - Returns a boolean array the same length as `A` that is True - where an element of `A` is in `B` and False otherwise. + If ``symmetric=False`` (default), returns a boolean pdarray of the same + shape as ``A`` indicating whether each element of ``A`` is in ``B``. + + If ``symmetric=True``, returns a tuple ``(maskA, maskB)`` where: + + * ``maskA[i]`` is True iff ``A[i]`` is in ``B`` + * ``maskB[j]`` is True iff ``B[j]`` is in ``A`` - Supports multi-level, i.e. test if rows of a are in the set of rows of b. - But note that multi-dimensional pdarrays are not supported. + If ``invert=True``, the returned mask(s) are logically inverted. Parameters ---------- @@ -223,7 +248,7 @@ def in1d( raise TypeError("If A is pdarray, B must also be pdarray") elif isinstance(B, (pdarray, Strings, Categorical_)): if symmetric: - return _in1d_single(A, B), _in1d_single(B, A, invert) + return _in1d_single(A, B, invert), _in1d_single(B, A, invert) return _in1d_single(A, B, invert) else: raise TypeError( @@ -260,18 +285,25 @@ def in1d( if assume_unique: # Deinterleave truth into a and b domains if symmetric: - return truth[isa], truth[~isa] if not invert else ~truth[isa], ~truth[~isa] + aout = truth[isa] + bout = truth[~isa] + if invert: + return ~aout, ~bout + return aout, bout else: - return truth[isa] if not invert else ~truth[isa] + aout = truth[isa] + return ~aout if invert else aout else: # If didn't start unique, first need to deinterleave into ua domain, # then broadcast to a domain atruth = ag.broadcast(truth[isa], permute=True) if symmetric: btruth = bg.broadcast(truth[~isa], permute=True) - return atruth, btruth if not invert else ~atruth, ~btruth + if invert: + return ~atruth, ~btruth + return atruth, btruth else: - return atruth if not invert else ~atruth + return ~atruth if invert else atruth def in1dmulti(a, b, assume_unique=False, symmetric=False): diff --git a/arkouda/numpy/util.py b/arkouda/numpy/util.py index 11b8d2ceaf4..0fadca4cc4f 100644 --- a/arkouda/numpy/util.py +++ b/arkouda/numpy/util.py @@ -991,6 +991,9 @@ def map( TypeError If `mapping` is not of type `dict` or `Series`. If `values` is not of type `pdarray`, `Categorical`, or `Strings`. + ValueError + If a mapping with tuple keys has inconsistent lengths, or if a MultiIndex + mapping has a different number of levels than the GroupBy keys. Examples -------- @@ -1012,29 +1015,97 @@ def map( from arkouda.numpy.pdarraysetops import in1d from arkouda.numpy.strings import Strings from arkouda.pandas.categorical import Categorical + from arkouda.pandas.index import MultiIndex keys = values gb = GroupBy(keys, dropna=False) gb_keys = gb.unique_keys + # helper: number of unique keys (works for single key or tuple-of-keys) + nuniq = gb_keys[0].size if isinstance(gb_keys, tuple) else gb_keys.size + + # Fast-path: empty mapping => everything is missing + if (isinstance(mapping, dict) and len(mapping) == 0) or ( + isinstance(mapping, Series) and len(mapping.index) == 0 + ): + if not isinstance(values, (Strings, Categorical)): + fillvals = full(nuniq, np.nan, values.dtype) + else: + fillvals = full(nuniq, "null") + return broadcast(gb.segments, fillvals, permutation=gb.permutation) + if isinstance(mapping, dict): - mapping = Series([array(list(mapping.keys())), array(list(mapping.values()))]) + # Build mapping as a Series with an Index/MultiIndex (avoid rank>1 arrays) + m_keys = list(mapping.keys()) + m_vals = list(mapping.values()) + + k0 = m_keys[0] + if isinstance(k0, tuple): + # validate tuple keys + if not all(isinstance(k, tuple) for k in m_keys): + raise TypeError("Mixed key types in mapping dict (tuple and non-tuple).") + n = len(k0) + if not all(len(k) == n for k in m_keys): + raise ValueError("All tuple keys in mapping dict must have the same length.") + + cols = list(zip(*m_keys)) # transpose list[tuple] -> list[level] + idx = MultiIndex([array(col) for col in cols]) + mapping = Series(array(m_vals), index=idx) + else: + mapping = Series(array(m_vals), index=array(m_keys)) if isinstance(mapping, Series): - xtra_keys = gb_keys[in1d(gb_keys, mapping.index.values, invert=True)] + # Normalize mapping index keys into a "groupable" (single array OR tuple-of-arrays) + mindex = mapping.index + if isinstance(mindex, MultiIndex): + mkeys = tuple(mindex.index) + else: + mkeys = mindex.values - if xtra_keys.size > 0: - if not isinstance(mapping.values, (Strings, Categorical)): - nans = full(xtra_keys.size, np.nan, mapping.values.dtype) - else: - nans = full(xtra_keys.size, "null") + if isinstance(gb_keys, tuple) and isinstance(mkeys, tuple): + if len(gb_keys) != len(mkeys): + raise ValueError( + f"Mapping MultiIndex has {len(mkeys)} levels but GroupBy has {len(gb_keys)} keys" + ) + + mask = in1d(gb_keys, mkeys, invert=True) + + # Compute extra keys + extra size without mixing tuple/non-tuple assignments + if isinstance(gb_keys, tuple): + xtra_keys_t = tuple(k[mask] for k in gb_keys) + xtra_size = xtra_keys_t[0].size if len(xtra_keys_t) > 0 else 0 + + if xtra_size > 0: + if not isinstance(mapping.values, (Strings, Categorical)): + nans = full(xtra_size, np.nan, mapping.values.dtype) + else: + nans = full(xtra_size, "null") + + # Convert any categorical levels to strings, level-by-level + xtra_keys_t = tuple( + k.to_strings() if isinstance(k, Categorical) else k for k in xtra_keys_t + ) + + xtra_series = Series(nans, index=MultiIndex(list(xtra_keys_t))) + mapping = Series.concat([mapping, xtra_series]) + + else: + xtra_keys_s = gb_keys[mask] + xtra_size = xtra_keys_s.size + + if xtra_size > 0: + if not isinstance(mapping.values, (Strings, Categorical)): + nans = full(xtra_size, np.nan, mapping.values.dtype) + else: + nans = full(xtra_size, "null") - if isinstance(xtra_keys, Categorical): - xtra_keys = xtra_keys.to_strings() + if isinstance(xtra_keys_s, Categorical): + xtra_keys_s = xtra_keys_s.to_strings() - xtra_series = Series(nans, index=xtra_keys) - mapping = Series.concat([mapping, xtra_series]) + xtra_series = Series(nans, index=xtra_keys_s) + mapping = Series.concat([mapping, xtra_series]) + # Align mapping to gb_keys if isinstance(gb_keys, Categorical): mapping = mapping[gb_keys.to_strings()] else: diff --git a/arkouda/pandas/groupbyclass.py b/arkouda/pandas/groupbyclass.py index 7fd93e70682..a0672fc0c53 100644 --- a/arkouda/pandas/groupbyclass.py +++ b/arkouda/pandas/groupbyclass.py @@ -86,6 +86,7 @@ groupable_element_type = Union[pdarray, Strings, "Categorical"] groupable = Union[groupable_element_type, Sequence[groupable_element_type]] + # Note: we won't be typechecking GroupBy until we can figure out a way to handle # the circular import with Categorical diff --git a/arkouda/pandas/join.py b/arkouda/pandas/join.py index 08e7b793139..3c0dd3f9d25 100644 --- a/arkouda/pandas/join.py +++ b/arkouda/pandas/join.py @@ -18,7 +18,7 @@ from arkouda.numpy.pdarrayclass import create_pdarray, pdarray from arkouda.numpy.pdarraysetops import concatenate, in1d from arkouda.pandas.categorical import Categorical -from arkouda.pandas.groupbyclass import GroupBy, broadcast +from arkouda.pandas.groupbyclass import GroupBy, broadcast, groupable_element_type if TYPE_CHECKING: @@ -198,8 +198,8 @@ def compute_join_size(a: pdarray, b: pdarray) -> Tuple[int, int]: ua, asize = bya.size() byb = GroupBy(b) ub, bsize = byb.size() - afact = asize[in1d(ua, ub)] - bfact = bsize[in1d(ub, ua)] + afact = asize[in1d(cast(groupable_element_type, ua), cast(groupable_element_type, ub))] + bfact = bsize[in1d(cast(groupable_element_type, ub), cast(groupable_element_type, ua))] nelem = (afact * bfact).sum() nbytes = 3 * 8 * nelem return nelem, nbytes diff --git a/arkouda/pandas/series.py b/arkouda/pandas/series.py index ac493fcf803..b8039a065d4 100644 --- a/arkouda/pandas/series.py +++ b/arkouda/pandas/series.py @@ -19,7 +19,7 @@ from arkouda.numpy.pdarrayclass import RegistrationError, any, argmaxk, create_pdarray, pdarray from arkouda.numpy.pdarraysetops import argsort, concatenate, in1d, indexof1d from arkouda.numpy.util import get_callback, is_float -from arkouda.pandas.groupbyclass import GroupBy, groupable_element_type +from arkouda.pandas.groupbyclass import GroupBy, groupable, groupable_element_type from arkouda.pandas.index import Index, MultiIndex @@ -429,6 +429,7 @@ def __setitem__( """ from arkouda.numpy.pdarraycreation import array from arkouda.numpy.strings import Strings + from arkouda.pandas.categorical import Categorical val = self.validate_val(val) key = self.validate_key(key) @@ -440,7 +441,23 @@ def __setitem__( if is_supported_scalar(key): indices = self.index == key else: - indices = in1d(self.index.values, key) + # mypy: key may be scalar/SegArray/etc, but in1d only accepts groupables + if not isinstance(key, (pdarray, Strings, Categorical, list, tuple)): + raise TypeError(f"Unsupported key type for membership test: {type(key)}") + + # If key is a python list/tuple, it will be validated/converted by validate_key in many paths + # but if it slips through, convert here. + if ( + isinstance(self.index, MultiIndex) + and isinstance(key, tuple) + and len(key) == self.index.nlevels + ): + indices = self.index.lookup(key) # returns boolean mask + else: + if isinstance(key, list): + key = array(key) + indices = in1d(self.index.values, cast(groupable, key)) + tf, counts = GroupBy(indices).size() update_count = counts[1] if len(counts) == 2 else 0 if update_count == 0: @@ -614,10 +631,28 @@ def isin(self, lst: Union[pdarray, Strings, List]) -> Series: and False otherwise. """ + from arkouda.numpy.pdarraycreation import array + from arkouda.numpy.strings import Strings + from arkouda.pandas.categorical import Categorical + if isinstance(lst, list): lst = array(lst) - boolean = in1d(self.values, lst) + # mypy: lst/self.values can be a wider union (SegArray/Any) at type level. + # At runtime, in1d only supports pdarray/Strings/Categorical (or sequences of those). + if not isinstance(self.values, (pdarray, Strings, Categorical)): + raise TypeError(f"in1d not supported for Series values type: {type(self.values)}") + + if not isinstance(lst, (pdarray, Strings, Categorical, list, tuple)): + raise TypeError(f"in1d not supported for list type: {type(lst)}") + + if isinstance(lst, (list, tuple)): + lst = array(lst) + + boolean = in1d( + cast(groupable_element_type, self.values), + cast(groupable_element_type, lst), + ) return Series(data=boolean, index=self.index) @typechecked diff --git a/tests/numpy/setops_test.py b/tests/numpy/setops_test.py index 4ddc52cb2ec..9a0ace55e7c 100644 --- a/tests/numpy/setops_test.py +++ b/tests/numpy/setops_test.py @@ -238,6 +238,43 @@ def test_in1d_multiarray_categorical(self, size): stringsTwo = ak.Categorical(ak.array(["String {}".format(i % 2) for i in range(10)])) assert [(x % 3) < 2 for x in range(10)] == ak.in1d(stringsOne, stringsTwo).tolist() + @pytest.mark.requires_chapel_module("In1dMsg") + def test_in1d_symmetric(self): + # Duplicates to exercise assume_unique=False (GroupBy/broadcast path) + a = ak.array([1, 2, 2, 3, 4]) + b = ak.array([2, 4, 4, 5]) + + def exp_in(x, y): + yset = set(y) + return [xi in yset for xi in x] + + a_list = a.to_ndarray().tolist() + b_list = b.to_ndarray().tolist() + + # assume_unique=False path (duplicates allowed; should match membership semantics) + am2, bm2 = ak.in1d(a, b, assume_unique=False, symmetric=True, invert=False) + assert am2.tolist() == exp_in(a_list, b_list) + assert bm2.tolist() == exp_in(b_list, a_list) + + am2_i, bm2_i = ak.in1d(a, b, assume_unique=False, symmetric=True, invert=True) + assert am2_i.tolist() == [not v for v in exp_in(a_list, b_list)] + assert bm2_i.tolist() == [not v for v in exp_in(b_list, a_list)] + + # assume_unique=True path (inputs must be unique for this branch to be valid) + au = ak.array([1, 2, 3, 4]) + bu = ak.array([2, 4, 5]) + + au_list = au.to_ndarray().tolist() + bu_list = bu.to_ndarray().tolist() + + am, bm = ak.in1d(au, bu, assume_unique=True, symmetric=True, invert=False) + assert am.tolist() == exp_in(au_list, bu_list) + assert bm.tolist() == exp_in(bu_list, au_list) + + am_i, bm_i = ak.in1d(au, bu, assume_unique=True, symmetric=True, invert=True) + assert am_i.tolist() == [not v for v in exp_in(au_list, bu_list)] + assert bm_i.tolist() == [not v for v in exp_in(bu_list, au_list)] + @pytest.mark.parametrize("size", pytest.prob_size) @pytest.mark.parametrize("dtype", INTEGRAL_TYPES) def test_intersect1d_multiarray_numeric_types(self, size, dtype): diff --git a/tests/pandas/series_test.py b/tests/pandas/series_test.py index 1ac6dace3d9..2b94dfeed06 100644 --- a/tests/pandas/series_test.py +++ b/tests/pandas/series_test.py @@ -802,3 +802,49 @@ def test_iloc(self): _s1.iloc[[True, False, True]] with pytest.raises(IndexError): s1.iloc[[True, False, True]] + + @pytest.mark.requires_chapel_module("In1dMsg") + def test_series_isin_accepts_list_and_tuple_for_supported_value_types(self): + # pdarray values + s_int = ak.Series(ak.array([1, 2, 3, 2, 1])) + assert s_int.isin([2, 99]).tolist() == [False, True, False, True, False] + assert s_int.isin((1, 3)).tolist() == [True, False, True, False, True] + + # Strings values + s_str = ak.Series(ak.array(["a", "b", "c", "a"])) + assert s_str.isin(["a", "z"]).tolist() == [True, False, False, True] + assert s_str.isin(("b",)).tolist() == [False, True, False, False] + + # Categorical values + s_cat = ak.Series(ak.Categorical(ak.array(["red", "blue", "red", "green"]))) + assert s_cat.isin(["red"]).tolist() == [True, False, True, False] + assert s_cat.isin(("blue", "green")).tolist() == [False, True, False, True] + + @pytest.mark.requires_chapel_module("In1dMsg") + def test_series_map_multikey_missing_keys_fills_nans_and_nulls(self): + # MultiIndex with 2 keys (ensure map works with MultiIndex-backed Series) + k1 = ak.array([1, 1, 2, 2, 3]) + k2 = ak.array(["x", "y", "x", "y", "x"]) + mi = ak.MultiIndex([k1, k2], names=["k1", "k2"]) + + base = ak.Series(ak.array([10, 20, 30, 40, 50]), index=mi) + + # --- Numeric mapping (missing values should become NaN) --- + # Map only two of the Series *values*; others should be NaN + num_map = { + 10: 100.0, + 40: 200.0, + } + out_num = base.map(num_map) + + out_num_list = out_num.values.to_ndarray().tolist() + expected_num = [100.0, np.nan, np.nan, 200.0, np.nan] + assert np.allclose(out_num_list, expected_num, equal_nan=True) + + # --- String mapping (missing values should become "null") --- + str_map = { + 20: "A", + 50: "B", + } + out_str = base.map(str_map) + assert out_str.values.tolist() == ["null", "A", "null", "null", "B"]