Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 44 additions & 12 deletions arkouda/numpy/pdarraysetops.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Sequence, TypeVar, Union, cast
from typing import TYPE_CHECKING, Literal, Sequence, Tuple, TypeVar, Union, cast, overload

import numpy as np

Expand All @@ -19,7 +19,7 @@


if TYPE_CHECKING:
from arkouda.numpy.pdarraycreation import array, zeros, zeros_like
from arkouda.numpy.pdarraycreation import array, zeros_like
from arkouda.numpy.strings import Strings
from arkouda.pandas.categorical import Categorical
else:
Expand Down Expand Up @@ -94,6 +94,7 @@ def _in1d_single(
array([False True])
"""
from arkouda.client import generic_msg
from arkouda.numpy.pdarraycreation import zeros
from arkouda.numpy.strings import Strings
from arkouda.pandas.categorical import Categorical as Categorical_

Expand Down Expand Up @@ -138,22 +139,46 @@ def _in1d_single(
raise TypeError("Both pda1 and pda2 must be pdarray, Strings, or Categorical")


@overload
def in1d(
A: groupable,
B: groupable,
assume_unique: bool = ...,
symmetric: Literal[False] = ...,
invert: bool = ...,
) -> pdarray: ...


@overload
def in1d(
A: groupable,
B: groupable,
assume_unique: bool = ...,
symmetric: Literal[True] = ...,
invert: bool = ...,
) -> Tuple[pdarray, pdarray]: ...


@typechecked
def in1d(
A: groupable,
B: groupable,
assume_unique: bool = False,
symmetric: bool = False,
invert: bool = False,
) -> groupable:
) -> Union[pdarray, Tuple[pdarray, pdarray]]:
"""
Test whether each element of a 1-D array is also present in a second array.

Returns a boolean array the same length as `A` that is True
where an element of `A` is in `B` and False otherwise.
If ``symmetric=False`` (default), returns a boolean pdarray of the same
shape as ``A`` indicating whether each element of ``A`` is in ``B``.

If ``symmetric=True``, returns a tuple ``(maskA, maskB)`` where:

* ``maskA[i]`` is True iff ``A[i]`` is in ``B``
* ``maskB[j]`` is True iff ``B[j]`` is in ``A``

Supports multi-level, i.e. test if rows of a are in the set of rows of b.
But note that multi-dimensional pdarrays are not supported.
If ``invert=True``, the returned mask(s) are logically inverted.

Parameters
----------
Expand Down Expand Up @@ -223,7 +248,7 @@ def in1d(
raise TypeError("If A is pdarray, B must also be pdarray")
elif isinstance(B, (pdarray, Strings, Categorical_)):
if symmetric:
return _in1d_single(A, B), _in1d_single(B, A, invert)
return _in1d_single(A, B, invert), _in1d_single(B, A, invert)
return _in1d_single(A, B, invert)
else:
raise TypeError(
Expand Down Expand Up @@ -260,18 +285,25 @@ def in1d(
if assume_unique:
# Deinterleave truth into a and b domains
if symmetric:
return truth[isa], truth[~isa] if not invert else ~truth[isa], ~truth[~isa]
aout = truth[isa]
bout = truth[~isa]
if invert:
return ~aout, ~bout
return aout, bout
else:
return truth[isa] if not invert else ~truth[isa]
aout = truth[isa]
return ~aout if invert else aout
else:
# If didn't start unique, first need to deinterleave into ua domain,
# then broadcast to a domain
atruth = ag.broadcast(truth[isa], permute=True)
if symmetric:
btruth = bg.broadcast(truth[~isa], permute=True)
return atruth, btruth if not invert else ~atruth, ~btruth
if invert:
return ~atruth, ~btruth
return atruth, btruth
else:
return atruth if not invert else ~atruth
return ~atruth if invert else atruth


def in1dmulti(a, b, assume_unique=False, symmetric=False):
Expand Down
93 changes: 82 additions & 11 deletions arkouda/numpy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,6 +991,9 @@ def map(
TypeError
If `mapping` is not of type `dict` or `Series`.
If `values` is not of type `pdarray`, `Categorical`, or `Strings`.
ValueError
If a mapping with tuple keys has inconsistent lengths, or if a MultiIndex
mapping has a different number of levels than the GroupBy keys.

Examples
--------
Expand All @@ -1012,29 +1015,97 @@ def map(
from arkouda.numpy.pdarraysetops import in1d
from arkouda.numpy.strings import Strings
from arkouda.pandas.categorical import Categorical
from arkouda.pandas.index import MultiIndex

keys = values
gb = GroupBy(keys, dropna=False)
gb_keys = gb.unique_keys

# helper: number of unique keys (works for single key or tuple-of-keys)
nuniq = gb_keys[0].size if isinstance(gb_keys, tuple) else gb_keys.size

# Fast-path: empty mapping => everything is missing
if (isinstance(mapping, dict) and len(mapping) == 0) or (
isinstance(mapping, Series) and len(mapping.index) == 0
):
if not isinstance(values, (Strings, Categorical)):
fillvals = full(nuniq, np.nan, values.dtype)
else:
fillvals = full(nuniq, "null")
return broadcast(gb.segments, fillvals, permutation=gb.permutation)

if isinstance(mapping, dict):
mapping = Series([array(list(mapping.keys())), array(list(mapping.values()))])
# Build mapping as a Series with an Index/MultiIndex (avoid rank>1 arrays)
m_keys = list(mapping.keys())
m_vals = list(mapping.values())

k0 = m_keys[0]
if isinstance(k0, tuple):
# validate tuple keys
if not all(isinstance(k, tuple) for k in m_keys):
raise TypeError("Mixed key types in mapping dict (tuple and non-tuple).")
n = len(k0)
if not all(len(k) == n for k in m_keys):
raise ValueError("All tuple keys in mapping dict must have the same length.")

cols = list(zip(*m_keys)) # transpose list[tuple] -> list[level]
idx = MultiIndex([array(col) for col in cols])
mapping = Series(array(m_vals), index=idx)
else:
mapping = Series(array(m_vals), index=array(m_keys))

if isinstance(mapping, Series):
xtra_keys = gb_keys[in1d(gb_keys, mapping.index.values, invert=True)]
# Normalize mapping index keys into a "groupable" (single array OR tuple-of-arrays)
mindex = mapping.index
if isinstance(mindex, MultiIndex):
mkeys = tuple(mindex.index)
else:
mkeys = mindex.values

if xtra_keys.size > 0:
if not isinstance(mapping.values, (Strings, Categorical)):
nans = full(xtra_keys.size, np.nan, mapping.values.dtype)
else:
nans = full(xtra_keys.size, "null")
if isinstance(gb_keys, tuple) and isinstance(mkeys, tuple):
if len(gb_keys) != len(mkeys):
raise ValueError(
f"Mapping MultiIndex has {len(mkeys)} levels but GroupBy has {len(gb_keys)} keys"
)

mask = in1d(gb_keys, mkeys, invert=True)

# Compute extra keys + extra size without mixing tuple/non-tuple assignments
if isinstance(gb_keys, tuple):
xtra_keys_t = tuple(k[mask] for k in gb_keys)
xtra_size = xtra_keys_t[0].size if len(xtra_keys_t) > 0 else 0

if xtra_size > 0:
if not isinstance(mapping.values, (Strings, Categorical)):
nans = full(xtra_size, np.nan, mapping.values.dtype)
else:
nans = full(xtra_size, "null")

# Convert any categorical levels to strings, level-by-level
xtra_keys_t = tuple(
k.to_strings() if isinstance(k, Categorical) else k for k in xtra_keys_t
)

xtra_series = Series(nans, index=MultiIndex(list(xtra_keys_t)))
mapping = Series.concat([mapping, xtra_series])

else:
xtra_keys_s = gb_keys[mask]
xtra_size = xtra_keys_s.size

if xtra_size > 0:
if not isinstance(mapping.values, (Strings, Categorical)):
nans = full(xtra_size, np.nan, mapping.values.dtype)
else:
nans = full(xtra_size, "null")

if isinstance(xtra_keys, Categorical):
xtra_keys = xtra_keys.to_strings()
if isinstance(xtra_keys_s, Categorical):
xtra_keys_s = xtra_keys_s.to_strings()

xtra_series = Series(nans, index=xtra_keys)
mapping = Series.concat([mapping, xtra_series])
xtra_series = Series(nans, index=xtra_keys_s)
mapping = Series.concat([mapping, xtra_series])

# Align mapping to gb_keys
if isinstance(gb_keys, Categorical):
mapping = mapping[gb_keys.to_strings()]
else:
Expand Down
1 change: 1 addition & 0 deletions arkouda/pandas/groupbyclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@

groupable_element_type = Union[pdarray, Strings, "Categorical"]
groupable = Union[groupable_element_type, Sequence[groupable_element_type]]

# Note: we won't be typechecking GroupBy until we can figure out a way to handle
# the circular import with Categorical

Expand Down
6 changes: 3 additions & 3 deletions arkouda/pandas/join.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from arkouda.numpy.pdarrayclass import create_pdarray, pdarray
from arkouda.numpy.pdarraysetops import concatenate, in1d
from arkouda.pandas.categorical import Categorical
from arkouda.pandas.groupbyclass import GroupBy, broadcast
from arkouda.pandas.groupbyclass import GroupBy, broadcast, groupable_element_type


if TYPE_CHECKING:
Expand Down Expand Up @@ -198,8 +198,8 @@ def compute_join_size(a: pdarray, b: pdarray) -> Tuple[int, int]:
ua, asize = bya.size()
byb = GroupBy(b)
ub, bsize = byb.size()
afact = asize[in1d(ua, ub)]
bfact = bsize[in1d(ub, ua)]
afact = asize[in1d(cast(groupable_element_type, ua), cast(groupable_element_type, ub))]
bfact = bsize[in1d(cast(groupable_element_type, ub), cast(groupable_element_type, ua))]
nelem = (afact * bfact).sum()
nbytes = 3 * 8 * nelem
return nelem, nbytes
Expand Down
41 changes: 38 additions & 3 deletions arkouda/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from arkouda.numpy.pdarrayclass import RegistrationError, any, argmaxk, create_pdarray, pdarray
from arkouda.numpy.pdarraysetops import argsort, concatenate, in1d, indexof1d
from arkouda.numpy.util import get_callback, is_float
from arkouda.pandas.groupbyclass import GroupBy, groupable_element_type
from arkouda.pandas.groupbyclass import GroupBy, groupable, groupable_element_type
from arkouda.pandas.index import Index, MultiIndex


Expand Down Expand Up @@ -429,6 +429,7 @@ def __setitem__(
"""
from arkouda.numpy.pdarraycreation import array
from arkouda.numpy.strings import Strings
from arkouda.pandas.categorical import Categorical

val = self.validate_val(val)
key = self.validate_key(key)
Expand All @@ -440,7 +441,23 @@ def __setitem__(
if is_supported_scalar(key):
indices = self.index == key
else:
indices = in1d(self.index.values, key)
# mypy: key may be scalar/SegArray/etc, but in1d only accepts groupables
if not isinstance(key, (pdarray, Strings, Categorical, list, tuple)):
raise TypeError(f"Unsupported key type for membership test: {type(key)}")

# If key is a python list/tuple, it will be validated/converted by validate_key in many paths
# but if it slips through, convert here.
if (
isinstance(self.index, MultiIndex)
and isinstance(key, tuple)
and len(key) == self.index.nlevels
):
indices = self.index.lookup(key) # returns boolean mask
else:
if isinstance(key, list):
key = array(key)
indices = in1d(self.index.values, cast(groupable, key))

tf, counts = GroupBy(indices).size()
update_count = counts[1] if len(counts) == 2 else 0
if update_count == 0:
Expand Down Expand Up @@ -614,10 +631,28 @@ def isin(self, lst: Union[pdarray, Strings, List]) -> Series:
and False otherwise.

"""
from arkouda.numpy.pdarraycreation import array
from arkouda.numpy.strings import Strings
from arkouda.pandas.categorical import Categorical

if isinstance(lst, list):
lst = array(lst)

boolean = in1d(self.values, lst)
# mypy: lst/self.values can be a wider union (SegArray/Any) at type level.
# At runtime, in1d only supports pdarray/Strings/Categorical (or sequences of those).
if not isinstance(self.values, (pdarray, Strings, Categorical)):
raise TypeError(f"in1d not supported for Series values type: {type(self.values)}")

if not isinstance(lst, (pdarray, Strings, Categorical, list, tuple)):
raise TypeError(f"in1d not supported for list type: {type(lst)}")

if isinstance(lst, (list, tuple)):
lst = array(lst)

boolean = in1d(
cast(groupable_element_type, self.values),
cast(groupable_element_type, lst),
)
return Series(data=boolean, index=self.index)

@typechecked
Expand Down
37 changes: 37 additions & 0 deletions tests/numpy/setops_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,43 @@ def test_in1d_multiarray_categorical(self, size):
stringsTwo = ak.Categorical(ak.array(["String {}".format(i % 2) for i in range(10)]))
assert [(x % 3) < 2 for x in range(10)] == ak.in1d(stringsOne, stringsTwo).tolist()

@pytest.mark.requires_chapel_module("In1dMsg")
def test_in1d_symmetric(self):
# Duplicates to exercise assume_unique=False (GroupBy/broadcast path)
a = ak.array([1, 2, 2, 3, 4])
b = ak.array([2, 4, 4, 5])

def exp_in(x, y):
yset = set(y)
return [xi in yset for xi in x]

a_list = a.to_ndarray().tolist()
b_list = b.to_ndarray().tolist()

# assume_unique=False path (duplicates allowed; should match membership semantics)
am2, bm2 = ak.in1d(a, b, assume_unique=False, symmetric=True, invert=False)
assert am2.tolist() == exp_in(a_list, b_list)
assert bm2.tolist() == exp_in(b_list, a_list)

am2_i, bm2_i = ak.in1d(a, b, assume_unique=False, symmetric=True, invert=True)
assert am2_i.tolist() == [not v for v in exp_in(a_list, b_list)]
assert bm2_i.tolist() == [not v for v in exp_in(b_list, a_list)]

# assume_unique=True path (inputs must be unique for this branch to be valid)
au = ak.array([1, 2, 3, 4])
bu = ak.array([2, 4, 5])

au_list = au.to_ndarray().tolist()
bu_list = bu.to_ndarray().tolist()

am, bm = ak.in1d(au, bu, assume_unique=True, symmetric=True, invert=False)
assert am.tolist() == exp_in(au_list, bu_list)
assert bm.tolist() == exp_in(bu_list, au_list)

am_i, bm_i = ak.in1d(au, bu, assume_unique=True, symmetric=True, invert=True)
assert am_i.tolist() == [not v for v in exp_in(au_list, bu_list)]
assert bm_i.tolist() == [not v for v in exp_in(bu_list, au_list)]

@pytest.mark.parametrize("size", pytest.prob_size)
@pytest.mark.parametrize("dtype", INTEGRAL_TYPES)
def test_intersect1d_multiarray_numeric_types(self, size, dtype):
Expand Down
Loading