diff --git a/arkouda/pandas/extension/_arkouda_array.py b/arkouda/pandas/extension/_arkouda_array.py index 67103483f4f..e377e90430d 100644 --- a/arkouda/pandas/extension/_arkouda_array.py +++ b/arkouda/pandas/extension/_arkouda_array.py @@ -4,6 +4,7 @@ from typing import cast as type_cast import numpy as np +import pandas as pd from numpy import ndarray from numpy.typing import NDArray @@ -446,3 +447,73 @@ def any(self, axis=0, skipna=True, **kwargs): boolean-reduction calls. """ return bool(self._data.any()) + + def value_counts(self, dropna: bool = True) -> pd.Series: + """ + Return counts of unique values as a pandas Series. + + This method computes the frequency of each distinct value in the + underlying Arkouda array and returns the result as a pandas + ``Series``, with the unique values as the index and their counts + as the data. + + Parameters + ---------- + dropna : bool + Whether to exclude missing values. Currently, missing-value + handling is supported only for floating-point data, where + ``NaN`` values are treated as missing. Default is True. + + Returns + ------- + pd.Series + A Series containing the counts of unique values. + The index is an ``ArkoudaArray`` of unique values, and the + values are an ``ArkoudaArray`` of counts. + + Notes + ----- + - Only ``dropna=True`` is supported. + - The following pandas options are not yet implemented: + ``normalize``, ``sort``, and ``bins``. + - Counting is performed server-side in Arkouda; only the small + result (unique values and counts) is materialized on the client. + + Examples + -------- + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaArray + >>> + >>> a = ArkoudaArray(ak.array([1, 2, 1, 3, 2, 1])) + >>> a.value_counts() + 1 3 + 2 2 + 3 1 + dtype: int64 + + Floating-point data with NaN values: + + >>> b = ArkoudaArray(ak.array([1.0, 2.0, float("nan"), 1.0])) + >>> b.value_counts() + 1.0 2 + 2.0 1 + dtype: int64 + """ + from arkouda.numpy.numeric import isnan as ak_isnan + + data = self._data + + # Handle NA only for floats (pandas-compatible) + if dropna and data.dtype == "float64": + mask = ~ak_isnan(data) + data = data[mask] + + if data.size == 0: + return pd.Series(dtype="int64") + + keys, counts = data.value_counts() + + return_index = ArkoudaArray._from_sequence(keys) + return_values = ArkoudaArray._from_sequence(counts) + + return pd.Series(return_values, index=return_index) diff --git a/arkouda/pandas/extension/_arkouda_categorical_array.py b/arkouda/pandas/extension/_arkouda_categorical_array.py index 857503620cf..33f583e0135 100644 --- a/arkouda/pandas/extension/_arkouda_categorical_array.py +++ b/arkouda/pandas/extension/_arkouda_categorical_array.py @@ -2,7 +2,8 @@ from typing import TYPE_CHECKING, Any, Sequence, TypeVar -import numpy as np # new +import numpy as np +import pandas as pd from numpy import ndarray from pandas.api.extensions import ExtensionArray @@ -131,6 +132,79 @@ def __eq__(self, other): def __repr__(self): return f"ArkoudaCategoricalArray({self._data})" + def value_counts(self, dropna: bool = True) -> pd.Series: + """ + Return counts of categories as a pandas Series. + + This method computes category frequencies from the underlying Arkouda + ``Categorical`` and returns them as a pandas ``Series``, where the + index contains the category labels and the values contain the + corresponding counts. + + Parameters + ---------- + dropna : bool + Whether to drop missing values from the result. When ``True``, + the result is filtered using the categorical's ``na_value``. + When ``False``, all categories returned by the underlying + computation are included. Default is True. + + Returns + ------- + pd.Series + A Series containing category counts. + The index is an ``ArkoudaStringArray`` of category labels and the + values are an ``ArkoudaArray`` of counts. + + Notes + ----- + - The result is computed server-side in Arkouda; only the (typically small) + output of categories and counts is materialized for the pandas ``Series``. + - This method does not yet support pandas options such as ``normalize``, + ``sort``, or ``bins``. + - The handling of missing values depends on the Arkouda ``Categorical`` + definition of ``na_value``. + + Examples + -------- + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaCategoricalArray + >>> + >>> a = ArkoudaCategoricalArray(["a", "b", "a", "c", "b", "a"]) + >>> a.value_counts() + a 3 + b 2 + c 1 + dtype: int64 + """ + import pandas as pd + + from arkouda.pandas.extension import ArkoudaArray, ArkoudaStringArray + from arkouda.pandas.groupbyclass import GroupBy + + cat = self._data + + codes = cat.codes + + if codes.size == 0: + return pd.Series(dtype="int64") + + grouped_codes, counts = GroupBy(codes).size() + categories = cat.categories[grouped_codes] + + if dropna is True: + mask = categories != cat.na_value + categories = categories[mask] + counts = counts[mask] + + if categories.size == 0: + return pd.Series(dtype="int64") + + return pd.Series( + ArkoudaArray._from_sequence(counts), + index=ArkoudaStringArray._from_sequence(categories), + ) + # ------------------------------------------------------------------ # pandas.Categorical-specific API that is not yet implemented # ------------------------------------------------------------------ @@ -252,6 +326,3 @@ def max(self, *args, **kwargs): def min(self, *args, **kwargs): self._categorical_not_implemented("min") - - def value_counts(self, *args, **kwargs): - self._categorical_not_implemented("value_counts") diff --git a/arkouda/pandas/extension/_arkouda_extension_array.py b/arkouda/pandas/extension/_arkouda_extension_array.py index 6e1e9881ddd..cf091b52d09 100644 --- a/arkouda/pandas/extension/_arkouda_extension_array.py +++ b/arkouda/pandas/extension/_arkouda_extension_array.py @@ -827,9 +827,6 @@ def skew(self, *args, **kwargs): def swapaxes(self, *args, **kwargs): self._reduction_not_implemented("swapaxes") - def value_counts(self, *args, **kwargs): - self._reduction_not_implemented("value_counts") - # ------------------------------------------------------------------ # String-like methods # ------------------------------------------------------------------ diff --git a/arkouda/pandas/extension/_arkouda_string_array.py b/arkouda/pandas/extension/_arkouda_string_array.py index 1bdf4bef020..0c4973e3a1e 100644 --- a/arkouda/pandas/extension/_arkouda_string_array.py +++ b/arkouda/pandas/extension/_arkouda_string_array.py @@ -4,6 +4,7 @@ from typing import cast as type_cast import numpy as np +import pandas as pd from numpy import ndarray from pandas.api.extensions import ExtensionArray @@ -136,6 +137,81 @@ def __eq__(self, other): def __repr__(self): return f"ArkoudaStringArray({self._data})" + def value_counts(self, dropna: bool = True) -> pd.Series: + """ + Return counts of unique strings as a pandas Series. + + This method computes the frequency of each distinct string value in the + underlying Arkouda ``Strings`` object and returns the result as a pandas + ``Series``, with the unique string values as the index and their counts + as the data. + + Parameters + ---------- + dropna : bool + Whether to exclude missing values. Missing-value handling for + Arkouda string arrays is not yet implemented, so this parameter is + accepted for pandas compatibility but currently has no effect. + Default is True. + + Returns + ------- + pd.Series + A Series containing the counts of unique string values. + The index is an ``ArkoudaStringArray`` of unique values, and the + values are an ``ArkoudaArray`` of counts. + + Notes + ----- + - The following pandas options are not yet implemented: + ``normalize``, ``sort``, and ``bins``. + - Counting is performed server-side in Arkouda; only the small result + (unique values and counts) is materialized on the client. + + Examples + -------- + Basic usage: + + >>> import arkouda as ak + >>> from arkouda.pandas.extension import ArkoudaStringArray + >>> + >>> s = ArkoudaStringArray(["red", "blue", "red", "green", "blue", "red"]) + >>> s.value_counts() + red 3 + blue 2 + green 1 + dtype: int64 + + Empty input: + + >>> empty = ArkoudaStringArray([]) + >>> empty.value_counts() + Series([], dtype: int64) + """ + import pandas as pd + + from arkouda.numpy.strings import Strings + from arkouda.pandas.extension import ArkoudaArray, ArkoudaStringArray + from arkouda.pandas.groupbyclass import GroupBy + + s = self._data + + if s.size == 0: + return pd.Series(dtype="int64") + + values, counts = GroupBy(s).size() + + # For type checking: + assert isinstance(values, Strings) + + if values.size == 0: + return pd.Series(dtype="int64") + + return pd.Series( + ArkoudaArray._from_sequence(counts), + index=ArkoudaStringArray._from_sequence(values), + ) + def _not_implemented(self, name: str): raise NotImplementedError(f"`{name}` is not implemented for Arkouda-backed arrays yet.") diff --git a/make/Dev.mk b/make/Dev.mk index 4069f8dceec..f531b415562 100644 --- a/make/Dev.mk +++ b/make/Dev.mk @@ -26,4 +26,4 @@ chplcheck: COV_MIN ?= 100 .PHONY: coverage coverage: - python3 -m pytest -c pytest.ini --cov=$(ARKOUDA_PROJECT_DIR)/arkouda --cov-report=term-missing --cov-report=xml:coverage.xml --cov-fail-under=$(COV_MIN) --size=$(size) $(ARKOUDA_PYTEST_OPTIONS) --skip_doctest="True" + python3 -m pytest -c pytest.ini --cov=arkouda --cov-report=term-missing --cov-report=xml:coverage.xml --cov-fail-under=$(COV_MIN) --size=$(size) $(ARKOUDA_PYTEST_OPTIONS) --skip_doctest="True" diff --git a/tests/pandas/extension/arkouda_array_extension.py b/tests/pandas/extension/arkouda_array_extension.py index 4b826ace4c9..555629d6640 100644 --- a/tests/pandas/extension/arkouda_array_extension.py +++ b/tests/pandas/extension/arkouda_array_extension.py @@ -1,3 +1,5 @@ +import math + import numpy as np import pandas as pd import pytest @@ -815,3 +817,85 @@ def test_reduce_accepts_skipna_flag(self, skipna): arr = ArkoudaArray(ak.array([1.0, np.nan, 2.0])) # whichever semantics you currently implement, it should not error _ = arr._reduce("sum", skipna=skipna) + + +class TestArkoudaArrayValueCounts: + def _series_to_pycounts(self, s: pd.Series) -> dict: + """ + Convert the returned Series to a plain Python {value: count} mapping. + + This avoids relying on ordering and avoids depending on whether the + Series holds Arkouda-backed values vs NumPy-backed values. + """ + # Index and values may be Arkouda-backed; coerce to python scalars + idx = list(s.index.to_numpy()) + vals = list(s.to_numpy()) + return {idx[i]: int(vals[i]) for i in range(len(s))} + + def test_value_counts_int64_basic(self): + a = ArkoudaArray(ak.array([1, 2, 1, 3, 2, 1], dtype="int64")) + out = a.value_counts() + + got = self._series_to_pycounts(out) + assert got == {1: 3, 2: 2, 3: 1} + + def test_value_counts_uint64_basic(self): + a = ArkoudaArray(ak.array([1, 2, 1, 3, 2, 1], dtype="uint64")) + out = a.value_counts() + + got = self._series_to_pycounts(out) + assert got == {1: 3, 2: 2, 3: 1} + + def test_value_counts_bool_basic(self): + a = ArkoudaArray(ak.array([True, False, True, True], dtype="bool")) + out = a.value_counts() + + got = self._series_to_pycounts(out) + assert got == {True: 3, False: 1} + + def test_value_counts_float64_dropna_true_excludes_nan(self): + a = ArkoudaArray(ak.array([1.0, 2.0, float("nan"), 1.0], dtype="float64")) + out = a.value_counts(dropna=True) + + got = self._series_to_pycounts(out) + + # NaN should not appear when dropna=True + assert 1.0 in got and got[1.0] == 2 + assert 2.0 in got and got[2.0] == 1 + assert not any(isinstance(k, float) and math.isnan(k) for k in got.keys()) + + def test_value_counts_empty_returns_empty_series(self): + a = ArkoudaArray(ak.array([], dtype="int64")) + out = a.value_counts() + + assert isinstance(out, pd.Series) + assert len(out) == 0 + + def test_value_counts_matches_pandas_counts_as_multiset(self): + """Cross-check correctness against pandas value_counts, ignoring ordering.""" + data = [3, 1, 2, 3, 3, 2, 1, 4, 4, 4, 4] + a = ArkoudaArray(ak.array(data, dtype="int64")) + out = a.value_counts() + + got = self._series_to_pycounts(out) + expected = pd.Series(data).value_counts(dropna=True).to_dict() + + # pandas dict keys are python ints; compare directly + assert got == {int(k): int(v) for k, v in expected.items()} + + def test_arkoudaarray_value_counts_dropna_true_excludes_nan(self): + # float64 with NaNs present + arr = pd.array([1.0, np.nan, 2.0, np.nan, 2.0], dtype="ak_float64") + + vc = arr.value_counts(dropna=True) + + # With the bug, NaN will still be counted -> this assertion would fail. + assert len(vc) == 2 + + # Ensure NaN isn't present as an index entry + # (robust across different index container types) + assert not any(pd.isna(x) for x in vc.index.to_numpy()) + + # And the numeric counts are correct (order-independent) + got = dict(zip(vc.index.to_numpy(), vc.to_numpy())) + assert got == {1.0: 1, 2.0: 2} diff --git a/tests/pandas/extension/arkouda_categorical_extension.py b/tests/pandas/extension/arkouda_categorical_extension.py index 6a5c05eee76..35dc2cf5485 100644 --- a/tests/pandas/extension/arkouda_categorical_extension.py +++ b/tests/pandas/extension/arkouda_categorical_extension.py @@ -255,3 +255,83 @@ def test_eq_with_python_sequence_length_mismatch_raises_categorical(self): arr = ArkoudaCategoricalArray(Categorical(ak.array(["a", "b", "c"]))) with pytest.raises(ValueError, match="Lengths must match"): _ = arr == ["a", "b"] # len 2, not 1 and not len(arr) + + +class TestArkoudaCategoricalValueCounts: + def _series_to_pycounts(self, s: pd.Series) -> dict: + """ + Convert the returned Series to a plain Python {value: count} mapping. + + Works whether the index / values are Arkouda-backed or NumPy-backed. + """ + idx = list(s.index.to_numpy()) + vals = list(s.to_numpy()) + return {idx[i]: int(vals[i]) for i in range(len(s))} + + def test_categorical_value_counts_basic(self): + a = ArkoudaCategoricalArray(["a", "b", "a", "c", "b", "a"]) + out = a.value_counts() + + got = self._series_to_pycounts(out) + assert got == {"a": 3, "b": 2, "c": 1} + + def test_categorical_value_counts_single_category(self): + a = ArkoudaCategoricalArray(["x", "x", "x"]) + out = a.value_counts() + + got = self._series_to_pycounts(out) + assert got == {"x": 3} + + def test_categorical_value_counts_empty(self): + a = ArkoudaCategoricalArray(ak.array([], dtype="str_")) + out = a.value_counts() + + assert isinstance(out, pd.Series) + assert len(out) == 0 + + def test_categorical_value_counts_matches_pandas_as_multiset(self): + """Cross-check correctness against pandas value_counts, ignoring ordering.""" + data = ["blue", "red", "blue", "green", "blue", "red"] + a = ArkoudaCategoricalArray(data) + out = a.value_counts() + + got = self._series_to_pycounts(out) + expected = pd.Series(pd.Categorical(data)).value_counts(dropna=True).to_dict() + + # pandas returns counts as numpy ints; normalize to python ints + assert got == {str(k): int(v) for k, v in expected.items()} + + def test_categorical_value_counts_dropna_true_drops_na_value(self): + """ + With the current implementation, dropna=True filters the result down to + categories != cat.na_value. + """ + a = ArkoudaCategoricalArray(["a", "b", "a"]) + out = a.value_counts(dropna=True) + + got = self._series_to_pycounts(out) + + # It should not contain the na value + na = a._data.na_value + assert na not in set(got.keys()) + + def test_categorical_value_counts_dropna_false_includes_non_na_categories(self): + """dropna=False should not apply the na_value filter, so normal categories appear.""" + a = ArkoudaCategoricalArray(["a", "b", "a"]) + out = a.value_counts(dropna=False) + + got = self._series_to_pycounts(out) + + assert got.get("a", 0) == 2 + assert got.get("b", 0) == 1 + + def test_categorical_value_counts_dropna(self): + c = Categorical([]) + a = ArkoudaCategoricalArray(["x", "y", "x", c.na_value]) + na = a._data.na_value + + out1 = a.value_counts(dropna=True) + assert na not in set(out1.index) + + out2 = a.value_counts(dropna=False) + assert na in set(out2.index) diff --git a/tests/pandas/extension/arkouda_strings_extension.py b/tests/pandas/extension/arkouda_strings_extension.py index 92a5466f171..9474cdfef48 100644 --- a/tests/pandas/extension/arkouda_strings_extension.py +++ b/tests/pandas/extension/arkouda_strings_extension.py @@ -234,3 +234,67 @@ def test_eq_with_python_sequence_length_mismatch_raises_strings(self): arr = ArkoudaStringArray(ak.array(["a", "b", "c"])) with pytest.raises(ValueError, match="Lengths must match"): _ = arr == ["a", "b"] # len 2, not 1 and not len(arr) + + +class TestArkoudaStringArrayValueCounts: + def _series_to_pycounts(self, s: pd.Series) -> dict[str, int]: + """ + Convert Series(index=unique values, values=counts) to a Python dict. + Avoids relying on ordering. + """ + idx = list(s.index.to_numpy()) + vals = list(s.to_numpy()) + return {str(idx[i]): int(vals[i]) for i in range(len(s))} + + def test_string_value_counts_basic(self): + a = ArkoudaStringArray(["red", "blue", "red", "green", "blue", "red"]) + out = a.value_counts() + + got = self._series_to_pycounts(out) + assert got == {"red": 3, "blue": 2, "green": 1} + + def test_string_value_counts_index_is_unique_values_not_original(self): + """ + Regression test for a common bug: using the original array 's' as index, + which causes a length mismatch (or incorrect results). + """ + a = ArkoudaStringArray(["a", "b", "a", "c"]) + out = a.value_counts() + + # index length must equal number of unique values, not len(a) + assert len(out.index) == 3 + assert len(out) == 3 + + got = self._series_to_pycounts(out) + assert got == {"a": 2, "b": 1, "c": 1} + + def test_string_value_counts_empty_returns_empty_series(self): + a = ArkoudaStringArray([]) + out = a.value_counts() + + assert isinstance(out, pd.Series) + assert len(out) == 0 + + def test_string_value_counts_dropna_parameter_is_accepted(self): + """ + Dropna is currently inert for ArkoudaStringArray, but should be accepted + for pandas compatibility. + """ + a = ArkoudaStringArray(["x", "y", "x"]) + out_true = a.value_counts(dropna=True) + out_false = a.value_counts(dropna=False) + + assert ( + self._series_to_pycounts(out_true) == self._series_to_pycounts(out_false) == {"x": 2, "y": 1} + ) + + def test_string_value_counts_matches_pandas_as_multiset(self): + """Cross-check counts against pandas, ignoring ordering.""" + data = ["blue", "red", "blue", "green", "blue", "red"] + a = ArkoudaStringArray(data) + out = a.value_counts() + + got = self._series_to_pycounts(out) + expected = pd.Series(data).value_counts(dropna=True).to_dict() + + assert got == {str(k): int(v) for k, v in expected.items()}