Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions arkouda/pandas/extension/_arkouda_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import cast as type_cast

import numpy as np
import pandas as pd

from numpy import ndarray
from numpy.typing import NDArray
Expand Down Expand Up @@ -446,3 +447,73 @@ def any(self, axis=0, skipna=True, **kwargs):
boolean-reduction calls.
"""
return bool(self._data.any())

def value_counts(self, dropna: bool = True) -> pd.Series:
"""
Return counts of unique values as a pandas Series.

This method computes the frequency of each distinct value in the
underlying Arkouda array and returns the result as a pandas
``Series``, with the unique values as the index and their counts
as the data.

Parameters
----------
dropna : bool
Whether to exclude missing values. Currently, missing-value
handling is supported only for floating-point data, where
``NaN`` values are treated as missing. Default is True.

Returns
-------
pd.Series
A Series containing the counts of unique values.
The index is an ``ArkoudaArray`` of unique values, and the
values are an ``ArkoudaArray`` of counts.

Notes
-----
- Only ``dropna=True`` is supported.
- The following pandas options are not yet implemented:
``normalize``, ``sort``, and ``bins``.
- Counting is performed server-side in Arkouda; only the small
result (unique values and counts) is materialized on the client.

Examples
--------
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaArray
>>>
>>> a = ArkoudaArray(ak.array([1, 2, 1, 3, 2, 1]))
>>> a.value_counts()
1 3
2 2
3 1
dtype: int64

Floating-point data with NaN values:

>>> b = ArkoudaArray(ak.array([1.0, 2.0, float("nan"), 1.0]))
>>> b.value_counts()
1.0 2
2.0 1
dtype: int64
"""
from arkouda.numpy.numeric import isnan as ak_isnan

data = self._data

# Handle NA only for floats (pandas-compatible)
if dropna and data.dtype == "float64":
mask = ~ak_isnan(data)
data = data[mask]

if data.size == 0:
return pd.Series(dtype="int64")

keys, counts = data.value_counts()

return_index = ArkoudaArray._from_sequence(keys)
return_values = ArkoudaArray._from_sequence(counts)

return pd.Series(return_values, index=return_index)
79 changes: 75 additions & 4 deletions arkouda/pandas/extension/_arkouda_categorical_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from typing import TYPE_CHECKING, Any, Sequence, TypeVar

import numpy as np # new
import numpy as np
import pandas as pd

from numpy import ndarray
from pandas.api.extensions import ExtensionArray
Expand Down Expand Up @@ -131,6 +132,79 @@ def __eq__(self, other):
def __repr__(self):
return f"ArkoudaCategoricalArray({self._data})"

def value_counts(self, dropna: bool = True) -> pd.Series:
"""
Return counts of categories as a pandas Series.

This method computes category frequencies from the underlying Arkouda
``Categorical`` and returns them as a pandas ``Series``, where the
index contains the category labels and the values contain the
corresponding counts.

Parameters
----------
dropna : bool
Whether to drop missing values from the result. When ``True``,
the result is filtered using the categorical's ``na_value``.
When ``False``, all categories returned by the underlying
computation are included. Default is True.

Returns
-------
pd.Series
A Series containing category counts.
The index is an ``ArkoudaStringArray`` of category labels and the
values are an ``ArkoudaArray`` of counts.

Notes
-----
- The result is computed server-side in Arkouda; only the (typically small)
output of categories and counts is materialized for the pandas ``Series``.
- This method does not yet support pandas options such as ``normalize``,
``sort``, or ``bins``.
- The handling of missing values depends on the Arkouda ``Categorical``
definition of ``na_value``.

Examples
--------
>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaCategoricalArray
>>>
>>> a = ArkoudaCategoricalArray(["a", "b", "a", "c", "b", "a"])
>>> a.value_counts()
a 3
b 2
c 1
dtype: int64
"""
import pandas as pd

from arkouda.pandas.extension import ArkoudaArray, ArkoudaStringArray
from arkouda.pandas.groupbyclass import GroupBy

cat = self._data

codes = cat.codes

if codes.size == 0:
return pd.Series(dtype="int64")

grouped_codes, counts = GroupBy(codes).size()
categories = cat.categories[grouped_codes]

if dropna is True:
mask = categories != cat.na_value
categories = categories[mask]
counts = counts[mask]

if categories.size == 0:
return pd.Series(dtype="int64")

return pd.Series(
ArkoudaArray._from_sequence(counts),
index=ArkoudaStringArray._from_sequence(categories),
)

# ------------------------------------------------------------------
# pandas.Categorical-specific API that is not yet implemented
# ------------------------------------------------------------------
Expand Down Expand Up @@ -252,6 +326,3 @@ def max(self, *args, **kwargs):

def min(self, *args, **kwargs):
self._categorical_not_implemented("min")

def value_counts(self, *args, **kwargs):
self._categorical_not_implemented("value_counts")
3 changes: 0 additions & 3 deletions arkouda/pandas/extension/_arkouda_extension_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -827,9 +827,6 @@ def skew(self, *args, **kwargs):
def swapaxes(self, *args, **kwargs):
self._reduction_not_implemented("swapaxes")

def value_counts(self, *args, **kwargs):
self._reduction_not_implemented("value_counts")

# ------------------------------------------------------------------
# String-like methods
# ------------------------------------------------------------------
Expand Down
76 changes: 76 additions & 0 deletions arkouda/pandas/extension/_arkouda_string_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import cast as type_cast

import numpy as np
import pandas as pd

from numpy import ndarray
from pandas.api.extensions import ExtensionArray
Expand Down Expand Up @@ -136,6 +137,81 @@ def __eq__(self, other):
def __repr__(self):
return f"ArkoudaStringArray({self._data})"

def value_counts(self, dropna: bool = True) -> pd.Series:
"""
Return counts of unique strings as a pandas Series.

This method computes the frequency of each distinct string value in the
underlying Arkouda ``Strings`` object and returns the result as a pandas
``Series``, with the unique string values as the index and their counts
as the data.

Parameters
----------
dropna : bool
Whether to exclude missing values. Missing-value handling for
Arkouda string arrays is not yet implemented, so this parameter is
accepted for pandas compatibility but currently has no effect.
Default is True.

Returns
-------
pd.Series
A Series containing the counts of unique string values.
The index is an ``ArkoudaStringArray`` of unique values, and the
values are an ``ArkoudaArray`` of counts.

Notes
-----
- The following pandas options are not yet implemented:
``normalize``, ``sort``, and ``bins``.
- Counting is performed server-side in Arkouda; only the small result
(unique values and counts) is materialized on the client.

Examples
--------
Basic usage:

>>> import arkouda as ak
>>> from arkouda.pandas.extension import ArkoudaStringArray
>>>
>>> s = ArkoudaStringArray(["red", "blue", "red", "green", "blue", "red"])
>>> s.value_counts()
red 3
blue 2
green 1
dtype: int64

Empty input:

>>> empty = ArkoudaStringArray([])
>>> empty.value_counts()
Series([], dtype: int64)
"""
import pandas as pd

from arkouda.numpy.strings import Strings
from arkouda.pandas.extension import ArkoudaArray, ArkoudaStringArray
from arkouda.pandas.groupbyclass import GroupBy

s = self._data

if s.size == 0:
return pd.Series(dtype="int64")

values, counts = GroupBy(s).size()

# For type checking:
assert isinstance(values, Strings)

if values.size == 0:
return pd.Series(dtype="int64")

return pd.Series(
ArkoudaArray._from_sequence(counts),
index=ArkoudaStringArray._from_sequence(values),
)

def _not_implemented(self, name: str):
raise NotImplementedError(f"`{name}` is not implemented for Arkouda-backed arrays yet.")

Expand Down
2 changes: 1 addition & 1 deletion make/Dev.mk
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ chplcheck:
COV_MIN ?= 100
.PHONY: coverage
coverage:
python3 -m pytest -c pytest.ini --cov=$(ARKOUDA_PROJECT_DIR)/arkouda --cov-report=term-missing --cov-report=xml:coverage.xml --cov-fail-under=$(COV_MIN) --size=$(size) $(ARKOUDA_PYTEST_OPTIONS) --skip_doctest="True"
python3 -m pytest -c pytest.ini --cov=arkouda --cov-report=term-missing --cov-report=xml:coverage.xml --cov-fail-under=$(COV_MIN) --size=$(size) $(ARKOUDA_PYTEST_OPTIONS) --skip_doctest="True"
84 changes: 84 additions & 0 deletions tests/pandas/extension/arkouda_array_extension.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import math

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -815,3 +817,85 @@ def test_reduce_accepts_skipna_flag(self, skipna):
arr = ArkoudaArray(ak.array([1.0, np.nan, 2.0]))
# whichever semantics you currently implement, it should not error
_ = arr._reduce("sum", skipna=skipna)


class TestArkoudaArrayValueCounts:
def _series_to_pycounts(self, s: pd.Series) -> dict:
"""
Convert the returned Series to a plain Python {value: count} mapping.

This avoids relying on ordering and avoids depending on whether the
Series holds Arkouda-backed values vs NumPy-backed values.
"""
# Index and values may be Arkouda-backed; coerce to python scalars
idx = list(s.index.to_numpy())
vals = list(s.to_numpy())
return {idx[i]: int(vals[i]) for i in range(len(s))}

def test_value_counts_int64_basic(self):
a = ArkoudaArray(ak.array([1, 2, 1, 3, 2, 1], dtype="int64"))
out = a.value_counts()

got = self._series_to_pycounts(out)
assert got == {1: 3, 2: 2, 3: 1}

def test_value_counts_uint64_basic(self):
a = ArkoudaArray(ak.array([1, 2, 1, 3, 2, 1], dtype="uint64"))
out = a.value_counts()

got = self._series_to_pycounts(out)
assert got == {1: 3, 2: 2, 3: 1}

def test_value_counts_bool_basic(self):
a = ArkoudaArray(ak.array([True, False, True, True], dtype="bool"))
out = a.value_counts()

got = self._series_to_pycounts(out)
assert got == {True: 3, False: 1}

def test_value_counts_float64_dropna_true_excludes_nan(self):
a = ArkoudaArray(ak.array([1.0, 2.0, float("nan"), 1.0], dtype="float64"))
out = a.value_counts(dropna=True)

got = self._series_to_pycounts(out)

# NaN should not appear when dropna=True
assert 1.0 in got and got[1.0] == 2
assert 2.0 in got and got[2.0] == 1
assert not any(isinstance(k, float) and math.isnan(k) for k in got.keys())

def test_value_counts_empty_returns_empty_series(self):
a = ArkoudaArray(ak.array([], dtype="int64"))
out = a.value_counts()

assert isinstance(out, pd.Series)
assert len(out) == 0

def test_value_counts_matches_pandas_counts_as_multiset(self):
"""Cross-check correctness against pandas value_counts, ignoring ordering."""
data = [3, 1, 2, 3, 3, 2, 1, 4, 4, 4, 4]
a = ArkoudaArray(ak.array(data, dtype="int64"))
out = a.value_counts()

got = self._series_to_pycounts(out)
expected = pd.Series(data).value_counts(dropna=True).to_dict()

# pandas dict keys are python ints; compare directly
assert got == {int(k): int(v) for k, v in expected.items()}

def test_arkoudaarray_value_counts_dropna_true_excludes_nan(self):
# float64 with NaNs present
arr = pd.array([1.0, np.nan, 2.0, np.nan, 2.0], dtype="ak_float64")

vc = arr.value_counts(dropna=True)

# With the bug, NaN will still be counted -> this assertion would fail.
assert len(vc) == 2

# Ensure NaN isn't present as an index entry
# (robust across different index container types)
assert not any(pd.isna(x) for x in vc.index.to_numpy())

# And the numeric counts are correct (order-independent)
got = dict(zip(vc.index.to_numpy(), vc.to_numpy()))
assert got == {1.0: 1, 2.0: 2}
Loading