From 5caa0ba6a490b99eee612686fdcec705b8892401 Mon Sep 17 00:00:00 2001 From: Vassily Litvinov Date: Tue, 14 Jan 2025 19:14:08 -0800 Subject: [PATCH] Add 'range' argument to histogram*() Signed-off-by: Vassily Litvinov --- Makefile | 56 ++++++++-------- arkouda/numpy/numeric.py | 98 ++++++++++++++++++++++++--- src/Histogram.chpl | 108 ++++++++++++++++-------------- src/HistogramMsg.chpl | 130 +++++++++++++++--------------------- tests/numpy/numeric_test.py | 29 ++++++-- 5 files changed, 249 insertions(+), 172 deletions(-) diff --git a/Makefile b/Makefile index 569e8baa666..c97af3edccf 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ # Makefile for Arkouda ARKOUDA_PROJECT_DIR := $(dir $(realpath $(firstword $(MAKEFILE_LIST)))) +ARKOUDA_PROJECT_DIR := $(patsubst %/,%,$(ARKOUDA_PROJECT_DIR)) PROJECT_NAME := arkouda ARKOUDA_SOURCE_DIR := $(ARKOUDA_PROJECT_DIR)/src @@ -114,7 +115,7 @@ deps-download-source: zmq-download-source hdf5-download-source arrow-download-so DEP_DIR := dep DEP_INSTALL_DIR := $(ARKOUDA_PROJECT_DIR)/$(DEP_DIR) -DEP_BUILD_DIR := $(ARKOUDA_PROJECT_DIR)$(DEP_DIR)/build +DEP_BUILD_DIR := $(ARKOUDA_PROJECT_DIR)/$(DEP_DIR)/build ZMQ_VER := 4.3.5 ZMQ_NAME_VER := zeromq-$(ZMQ_VER) @@ -128,8 +129,8 @@ zmq-download-source: ifeq (,$(wildcard ${ZMQ_BUILD_DIR}*/.*)) # If the tar.gz not found, download it ifeq (,$(wildcard ${DEP_BUILD_DIR}/${ZMQ_NAME_VER}*.tar.gz)) - cd $(DEP_BUILD_DIR) && curl -sL $(ZMQ_LINK) | tar xz - # Otherwise just unzip it + cd $(DEP_BUILD_DIR) && curl -sL $(ZMQ_LINK) | tar xz + # Otherwise just unzip it else cd $(DEP_BUILD_DIR) && tar -xzf $(ZMQ_NAME_VER)*.tar.gz endif @@ -166,18 +167,18 @@ hdf5-download-source: ifeq (,$(wildcard ${HDF5_BUILD_DIR}*/.*)) # If the tar.gz not found, download it ifeq (,$(wildcard ${DEP_BUILD_DIR}/$(HDF5_NAME_VER)*tar.gz)) - cd $(DEP_BUILD_DIR) && curl -sL $(HDF5_LINK) | tar xz - # Otherwise just unzip it + cd $(DEP_BUILD_DIR) && curl -sL $(HDF5_LINK) | tar xz + # Otherwise just unzip it else cd $(DEP_BUILD_DIR) && tar -xzf $(HDF5_NAME_VER)*.tar.gz endif - endif + endif install-hdf5: hdf5-download-source @echo "Installing HDF5" rm -rf $(HDF5_INSTALL_DIR) mkdir -p $(DEP_INSTALL_DIR) $(DEP_BUILD_DIR) - + cd $(HDF5_BUILD_DIR)* && ./configure --prefix=$(HDF5_INSTALL_DIR) --enable-optimization=high --enable-hl && make && make install echo '$$(eval $$(call add-path,$(HDF5_INSTALL_DIR)))' >> Makefile.paths @@ -218,9 +219,8 @@ arrow-download-source: mkdir -p $(ARROW_DEP_DIR) cd $(ARROW_BUILD_DIR)/cpp/thirdparty/ && ./download_dependencies.sh $(ARROW_DEP_DIR) > $(DEP_BUILD_DIR)/arrow_exports.sh endif - + rm -fr $(ARROW_BUILD_DIR) - install-arrow: arrow-download-source @echo "Installing Apache Arrow/Parquet" @@ -230,16 +230,16 @@ install-arrow: arrow-download-source cd $(DEP_BUILD_DIR) && tar -xvf $(ARROW_NAME_VER).tar.gz mkdir -p $(ARROW_BUILD_DIR)/cpp/build-release - + cd $(DEP_BUILD_DIR) && . ./arrow_exports.sh && cd $(ARROW_BUILD_DIR)/cpp/build-release && cmake -S $(ARROW_BUILD_DIR)/cpp .. -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(ARROW_INSTALL_DIR) -DCMAKE_BUILD_TYPE=Release -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_BZ2=ON -DARROW_WITH_LZ4=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON -DARROW_DEPENDENCY_SOURCE=$(ARROW_DEPENDENCY_SOURCE) $(ARROW_OPTIONS) && make -j$(NUM_CORES) cd $(ARROW_BUILD_DIR)/cpp/build-release && make install - - echo '$$(eval $$(call add-path,$(ARROW_INSTALL_DIR)))' >> Makefile.paths + + echo '$$(eval $$(call add-path,$(ARROW_INSTALL_DIR)))' >> Makefile.paths arrow-clean: rm -rf $(DEP_BUILD_DIR)/apache-arrow* - rm -rf $(DEP_BUILD_DIR)/arrow-apache-arrow* + rm -rf $(DEP_BUILD_DIR)/arrow-apache-arrow* rm -rf $(ARROW_DEP_DIR) rm -fr $(DEP_BUILD_DIR)/arrow_exports.sh @@ -268,23 +268,23 @@ ICONV_LINK := https://ftp.gnu.org/pub/gnu/libiconv/libiconv-$(ICONV_VER).tar.gz iconv-download-source: mkdir -p $(DEP_BUILD_DIR) - + #If the build directory does not exist, create it ifeq (,$(wildcard ${ICONV_BUILD_DIR}*/.*)) # If the tar.gz not found, download it ifeq (,$(wildcard ${DEP_BUILD_DIR}/libiconv-${ICONV_VER}.tar.gz)) - cd $(DEP_BUILD_DIR) && curl -sL $(ICONV_LINK) | tar xz - # Otherwise just unzip it + cd $(DEP_BUILD_DIR) && curl -sL $(ICONV_LINK) | tar xz + # Otherwise just unzip it else cd $(DEP_BUILD_DIR) && tar -xzf libiconv-$(ICONV_VER).tar.gz endif - endif - + endif + install-iconv: iconv-download-source @echo "Installing iconv" rm -rf $(ICONV_INSTALL_DIR) mkdir -p $(DEP_INSTALL_DIR) $(DEP_BUILD_DIR) - + cd $(ICONV_BUILD_DIR) && ./configure --prefix=$(ICONV_INSTALL_DIR) && make && make install echo '$$(eval $$(call add-path,$(ICONV_INSTALL_DIR)))' >> Makefile.paths @@ -299,23 +299,23 @@ LIBIDN_LINK := https://ftp.gnu.org/gnu/libidn/libidn2-$(LIBIDN_VER).tar.gz idn2-download-source: mkdir -p $(DEP_BUILD_DIR) - + #If the build directory does not exist, create it ifeq (,$(wildcard $(LIBIDN_BUILD_DIR)*/.*)) - # If the tar.gz is not found, download it + # If the tar.gz is not found, download it ifeq (,$(wildcard ${DEP_BUILD_DIR}/libidn2-$(LIBIDN_VER)*.tar.gz)) - cd $(DEP_BUILD_DIR) && curl -sL $(LIBIDN_LINK) | tar xz - # Otherwise just unzip it + cd $(DEP_BUILD_DIR) && curl -sL $(LIBIDN_LINK) | tar xz + # Otherwise just unzip it else cd $(DEP_BUILD_DIR) && tar -xzf libidn2-$(LIBIDN_VER)*.tar.gz endif - endif + endif install-idn2: idn2-download-source @echo "Installing libidn2" rm -rf $(LIBIDN_INSTALL_DIR) - mkdir -p $(DEP_INSTALL_DIR) $(DEP_BUILD_DIR) - + mkdir -p $(DEP_INSTALL_DIR) $(DEP_BUILD_DIR) + cd $(LIBIDN_BUILD_DIR) && ./configure --prefix=$(LIBIDN_INSTALL_DIR) && make && make install echo '$$(eval $$(call add-path,$(LIBIDN_INSTALL_DIR)))' >> Makefile.paths @@ -327,7 +327,7 @@ BLOSC_INSTALL_DIR := $(DEP_INSTALL_DIR)/c-blosc-install blosc-download-source: mkdir -p $(DEP_BUILD_DIR) - + #If the build directory does not exist, create it ifeq (,$(wildcard $(BLOSC_BUILD_DIR)/.*)) cd $(DEP_BUILD_DIR) && git clone https://github.com/Blosc/c-blosc2.git @@ -760,8 +760,6 @@ benchmark: python3 -m pytest -c benchmark.ini --benchmark-autosave --benchmark-storage=file://benchmark_v2/.benchmarks --size=$(size_bm) --benchmark-json=$(out) python3 benchmark_v2/reformat_benchmark_results.py --benchmark-data $(out) - - version: @echo $(VERSION); diff --git a/arkouda/numpy/numeric.py b/arkouda/numpy/numeric.py index 233d5a59bbf..9d7e40f8682 100644 --- a/arkouda/numpy/numeric.py +++ b/arkouda/numpy/numeric.py @@ -1772,8 +1772,27 @@ def where( return create_pdarray(type_cast(str, repMsg)) +# histogram helper +def _pyrange(count): + """Simply makes a range(count). For use in histogram* functions + that, like in numpy, have a 'range' parameter.""" + return range(count) + + +# histogram helper, to avoid typechecker errors +def _conv_dim(sampleDim, rangeDim): + if rangeDim: + return (rangeDim[0], rangeDim[1]) + else: + return (sampleDim.min(), sampleDim.max()) + + @typechecked -def histogram(pda: pdarray, bins: int_scalars = 10) -> Tuple[pdarray, pdarray]: +def histogram( + pda: pdarray, + bins: int_scalars = 10, + range: Optional[Tuple[numeric_scalars, numeric_scalars]] = None, +) -> Tuple[pdarray, pdarray]: """ Compute a histogram of evenly spaced bins over the range of an array. @@ -1785,6 +1804,11 @@ def histogram(pda: pdarray, bins: int_scalars = 10) -> Tuple[pdarray, pdarray]: bins : int_scalars, default=10 The number of equal-size bins to use (default: 10) + range : (minVal, maxVal), optional + The range of the values to count. + Values outside of this range are dropped. + By default, all values are counted. + Returns ------- (pdarray, Union[pdarray, int64 or float64]) @@ -1807,6 +1831,7 @@ def histogram(pda: pdarray, bins: int_scalars = 10) -> Tuple[pdarray, pdarray]: Notes ----- The bins are evenly spaced in the interval [pda.min(), pda.max()]. + If range parameter is provided, the interval is [range[0], range[1]]. Examples -------- @@ -1828,15 +1853,25 @@ def histogram(pda: pdarray, bins: int_scalars = 10) -> Tuple[pdarray, pdarray]: """ if bins < 1: raise ValueError("bins must be 1 or greater") - b = linspace(pda.min(), pda.max(), bins + 1) - repMsg = generic_msg(cmd="histogram", args={"array": pda, "bins": bins}) + + minVal, maxVal = _conv_dim(pda, range) + + b = linspace(minVal, maxVal, bins + 1) + repMsg = generic_msg( + cmd="histogram", args={"array": pda, "bins": bins, "minVal": minVal, "maxVal": maxVal} + ) return create_pdarray(type_cast(str, repMsg)), b # Typechecking removed due to circular dependencies with arrayview # @typechecked def histogram2d( - x: pdarray, y: pdarray, bins: Union[int_scalars, Sequence[int_scalars]] = 10 + x: pdarray, + y: pdarray, + bins: Union[int_scalars, Sequence[int_scalars]] = 10, + range: Optional[ + Tuple[Tuple[numeric_scalars, numeric_scalars], Tuple[numeric_scalars, numeric_scalars]] + ] = None, ) -> Tuple[pdarray, pdarray, pdarray]: """ Compute the bi-dimensional histogram of two data samples with evenly spaced bins @@ -1855,6 +1890,11 @@ def histogram2d( If [int, int], the number of bins in each dimension (nx, ny = bins). Defaults to 10 + range : ((xMin, xMax), (yMin, yMax)), optional + The ranges of the values in x and y to count. + Values outside of these ranges are dropped. + By default, all values are counted. + Returns ------- hist : pdarray @@ -1887,6 +1927,8 @@ def histogram2d( ----- The x bins are evenly spaced in the interval [x.min(), x.max()] and y bins are evenly spaced in the interval [y.min(), y.max()]. + If range parameter is provided, the intervals are given + by range[0] for x and range[1] for y.. Examples -------- @@ -1909,11 +1951,28 @@ def histogram2d( if len(bins) != 2: raise ValueError("Sequences of bins must contain two elements (num_x_bins, num_y_bins)") x_bins, y_bins = bins + x_bins, y_bins = int(x_bins), int(y_bins) if x_bins < 1 or y_bins < 1: raise ValueError("bins must be 1 or greater") - x_bin_boundaries = linspace(x.min(), x.max(), x_bins + 1) - y_bin_boundaries = linspace(y.min(), y.max(), y_bins + 1) - repMsg = generic_msg(cmd="histogram2D", args={"x": x, "y": y, "xBins": x_bins, "yBins": y_bins}) + + xMin, xMax = _conv_dim(x, range[0] if range else None) + yMin, yMax = _conv_dim(y, range[1] if range else None) + + x_bin_boundaries = linspace(xMin, xMax, x_bins + 1) + y_bin_boundaries = linspace(yMin, yMax, y_bins + 1) + repMsg = generic_msg( + cmd="histogram2D", + args={ + "x": x, + "y": y, + "xBins": x_bins, + "yBins": y_bins, + "xMin": xMin, + "xMax": xMax, + "yMin": yMin, + "yMax": yMax, + }, + ) return ( create_pdarray(type_cast(str, repMsg)).reshape(x_bins, y_bins), x_bin_boundaries, @@ -1922,7 +1981,9 @@ def histogram2d( def histogramdd( - sample: Sequence[pdarray], bins: Union[int_scalars, Sequence[int_scalars]] = 10 + sample: Sequence[pdarray], + bins: Union[int_scalars, Sequence[int_scalars]] = 10, + range: Optional[Sequence[Optional[Tuple[numeric_scalars, numeric_scalars]]]] = None, ) -> Tuple[pdarray, Sequence[pdarray]]: """ Compute the multidimensional histogram of data in sample with evenly spaced bins. @@ -1938,6 +1999,11 @@ def histogramdd( If [int, int, ...], the number of bins in each dimension (nx, ny, ... = bins). Defaults to 10 + range : Sequence[optional (minVal, maxVal)], optional + The ranges of the values to count for each array in sample. + Values outside of these ranges are dropped. + By default, all values are counted. + Returns ------- hist : pdarray @@ -1964,6 +2030,7 @@ def histogramdd( Notes ----- The bins for each dimension, m, are evenly spaced in the interval [m.min(), m.max()] + or in the inverval determined by range[dimension], if provided. Examples -------- @@ -1996,17 +2063,26 @@ def histogramdd( if any(b < 1 for b in bins): raise ValueError("bins must be 1 or greater") + if not range: + range = [None for pda in sample] + elif len(range) != num_dims: + raise ValueError("The range sequence contains a different number of elements than the sample") + + range_list = [_conv_dim(sample[i], range[i]) for i in _pyrange(num_dims)] + bins = list(bins) if isinstance(bins, tuple) else bins sample = list(sample) if isinstance(sample, tuple) else sample - bin_boundaries = [linspace(a.min(), a.max(), b + 1) for a, b in zip(sample, bins)] - bins_pda = array(bins)[::-1] - dim_prod = (cumprod(bins_pda) // bins_pda)[::-1] + bin_boundaries = [linspace(r[0], r[1], b + 1) for r, b in zip(range_list, bins)] + d_curr, d_next = 1, 1 + dim_prod = [(d_curr := d_next, d_next := d_curr * int(v))[0] for v in bins[::-1]][::-1] # noqa: F841 repMsg = generic_msg( cmd="histogramdD", args={ "sample": sample, "num_dims": num_dims, "bins": bins, + "rangeMin": [r[0] for r in range_list], + "rangeMax": [r[1] for r in range_list], "dim_prod": dim_prod, "num_samples": sample[0].size, }, diff --git a/src/Histogram.chpl b/src/Histogram.chpl index 53870bc466c..1fcc4eb4465 100644 --- a/src/Histogram.chpl +++ b/src/Histogram.chpl @@ -2,9 +2,6 @@ module Histogram { use ServerConfig; - use Time; - use Math only; - use PrivateDist; use SymArrayDmap; use Logging; @@ -14,6 +11,27 @@ module Histogram private config const logChannel = ServerConfig.logChannel; const hgLogger = new Logger(logLevel, logChannel); + /* Helper: is this value within the requested range? */ + inline proc histValWithinRange(val, aMin, aMax): bool { + return aMin <= val && val <= aMax; + } + + /* Helper: computes the 1-d bin number for the value, assuming it is within the range. */ + inline proc histValToBin(val, aMin, aMax, numBins, binWidth: real): int { + return if val == aMax then numBins-1 else ((val - aMin) / binWidth): int; + } + + /* Helper: converts (x,y) into a bin number for a 2-d histogram. + Returns -1 if (x,y) is outside the requested range. */ + private proc valsToBin(xi, yi, xMin, xMax, yMin, yMax, + numXBins, numYBins, xBinWidth, yBinWidth): int { + if ! histValWithinRange(xi, xMin, xMax) then return -1; + if ! histValWithinRange(yi, yMin, yMax) then return -1; + const xiBin = histValToBin(xi, xMin, xMax, numXBins, xBinWidth); + const yiBin = histValToBin(yi, yMin, yMax, numYBins, yBinWidth); + return (xiBin * numYBins) + yiBin; + } + /* Takes the data in array a, creates an atomic histogram in parallel, and copies the result of the histogram operation into a distributed int array @@ -23,11 +41,9 @@ module Histogram :arg a: array of data to be histogrammed :type a: [] ?etype - :arg aMin: Min value in array a - :type aMin: etype + :arg aMin: Min value to count - :arg aMax: Max value in array a - :type aMax: etype + :arg aMax: Max value to count :arg bins: allocate size of the histogram's distributed domain :type bins: int @@ -37,19 +53,17 @@ module Histogram :returns: [] int */ - proc histogramGlobalAtomic(a: [?aD] ?etype, aMin: etype, aMax: etype, bins: int, binWidth: real) throws { + proc histogramGlobalAtomic(a: [?aD] ?etype, aMin, aMax, bins: int, binWidth: real) throws { var hD = makeDistDom(bins); var atomicHist: [hD] atomic int; // count into atomic histogram forall v in a { - var vBin = ((v - aMin) / binWidth):int; - if v == aMax {vBin = bins-1;} - if (vBin < 0) || (vBin > (bins-1)) { - try! hgLogger.error(getModuleName(),getRoutineName(),getLineNumber(),"OOB"); - } + if histValWithinRange(v, aMin, aMax) { + const vBin = histValToBin(v, aMin, aMax, bins, binWidth); atomicHist[vBin].add(1); + } } var hist = makeDistArray(bins,int); @@ -61,21 +75,17 @@ module Histogram return hist; } - proc histogramGlobalAtomic(x: [?aD] ?etype1, y: [aD] ?etype2, xMin: etype1, xMax: etype1, yMin: etype2, yMax: etype2, numXBins: int, numYBins: int, xBinWidth: real, yBinWidth: real) throws { + proc histogramGlobalAtomic(x: [?aD] ?etype1, y: [aD] ?etype2, xMin, xMax, yMin, yMax, numXBins: int, numYBins: int, xBinWidth: real, yBinWidth: real) throws { const totNumBins = numXBins * numYBins; var hD = makeDistDom(totNumBins); var atomicHist: [hD] atomic real; // count into atomic histogram forall (xi, yi) in zip(x, y) { - var xiBin = ((xi - xMin) / xBinWidth):int; - var yiBin = ((yi - yMin) / yBinWidth):int; - if xi == xMax {xiBin = numXBins-1;} - if yi == yMax {yiBin = numYBins-1;} - if xiBin < 0 || yiBin < 0 || (xiBin > (numXBins-1)) || (yiBin > (numYBins-1)) { - try! hgLogger.error(getModuleName(),getRoutineName(),getLineNumber(),"OOB"); - } - atomicHist[(xiBin * numYBins) + yiBin].add(1); + const bin = valsToBin(xi, yi, xMin, xMax, yMin, yMax, + numXBins, numYBins, xBinWidth, yBinWidth); + if bin < 0 then continue; + atomicHist[bin].add(1); } var hist = makeDistArray(totNumBins,real); @@ -96,11 +106,9 @@ module Histogram :arg a: array of data to be histogrammed :type a: [] ?etype - :arg aMin: Min value in array a - :type aMin: etype + :arg aMin: Min value to count - :arg aMax: Max value in array a - :type aMax: etype + :arg aMax: Max value to count :arg bins: allocate size of the histogram's distributed domain :type bins: int @@ -110,16 +118,17 @@ module Histogram :returns: [] int */ - proc histogramLocalAtomic(a: [?aD] ?etype, aMin: etype, aMax: etype, bins: int, binWidth: real) throws { + proc histogramLocalAtomic(a: [?aD] ?etype, aMin, aMax, bins: int, binWidth: real) throws { // allocate per-locale atomic histogram var atomicHist: [PrivateSpace] [0..#bins] atomic int; // count into per-locale private atomic histogram forall v in a { - var vBin = ((v - aMin) / binWidth):int; - if v == aMax {vBin = bins-1;} + if histValWithinRange(v, aMin, aMax) { + const vBin = histValToBin(v, aMin, aMax, bins, binWidth); atomicHist[here.id][vBin].add(1); + } } // +reduce across per-locale histograms to get counts @@ -127,23 +136,22 @@ module Histogram forall i in PrivateSpace with (+ reduce lHist) do lHist reduce= atomicHist[i].read(); - var hist = makeDistArray(bins,int); + var hist = makeDistArray(bins,int); hist = lHist; return hist; } - proc histogramLocalAtomic(x: [?aD] ?etype1, y: [aD] ?etype2, xMin: etype1, xMax: etype1, yMin: etype2, yMax: etype2, numXBins: int, numYBins: int, xBinWidth: real, yBinWidth: real) throws { + proc histogramLocalAtomic(x: [?aD] ?etype1, y: [aD] ?etype2, xMin, xMax, yMin, yMax, numXBins: int, numYBins: int, xBinWidth: real, yBinWidth: real) throws { const totNumBins = numXBins * numYBins; // allocate per-locale atomic histogram var atomicHist: [PrivateSpace] [0..#totNumBins] atomic real; // count into per-locale private atomic histogram forall (xi, yi) in zip(x, y) { - var xiBin = ((xi - xMin) / xBinWidth):int; - var yiBin = ((yi - yMin) / yBinWidth):int; - if xi == xMax {xiBin = numXBins-1;} - if yi == yMax {yiBin = numYBins-1;} - atomicHist[here.id][(xiBin * numYBins) + yiBin].add(1); + const bin = valsToBin(xi, yi, xMin, xMax, yMin, yMax, + numXBins, numYBins, xBinWidth, yBinWidth); + if bin < 0 then continue; + atomicHist[here.id][bin].add(1); } // +reduce across per-locale histograms to get counts @@ -165,11 +173,9 @@ module Histogram :arg a: array of data to be histogrammed :type a: [] ?etype - :arg aMin: Min value in array a - :type aMin: etype + :arg aMin: Min value to count - :arg aMax: Max value in array a - :type aMax: etype + :arg aMax: Max value to count :arg bins: allocate size of the histogram's distributed domain :type bins: int @@ -179,33 +185,33 @@ module Histogram :returns: [] int */ - proc histogramReduceIntent(a: [?aD] ?etype, aMin: etype, aMax: etype, bins: int, binWidth: real) throws { + proc histogramReduceIntent(a: [?aD] ?etype, aMin, aMax, bins: int, binWidth: real) throws { var gHist: [0..#bins] int; - + // count into per-task/per-locale histogram and then reduce as tasks complete forall v in a with (+ reduce gHist) { - var vBin = ((v - aMin) / binWidth):int; - if v == aMax {vBin = bins-1;} + if histValWithinRange(v, aMin, aMax) { + const vBin = histValToBin(v, aMin, aMax, bins, binWidth); gHist[vBin] += 1; + } } - var hist = makeDistArray(bins,int); + var hist = makeDistArray(bins,int); hist = gHist; return hist; } - proc histogramReduceIntent(x: [?aD] ?etype1, y: [aD] ?etype2, xMin: etype1, xMax: etype1, yMin: etype2, yMax: etype2, numXBins: int, numYBins: int, xBinWidth: real, yBinWidth: real) throws { + proc histogramReduceIntent(x: [?aD] ?etype1, y: [aD] ?etype2, xMin, xMax, yMin, yMax, numXBins: int, numYBins: int, xBinWidth: real, yBinWidth: real) throws { const totNumBins = numXBins * numYBins; var gHist: [0..#totNumBins] real; // count into per-task/per-locale histogram and then reduce as tasks complete forall (xi, yi) in zip(x, y) with (+ reduce gHist) { - var xiBin = ((xi - xMin) / xBinWidth):int; - var yiBin = ((yi - yMin) / yBinWidth):int; - if xi == xMax {xiBin = numXBins-1;} - if yi == yMax {yiBin = numYBins-1;} - gHist[(xiBin * numYBins) + yiBin] += 1; + const bin = valsToBin(xi, yi, xMin, xMax, yMin, yMax, + numXBins, numYBins, xBinWidth, yBinWidth); + if bin < 0 then continue; + gHist[bin] += 1; } var hist = makeDistArray(totNumBins,real); diff --git a/src/HistogramMsg.chpl b/src/HistogramMsg.chpl index 12f981a9e3f..46f0e20d73a 100644 --- a/src/HistogramMsg.chpl +++ b/src/HistogramMsg.chpl @@ -6,18 +6,18 @@ module HistogramMsg use ServerErrors; use Logging; use Message; - + use MultiTypeSymbolTable; use MultiTypeSymEntry; use ServerErrorStrings; use Histogram; use Message; - + private config const logLevel = ServerConfig.logLevel; private config const logChannel = ServerConfig.logChannel; const hgmLogger = new Logger(logLevel, logChannel); - + private config const sBound = 2**12; private config const mBound = 2**25; @@ -27,7 +27,7 @@ module HistogramMsg var repMsg: string; // response message const bins = msgArgs.get("bins").getIntValue(); const name = msgArgs.getValueOf("array"); - + // get next symbol name var rname = st.nextName(); hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), @@ -38,8 +38,8 @@ module HistogramMsg // helper nested procedure proc histogramHelper(type t) throws { var e = toSymEntry(gEnt,t); - var aMin = min reduce e.a; - var aMax = max reduce e.a; + var aMin = msgArgs.get("minVal").toScalar(real); + var aMax = msgArgs.get("maxVal").toScalar(real); var binWidth:real = (aMax - aMin):real / bins:real; hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), "binWidth %r".format(binWidth)); @@ -70,11 +70,11 @@ module HistogramMsg when (DType.Float64) {histogramHelper(real);} otherwise { var errorMsg = notImplementedError(pn,gEnt.dtype); - hgmLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg); - return new MsgTuple(errorMsg, MsgType.ERROR); + hgmLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg); + return new MsgTuple(errorMsg, MsgType.ERROR); } } - + repMsg = "created " + st.attrib(rname); hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return new MsgTuple(repMsg, MsgType.NORMAL); @@ -101,11 +101,11 @@ module HistogramMsg // helper nested procedure proc histogramHelper(type t1, type t2) throws { var x = toSymEntry(xGenEnt,t1); - var y = toSymEntry(yGenEnt,t2); - var xMin = min reduce x.a; - var xMax = max reduce x.a; - var yMin = min reduce y.a; - var yMax = max reduce y.a; + var y = toSymEntry(yGenEnt,t2); + var xMin = msgArgs.get("xMin").toScalar(real); + var xMax = msgArgs.get("xMax").toScalar(real); + var yMin = msgArgs.get("yMin").toScalar(real); + var yMax = msgArgs.get("yMax").toScalar(real); var xBinWidth:real = (xMax - xMin):real / xBins:real; var yBinWidth:real = (yMax - yMin):real / yBins:real; hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), @@ -162,87 +162,80 @@ module HistogramMsg const binsStrs = msgArgs.get("bins").getList(numDims); const bins = try! [b in binsStrs] b:int; const names = msgArgs.get("sample").getList(numDims); - const dimProdName = msgArgs.getValueOf("dim_prod"); + const dimProd = msgArgs.get("dim_prod").toScalarArray(int, numDims); const totNumBins = * reduce bins; - + // get next symbol name var rname = st.nextName(); hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), "cmd: %s name: %? bins: %? rname: %s".format(cmd, names, bins, rname)); var gEnts = try! [name in names] getGenericTypedArrayEntry(name, st); - var dimProdGenEnt = getGenericTypedArrayEntry(dimProdName, st); - var dimProd = toSymEntry(dimProdGenEnt,int); // helper nested procedure proc histogramHelper(type t) throws { + var rangeMin = msgArgs.get("rangeMin").toScalarArray(real, numDims); + var rangeMax = msgArgs.get("rangeMax").toScalarArray(real, numDims); var indices = makeDistArray(numSamples, int); - // 3 different implementations depending on size of histogram - // this is due to the time memory tradeoff between creating one/few atomic arrays - // or many non-atomic arrays and reducing them // compute index into flattened array: // for each dimension calculate which bin that value falls into in parallel // we can then scale by the product of the previous dimensions to get that dimensions impact on the flattened index // summing all these together gives the index into the flattened array + hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), "compute indices"); + for (gEnt, stride, bin, aMin, aMax) in zip(gEnts, dimProd, bins, rangeMin, rangeMax) { + var e = toSymEntry(gEnt,t); + var binWidth:real = (aMax - aMin):real / bin:real; + forall (v, idx) in zip(e.a, indices) { + if idx < 0 { + // value for this index was out of bounds in a previous sample, skip + } else if ! histValWithinRange(v, aMin, aMax) { + idx = -1; + } else { + const vBin = histValToBin(v, aMin, aMax, bin, binWidth); + idx += (vBin * stride):int; + } + } + } + + // The result will be here. + const histSE = createSymEntry(totNumBins, real); + st.addEntry(rname, histSE); + ref hist = histSE.a; + + // 3 different implementations depending on size of histogram + // this is due to the time memory tradeoff between creating one/few atomic arrays + // or many non-atomic arrays and reducing them if (totNumBins <= sBound) { + // "histogramReduceIntent" // small number of buckets (so histogram is relatively small): // each task gets it's own copy of the histogram and they're reduced hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), "%? <= %?".format(bins,sBound)); - for (gEnt, stride, bin) in zip(gEnts, dimProd.a, bins) { - var e = toSymEntry(gEnt,t); - var aMin = min reduce e.a; - var aMax = max reduce e.a; - var binWidth:real = (aMax - aMin):real / bin:real; - forall (v, idx) in zip(e.a, indices) { - var vBin = ((v - aMin) / binWidth):int; - if v == aMax {vBin = bin-1;} - idx += (vBin * stride):int; - } - } var gHist: [0..#totNumBins] int; - + // count into per-task/per-locale histogram and then reduce as tasks complete forall idx in indices with (+ reduce gHist) { + if idx<0 then continue; gHist[idx] += 1; } - var hist = makeDistArray(totNumBins,int); hist = gHist; - st.addEntry(rname, createSymEntry(hist)); } else if (totNumBins <= mBound) { + // "histogramLocalAtomic" // medium number of buckets: // each locale gets it's own atomic copy of the histogram and these are reduced together hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), "%? <= %?".format(bins,mBound)); use PrivateDist; - var atomicIdx: [PrivateSpace] [0..#numSamples] atomic int; - - for (gEnt, stride, bin) in zip(gEnts, dimProd.a, bins) { - var e = toSymEntry(gEnt,t); - var aMin = min reduce e.a; - var aMax = max reduce e.a; - var binWidth:real = (aMax - aMin):real / bin:real; - forall (v, i) in zip(e.a, 0..) { - var vBin = ((v - aMin) / binWidth):int; - if v == aMax {vBin = bin-1;} - atomicIdx[here.id][i].add((vBin * stride):int); - } - } - - var lIdx: [0..#numSamples] int; - forall i in PrivateSpace with (+ reduce lIdx) { - lIdx reduce= atomicIdx[i].read(); - } - indices = lIdx; // allocate per-locale atomic histogram var atomicHist: [PrivateSpace] [0..#totNumBins] atomic int; // count into per-locale private atomic histogram forall idx in indices { + if idx<0 then continue; atomicHist[here.id][idx].add(1); } @@ -252,43 +245,26 @@ module HistogramMsg lHist reduce= atomicHist[i].read(); } - var hist = makeDistArray(totNumBins,int); hist = lHist; - st.addEntry(rname, createSymEntry(hist)); } else { + + // "histogramGlobalAtomic" // large number of buckets: // one global atomic histogram hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), "%? > %?".format(bins,mBound)); - use PrivateDist; - var atomicIdx: [makeDistDom(numSamples)] atomic int; - - for (gEnt, stride, bin) in zip(gEnts, dimProd.a, bins) { - var e = toSymEntry(gEnt,t); - var aMin = min reduce e.a; - var aMax = max reduce e.a; - var binWidth:real = (aMax - aMin):real / bin:real; - forall (v, i) in zip(e.a, 0..) { - var vBin = ((v - aMin) / binWidth):int; - if v == aMax {vBin = bin-1;} - atomicIdx[i].add((vBin * stride):int); - } - } - - [(i,ai) in zip(indices, atomicIdx)] i = ai.read(); // allocate single global atomic histogram var atomicHist: [makeDistDom(totNumBins)] atomic int; // count into atomic histogram forall idx in indices { + if idx<0 then continue; atomicHist[idx].add(1); } - var hist = makeDistArray(totNumBins,int); // copy from atomic histogram to normal histogram [(e,ae) in zip(hist, atomicHist)] e = ae.read(); - st.addEntry(rname, createSymEntry(hist)); } } @@ -298,11 +274,11 @@ module HistogramMsg when (DType.Float64) {histogramHelper(real);} otherwise { var errorMsg = notImplementedError(pn,gEnts[0].dtype); - hgmLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg); - return new MsgTuple(errorMsg, MsgType.ERROR); + hgmLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg); + return new MsgTuple(errorMsg, MsgType.ERROR); } } - + repMsg = "created " + st.attrib(rname); hgmLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return new MsgTuple(repMsg, MsgType.NORMAL); diff --git a/tests/numpy/numeric_test.py b/tests/numpy/numeric_test.py index c119a15be10..88e3596bf32 100644 --- a/tests/numpy/numeric_test.py +++ b/tests/numpy/numeric_test.py @@ -295,6 +295,12 @@ def test_histogram(self, num_type): with pytest.raises(TypeError): ak.histogram(np.array([range(0, 10)]).astype(num_type), bins="1") + # add 'range' + ak_result, ak_bins = ak.histogram(pda, bins=20, range=(15, 20)) + np_result, np_bins = np.histogram(np.array(pda.to_list()), bins=20, range=(15, 20)) + assert np.allclose(ak_result.to_list(), np_result.tolist()) + assert np.allclose(ak_bins.to_list(), np_bins.tolist()) + # log and exp tests were identical, and so have been combined. @pytest.mark.skipif(pytest.host == "horizon", reason="Fails on horizon") @@ -304,11 +310,10 @@ def test_histogram(self, num_type): def test_histogram_multidim(self, num_type1, num_type2): # test 2d histogram seed = 1 - ak_x, ak_y = ( - ak.randint(1, 100, 1000, seed=seed, dtype=num_type1), - ak.randint(1, 100, 1000, seed=seed + 1, dtype=num_type2), - ) + ak_x = ak.randint(1, 100, 1000, seed=seed, dtype=num_type1) + ak_y = ak.randint(1, 100, 1000, seed=seed + 1, dtype=num_type2) np_x, np_y = ak_x.to_ndarray(), ak_y.to_ndarray() + np_hist, np_x_edges, np_y_edges = np.histogram2d(np_x, np_y) ak_hist, ak_x_edges, ak_y_edges = ak.histogram2d(ak_x, ak_y) assert np.allclose(np_hist.tolist(), ak_hist.to_list()) @@ -321,6 +326,13 @@ def test_histogram_multidim(self, num_type1, num_type2): assert np.allclose(np_x_edges.tolist(), ak_x_edges.to_list()) assert np.allclose(np_y_edges.tolist(), ak_y_edges.to_list()) + # add 'range' + np_hist, np_x_edges, np_y_edges = np.histogram2d(np_x, np_y, range=((25, 92), (15, 86))) + ak_hist, ak_x_edges, ak_y_edges = ak.histogram2d(ak_x, ak_y, range=((25, 92), (15, 86))) + assert np.allclose(np_hist.tolist(), ak_hist.to_list()) + assert np.allclose(np_x_edges.tolist(), ak_x_edges.to_list()) + assert np.allclose(np_y_edges.tolist(), ak_y_edges.to_list()) + # test arbitrary dimensional histogram dim_list = [3, 4, 5] bin_list = [[2, 4, 5], [2, 4, 5, 2], [2, 4, 5, 2, 3]] @@ -328,12 +340,21 @@ def test_histogram_multidim(self, num_type1, num_type2): if dim <= get_max_array_rank(): np_arrs = [np.random.randint(1, 100, 1000) for _ in range(dim)] ak_arrs = [ak.array(a) for a in np_arrs] + np_hist, np_bin_edges = np.histogramdd(np_arrs, bins=bins) ak_hist, ak_bin_edges = ak.histogramdd(ak_arrs, bins=bins) assert np.allclose(np_hist.tolist(), ak_hist.to_list()) for np_edge, ak_edge in zip(np_bin_edges, ak_bin_edges): assert np.allclose(np_edge.tolist(), ak_edge.to_list()) + # add 'range' + range_arg = [(10, 80) for _ in range(dim)] + np_hist, np_bin_edges = np.histogramdd(np_arrs, bins=bins, range=range_arg) + ak_hist, ak_bin_edges = ak.histogramdd(ak_arrs, bins=bins, range=range_arg) + assert np.allclose(np_hist.tolist(), ak_hist.to_list()) + for np_edge, ak_edge in zip(np_bin_edges, ak_bin_edges): + assert np.allclose(np_edge.tolist(), ak_edge.to_list()) + @pytest.mark.parametrize("num_type", NO_BOOL) @pytest.mark.parametrize("op", ["exp", "log", "expm1", "log2", "log10", "log1p"]) def test_log_and_exp(self, num_type, op):