From 59e5f366fe18c54f8d9e4f26742c02f6e7a9164a Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Wed, 7 Jan 2026 19:24:49 +0900 Subject: [PATCH 1/8] fix: rejecting inf as value --- tdigest/include/tdigest_impl.hpp | 5 +++ tdigest/test/tdigest_test.cpp | 53 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index b8fab38d..75f2d9ee 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -37,6 +37,7 @@ tdigest(false, k, std::numeric_limits::infinity(), -std::numeric_limits::i template void tdigest::update(T value) { if (std::isnan(value)) return; + if (std::isinf(value)) return; if (buffer_.size() == centroids_capacity_ * BUFFER_MULTIPLIER) compress(); buffer_.push_back(value); min_ = std::min(min_, value); @@ -94,6 +95,7 @@ template double tdigest::get_rank(T value) const { if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN"); + if (std::isinf(value)) throw std::invalid_argument("operation is undefined for infinity"); if (value < min_) return 0; if (value > max_) return 1; // one centroid and value == min_ == max_ @@ -621,6 +623,9 @@ void tdigest::check_split_points(const T* values, uint32_t size) { if (std::isnan(values[i])) { throw std::invalid_argument("Values must not be NaN"); } + if (std::isinf(values[i])) { + throw std::invalid_argument("Values must not be infinity"); + } if ((i < (size - 1)) && !(values[i] < values[i + 1])) { throw std::invalid_argument("Values must be unique and monotonically increasing"); } diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 9f92094d..45c10822 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -470,4 +470,57 @@ TEST_CASE("iterate centroids", "[tdigest]") { REQUIRE(td.get_total_weight() == total_weight); } +TEST_CASE("update rejects positive infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(std::numeric_limits::infinity()); + REQUIRE(td.get_total_weight() == 2); + REQUIRE(td.get_max_value() == 2.0); +} + +TEST_CASE("update rejects negative infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(-std::numeric_limits::infinity()); + REQUIRE(td.get_total_weight() == 2); + REQUIRE(td.get_min_value() == 1.0); +} + +TEST_CASE("get_rank rejects positive infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + REQUIRE_THROWS_AS(td.get_rank(std::numeric_limits::infinity()), std::invalid_argument); +} + +TEST_CASE("get_rank rejects negative infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits::infinity()), std::invalid_argument); +} + +TEST_CASE("get_CDF rejects positive infinity in split points", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 100; ++i) td.update(i); + const double split_points[2] = {50.0, std::numeric_limits::infinity()}; + REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument); +} + +TEST_CASE("get_CDF rejects negative infinity in split points", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 100; ++i) td.update(i); + const double split_points[2] = {-std::numeric_limits::infinity(), 50.0}; + REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument); +} + +TEST_CASE("get_PMF rejects infinity in split points", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 100; ++i) td.update(i); + const double split_points[1] = {std::numeric_limits::infinity()}; + REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::invalid_argument); +} + } /* namespace datasketches */ From 588fd73c09b09740a0ebd493ef336a02fab2eb0f Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 13 Jan 2026 00:40:31 +0900 Subject: [PATCH 2/8] fix: check invalid inputs on deserialization --- tdigest/include/tdigest.hpp | 2 + tdigest/include/tdigest_impl.hpp | 91 +++++++++++++++++++++++++++--- tdigest/test/tdigest_test.cpp | 97 ++++++++++++++++++++++++++++---- 3 files changed, 171 insertions(+), 19 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index cc7898e3..7d060ec1 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -108,6 +108,7 @@ class tdigest { /** * Update this t-Digest with the given value + * NaN and infinity values are ignored * @param value to update the t-Digest with */ void update(T value); @@ -153,6 +154,7 @@ class tdigest { * Compute approximate normalized rank of the given value. * *

If the sketch is empty this throws std::runtime_error. + *

NaN and infinity values throw std::invalid_argument. * * @param value to be ranked * @return normalized rank (from 0 to 1 inclusive) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 75f2d9ee..294dab88 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -29,6 +29,24 @@ namespace datasketches { +template +inline void check_not_nan(T value, const char* name) { + if (std::isnan(value)) { + std::ostringstream oss; + oss << name << " must not be NaN"; + throw std::invalid_argument(oss.str()); + } +} + +template +inline void check_not_infinite(T value, const char* name) { + if (std::isinf(value)) { + std::ostringstream oss; + oss << name << " must not be infinite"; + throw std::invalid_argument(oss.str()); + } +} + template tdigest::tdigest(uint16_t k, const A& allocator): tdigest(false, k, std::numeric_limits::infinity(), -std::numeric_limits::infinity(), vector_centroid(allocator), 0, vector_t(allocator)) @@ -402,6 +420,8 @@ tdigest tdigest::deserialize(std::istream& is, const A& allocator) { const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE); if (is_single_value) { const T value = read(is); + check_not_nan(value, "single_value"); + check_not_infinite(value, "single_value"); return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator)); } @@ -410,12 +430,24 @@ tdigest tdigest::deserialize(std::istream& is, const A& allocator) { const T min = read(is); const T max = read(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); if (num_centroids > 0) read(is, centroids.data(), num_centroids * sizeof(centroid)); vector_t buffer(num_buffered, 0, allocator); if (num_buffered > 0) read(is, buffer.data(), num_buffered * sizeof(T)); uint64_t weight = 0; - for (const auto& c: centroids) weight += c.get_weight(); + for (const auto& c: centroids) { + check_not_nan(c.get_mean(), "centroid mean"); + check_not_infinite(c.get_mean(), "centroid mean"); + weight += c.get_weight(); + } + for (const auto& value: buffer) { + check_not_nan(value, "buffered_value"); + check_not_infinite(value, "buffered_value"); + } return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer)); } @@ -453,6 +485,8 @@ tdigest tdigest::deserialize(const void* bytes, size_t size, const A ensure_minimum_memory(end_ptr - ptr, sizeof(T)); T value; ptr += copy_from_mem(ptr, value); + check_not_nan(value, "single_value"); + check_not_infinite(value, "single_value"); return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator)); } @@ -467,12 +501,24 @@ tdigest tdigest::deserialize(const void* bytes, size_t size, const A ptr += copy_from_mem(ptr, min); T max; ptr += copy_from_mem(ptr, max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); if (num_centroids > 0) ptr += copy_from_mem(ptr, centroids.data(), num_centroids * sizeof(centroid)); vector_t buffer(num_buffered, 0, allocator); if (num_buffered > 0) copy_from_mem(ptr, buffer.data(), num_buffered * sizeof(T)); uint64_t weight = 0; - for (const auto& c: centroids) weight += c.get_weight(); + for (const auto& c: centroids) { + check_not_nan(c.get_mean(), "centroid mean"); + check_not_infinite(c.get_mean(), "centroid mean"); + weight += c.get_weight(); + } + for (const auto& value: buffer) { + check_not_nan(value, "buffered_value"); + check_not_infinite(value, "buffered_value"); + } return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer)); } @@ -489,13 +535,22 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc if (type == COMPAT_DOUBLE) { // compatibility with asBytes() const auto min = read_big_endian(is); const auto max = read_big_endian(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); const auto k = static_cast(read_big_endian(is)); const auto num_centroids = read_big_endian(is); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); uint64_t total_weight = 0; for (auto& c: centroids) { - const W weight = static_cast(read_big_endian(is)); + const auto weight_double = read_big_endian(is); + check_not_nan(weight_double, "centroid weight"); + check_not_infinite(weight_double, "centroid weight"); const auto mean = read_big_endian(is); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); + const W weight = static_cast(weight_double); c = centroid(mean, weight); total_weight += weight; } @@ -504,6 +559,10 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc // COMPAT_FLOAT: compatibility with asSmallBytes() const auto min = read_big_endian(is); // reference implementation uses doubles for min and max const auto max = read_big_endian(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); const auto k = static_cast(read_big_endian(is)); // reference implementation stores capacities of the array of centroids and the buffer as shorts // they can be derived from k in the constructor @@ -512,8 +571,13 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc vector_centroid centroids(num_centroids, centroid(0, 0), allocator); uint64_t total_weight = 0; for (auto& c: centroids) { - const W weight = static_cast(read_big_endian(is)); + const auto weight_float = read_big_endian(is); + check_not_nan(weight_float, "centroid weight"); + check_not_infinite(weight_float, "centroid weight"); const auto mean = read_big_endian(is); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); + const W weight = static_cast(weight_float); c = centroid(mean, weight); total_weight += weight; } @@ -540,6 +604,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double max; ptr += copy_from_mem(ptr, max); max = byteswap(max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); double k_double; ptr += copy_from_mem(ptr, k_double); const uint16_t k = static_cast(byteswap(k_double)); @@ -556,6 +624,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double mean; ptr += copy_from_mem(ptr, mean); mean = byteswap(mean); + check_not_nan(weight, "centroid weight"); + check_not_infinite(weight, "centroid weight"); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); c = centroid(mean, static_cast(weight)); total_weight += static_cast(weight); } @@ -569,6 +641,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double max; ptr += copy_from_mem(ptr, max); max = byteswap(max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); float k_float; ptr += copy_from_mem(ptr, k_float); const uint16_t k = static_cast(byteswap(k_float)); @@ -588,6 +664,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, float mean; ptr += copy_from_mem(ptr, mean); mean = byteswap(mean); + check_not_nan(weight, "centroid weight"); + check_not_infinite(weight, "centroid weight"); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); c = centroid(mean, static_cast(weight)); total_weight += static_cast(weight); } @@ -623,9 +703,6 @@ void tdigest::check_split_points(const T* values, uint32_t size) { if (std::isnan(values[i])) { throw std::invalid_argument("Values must not be NaN"); } - if (std::isinf(values[i])) { - throw std::invalid_argument("Values must not be infinity"); - } if ((i < (size - 1)) && !(values[i] < values[i + 1])) { throw std::invalid_argument("Values must be unique and monotonically increasing"); } diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 45c10822..0019b936 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -18,13 +18,35 @@ */ #include +#include #include #include +#include #include "tdigest.hpp" namespace datasketches { +namespace { +constexpr size_t kHeaderSize = 8; +constexpr size_t kCountsSize = 8; +constexpr size_t kMinOffset = kHeaderSize + kCountsSize; +constexpr size_t kMaxOffset = kMinOffset + sizeof(double); +constexpr size_t kFirstCentroidMeanOffset = kMinOffset + sizeof(double) * 2; +constexpr size_t kFirstBufferedValueOffset = kFirstCentroidMeanOffset; +constexpr size_t kSingleValueOffset = kHeaderSize; + +template +void write_bytes(std::vector& bytes, size_t offset, T value) { + std::memcpy(bytes.data() + offset, &value, sizeof(T)); +} + +template +void write_bytes(std::string& data, size_t offset, T value) { + std::memcpy(&data[offset], &value, sizeof(T)); +} +} // namespace + TEST_CASE("empty", "[tdigest]") { tdigest_double td(10); // std::cout << td.to_string(); @@ -502,25 +524,76 @@ TEST_CASE("get_rank rejects negative infinity", "[tdigest]") { REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits::infinity()), std::invalid_argument); } -TEST_CASE("get_CDF rejects positive infinity in split points", "[tdigest]") { +TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + auto bytes = td.serialize(); + write_bytes(bytes, kSingleValueOffset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize stream rejects infinity min", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(3.0); + auto bytes = td.serialize(); + std::string data(reinterpret_cast(bytes.data()), bytes.size()); + write_bytes(data, kMinOffset, std::numeric_limits::infinity()); + std::istringstream is(data, std::ios::binary); + REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN centroid mean", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + write_bytes(bytes, kFirstCentroidMeanOffset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN buffered value", "[tdigest]") { tdigest_double td(100); - for (int i = 0; i < 100; ++i) td.update(i); - const double split_points[2] = {50.0, std::numeric_limits::infinity()}; - REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(0, true); + write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects infinity single value", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + auto bytes = td.serialize(); + write_bytes(bytes, kSingleValueOffset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN max", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(); + write_bytes(bytes, kMaxOffset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } -TEST_CASE("get_CDF rejects negative infinity in split points", "[tdigest]") { +TEST_CASE("deserialize bytes rejects infinity max", "[tdigest]") { tdigest_double td(100); - for (int i = 0; i < 100; ++i) td.update(i); - const double split_points[2] = {-std::numeric_limits::infinity(), 50.0}; - REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(); + write_bytes(bytes, kMaxOffset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } -TEST_CASE("get_PMF rejects infinity in split points", "[tdigest]") { +TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") { tdigest_double td(100); - for (int i = 0; i < 100; ++i) td.update(i); - const double split_points[1] = {std::numeric_limits::infinity()}; - REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::invalid_argument); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(0, true); + write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } } /* namespace datasketches */ From b8489fd7327721fa4c1a16ff2a93565e7b077e5e Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 13 Jan 2026 01:04:40 +0900 Subject: [PATCH 3/8] perf: remove ostringstream --- tdigest/include/tdigest_impl.hpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 294dab88..0be1a486 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -22,7 +22,6 @@ #include #include -#include #include "common_defs.hpp" #include "memory_operations.hpp" @@ -32,18 +31,14 @@ namespace datasketches { template inline void check_not_nan(T value, const char* name) { if (std::isnan(value)) { - std::ostringstream oss; - oss << name << " must not be NaN"; - throw std::invalid_argument(oss.str()); + throw std::invalid_argument(std::string(name) + " must not be NaN"); } } template inline void check_not_infinite(T value, const char* name) { if (std::isinf(value)) { - std::ostringstream oss; - oss << name << " must not be infinite"; - throw std::invalid_argument(oss.str()); + throw std::invalid_argument(std::string(name) + " must not be infinite"); } } From c680a81c9fd690971de14cff4da3116fb04903cf Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 13 Jan 2026 02:08:48 +0900 Subject: [PATCH 4/8] style: follow local convention --- tdigest/test/tdigest_test.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 0019b936..fd0a71c1 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -28,13 +28,13 @@ namespace datasketches { namespace { -constexpr size_t kHeaderSize = 8; -constexpr size_t kCountsSize = 8; -constexpr size_t kMinOffset = kHeaderSize + kCountsSize; -constexpr size_t kMaxOffset = kMinOffset + sizeof(double); -constexpr size_t kFirstCentroidMeanOffset = kMinOffset + sizeof(double) * 2; -constexpr size_t kFirstBufferedValueOffset = kFirstCentroidMeanOffset; -constexpr size_t kSingleValueOffset = kHeaderSize; +constexpr size_t header_size = 8; +constexpr size_t counts_size = 8; +constexpr size_t min_offset = header_size + counts_size; +constexpr size_t max_offset = min_offset + sizeof(double); +constexpr size_t first_centroid_mean_offset = min_offset + sizeof(double) * 2; +constexpr size_t first_buffered_value_offset = first_centroid_mean_offset; +constexpr size_t single_value_offset = header_size; template void write_bytes(std::vector& bytes, size_t offset, T value) { @@ -528,7 +528,7 @@ TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") { tdigest_double td(100); td.update(1.0); auto bytes = td.serialize(); - write_bytes(bytes, kSingleValueOffset, std::numeric_limits::quiet_NaN()); + write_bytes(bytes, single_value_offset, std::numeric_limits::quiet_NaN()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -539,7 +539,7 @@ TEST_CASE("deserialize stream rejects infinity min", "[tdigest]") { td.update(3.0); auto bytes = td.serialize(); std::string data(reinterpret_cast(bytes.data()), bytes.size()); - write_bytes(data, kMinOffset, std::numeric_limits::infinity()); + write_bytes(data, min_offset, std::numeric_limits::infinity()); std::istringstream is(data, std::ios::binary); REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument); } @@ -548,7 +548,7 @@ TEST_CASE("deserialize bytes rejects NaN centroid mean", "[tdigest]") { tdigest_double td(100); for (int i = 0; i < 10; ++i) td.update(i); auto bytes = td.serialize(); - write_bytes(bytes, kFirstCentroidMeanOffset, std::numeric_limits::quiet_NaN()); + write_bytes(bytes, first_centroid_mean_offset, std::numeric_limits::quiet_NaN()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -557,7 +557,7 @@ TEST_CASE("deserialize bytes rejects NaN buffered value", "[tdigest]") { td.update(1.0); td.update(2.0); auto bytes = td.serialize(0, true); - write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits::quiet_NaN()); + write_bytes(bytes, first_buffered_value_offset, std::numeric_limits::quiet_NaN()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -565,7 +565,7 @@ TEST_CASE("deserialize bytes rejects infinity single value", "[tdigest]") { tdigest_double td(100); td.update(1.0); auto bytes = td.serialize(); - write_bytes(bytes, kSingleValueOffset, std::numeric_limits::infinity()); + write_bytes(bytes, single_value_offset, std::numeric_limits::infinity()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -574,7 +574,7 @@ TEST_CASE("deserialize bytes rejects NaN max", "[tdigest]") { td.update(1.0); td.update(2.0); auto bytes = td.serialize(); - write_bytes(bytes, kMaxOffset, std::numeric_limits::quiet_NaN()); + write_bytes(bytes, max_offset, std::numeric_limits::quiet_NaN()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -583,7 +583,7 @@ TEST_CASE("deserialize bytes rejects infinity max", "[tdigest]") { td.update(1.0); td.update(2.0); auto bytes = td.serialize(); - write_bytes(bytes, kMaxOffset, std::numeric_limits::infinity()); + write_bytes(bytes, max_offset, std::numeric_limits::infinity()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -592,7 +592,7 @@ TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") { td.update(1.0); td.update(2.0); auto bytes = td.serialize(0, true); - write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits::infinity()); + write_bytes(bytes, first_buffered_value_offset, std::numeric_limits::infinity()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } From 99d06bfd2b3c668911720ee3e2598a6fce7cc917 Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 13 Jan 2026 02:10:05 +0900 Subject: [PATCH 5/8] fix: add missing dependency --- tdigest/include/tdigest_impl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 0be1a486..043c7bab 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -22,6 +22,7 @@ #include #include +#include #include "common_defs.hpp" #include "memory_operations.hpp" From 662aef37c3912b4b2c6cbf3cc6ab0dbf5d40a0df Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Tue, 13 Jan 2026 15:20:44 +0900 Subject: [PATCH 6/8] fix: allow inf for get_rank --- tdigest/include/tdigest.hpp | 1 - tdigest/include/tdigest_impl.hpp | 1 - tdigest/test/tdigest_test.cpp | 14 -------------- 3 files changed, 16 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 7d060ec1..7ce87dd1 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -154,7 +154,6 @@ class tdigest { * Compute approximate normalized rank of the given value. * *

If the sketch is empty this throws std::runtime_error. - *

NaN and infinity values throw std::invalid_argument. * * @param value to be ranked * @return normalized rank (from 0 to 1 inclusive) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 043c7bab..e6904f20 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -109,7 +109,6 @@ template double tdigest::get_rank(T value) const { if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN"); - if (std::isinf(value)) throw std::invalid_argument("operation is undefined for infinity"); if (value < min_) return 0; if (value > max_) return 1; // one centroid and value == min_ == max_ diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index fd0a71c1..8dd62132 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -510,20 +510,6 @@ TEST_CASE("update rejects negative infinity", "[tdigest]") { REQUIRE(td.get_min_value() == 1.0); } -TEST_CASE("get_rank rejects positive infinity", "[tdigest]") { - tdigest_double td(100); - td.update(1.0); - td.update(2.0); - REQUIRE_THROWS_AS(td.get_rank(std::numeric_limits::infinity()), std::invalid_argument); -} - -TEST_CASE("get_rank rejects negative infinity", "[tdigest]") { - tdigest_double td(100); - td.update(1.0); - td.update(2.0); - REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits::infinity()), std::invalid_argument); -} - TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") { tdigest_double td(100); td.update(1.0); From bded7aa1eb09c13daa742ce901443388e1a8994a Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Tue, 13 Jan 2026 16:15:30 +0900 Subject: [PATCH 7/8] fix: check weight is zero --- tdigest/include/tdigest_impl.hpp | 15 +++++++++++++++ tdigest/test/tdigest_test.cpp | 19 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index e6904f20..065e3ef1 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "common_defs.hpp" #include "memory_operations.hpp" @@ -43,6 +44,14 @@ inline void check_not_infinite(T value, const char* name) { } } +template +inline void check_non_zero(T value, const char* name) { + static_assert(std::is_arithmetic::value, "T must be an arithmetic type"); + if (value == 0) { + throw std::invalid_argument(std::string(name) + " must not be zero"); + } +} + template tdigest::tdigest(uint16_t k, const A& allocator): tdigest(false, k, std::numeric_limits::infinity(), -std::numeric_limits::infinity(), vector_centroid(allocator), 0, vector_t(allocator)) @@ -437,6 +446,8 @@ tdigest tdigest::deserialize(std::istream& is, const A& allocator) { for (const auto& c: centroids) { check_not_nan(c.get_mean(), "centroid mean"); check_not_infinite(c.get_mean(), "centroid mean"); + check_non_zero(c.get_weight(), "centroid weight"); + weight += c.get_weight(); } for (const auto& value: buffer) { @@ -508,6 +519,8 @@ tdigest tdigest::deserialize(const void* bytes, size_t size, const A for (const auto& c: centroids) { check_not_nan(c.get_mean(), "centroid mean"); check_not_infinite(c.get_mean(), "centroid mean"); + check_non_zero(c.get_weight(), "centroid weight"); + weight += c.get_weight(); } for (const auto& value: buffer) { @@ -542,6 +555,8 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc const auto weight_double = read_big_endian(is); check_not_nan(weight_double, "centroid weight"); check_not_infinite(weight_double, "centroid weight"); + check_non_zero(weight_double, "centroid weight"); + const auto mean = read_big_endian(is); check_not_nan(mean, "centroid mean"); check_not_infinite(mean, "centroid mean"); diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 8dd62132..07d6185f 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -33,6 +33,7 @@ constexpr size_t counts_size = 8; constexpr size_t min_offset = header_size + counts_size; constexpr size_t max_offset = min_offset + sizeof(double); constexpr size_t first_centroid_mean_offset = min_offset + sizeof(double) * 2; +constexpr size_t first_centroid_weight_offset = first_centroid_mean_offset + sizeof(double); constexpr size_t first_buffered_value_offset = first_centroid_mean_offset; constexpr size_t single_value_offset = header_size; @@ -582,4 +583,22 @@ TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") { REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } +TEST_CASE("deserialize bytes rejects zero centroid weight", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + write_bytes(bytes, first_centroid_weight_offset, static_cast(0)); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize stream rejects zero centroid weight", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + std::string data(reinterpret_cast(bytes.data()), bytes.size()); + write_bytes(data, first_centroid_weight_offset, static_cast(0)); + std::istringstream is(data, std::ios::binary); + REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument); +} + } /* namespace datasketches */ From 19798344aad67441f12f0e356ffc78b6fbd3078e Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Tue, 13 Jan 2026 16:22:12 +0900 Subject: [PATCH 8/8] doc: update throw NaN for get_rank --- tdigest/include/tdigest.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 7ce87dd1..2d3620b1 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -154,6 +154,7 @@ class tdigest { * Compute approximate normalized rank of the given value. * *

If the sketch is empty this throws std::runtime_error. + *

NaN value throw std::invalid_argument. * * @param value to be ranked * @return normalized rank (from 0 to 1 inclusive)