diff --git a/contrib/pg_stat_statements/Makefile b/contrib/pg_stat_statements/Makefile index c27e9529bb60c..9bea8b6c5fdb9 100644 --- a/contrib/pg_stat_statements/Makefile +++ b/contrib/pg_stat_statements/Makefile @@ -3,7 +3,8 @@ MODULE_big = pg_stat_statements OBJS = \ $(WIN32RES) \ - pg_stat_statements.o + pg_stat_statements.o \ + hll.o EXTENSION = pg_stat_statements DATA = pg_stat_statements--1.4.sql \ diff --git a/contrib/pg_stat_statements/hll.c b/contrib/pg_stat_statements/hll.c new file mode 100644 index 0000000000000..bbaad09f5fbd6 --- /dev/null +++ b/contrib/pg_stat_statements/hll.c @@ -0,0 +1,191 @@ +/*------------------------------------------------------------------------- + * + * hll.c + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "postgres.h" +#include "funcapi.h" +#include "port/pg_bitutils.h" +#include "utils/timestamp.h" +#include "hll.h" + + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS) + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + if (x == 0) + return b + 1; + + j = 32 - pg_leftmost_one_pos32(x); + + if (j > b) + return b + 1; + + return j; +} + +/* + * Initialize HyperLogLog track state + */ +void +initSHLL(HyperLogLogState *cState) +{ + memset(cState->regs, 0, sizeof(cState->regs)); +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addSHLL(HyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + + TimestampTz now = GetCurrentTimestamp(); + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> HLL_C_BITS; + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1; + Assert(count <= HLL_C_BITS); + cState->regs[index][count] = now; +} + +static uint8 +getMaximum(const TimestampTz* reg, TimestampTz since) +{ + uint8 max = 0; + + for (size_t i = 0; i < HLL_C_BITS + 1; i++) + { + if (reg[i] >= since) + { + max = i + 1; + } + } + + return max; +} + + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateSHLL(HyperLogLogState *cState, time_t duration) +{ + double result; + double sum = 0.0; + size_t i; + uint8 R[HLL_N_REGISTERS]; + /* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */ + TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + R[i] = getMaximum(cState->regs[i], since); + sum += 1.0 / pow(2.0, R[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = ALPHA_MM / sum; + + if (result <= (5.0 / 2.0) * HLL_N_REGISTERS) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + zero_count += R[i] == 0; + } + + if (zero_count != 0) + result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + diff --git a/contrib/pg_stat_statements/meson.build b/contrib/pg_stat_statements/meson.build index 9d78cb88b7d78..e5c9195ac0374 100644 --- a/contrib/pg_stat_statements/meson.build +++ b/contrib/pg_stat_statements/meson.build @@ -2,6 +2,7 @@ pg_stat_statements_sources = files( 'pg_stat_statements.c', + 'hhl.c' ) if host_system == 'windows' diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql b/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql index 2f0eaf14ec34d..63157e61bf948 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql @@ -76,3 +76,10 @@ CREATE VIEW pg_stat_statements AS SELECT * FROM pg_stat_statements(true); GRANT SELECT ON pg_stat_statements TO PUBLIC; + +CREATE FUNCTION pg_bufferpool_working_set_size_pages(duration integer) +RETURNS integer +AS 'MODULE_PATHNAME', 'pg_bufferpool_working_set_size_pages' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION pg_bufferpool_working_set_size_pages(integer) TO PUBLIC; diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 4a427533bd88d..f396de9299f60 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -53,6 +53,7 @@ #include "common/int.h" #include "executor/instrument.h" #include "funcapi.h" +#include "hll.h" #include "jit/jit.h" #include "mb/pg_wchar.h" #include "miscadmin.h" @@ -71,6 +72,8 @@ #include "utils/builtins.h" #include "utils/memutils.h" #include "utils/timestamp.h" +#include "storage/bufmgr.h" +#include "storage/shmem.h" PG_MODULE_MAGIC_EXT( .name = "pg_stat_statements", @@ -91,6 +94,10 @@ static const uint32 PGSS_FILE_HEADER = 0x20250731; /* PostgreSQL major version number, changes in which invalidate all entries */ static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100; +static HyperLogLogState *BufferPoolWss = NULL; + +static void orion_wss_add_hash(uint32 hash); + /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */ #define USAGE_EXEC(duration) (1.0) #define USAGE_INIT (1.0) /* including initial planning */ @@ -505,6 +512,12 @@ pgss_shmem_request(void) RequestAddinShmemSpace(pgss_memsize()); RequestNamedLWLockTranche("pg_stat_statements", 1); + + /* Request shared memory for buffer pool WSS HLL state */ + RequestAddinShmemSpace(sizeof(HyperLogLogState)); + + /* Register the WSS tracking hook */ + WssAddHashHook = orion_wss_add_hash; } /* @@ -564,6 +577,14 @@ pgss_shmem_startup(void) &info, HASH_ELEM | HASH_BLOBS); + /* Initialize buffer pool working set size HLL state */ + BufferPoolWss = (HyperLogLogState *) + ShmemInitStruct("Orion Buffer Pool WSS", + sizeof(HyperLogLogState), + &found); + if (!found) + initSHLL(BufferPoolWss); + LWLockRelease(AddinShmemInitLock); /* @@ -3076,3 +3097,36 @@ comp_location(const void *a, const void *b) return pg_cmp_s32(l, r); } + +/* + * WSS hook function: add buffer tag hash to HLL estimator. + * Called from BufferAlloc() for every buffer allocation. + */ +static void +orion_wss_add_hash(uint32 hash) +{ + addSHLL(BufferPoolWss, hash); +} + +/* + * SQL function: pg_bufferpool_working_set_size_pages + * + * Returns the estimated number of unique buffer pages accessed + * in the last 'duration' seconds. + */ +PG_FUNCTION_INFO_V1(pg_bufferpool_working_set_size_pages); + +Datum +pg_bufferpool_working_set_size_pages(PG_FUNCTION_ARGS) +{ + int32 result; + time_t duration; + + if (BufferPoolWss == NULL) + PG_RETURN_NULL(); + + duration = (time_t) PG_GETARG_INT32(0); + result = (int32) estimateSHLL(BufferPoolWss, duration); + + PG_RETURN_INT32(result); +} \ No newline at end of file diff --git a/src/backend/lib/meson.build b/src/backend/lib/meson.build index 8e38fb20f17ac..79a21ede6a4f2 100644 --- a/src/backend/lib/meson.build +++ b/src/backend/lib/meson.build @@ -9,5 +9,5 @@ backend_sources += files( 'integerset.c', 'knapsack.c', 'pairingheap.c', - 'rbtree.c', + 'rbtree.c' ) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 0399265c4dda9..a2a7dad7d8a4c 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -759,7 +759,7 @@ round_off_mapping_sizes_for_hugepages(MemoryMappingSizes *mapping, int hugepages return; if (mapping->shmem_req_size % hugepagesize != 0) - mapping->shmem_req_size += add_size(mapping->shmem_req_size, + mapping->shmem_req_size = add_size(mapping->shmem_req_size, hugepagesize - (mapping->shmem_req_size % hugepagesize)); if (mapping->shmem_reserved % hugepagesize != 0) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d39e65ef811d0..5257387a94490 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -171,6 +171,13 @@ typedef struct SMgrSortArray SMgrRelation srel; } SMgrSortArray; +/* + * Hook for working set size tracking. + * Initially NULL - extensions set this to enable tracking. + */ +WssAddHashHook_type WssAddHashHook = NULL; + + /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; @@ -2124,6 +2131,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, newHash = BufTableHashCode(&newTag); newPartitionLock = BufMappingPartitionLock(newHash); + /* Track buffer access for working set size estimation */ + if (likely(WssAddHashHook != NULL)) + WssAddHashHook(newHash); + /* see if the block is in the buffer pool already */ LWLockAcquire(newPartitionLock, LW_SHARED); existing_buf_id = BufTableLookup(&newTag, newHash); diff --git a/src/include/hll.h b/src/include/hll.h new file mode 100644 index 0000000000000..9256cb9afa2f0 --- /dev/null +++ b/src/include/hll.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * hll.h + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef HLL_H +#define HLL_H + +#define HLL_BIT_WIDTH 10 +#define HLL_C_BITS (32 - HLL_BIT_WIDTH) +#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) + +/* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + * + * Instead of a single counter for every bits register, we have a timestamp + * for every valid number of bits we can encounter. Every time we encounter + * a certain number of bits, we update the timestamp in those registers to + * the current timestamp. + * + * We can query the sketch's stored cardinality for the range of some timestamp + * up to now: For each register, we return the highest bits bucket that has a + * modified timestamp >= the query timestamp. This value is the number of bits + * for this register in the normal HLL calculation. + * + * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. + * Usage could be halved if we decide to reduce the required time dimension + * precision; as 32 bits in second precision should be enough for statistics. + * However, that is not yet implemented. + */ +typedef struct HyperLogLogState +{ + TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; +} HyperLogLogState; + +extern void initSHLL(HyperLogLogState *cState); +extern void addSHLL(HyperLogLogState *cState, uint32 hash); +extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); + +#endif diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 9ced25c77e366..d69050208b268 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -393,6 +393,14 @@ extern int GetAccessStrategyPinLimit(BufferAccessStrategy strategy); extern void FreeAccessStrategy(BufferAccessStrategy strategy); +/* + * Hook for working set size tracking. + * Called from PinBuffer/PinBuffer_Locked with the hash of the buffer tag. + * Extensions can set this hook to track unique buffer pages accessed. + */ +typedef void (*WssAddHashHook_type)(uint32 hash); +extern PGDLLIMPORT WssAddHashHook_type WssAddHashHook; + /* inline functions */