From 71c1136989b363004357efb54c87b4192749a6a0 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 29 Jan 2026 08:04:23 -0500 Subject: [PATCH 001/147] Fix mistakes in commit 4020b370f214315b8c10430301898ac21658143f cost_tidrangescan() was setting the disabled_nodes value correctly, and then immediately resetting it to zero, due to poor code editing on my part. materialized_finished_plan correctly set matpath.parent to zero, but forgot to also set matpath.parallel_workers = 0, causing an access to uninitialized memory in cost_material. (This shouldn't result in any real problem, but it makes valgrind unhappy.) reparameterize_path was dereferencing a variable before verifying that it was not NULL. Reported-by: Tom Lane (issue #1) Reported-by: Michael Paquier (issue #1) Diagnosed-by: Lukas Fittl (issue #1) Reported-by: Zsolt Parragi (issue #2) Reported-by: Richard Guo (issue #3) Discussion: http://postgr.es/m/CAN4CZFPvwjNJEZ_JT9Y67yR7C=KMNa=LNefOB8ZY7TKDcmAXOA@mail.gmail.com Discussion: http://postgr.es/m/aXrnPgrq6Gggb5TG@paquier.xyz --- src/backend/optimizer/path/costsize.c | 1 - src/backend/optimizer/plan/createplan.c | 5 ++++- src/backend/optimizer/util/pathnode.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 4da0b17f13..c30d6e8467 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1461,7 +1461,6 @@ cost_tidrangescan(Path *path, PlannerInfo *root, enable_mask |= PGS_CONSIDER_NONPARTIAL; path->disabled_nodes = (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0; - path->disabled_nodes = 0; path->startup_cost = startup_cost; path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; } diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index c26e841f53..e5200f4b3c 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -6526,8 +6526,11 @@ materialize_finished_plan(Plan *subplan) subplan->startup_cost -= initplan_cost; subplan->total_cost -= initplan_cost; - /* Set cost data */ + /* Clear fields that cost_material() will consult */ + matpath.parallel_workers = 0; matpath.parent = NULL; + + /* Set cost data */ cost_material(&matpath, enable_material, subplan->disabled_nodes, diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 7295438ad2..7b6c5d51e5 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3971,10 +3971,10 @@ reparameterize_path(PlannerInfo *root, Path *path, spath = reparameterize_path(root, spath, required_outer, loop_count); - enabled = - (mpath->path.disabled_nodes <= spath->disabled_nodes); if (spath == NULL) return NULL; + enabled = + (mpath->path.disabled_nodes <= spath->disabled_nodes); return (Path *) create_material_path(rel, spath, enabled); } case T_Memoize: From ec317440716487753bafa4c0f8adae53e2c32446 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Thu, 29 Jan 2026 18:37:09 +0100 Subject: [PATCH 002/147] Replace literal 0 with InvalidXLogRecPtr for XLogRecPtr assignments Use the proper constant InvalidXLogRecPtr instead of literal 0 when assigning XLogRecPtr variables and struct fields. This improves code clarity by making it explicit that these are invalid LSN values rather than ambiguous zero literals. Author: Bertrand Drouvot Discussion: https://postgr.es/m/aRtd2dw8FO1nNX7k@ip-10-97-1-34.eu-west-3.compute.internal --- src/backend/access/gist/gist.c | 4 ++-- src/backend/access/transam/parallel.c | 4 ++-- src/backend/access/transam/xlog.c | 6 +++--- src/backend/access/transam/xlogprefetcher.c | 2 +- src/backend/access/transam/xlogrecovery.c | 6 +++--- src/backend/replication/syncrep.c | 4 ++-- src/backend/replication/walreceiver.c | 4 ++-- src/backend/storage/lmgr/proc.c | 2 +- src/bin/pg_resetwal/pg_resetwal.c | 8 ++++---- src/bin/pg_rewind/pg_rewind.c | 2 +- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index d5944205db..dfffce3e39 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -291,7 +291,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, SplitPageLayout *dist = NULL, *ptr; BlockNumber oldrlink = InvalidBlockNumber; - GistNSN oldnsn = 0; + GistNSN oldnsn = InvalidXLogRecPtr; SplitPageLayout rootpg; bool is_rootsplit; int npage; @@ -654,7 +654,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; - firststack.lsn = 0; + firststack.lsn = InvalidXLogRecPtr; firststack.retry_from_parent = false; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 01a89104ef..fe00488487 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -357,7 +357,7 @@ InitializeParallelDSM(ParallelContext *pcxt) fps->stmt_ts = GetCurrentStatementStartTimestamp(); fps->serializable_xact_handle = ShareSerializableXact(); SpinLockInit(&fps->mutex); - fps->last_xlog_end = 0; + fps->last_xlog_end = InvalidXLogRecPtr; shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps); /* We can skip the rest of this if we're not budgeting for any workers. */ @@ -530,7 +530,7 @@ ReinitializeParallelDSM(ParallelContext *pcxt) /* Reset a few bits of fixed parallel state to a clean state. */ fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false); - fps->last_xlog_end = 0; + fps->last_xlog_end = InvalidXLogRecPtr; /* Recreate error queues (if they exist). */ if (pcxt->nworkers > 0) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 16614e152d..13ec6225b8 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2060,7 +2060,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) /* Have to write it ourselves */ TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START(); WriteRqst.Write = OldPageRqstPtr; - WriteRqst.Flush = 0; + WriteRqst.Flush = InvalidXLogRecPtr; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); pgWalUsage.wal_buffers_full++; @@ -3077,7 +3077,7 @@ XLogBackgroundFlush(void) else { /* no flushing, this time round */ - WriteRqst.Flush = 0; + WriteRqst.Flush = InvalidXLogRecPtr; } #ifdef WAL_DEBUG @@ -5207,7 +5207,7 @@ BootStrapXLOG(uint32 data_checksum_version) /* Insert the initial checkpoint record */ recptr = ((char *) page + SizeOfXLogLongPHD); record = (XLogRecord *) recptr; - record->xl_prev = 0; + record->xl_prev = InvalidXLogRecPtr; record->xl_xid = InvalidTransactionId; record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint); record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 3c3f067aaf..24cfa96d73 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -967,7 +967,7 @@ XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr) /* Book-keeping to avoid readahead on first read. */ prefetcher->begin_ptr = recPtr; - prefetcher->no_readahead_until = 0; + prefetcher->no_readahead_until = InvalidXLogRecPtr; /* This will forget about any queued up records in the decoder. */ XLogBeginRead(prefetcher->reader, recPtr); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index a81dcbb5d7..4fc37a031d 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -261,7 +261,7 @@ static TimestampTz XLogReceiptTime = 0; static XLogSource XLogReceiptSource = XLOG_FROM_ANY; /* Local copy of WalRcv->flushedUpto */ -static XLogRecPtr flushedUpto = 0; +static XLogRecPtr flushedUpto = InvalidXLogRecPtr; static TimeLineID receiveTLI = 0; /* @@ -3918,7 +3918,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, RequestXLogStreaming(tli, ptr, PrimaryConnInfo, PrimarySlotName, wal_receiver_create_temp_slot); - flushedUpto = 0; + flushedUpto = InvalidXLogRecPtr; } /* @@ -4096,7 +4096,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) { - static XLogRecPtr lastComplaint = 0; + static XLogRecPtr lastComplaint = InvalidXLogRecPtr; if (readSource == XLOG_FROM_PG_WAL && emode == LOG) { diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c index e7bee77753..7ea6001e9a 100644 --- a/src/backend/replication/syncrep.c +++ b/src/backend/replication/syncrep.c @@ -355,7 +355,7 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit) pg_read_barrier(); Assert(dlist_node_is_detached(&MyProc->syncRepLinks)); MyProc->syncRepState = SYNC_REP_NOT_WAITING; - MyProc->waitLSN = 0; + MyProc->waitLSN = InvalidXLogRecPtr; /* reset ps display to remove the suffix */ if (update_process_title) @@ -1027,7 +1027,7 @@ SyncRepQueueIsOrderedByLSN(int mode) Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE); - lastLSN = 0; + lastLSN = InvalidXLogRecPtr; dlist_foreach(iter, &WalSndCtl->SyncRepQueue[mode]) { diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 6970af3f3f..8b99160ed0 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -1122,8 +1122,8 @@ XLogWalRcvClose(XLogRecPtr recptr, TimeLineID tli) static void XLogWalRcvSendReply(bool force, bool requestReply) { - static XLogRecPtr writePtr = 0; - static XLogRecPtr flushPtr = 0; + static XLogRecPtr writePtr = InvalidXLogRecPtr; + static XLogRecPtr flushPtr = InvalidXLogRecPtr; XLogRecPtr applyPtr; TimestampTz now; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 063826ae57..696bbb7b91 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -509,7 +509,7 @@ InitProcess(void) MyProc->recoveryConflictPending = false; /* Initialize fields for sync rep */ - MyProc->waitLSN = 0; + MyProc->waitLSN = InvalidXLogRecPtr; MyProc->syncRepState = SYNC_REP_NOT_WAITING; dlist_node_init(&MyProc->syncRepLinks); diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index b2c4b9db39..431b83a67d 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -913,10 +913,10 @@ RewriteControlFile(void) ControlFile.state = DB_SHUTDOWNED; ControlFile.checkPoint = ControlFile.checkPointCopy.redo; - ControlFile.minRecoveryPoint = 0; + ControlFile.minRecoveryPoint = InvalidXLogRecPtr; ControlFile.minRecoveryPointTLI = 0; - ControlFile.backupStartPoint = 0; - ControlFile.backupEndPoint = 0; + ControlFile.backupStartPoint = InvalidXLogRecPtr; + ControlFile.backupEndPoint = InvalidXLogRecPtr; ControlFile.backupEndRequired = false; /* @@ -1147,7 +1147,7 @@ WriteEmptyXLOG(void) /* Insert the initial checkpoint record */ recptr = (char *) page + SizeOfXLogLongPHD; record = (XLogRecord *) recptr; - record->xl_prev = 0; + record->xl_prev = InvalidXLogRecPtr; record->xl_xid = InvalidTransactionId; record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint); record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 31693843b3..d0aafd7e7a 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -377,7 +377,7 @@ main(int argc, char **argv) { pg_log_info("source and target cluster are on the same timeline"); rewind_needed = false; - target_wal_endrec = 0; + target_wal_endrec = InvalidXLogRecPtr; } else { From de90bb7db1f8c7ab1289f82f877a6e18f7b3d468 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Thu, 29 Jan 2026 10:14:55 -0800 Subject: [PATCH 003/147] Fix theoretical memory leaks in pg_locale_libc.c. The leaks were hard to reach in practice and the impact was low. The callers provide a buffer the same number of bytes as the source string (plus one for NUL terminator) as a starting size, and libc never increases the number of characters. But, if the byte length of one of the converted characters is larger, then it might need a larger destination buffer. Previously, in that case, the working buffers would be leaked. Even in that case, the call typically happens within a context that will soon be reset. Regardless, it's worth fixing to avoid such assumptions, and the fix is simple so it's worth backporting. Discussion: https://postgr.es/m/e2b7a0a88aaadded7e2d19f42d5ab03c9e182ad8.camel@j-davis.com Backpatch-through: 18 --- src/backend/utils/adt/pg_locale_libc.c | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 2f96e88959..78f6ea161a 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -527,11 +527,11 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, result_size = wchar2char(result, workspace, max_size + 1, loc); - if (result_size + 1 > destsize) - return result_size; - - memcpy(dest, result, result_size); - dest[result_size] = '\0'; + if (destsize >= result_size + 1) + { + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + } pfree(workspace); pfree(result); @@ -638,11 +638,11 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, result_size = wchar2char(result, workspace, max_size + 1, loc); - if (result_size + 1 > destsize) - return result_size; - - memcpy(dest, result, result_size); - dest[result_size] = '\0'; + if (destsize >= result_size + 1) + { + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + } pfree(workspace); pfree(result); @@ -725,11 +725,11 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, result_size = wchar2char(result, workspace, max_size + 1, loc); - if (result_size + 1 > destsize) - return result_size; - - memcpy(dest, result, result_size); - dest[result_size] = '\0'; + if (destsize >= result_size + 1) + { + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + } pfree(workspace); pfree(result); From bd9dfac8b121e67d7dd4a9bfecb1474fe6811927 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 29 Jan 2026 16:16:36 -0500 Subject: [PATCH 004/147] Further fix extended alignment for older g++. Commit 6ceef9408 was still one brick shy of a load, because it caused any usage at all of PGIOAlignedBlock or PGAlignedXLogBlock to fail under older g++. Notably, this broke "headerscheck --cplusplus". We can permit references to these structs as abstract structs though; only actual declaration of such a variable needs to be forbidden. Discussion: https://www.postgresql.org/message-id/3119480.1769189606@sss.pgh.pa.us --- src/include/c.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/include/c.h b/src/include/c.h index 48e4087c09..c443e75b89 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1140,6 +1140,12 @@ typedef struct PGAlignedXLogBlock alignas(PG_IO_ALIGN_SIZE) char data[XLOG_BLCKSZ]; } PGAlignedXLogBlock; +#else /* (g++ < 9) */ + +/* Allow these types to be used as abstract types when using old g++ */ +typedef struct PGIOAlignedBlock PGIOAlignedBlock; +typedef struct PGAlignedXLogBlock PGAlignedXLogBlock; + #endif /* !(g++ < 9) */ /* msb for char */ From 333f586372aae764b7ad7e2c975b14fd431ce819 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Thu, 29 Jan 2026 16:49:01 -0500 Subject: [PATCH 005/147] bufmgr: Allow conditionally locking of already locked buffer In fcb9c977aa5 I included an assertion in BufferLockConditional() to detect if a conditional lock acquisition is done on a buffer that we already have locked. The assertion was added in the course of adding other assertions. Unfortunately I failed to realize that some of our code relies on such lock acquisitions to silently fail. E.g. spgist and nbtree may try to conditionally lock an already locked buffer when acquiring a empty buffer. LWLockAcquireConditional(), which was previously used to implement ConditionalLockBuffer(), does not have such an assert. Instead of just removing the assert, and relying on the lock acquisition to fail due to the buffer already locked, this commit changes the behaviour of conditional content lock acquisition to fail if the current backend has any pre-existing lock on the buffer, even if the lock modes would not conflict. The reason for that is that we currently do not have space to track multiple lock acquisitions on a single buffer. Allowing multiple locks on the same buffer by a backend also seems likely to lead to bugs. There is only one non-self-exclusive conditional content lock acquisition, in GetVictimBuffer(), but it only is used if the target buffer is not pinned and thus can't already be locked by the current backend. Reported-by: Alexander Lakhin Discussion: https://postgr.es/m/90bd2cbb-49ce-4092-9f61-5ac2ab782c94@gmail.com --- src/backend/storage/buffer/bufmgr.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6f935648ae..7241477cac 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -5895,6 +5895,13 @@ BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr) /* * Acquire the content lock for the buffer, but only if we don't have to wait. + * + * It is allowed to try to conditionally acquire a lock on a buffer that this + * backend has already locked, but the lock acquisition will always fail, even + * if the new lock acquisition does not conflict with an already held lock + * (e.g. two share locks). This is because we currently do not have space to + * track multiple lock ownerships of the same buffer within one backend. That + * is ok for the current uses of BufferLockConditional(). */ static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) @@ -5903,9 +5910,12 @@ BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) bool mustwait; /* - * We better not already hold a lock on the buffer. + * As described above, if we're trying to lock a buffer this backend + * already has locked, return false, independent of the existing and + * desired lock level. */ - Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK); + if (entry->data.lockmode != BUFFER_LOCK_UNLOCK) + return false; /* * Lock out cancel/die interrupts until we exit the code section protected From 87f7b824f20c1c06884ef0711b4d32dbf4461436 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Thu, 29 Jan 2026 17:27:23 -0500 Subject: [PATCH 006/147] tableam: Perform CheckXidAlive check once per scan Previously, the CheckXidAlive check was performed within the table_scan*next* functions. This caused the check to be executed for every fetched tuple, an unnecessary overhead. To fix, move the check to table_beginscan* so it is performed once per scan rather than once per row. Note: table_tuple_fetch_row_version() does not use a scan descriptor; therefore, the CheckXidAlive check is retained in that function. The overhead is unlikely to be relevant for the existing callers. Reported-by: Andres Freund Author: Dilip Kumar Suggested-by: Andres Freund Suggested-by: Amit Kapila Reviewed-by: Andres Freund Discussion: https://www.postgresql.org/message-id/tlpltqm5jjwj7mp66dtebwwhppe4ri36vdypux2zoczrc2i3mp%40dhv4v4nikyfg --- src/backend/access/heap/heapam.c | 10 ---- src/backend/access/index/genam.c | 30 +++++------ src/backend/access/table/tableam.c | 20 +++----- src/include/access/tableam.h | 82 +++++++++++++----------------- 4 files changed, 57 insertions(+), 85 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f30a56ecf5..ae31efe8c2 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1421,16 +1421,6 @@ heap_getnext(TableScanDesc sscan, ScanDirection direction) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg_internal("only heap AM is supported"))); - /* - * We don't expect direct calls to heap_getnext with valid CheckXidAlive - * for catalog or regular tables. See detailed comments in xact.c where - * these variables are declared. Normally we have such a check at tableam - * level API but this is called from many places so we need to ensure it - * here. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected heap_getnext call during logical decoding"); - /* Note: no locking manipulations needed */ if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE) diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index a29be6f467..5e89b86a62 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -420,6 +420,14 @@ systable_beginscan(Relation heapRelation, sysscan->snapshot = NULL; } + /* + * If CheckXidAlive is set then set a flag to indicate that system table + * scan is in-progress. See detailed comments in xact.c where these + * variables are declared. + */ + if (TransactionIdIsValid(CheckXidAlive)) + bsysscan = true; + if (irel) { int i; @@ -468,14 +476,6 @@ systable_beginscan(Relation heapRelation, sysscan->iscan = NULL; } - /* - * If CheckXidAlive is set then set a flag to indicate that system table - * scan is in-progress. See detailed comments in xact.c where these - * variables are declared. - */ - if (TransactionIdIsValid(CheckXidAlive)) - bsysscan = true; - return sysscan; } @@ -707,13 +707,6 @@ systable_beginscan_ordered(Relation heapRelation, elog(ERROR, "column is not in index"); } - sysscan->iscan = index_beginscan(heapRelation, indexRelation, - snapshot, NULL, nkeys, 0); - index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); - sysscan->scan = NULL; - - pfree(idxkey); - /* * If CheckXidAlive is set then set a flag to indicate that system table * scan is in-progress. See detailed comments in xact.c where these @@ -722,6 +715,13 @@ systable_beginscan_ordered(Relation heapRelation, if (TransactionIdIsValid(CheckXidAlive)) bsysscan = true; + sysscan->iscan = index_beginscan(heapRelation, indexRelation, + snapshot, NULL, nkeys, 0); + index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); + sysscan->scan = NULL; + + pfree(idxkey); + return sysscan; } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 8749179652..dfda1af412 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -117,8 +117,8 @@ table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key) Oid relid = RelationGetRelid(relation); Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); - return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key, - NULL, flags); + return table_beginscan_common(relation, snapshot, nkeys, key, + NULL, flags); } @@ -184,8 +184,8 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) snapshot = SnapshotAny; } - return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL, - pscan, flags); + return table_beginscan_common(relation, snapshot, 0, NULL, + pscan, flags); } TableScanDesc @@ -214,8 +214,8 @@ table_beginscan_parallel_tidrange(Relation relation, snapshot = SnapshotAny; } - sscan = relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL, - pscan, flags); + sscan = table_beginscan_common(relation, snapshot, 0, NULL, + pscan, flags); return sscan; } @@ -269,14 +269,6 @@ table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid) Relation rel = scan->rs_rd; const TableAmRoutine *tableam = rel->rd_tableam; - /* - * We don't expect direct calls to table_tuple_get_latest_tid with valid - * CheckXidAlive for catalog or regular tables. See detailed comments in - * xact.c where these variables are declared. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding"); - /* * Since this can be called with user-supplied TID, don't trust the input * too much. diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index e2ec5289d4..7260b7b3d5 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -868,6 +868,27 @@ extern TupleTableSlot *table_slot_create(Relation relation, List **reglist); * ---------------------------------------------------------------------------- */ +/* + * A wrapper around the Table Access Method scan_begin callback, to centralize + * error checking. All calls to ->scan_begin() should go through this + * function. + */ +static TableScanDesc +table_beginscan_common(Relation rel, Snapshot snapshot, int nkeys, + ScanKeyData *key, ParallelTableScanDesc pscan, + uint32 flags) +{ + /* + * We don't allow scans to be started while CheckXidAlive is set, except + * via systable_beginscan() et al. See detailed comments in xact.c where + * these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "scan started during logical decoding"); + + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, pscan, flags); +} + /* * Start a scan of `rel`. Returned tuples pass a visibility test of * `snapshot`, and if nkeys != 0, the results are filtered by those scan keys. @@ -879,7 +900,7 @@ table_beginscan(Relation rel, Snapshot snapshot, uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; - return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); + return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); } /* @@ -908,7 +929,7 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, if (allow_sync) flags |= SO_ALLOW_SYNC; - return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); + return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); } /* @@ -923,8 +944,7 @@ table_beginscan_bm(Relation rel, Snapshot snapshot, { uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; - return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, - NULL, flags); + return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); } /* @@ -949,7 +969,7 @@ table_beginscan_sampling(Relation rel, Snapshot snapshot, if (allow_pagemode) flags |= SO_ALLOW_PAGEMODE; - return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); + return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); } /* @@ -962,7 +982,7 @@ table_beginscan_tid(Relation rel, Snapshot snapshot) { uint32 flags = SO_TYPE_TIDSCAN; - return rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags); + return table_beginscan_common(rel, snapshot, 0, NULL, NULL, flags); } /* @@ -975,7 +995,7 @@ table_beginscan_analyze(Relation rel) { uint32 flags = SO_TYPE_ANALYZE; - return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags); + return table_beginscan_common(rel, NULL, 0, NULL, NULL, flags); } /* @@ -1025,14 +1045,6 @@ table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableS Assert(direction == ForwardScanDirection || direction == BackwardScanDirection); - /* - * We don't expect direct calls to table_scan_getnextslot with valid - * CheckXidAlive for catalog or regular tables. See detailed comments in - * xact.c where these variables are declared. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected table_scan_getnextslot call during logical decoding"); - return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot); } @@ -1053,7 +1065,7 @@ table_beginscan_tidrange(Relation rel, Snapshot snapshot, TableScanDesc sscan; uint32 flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE; - sscan = rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags); + sscan = table_beginscan_common(rel, snapshot, 0, NULL, NULL, flags); /* Set the range of TIDs to scan */ sscan->rs_rd->rd_tableam->scan_set_tidrange(sscan, mintid, maxtid); @@ -1166,6 +1178,14 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) static inline IndexFetchTableData * table_index_fetch_begin(Relation rel) { + /* + * We don't allow scans to be started while CheckXidAlive is set, except + * via systable_beginscan() et al. See detailed comments in xact.c where + * these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "scan started during logical decoding"); + return rel->rd_tableam->index_fetch_begin(rel); } @@ -1219,14 +1239,6 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, TupleTableSlot *slot, bool *call_again, bool *all_dead) { - /* - * We don't expect direct calls to table_index_fetch_tuple with valid - * CheckXidAlive for catalog or regular tables. See detailed comments in - * xact.c where these variables are declared. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding"); - return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, slot, call_again, all_dead); @@ -1947,14 +1959,6 @@ table_scan_bitmap_next_tuple(TableScanDesc scan, uint64 *lossy_pages, uint64 *exact_pages) { - /* - * We don't expect direct calls to table_scan_bitmap_next_tuple with valid - * CheckXidAlive for catalog or regular tables. See detailed comments in - * xact.c where these variables are declared. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding"); - return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan, slot, recheck, @@ -1975,13 +1979,6 @@ static inline bool table_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate) { - /* - * We don't expect direct calls to table_scan_sample_next_block with valid - * CheckXidAlive for catalog or regular tables. See detailed comments in - * xact.c where these variables are declared. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected table_scan_sample_next_block call during logical decoding"); return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate); } @@ -1998,13 +1995,6 @@ table_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, TupleTableSlot *slot) { - /* - * We don't expect direct calls to table_scan_sample_next_tuple with valid - * CheckXidAlive for catalog or regular tables. See detailed comments in - * xact.c where these variables are declared. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected table_scan_sample_next_tuple call during logical decoding"); return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate, slot); } From bb26a81ee28c9d9c64e6f233fafa2792768ece1b Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Fri, 30 Jan 2026 09:05:35 +0900 Subject: [PATCH 007/147] Remove unused argument from ApplyLogicalMappingFile(). Author: Yugo Nagata Reviewed-by: Hayato Kuroda Discussion: https://postgr.es/m/20260128120056.b2a3e8184712ab5a537879eb@sraoss.co.jp --- src/backend/replication/logical/reorderbuffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index d84fa120b9..2d2a6d5e9e 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -5361,7 +5361,7 @@ DisplayMapping(HTAB *tuplecid_data) * transaction c) applied in LSN order. */ static void -ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname) +ApplyLogicalMappingFile(HTAB *tuplecid_data, const char *fname) { char path[MAXPGPATH]; int fd; @@ -5544,7 +5544,7 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname, snapshot->subxip[0]); - ApplyLogicalMappingFile(tuplecid_data, relid, f->fname); + ApplyLogicalMappingFile(tuplecid_data, f->fname); pfree(f); } } From 1eb09ed63a8d8063dc6bb75c8f31ec564bf35250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Fri, 30 Jan 2026 10:11:04 +0100 Subject: [PATCH 008/147] Use C99 designated designators in a couple of places MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the arrays somewhat easier to read. Author: Álvaro Herrera Reviewed-by: Peter Eisentraut Reviewed-by: Melanie Plageman Reviewed-by: Jelte Fennema-Nio Discussion: https://postgr.es/m/202601281204.sdxbr5qvpunk@alvherre.pgsql --- src/backend/access/heap/heapam.c | 47 ++++++++++++++++--------------- src/backend/postmaster/bgworker.c | 18 ++++++++---- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ae31efe8c2..3004964ab7 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -111,11 +111,11 @@ static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool ke /* - * Each tuple lock mode has a corresponding heavyweight lock, and one or two - * corresponding MultiXactStatuses (one to merely lock tuples, another one to - * update them). This table (and the macros below) helps us determine the - * heavyweight lock mode and MultiXactStatus values to use for any particular - * tuple lock strength. + * This table lists the heavyweight lock mode that corresponds to each tuple + * lock mode, as well as one or two corresponding MultiXactStatus values: + * .lockstatus to merely lock tuples, and .updstatus to update them. The + * latter is set to -1 if the corresponding tuple lock mode does not allow + * updating tuples -- see get_mxact_status_for_lock(). * * These interact with InplaceUpdateTupleLock, an alias for ExclusiveLock. * @@ -127,29 +127,30 @@ static const struct LOCKMODE hwlock; int lockstatus; int updstatus; -} +} tupleLockExtraInfo[] = - tupleLockExtraInfo[MaxLockTupleMode + 1] = { - { /* LockTupleKeyShare */ - AccessShareLock, - MultiXactStatusForKeyShare, - -1 /* KeyShare does not allow updating tuples */ + [LockTupleKeyShare] = { + .hwlock = AccessShareLock, + .lockstatus = MultiXactStatusForKeyShare, + /* KeyShare does not allow updating tuples */ + .updstatus = -1 }, - { /* LockTupleShare */ - RowShareLock, - MultiXactStatusForShare, - -1 /* Share does not allow updating tuples */ + [LockTupleShare] = { + .hwlock = RowShareLock, + .lockstatus = MultiXactStatusForShare, + /* Share does not allow updating tuples */ + .updstatus = -1 }, - { /* LockTupleNoKeyExclusive */ - ExclusiveLock, - MultiXactStatusForNoKeyUpdate, - MultiXactStatusNoKeyUpdate + [LockTupleNoKeyExclusive] = { + .hwlock = ExclusiveLock, + .lockstatus = MultiXactStatusForNoKeyUpdate, + .updstatus = MultiXactStatusNoKeyUpdate }, - { /* LockTupleExclusive */ - AccessExclusiveLock, - MultiXactStatusForUpdate, - MultiXactStatusUpdate + [LockTupleExclusive] = { + .hwlock = AccessExclusiveLock, + .lockstatus = MultiXactStatusForUpdate, + .updstatus = MultiXactStatusUpdate } }; diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 65deabe91a..5187448175 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -120,22 +120,28 @@ static const struct { { - "ParallelWorkerMain", ParallelWorkerMain + .fn_name = "ParallelWorkerMain", + .fn_addr = ParallelWorkerMain }, { - "ApplyLauncherMain", ApplyLauncherMain + .fn_name = "ApplyLauncherMain", + .fn_addr = ApplyLauncherMain }, { - "ApplyWorkerMain", ApplyWorkerMain + .fn_name = "ApplyWorkerMain", + .fn_addr = ApplyWorkerMain }, { - "ParallelApplyWorkerMain", ParallelApplyWorkerMain + .fn_name = "ParallelApplyWorkerMain", + .fn_addr = ParallelApplyWorkerMain }, { - "TableSyncWorkerMain", TableSyncWorkerMain + .fn_name = "TableSyncWorkerMain", + .fn_addr = TableSyncWorkerMain }, { - "SequenceSyncWorkerMain", SequenceSyncWorkerMain + .fn_name = "SequenceSyncWorkerMain", + .fn_addr = SequenceSyncWorkerMain } }; From e76221bd95f0428cc9b5872a4bcbf0b7e40b77b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Fri, 30 Jan 2026 14:26:02 +0100 Subject: [PATCH 009/147] Minor cosmetic tweaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These changes should have been done by 2f9661311b83, but were overlooked. I noticed while reviewing the code for commit b8926a5b4bb8. Author: Álvaro Herrera Discussion: https://postgr.es/m/18984-0f4778a6599ac3ae@postgresql.org --- src/backend/tcop/postgres.c | 2 +- src/backend/utils/mmgr/portalmem.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index e54bf1e760..b4a8d2f3a1 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1117,7 +1117,7 @@ exec_simple_query(const char *query_string) /* * Get the command name for use in status display (it also becomes the - * default completion tag, down inside PortalRun). Set ps_status and + * default completion tag, in PortalDefineQuery). Set ps_status and * do any special start-of-SQL-command processing needed by the * destination. */ diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index 4fa4d43202..c1a53e658c 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -294,9 +294,8 @@ PortalDefineQuery(Portal portal, portal->prepStmtName = prepStmtName; portal->sourceText = sourceText; - portal->qc.commandTag = commandTag; - portal->qc.nprocessed = 0; portal->commandTag = commandTag; + SetQueryCompletion(&portal->qc, commandTag, 0); portal->stmts = stmts; portal->cplan = cplan; portal->status = PORTAL_DEFINED; From e2362eb2bd1459319dacaeaa5dc886dbca546b96 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 30 Jan 2026 18:22:56 +0200 Subject: [PATCH 010/147] Move shmem allocator's fields from PGShmemHeader to its own struct For readability. It was a slight modularity violation to have fields in PGShmemHeader that were only used by the allocator code in shmem.c. And it was inconsistent that ShmemLock was nevertheless not stored there. Moving all the allocator-related fields to a separate struct makes it more consistent and modular, and removes the need to allocate and pass ShmemLock separately via BackendParameters. Merge InitShmemAccess() and InitShmemAllocation() into a single function that initializes the struct when called from postmaster, and when called from backends in EXEC_BACKEND mode, re-establishes the global variables. That's similar to all the *ShmemInit() functions that we have. Co-authored-by: Ashutosh Bapat Discussion: https://www.postgresql.org/message-id/CAExHW5uNRB9oT4pdo54qAo025MXFX4MfYrD9K15OCqe-ExnNvg@mail.gmail.com --- src/backend/port/sysv_shmem.c | 2 +- src/backend/port/win32_shmem.c | 2 +- src/backend/postmaster/launch_backend.c | 7 +- src/backend/storage/ipc/ipci.c | 4 +- src/backend/storage/ipc/shmem.c | 163 +++++++++++------------- src/include/storage/pg_shmem.h | 4 +- src/include/storage/shmem.h | 3 +- src/tools/pgindent/typedefs.list | 1 + 8 files changed, 83 insertions(+), 103 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 3cd3544fa2..2e3886cf9f 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -855,7 +855,7 @@ PGSharedMemoryCreate(Size size, * Initialize space allocation status for segment. */ hdr->totalsize = size; - hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + hdr->content_offset = MAXALIGN(sizeof(PGShmemHeader)); *shim = hdr; /* Save info for possible future use */ diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 7cb8b4c9b6..794e4fcb2a 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -389,7 +389,7 @@ PGSharedMemoryCreate(Size size, * Initialize space allocation status for segment. */ hdr->totalsize = size; - hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + hdr->content_offset = MAXALIGN(sizeof(PGShmemHeader)); hdr->dsm_control = 0; /* Save info for possible future use */ diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index cea229ad6a..45690b11c9 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -96,7 +96,6 @@ typedef struct HANDLE UsedShmemSegID; #endif void *UsedShmemSegAddr; - slock_t *ShmemLock; #ifdef USE_INJECTION_POINTS struct InjectionPointsCtl *ActiveInjectionPoints; #endif @@ -676,7 +675,7 @@ SubPostmasterMain(int argc, char *argv[]) /* Restore basic shared memory pointers */ if (UsedShmemSegAddr != NULL) - InitShmemAccess(UsedShmemSegAddr); + InitShmemAllocator(UsedShmemSegAddr); /* * Run the appropriate Main function @@ -724,8 +723,6 @@ save_backend_variables(BackendParameters *param, param->UsedShmemSegID = UsedShmemSegID; param->UsedShmemSegAddr = UsedShmemSegAddr; - param->ShmemLock = ShmemLock; - #ifdef USE_INJECTION_POINTS param->ActiveInjectionPoints = ActiveInjectionPoints; #endif @@ -986,8 +983,6 @@ restore_backend_variables(BackendParameters *param) UsedShmemSegID = param->UsedShmemSegID; UsedShmemSegAddr = param->UsedShmemSegAddr; - ShmemLock = param->ShmemLock; - #ifdef USE_INJECTION_POINTS ActiveInjectionPoints = param->ActiveInjectionPoints; #endif diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2a3dfedf7e..1f7e933d50 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -212,12 +212,10 @@ CreateSharedMemoryAndSemaphores(void) Assert(strcmp("unknown", GetConfigOption("huge_pages_status", false, false)) != 0); - InitShmemAccess(seghdr); - /* * Set up shared memory allocation mechanism */ - InitShmemAllocation(); + InitShmemAllocator(seghdr); /* Initialize subsystems */ CreateOrAttachShmemStructs(); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 1b53636315..9f362ce864 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -76,20 +76,33 @@ #include "storage/spin.h" #include "utils/builtins.h" +/* + * This is the first data structure stored in the shared memory segment, at + * the offset that PGShmemHeader->content_offset points to. Allocations by + * ShmemAlloc() are carved out of the space after this. + * + * For the base pointer and the total size of the shmem segment, we rely on + * the PGShmemHeader. + */ +typedef struct ShmemAllocatorData +{ + Size free_offset; /* offset to first free space from ShmemBase */ + HTAB *index; /* copy of ShmemIndex */ + + /* protects shared memory and LWLock allocation */ + slock_t shmem_lock; +} ShmemAllocatorData; + static void *ShmemAllocRaw(Size size, Size *allocated_size); -static void *ShmemAllocUnlocked(Size size); /* shared memory global variables */ static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ - static void *ShmemBase; /* start address of shared memory */ - static void *ShmemEnd; /* end+1 address of shared memory */ -slock_t *ShmemLock; /* spinlock for shared memory and LWLock - * allocation */ - +static ShmemAllocatorData *ShmemAllocator; +slock_t *ShmemLock; /* points to ShmemAllocator->shmem_lock */ static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ /* To get reliable results for NUMA inquiry we need to "touch pages" once */ @@ -98,49 +111,64 @@ static bool firstNumaTouch = true; Datum pg_numa_available(PG_FUNCTION_ARGS); /* - * InitShmemAccess() --- set up basic pointers to shared memory. + * InitShmemAllocator() --- set up basic pointers to shared memory. + * + * Called at postmaster or stand-alone backend startup, to initialize the + * allocator's data structure in the shared memory segment. In EXEC_BACKEND, + * this is also called at backend startup, to set up pointers to the shared + * memory areas. */ void -InitShmemAccess(PGShmemHeader *seghdr) +InitShmemAllocator(PGShmemHeader *seghdr) { + Assert(seghdr != NULL); + + /* + * We assume the pointer and offset are MAXALIGN. Not a hard requirement, + * but it's true today and keeps the math below simpler. + */ + Assert(seghdr == (void *) MAXALIGN(seghdr)); + Assert(seghdr->content_offset == MAXALIGN(seghdr->content_offset)); + ShmemSegHdr = seghdr; ShmemBase = seghdr; ShmemEnd = (char *) ShmemBase + seghdr->totalsize; -} -/* - * InitShmemAllocation() --- set up shared-memory space allocation. - * - * This should be called only in the postmaster or a standalone backend. - */ -void -InitShmemAllocation(void) -{ - PGShmemHeader *shmhdr = ShmemSegHdr; - char *aligned; +#ifndef EXEC_BACKEND + Assert(!IsUnderPostmaster); +#endif + if (IsUnderPostmaster) + { + PGShmemHeader *shmhdr = ShmemSegHdr; - Assert(shmhdr != NULL); + ShmemAllocator = (ShmemAllocatorData *) ((char *) shmhdr + shmhdr->content_offset); + ShmemLock = &ShmemAllocator->shmem_lock; + } + else + { + Size offset; - /* - * Initialize the spinlock used by ShmemAlloc. We must use - * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet. - */ - ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t)); + /* + * Allocations after this point should go through ShmemAlloc, which + * expects to allocate everything on cache line boundaries. Make sure + * the first allocation begins on a cache line boundary. + */ + offset = CACHELINEALIGN(seghdr->content_offset + sizeof(ShmemAllocatorData)); + if (offset > seghdr->totalsize) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory (%zu bytes requested)", + offset))); - SpinLockInit(ShmemLock); + ShmemAllocator = (ShmemAllocatorData *) ((char *) seghdr + seghdr->content_offset); - /* - * Allocations after this point should go through ShmemAlloc, which - * expects to allocate everything on cache line boundaries. Make sure the - * first allocation begins on a cache line boundary. - */ - aligned = (char *) - (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset))); - shmhdr->freeoffset = aligned - (char *) shmhdr; - - /* ShmemIndex can't be set up yet (need LWLocks first) */ - shmhdr->index = NULL; - ShmemIndex = (HTAB *) NULL; + SpinLockInit(&ShmemAllocator->shmem_lock); + ShmemLock = &ShmemAllocator->shmem_lock; + ShmemAllocator->free_offset = offset; + /* ShmemIndex can't be set up yet (need LWLocks first) */ + ShmemAllocator->index = NULL; + ShmemIndex = (HTAB *) NULL; + } } /* @@ -209,13 +237,13 @@ ShmemAllocRaw(Size size, Size *allocated_size) SpinLockAcquire(ShmemLock); - newStart = ShmemSegHdr->freeoffset; + newStart = ShmemAllocator->free_offset; newFree = newStart + size; if (newFree <= ShmemSegHdr->totalsize) { newSpace = (char *) ShmemBase + newStart; - ShmemSegHdr->freeoffset = newFree; + ShmemAllocator->free_offset = newFree; } else newSpace = NULL; @@ -228,45 +256,6 @@ ShmemAllocRaw(Size size, Size *allocated_size) return newSpace; } -/* - * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory - * - * Allocate space without locking ShmemLock. This should be used for, - * and only for, allocations that must happen before ShmemLock is ready. - * - * We consider maxalign, rather than cachealign, sufficient here. - */ -static void * -ShmemAllocUnlocked(Size size) -{ - Size newStart; - Size newFree; - void *newSpace; - - /* - * Ensure allocated space is adequately aligned. - */ - size = MAXALIGN(size); - - Assert(ShmemSegHdr != NULL); - - newStart = ShmemSegHdr->freeoffset; - - newFree = newStart + size; - if (newFree > ShmemSegHdr->totalsize) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory (%zu bytes requested)", - size))); - ShmemSegHdr->freeoffset = newFree; - - newSpace = (char *) ShmemBase + newStart; - - Assert(newSpace == (void *) MAXALIGN(newSpace)); - - return newSpace; -} - /* * ShmemAddrIsValid -- test if an address refers to shared memory * @@ -395,16 +384,14 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) if (!ShmemIndex) { - PGShmemHeader *shmemseghdr = ShmemSegHdr; - /* Must be trying to create/attach to ShmemIndex itself */ Assert(strcmp(name, "ShmemIndex") == 0); if (IsUnderPostmaster) { /* Must be initializing a (non-standalone) backend */ - Assert(shmemseghdr->index != NULL); - structPtr = shmemseghdr->index; + Assert(ShmemAllocator->index != NULL); + structPtr = ShmemAllocator->index; *foundPtr = true; } else @@ -417,9 +404,9 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) * index has been initialized. This should be OK because no other * process can be accessing shared memory yet. */ - Assert(shmemseghdr->index == NULL); + Assert(ShmemAllocator->index == NULL); structPtr = ShmemAlloc(size); - shmemseghdr->index = structPtr; + ShmemAllocator->index = structPtr; *foundPtr = false; } LWLockRelease(ShmemIndexLock); @@ -553,15 +540,15 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) /* output shared memory allocated but not counted via the shmem index */ values[0] = CStringGetTextDatum(""); nulls[1] = true; - values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated); + values[2] = Int64GetDatum(ShmemAllocator->free_offset - named_allocated); values[3] = values[2]; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); /* output as-of-yet unused shared memory */ nulls[0] = true; - values[1] = Int64GetDatum(ShmemSegHdr->freeoffset); + values[1] = Int64GetDatum(ShmemAllocator->free_offset); nulls[1] = false; - values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset); + values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemAllocator->free_offset); values[3] = values[2]; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 3aeada554b..10c7b06586 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -32,9 +32,9 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ #define PGShmemMagic 679834894 pid_t creatorPID; /* PID of creating process (set but unread) */ Size totalsize; /* total size of segment */ - Size freeoffset; /* offset to first free space */ + Size content_offset; /* offset to the data, i.e. size of this + * header */ dsm_handle dsm_control; /* ID of dynamic shared memory control seg */ - void *index; /* pointer to ShmemIndex table */ #ifndef WIN32 /* Windows doesn't have useful inode#s */ dev_t device; /* device data directory is on */ ino_t inode; /* inode number of data directory */ diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index e71a51dfe8..89d45287c1 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -29,8 +29,7 @@ extern PGDLLIMPORT slock_t *ShmemLock; typedef struct PGShmemHeader PGShmemHeader; /* avoid including * storage/pg_shmem.h here */ -extern void InitShmemAccess(PGShmemHeader *seghdr); -extern void InitShmemAllocation(void); +extern void InitShmemAllocator(PGShmemHeader *seghdr); extern void *ShmemAlloc(Size size); extern void *ShmemAllocNoError(Size size); extern bool ShmemAddrIsValid(const void *addr); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 34374df0d6..9f5ee8fd48 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2804,6 +2804,7 @@ SharedTypmodTableEntry Sharedsort ShellTypeInfo ShippableCacheEntry +ShmemAllocatorData ShippableCacheKey ShmemIndexEnt ShutdownForeignScan_function From a1d7ae2b2e38dd5d783c91316925f9f395da47e6 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 30 Jan 2026 14:59:25 -0500 Subject: [PATCH 011/147] Improve guards against false regex matches in BackgroundPsql.pm. BackgroundPsql needs to wait for all the output from an interactive psql command to come back. To make sure that's happened, it issues the command, then issues \echo and \warn psql commands that echo a "banner" string (which we assume won't appear in the command's output), then waits for the banner strings to appear. The hazard in this approach is that the banner will also appear in the echoed psql commands themselves, so we need to distinguish those echoes from the desired output. Commit 8b886a4e3 tried to do that by positing that the desired output would be directly preceded and followed by newlines, but it turns out that that assumption is timing-sensitive. In particular, it tends to fail in builds made --without-readline, wherein the command echoes will be made by the pty driver and may be interspersed with prompts issued by psql proper. It does seem safe to assume that the banner output we want will be followed by a newline, since that should be the last output before things quiesce. Therefore, we can improve matters by putting quotes around the banner strings in the \echo and \warn psql commands, so that their echoes cannot include banner directly followed by newline, and then checking for just banner-and-newline in the match pattern. While at it, spruce up the pump() call in sub query() to look like the neater version in wait_connect(), and don't die on timeout until after printing whatever we got. Reported-by: Oleg Tselebrovskiy Diagnosed-by: Oleg Tselebrovskiy Author: Tom Lane Reviewed-by: Soumya S Murali Discussion: https://postgr.es/m/db6fdb35a8665ad3c18be01181d44b31@postgrespro.ru Backpatch-through: 14 --- .../perl/PostgreSQL/Test/BackgroundPsql.pm | 40 +++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm index 5bd41a278d..c6ff2dbde4 100644 --- a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm +++ b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm @@ -155,11 +155,11 @@ sub wait_connect # # See query() for details about why/how the banner is used. my $banner = "background_psql: ready"; - my $banner_match = qr/(^|\n)$banner\r?\n/; - $self->{stdin} .= "\\echo $banner\n\\warn $banner\n"; + my $banner_match = qr/$banner\r?\n/; + $self->{stdin} .= "\\echo '$banner'\n\\warn '$banner'\n"; $self->{run}->pump() until ($self->{stdout} =~ /$banner_match/ - && $self->{stderr} =~ /$banner\r?\n/) + && $self->{stderr} =~ /$banner_match/) || $self->{timeout}->is_expired; note "connect output:\n", @@ -264,22 +264,17 @@ sub query # stderr (or vice versa), even if psql printed them in the opposite # order. We therefore wait on both. # - # We need to match for the newline, because we try to remove it below, and - # it's possible to consume just the input *without* the newline. In - # interactive psql we emit \r\n, so we need to allow for that. Also need - # to be careful that we don't e.g. match the echoed \echo command, rather - # than its output. + # In interactive psql we emit \r\n, so we need to allow for that. + # Also, include quotes around the banner string in the \echo and \warn + # commands, not because the string needs quoting but so that $banner_match + # can't match readline's echoing of these commands. my $banner = "background_psql: QUERY_SEPARATOR $query_cnt:"; - my $banner_match = qr/(^|\n)$banner\r?\n/; - $self->{stdin} .= "$query\n;\n\\echo $banner\n\\warn $banner\n"; - pump_until( - $self->{run}, $self->{timeout}, - \$self->{stdout}, qr/$banner_match/); - pump_until( - $self->{run}, $self->{timeout}, - \$self->{stderr}, qr/$banner_match/); - - die "psql query timed out" if $self->{timeout}->is_expired; + my $banner_match = qr/$banner\r?\n/; + $self->{stdin} .= "$query\n;\n\\echo '$banner'\n\\warn '$banner'\n"; + $self->{run}->pump() + until ($self->{stdout} =~ /$banner_match/ + && $self->{stderr} =~ /$banner_match/) + || $self->{timeout}->is_expired; note "results query $query_cnt:\n", explain { @@ -287,9 +282,12 @@ sub query stderr => $self->{stderr}, } unless !$params{verbose}; - # Remove banner from stdout and stderr, our caller doesn't care. The - # first newline is optional, as there would not be one if consuming an - # empty query result. + die "psql query timed out" if $self->{timeout}->is_expired; + + # Remove banner from stdout and stderr, our caller doesn't want it. + # Also remove the query output's trailing newline, if present (there + # would not be one if consuming an empty query result). + $banner_match = qr/\r?\n?$banner\r?\n/; $output = $self->{stdout}; $output =~ s/$banner_match//; $self->{stderr} =~ s/$banner_match//; From 6918434a4acb2b14535b3c1be30d306666db7c24 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 30 Jan 2026 15:11:44 -0500 Subject: [PATCH 012/147] Make psql/t/030_pager.pl more robust. Similarly to the preceding commit, 030_pager.pl was assuming that patterns it looks for in interactive psql output would appear by themselves on a line, but that assumption tends to fall over in builds made --without-readline: the output we get might have a psql prompt immediately followed by the expected line of output. For several of these tests, just checking for the pattern followed by newline seems sufficient, because we could not get a false match against the command echo, nor against the unreplaced command output if the pager fails to be invoked when expected. However, that's fairly scary for the test that was relying on information_schema.referential_constraints: "\d+" could easily appear at the end of a line in that view. Let's get rid of that hazard by making a custom test view instead of using information_schema.referential_constraints. This test script is new in v19, so no need for back-patch. Reported-by: Oleg Tselebrovskiy Author: Oleg Tselebrovskiy Co-authored-by: Tom Lane Reviewed-by: Soumya S Murali Discussion: https://postgr.es/m/db6fdb35a8665ad3c18be01181d44b31@postgrespro.ru --- src/bin/psql/t/030_pager.pl | 43 ++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/src/bin/psql/t/030_pager.pl b/src/bin/psql/t/030_pager.pl index cf81fb1603..a35f2b2629 100644 --- a/src/bin/psql/t/030_pager.pl +++ b/src/bin/psql/t/030_pager.pl @@ -40,6 +40,36 @@ $node->init; $node->start; +# create a view we'll use below +$node->safe_psql( + 'postgres', 'create view public.view_030_pager as select +1 as a, +2 as b, +3 as c, +4 as d, +5 as e, +6 as f, +7 as g, +8 as h, +9 as i, +10 as j, +11 as k, +12 as l, +13 as m, +14 as n, +15 as o, +16 as p, +17 as q, +18 as r, +19 as s, +20 as t, +21 as u, +22 as v, +23 as w, +24 as x, +25 as y, +26 as z'); + # fire up an interactive psql session my $h = $node->interactive_psql('postgres'); @@ -77,25 +107,28 @@ sub do_command # # Note that interactive_psql starts psql with --no-align --tuples-only, # and that the output string will include psql's prompts and command echo. +# So we have to test for patterns that can't match the command itself, +# and we can't assume the match will extend across a whole line (there +# might be a prompt ahead of it in the output). do_command( "SELECT 'test' AS t FROM generate_series(1,23);\n", - qr/^test\r?$/m, + qr/test\r?$/m, "execute SELECT query that needs no pagination"); do_command( "SELECT 'test' AS t FROM generate_series(1,24);\n", - qr/^ *24\r?$/m, + qr/24\r?$/m, "execute SELECT query that needs pagination"); do_command( "\\pset expanded\nSELECT generate_series(1,20) as g;\n", - qr/^ *39\r?$/m, + qr/39\r?$/m, "execute SELECT query that needs pagination in expanded mode"); do_command( - "\\pset tuples_only off\n\\d+ information_schema.referential_constraints\n", - qr/^ *\d+\r?$/m, + "\\pset tuples_only off\n\\d+ public.view_030_pager\n", + qr/55\r?$/m, "execute command with footer that needs pagination"); # send psql an explicit \q to shut it down, else pty won't close properly From d46aa32ea5ce0c61a464cdc2c74fa9a428df8bc1 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 2 Feb 2026 08:02:39 +0900 Subject: [PATCH 013/147] Fix build inconsistency due to the generation of wait-event code The build generates four files based on the wait event contents stored in wait_event_names.txt: - wait_event_types.h - pgstat_wait_event.c - wait_event_funcs_data.c - wait_event_types.sgml The SGML file is generated as part of a documentation build, with its data stored in doc/src/sgml/ for meson and configure. The three others are handled differently for meson and configure: - In configure, all the files are created in src/backend/utils/activity/. A link to wait_event_types.h is created in src/include/utils/. - In meson, all the files are created in src/include/utils/. The two C files, pgstat_wait_event.c and wait_event_funcs_data.c, are then included in respectively wait_event.c and wait_event_funcs.c, without the "utils/" path. For configure, this does not present a problem. For meson, this has to be combined with a trick in src/backend/utils/activity/meson.build, where include_directories needs to point to include/utils/ to make the inclusion of the C files work properly, causing builds to pull in PostgreSQL headers rather than system headers in some build paths, as src/include/utils/ would take priority. In order to fix this issue, this commit reworks the way the C/H files are generated, becoming consistent with guc_tables.inc.c: - For meson, basically nothing changes. The files are still generated in src/include/utils/. The trick with include_directories is removed. - For configure, the files are now generated in src/backend/utils/, with links in src/include/utils/ pointing to the ones in src/backend/. This requires extra rules in src/backend/utils/activity/Makefile so as a make command in this sub-directory is able to work. - The three files now fall under header-stamp, which is actually simpler as guc_tables.inc.c does the same. - wait_event_funcs_data.c and pgstat_wait_event.c are now included with "utils/" in their path. This problem has not been an issue in the buildfarm; it has been noted with AIX and a conflict with float.h. This issue could, however, create conflicts in the buildfarm depending on the environment with unexpected headers pulled in, so this fix is backpatched down to where the generation of the wait-event files has been introduced. While on it, this commit simplifies wait_event_names.txt regarding the paths of the files generated, to mention just the names of the files generated. The paths where the files are generated became incorrect. The path of the SGML path was wrong. This change has been tested in the CI, down to v17. Locally, I have run tests with configure (with and without VPATH), as well as meson, on the three branches. Combo oversight in fa88928470b5 and 1e68e43d3f0f. Reported-by: Aditya Kamath Discussion: https://postgr.es/m/LV8PR15MB64888765A43D229EA5D1CFE6D691A@LV8PR15MB6488.namprd15.prod.outlook.com Backpatch-through: 17 --- src/backend/Makefile | 10 +--------- src/backend/utils/.gitignore | 3 +++ src/backend/utils/Makefile | 13 ++++++++++--- src/backend/utils/activity/.gitignore | 3 --- src/backend/utils/activity/Makefile | 17 ++++------------- src/backend/utils/activity/meson.build | 1 - src/backend/utils/activity/wait_event.c | 2 +- src/backend/utils/activity/wait_event_funcs.c | 2 +- src/backend/utils/activity/wait_event_names.txt | 6 +++--- src/include/Makefile | 1 + src/include/utils/.gitignore | 2 ++ src/include/utils/meson.build | 4 +--- 12 files changed, 27 insertions(+), 37 deletions(-) delete mode 100644 src/backend/utils/activity/.gitignore diff --git a/src/backend/Makefile b/src/backend/Makefile index baa9b05d02..05642dc02e 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -136,9 +136,6 @@ parser/gram.h: parser/gram.y storage/lmgr/lwlocknames.h: storage/lmgr/generate-lwlocknames.pl ../include/storage/lwlocklist.h utils/activity/wait_event_names.txt $(MAKE) -C storage/lmgr lwlocknames.h -utils/activity/wait_event_types.h: utils/activity/generate-wait_event_types.pl utils/activity/wait_event_names.txt - $(MAKE) -C utils/activity wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c - # run this unconditionally to avoid needing to know its dependencies here: submake-catalog-headers: $(MAKE) -C ../include/catalog generated-headers @@ -163,18 +160,13 @@ submake-utils-headers: .PHONY: generated-headers -generated-headers: $(top_builddir)/src/include/storage/lwlocknames.h $(top_builddir)/src/include/utils/wait_event_types.h submake-catalog-headers submake-nodes-headers submake-utils-headers parser/gram.h +generated-headers: $(top_builddir)/src/include/storage/lwlocknames.h submake-catalog-headers submake-nodes-headers submake-utils-headers parser/gram.h $(top_builddir)/src/include/storage/lwlocknames.h: storage/lmgr/lwlocknames.h prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \ cd '$(dir $@)' && rm -f $(notdir $@) && \ $(LN_S) "$$prereqdir/$(notdir $<)" . -$(top_builddir)/src/include/utils/wait_event_types.h: utils/activity/wait_event_types.h - prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \ - cd '$(dir $@)' && rm -f $(notdir $@) && \ - $(LN_S) "$$prereqdir/$(notdir $<)" . - utils/probes.o: utils/probes.d $(SUBDIROBJS) $(DTRACE) $(DTRACEFLAGS) -C -G -s $(call expand_subsys,$^) -o $@ diff --git a/src/backend/utils/.gitignore b/src/backend/utils/.gitignore index 303c01d051..fa9cfb3969 100644 --- a/src/backend/utils/.gitignore +++ b/src/backend/utils/.gitignore @@ -5,3 +5,6 @@ /guc_tables.inc.c /probes.h /errcodes.h +/pgstat_wait_event.c +/wait_event_funcs_data.c +/wait_event_types.h diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile index 6df31504f3..81b4a956bd 100644 --- a/src/backend/utils/Makefile +++ b/src/backend/utils/Makefile @@ -43,7 +43,7 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp submak submake-adt-headers: $(MAKE) -C adt jsonpath_gram.h -$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c +$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h # fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on # the timestamps of the individual output files, because the Perl script @@ -58,6 +58,12 @@ errcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-errcodes.pl guc_tables.inc.c: $(top_srcdir)/src/backend/utils/misc/guc_parameters.dat $(top_srcdir)/src/backend/utils/misc/gen_guc_tables.pl $(PERL) $(top_srcdir)/src/backend/utils/misc/gen_guc_tables.pl $< $@ +pgstat_wait_event.c: wait_event_types.h +wait_event_funcs_data.c: wait_event_types.h + +wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl + $(PERL) $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl --code $< + ifeq ($(enable_dtrace), yes) probes.h: postprocess_dtrace.sed probes.h.tmp sed -f $^ >$@ @@ -73,8 +79,8 @@ endif # These generated headers must be symlinked into src/include/. # We use header-stamp to record that we've done this because the symlinks # themselves may appear older than fmgr-stamp. -$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c - cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c; do \ +$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h + cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h; do \ rm -f $$file && $(LN_S) "../../../$(subdir)/$$file" . ; \ done touch $@ @@ -93,3 +99,4 @@ uninstall-data: clean: rm -f probes.h probes.h.tmp rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h guc_tables.inc.c + rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c diff --git a/src/backend/utils/activity/.gitignore b/src/backend/utils/activity/.gitignore deleted file mode 100644 index bd0c0c7772..0000000000 --- a/src/backend/utils/activity/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/pgstat_wait_event.c -/wait_event_types.h -/wait_event_funcs_data.c diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile index 0eb29ee78a..c37bfb350b 100644 --- a/src/backend/utils/activity/Makefile +++ b/src/backend/utils/activity/Makefile @@ -36,17 +36,8 @@ OBJS = \ wait_event.o \ wait_event_funcs.o -include $(top_srcdir)/src/backend/common.mk - -wait_event_funcs.o: wait_event_funcs_data.c -wait_event_funcs_data.c: wait_event_types.h - -wait_event.o: pgstat_wait_event.c -pgstat_wait_event.c: wait_event_types.h - touch $@ +# Force these dependencies to be known even without dependency info built: +wait_event.o: wait_event.c $(top_builddir)/src/backend/utils/pgstat_wait_event.c +wait_event_funcs.o: wait_event_funcs.c $(top_builddir)/src/backend/utils/wait_event_funcs_data.c -wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt generate-wait_event_types.pl - $(PERL) $(srcdir)/generate-wait_event_types.pl --code $< - -clean: - rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/activity/meson.build b/src/backend/utils/activity/meson.build index 9f48d5970e..53bd5a246c 100644 --- a/src/backend/utils/activity/meson.build +++ b/src/backend/utils/activity/meson.build @@ -30,7 +30,6 @@ waitevent_sources = files( wait_event = static_library('wait_event_names', waitevent_sources, dependencies: [backend_code], - include_directories: include_directories('../../../include/utils'), kwargs: internal_lib_args, ) diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index e4f2c44025..aca2c8fc74 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -503,4 +503,4 @@ pgstat_get_wait_event(uint32 wait_event_info) return event_name; } -#include "pgstat_wait_event.c" +#include "utils/pgstat_wait_event.c" diff --git a/src/backend/utils/activity/wait_event_funcs.c b/src/backend/utils/activity/wait_event_funcs.c index b62ee83ef7..fa10a80b08 100644 --- a/src/backend/utils/activity/wait_event_funcs.c +++ b/src/backend/utils/activity/wait_event_funcs.c @@ -31,7 +31,7 @@ static const struct waitEventData[] = { -#include "wait_event_funcs_data.c" +#include "utils/wait_event_funcs_data.c" /* end of list */ {NULL, NULL, NULL} }; diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 5537a2d253..efde48e76b 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -14,13 +14,13 @@ # # The files generated from this one are: # -# src/backend/utils/activity/wait_event_types.h +# wait_event_types.h # typedef enum definitions for wait events. # -# src/backend/utils/activity/pgstat_wait_event.c +# pgstat_wait_event.c # C functions to get the wait event name based on the enum. # -# src/backend/utils/activity/wait_event_types.sgml +# wait_event_types.sgml # SGML tables of wait events for inclusion in the documentation. # # When adding a new wait event, make sure it is placed in the appropriate diff --git a/src/include/Makefile b/src/include/Makefile index 4ef060e905..ac673f4cf1 100644 --- a/src/include/Makefile +++ b/src/include/Makefile @@ -105,6 +105,7 @@ uninstall: clean: rm -f utils/fmgroids.h utils/fmgrprotos.h utils/guc_tables.inc.c utils/errcodes.h utils/header-stamp + rm -f utils/pgstat_wait_event.c utils/wait_event_funcs_data.c rm -f storage/lwlocknames.h utils/probes.h utils/wait_event_types.h rm -f nodes/nodetags.h nodes/header-stamp $(MAKE) -C catalog clean diff --git a/src/include/utils/.gitignore b/src/include/utils/.gitignore index 30f921429c..ff6f61cd7e 100644 --- a/src/include/utils/.gitignore +++ b/src/include/utils/.gitignore @@ -4,4 +4,6 @@ /probes.h /errcodes.h /header-stamp +/pgstat_wait_event.c +/wait_event_funcs_data.c /wait_event_types.h diff --git a/src/include/utils/meson.build b/src/include/utils/meson.build index 318a6aec0d..fd3a2352df 100644 --- a/src/include/utils/meson.build +++ b/src/include/utils/meson.build @@ -79,8 +79,6 @@ generated_backend_headers += fmgrtab_target[1] # autoconf generates the file there, ensure we get a conflict generated_sources_ac += { - 'src/backend/utils': fmgrtab_output + ['errcodes.h', 'probes.h', 'fmgr-stamp'], + 'src/backend/utils': fmgrtab_output + ['errcodes.h', 'wait_event_types.h', 'probes.h', 'fmgr-stamp'], 'src/include/utils': ['header-stamp'], } - -generated_sources_ac += {'src/backend/utils/activity': ['wait_event_types.h']} From a9afa021e95f2b0ffaaf26f3a27e685f634f4ac9 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 2 Feb 2026 10:21:04 +0900 Subject: [PATCH 014/147] Fix error message in RemoveWalSummaryIfOlderThan() A failing unlink() was reporting an incorrect error message, referring to stat(). Author: Man Zeng Reviewed-by: Junwang Zhao Discussion: https://postgr.es/m/tencent_3BBE865C5F49D452360FF190@qq.com Backpath-through: 17 --- src/backend/backup/walsummary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/backup/walsummary.c b/src/backend/backup/walsummary.c index 21164faac7..4ee510092f 100644 --- a/src/backend/backup/walsummary.c +++ b/src/backend/backup/walsummary.c @@ -251,7 +251,7 @@ RemoveWalSummaryIfOlderThan(WalSummaryFile *ws, time_t cutoff_time) if (unlink(path) != 0) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", path))); + errmsg("could not remove file \"%s\": %m", path))); ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path))); } From 0c9f46c4280e31a4f49200f5d2cde37727651869 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 2 Feb 2026 11:13:38 -0500 Subject: [PATCH 015/147] In s_lock.h, use regular labels with %= instead of local labels. Up to now we've used GNU-style local labels for branch targets in s_lock.h's assembly blocks. But there's an alternative style, which I for one didn't know about till recently: use regular assembler labels, and insert a per-asm-block number in them using %= to ensure they are distinct across multiple TAS calls within one source file. gcc has had %= since gcc 2.0, and I've verified that clang knows it too. While the immediate motivation for changing this is that AIX's assembler doesn't do local labels, it seems to me that this is a superior solution anyway. There is nothing mnemonic about "1:", while a regular label can convey something useful, and at least to me it feels less error-prone. Therefore let's standardize on this approach, also converting the one other usage in s_lock.h. Discussion: https://postgr.es/m/399291.1769998688@sss.pgh.pa.us --- src/include/storage/s_lock.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h index 2522cae0c3..3d9070e79d 100644 --- a/src/include/storage/s_lock.h +++ b/src/include/storage/s_lock.h @@ -119,6 +119,10 @@ * gcc from thinking it can cache the values of shared-memory fields * across the asm code. Add "cc" if your asm code changes the condition * code register, and also list any temp registers the code uses. + * + * If you need branch target labels within the asm block, include "%=" + * in the label names to make them distinct across multiple asm blocks + * within a source file. *---------- */ @@ -147,11 +151,11 @@ tas(volatile slock_t *lock) * leave it alone. */ __asm__ __volatile__( - " cmpb $0,%1 \n" - " jne 1f \n" - " lock \n" - " xchgb %0,%1 \n" - "1: \n" + " cmpb $0,%1 \n" + " jne TAS%=_out \n" + " lock \n" + " xchgb %0,%1 \n" + "TAS%=_out: \n" : "+q"(_res), "+m"(*lock) : /* no inputs */ : "memory", "cc"); @@ -421,17 +425,17 @@ tas(volatile slock_t *lock) __asm__ __volatile__( " lwarx %0,0,%3,1 \n" " cmpwi %0,0 \n" -" bne 1f \n" +" bne TAS%=_fail \n" " addi %0,%0,1 \n" " stwcx. %0,0,%3 \n" -" beq 2f \n" -"1: \n" +" beq TAS%=_ok \n" +"TAS%=_fail: \n" " li %1,1 \n" -" b 3f \n" -"2: \n" +" b TAS%=_out \n" +"TAS%=_ok: \n" " lwsync \n" " li %1,0 \n" -"3: \n" +"TAS%=_out: \n" : "=&b"(_t), "=r"(_res), "+m"(*lock) : "r"(lock) : "memory", "cc"); From da7a1dc0d62ac3141328f4e6ad51d70e918167aa Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 2 Feb 2026 14:39:50 -0500 Subject: [PATCH 016/147] Refactor att_align_nominal() to improve performance. Separate att_align_nominal() into two macros, similarly to what was already done with att_align_datum() and att_align_pointer(). The inner macro att_nominal_alignby() is really just TYPEALIGN(), while att_align_nominal() retains its previous API by mapping TYPALIGN_xxx values to numbers of bytes to align to and then calling att_nominal_alignby(). In support of this, split out tupdesc.c's logic to do that mapping into a publicly visible function typalign_to_alignby(). Having done that, we can replace performance-critical uses of att_align_nominal() with att_nominal_alignby(), where the typalign_to_alignby() mapping is done just once outside the loop. In most places I settled for doing typalign_to_alignby() once per function. We could in many places pass the alignby value in from the caller if we wanted to change function APIs for this purpose; but I'm a bit loath to do that, especially for exported APIs that extensions might call. Replacing a char typalign argument by a uint8 typalignby argument would be an API change that compilers would fail to warn about, thus silently breaking code in hard-to-debug ways. I did revise the APIs of array_iter_setup and array_iter_next, moving the element type attribute arguments to the former; if any external code uses those, the argument-count change will cause visible compile failures. Performance testing shows that ExecEvalScalarArrayOp is sped up by about 10% by this change, when using a simple per-element function such as int8eq. I did not check any of the other loops optimized here, but it's reasonable to expect similar gains. Although the motivation for creating this patch was to avoid a performance loss if we add some more typalign values, it evidently is worth doing whether that patch lands or not. Discussion: https://postgr.es/m/1127261.1769649624@sss.pgh.pa.us --- contrib/dblink/dblink.c | 4 +- src/backend/access/common/tupdesc.c | 21 +--- src/backend/executor/execExprInterp.c | 8 +- src/backend/utils/adt/array_expanded.c | 4 +- src/backend/utils/adt/arrayfuncs.c | 149 +++++++++++++----------- src/backend/utils/adt/multirangetypes.c | 16 +-- src/backend/utils/adt/varlena.c | 4 +- src/include/access/tupmacs.h | 51 ++++++-- src/include/utils/arrayaccess.h | 25 ++-- src/pl/plpython/plpy_typeio.c | 3 +- 10 files changed, 166 insertions(+), 119 deletions(-) diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c index 8cb3166495..2498d80c8e 100644 --- a/contrib/dblink/dblink.c +++ b/contrib/dblink/dblink.c @@ -2069,6 +2069,7 @@ get_text_array_contents(ArrayType *array, int *numitems) int16 typlen; bool typbyval; char typalign; + uint8 typalignby; char **values; char *ptr; bits8 *bitmap; @@ -2081,6 +2082,7 @@ get_text_array_contents(ArrayType *array, int *numitems) get_typlenbyvalalign(ARR_ELEMTYPE(array), &typlen, &typbyval, &typalign); + typalignby = typalign_to_alignby(typalign); values = palloc_array(char *, nitems); @@ -2098,7 +2100,7 @@ get_text_array_contents(ArrayType *array, int *numitems) { values[i] = TextDatumGetCString(PointerGetDatum(ptr)); ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); + ptr = (char *) att_nominal_alignby(ptr, typalignby); } /* advance bitmap pointer if any */ diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index 94b4f1f997..b69d10f0a4 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -86,25 +86,8 @@ populate_compact_attribute_internal(Form_pg_attribute src, IsCatalogRelationOid(src->attrelid) ? ATTNULLABLE_VALID : ATTNULLABLE_UNKNOWN; - switch (src->attalign) - { - case TYPALIGN_INT: - dst->attalignby = ALIGNOF_INT; - break; - case TYPALIGN_CHAR: - dst->attalignby = sizeof(char); - break; - case TYPALIGN_DOUBLE: - dst->attalignby = ALIGNOF_DOUBLE; - break; - case TYPALIGN_SHORT: - dst->attalignby = ALIGNOF_SHORT; - break; - default: - dst->attalignby = 0; - elog(ERROR, "invalid attalign value: %c", src->attalign); - break; - } + /* Compute numeric alignment requirement, too */ + dst->attalignby = typalign_to_alignby(src->attalign); } /* diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index a7a5ac1e83..61ff5ddc74 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -4032,6 +4032,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op) int16 typlen; bool typbyval; char typalign; + uint8 typalignby; char *s; bits8 *bitmap; int bitmask; @@ -4086,6 +4087,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op) typlen = op->d.scalararrayop.typlen; typbyval = op->d.scalararrayop.typbyval; typalign = op->d.scalararrayop.typalign; + typalignby = typalign_to_alignby(typalign); /* Initialize result appropriately depending on useOr */ result = BoolGetDatum(!useOr); @@ -4111,7 +4113,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op) { elt = fetch_att(s, typbyval, typlen); s = att_addlength_pointer(s, typlen, s); - s = (char *) att_align_nominal(s, typalign); + s = (char *) att_nominal_alignby(s, typalignby); fcinfo->args[1].value = elt; fcinfo->args[1].isnull = false; } @@ -4255,6 +4257,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco int16 typlen; bool typbyval; char typalign; + uint8 typalignby; int nitems; bool has_nulls = false; char *s; @@ -4272,6 +4275,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco &typlen, &typbyval, &typalign); + typalignby = typalign_to_alignby(typalign); oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); @@ -4318,7 +4322,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco element = fetch_att(s, typbyval, typlen); s = att_addlength_pointer(s, typlen, s); - s = (char *) att_align_nominal(s, typalign); + s = (char *) att_nominal_alignby(s, typalignby); saophash_insert(elements_tab->hashtab, element, &hashfound); } diff --git a/src/backend/utils/adt/array_expanded.c b/src/backend/utils/adt/array_expanded.c index 01e3dddcbb..7e8352af52 100644 --- a/src/backend/utils/adt/array_expanded.c +++ b/src/backend/utils/adt/array_expanded.c @@ -238,6 +238,7 @@ EA_get_flat_size(ExpandedObjectHeader *eohptr) Datum *dvalues; bool *dnulls; Size nbytes; + uint8 typalignby; int i; Assert(eah->ea_magic == EA_MAGIC); @@ -261,12 +262,13 @@ EA_get_flat_size(ExpandedObjectHeader *eohptr) dvalues = eah->dvalues; dnulls = eah->dnulls; nbytes = 0; + typalignby = typalign_to_alignby(eah->typalign); for (i = 0; i < nelems; i++) { if (dnulls && dnulls[i]) continue; nbytes = att_addlength_datum(nbytes, eah->typlen, dvalues[i]); - nbytes = att_align_nominal(nbytes, eah->typalign); + nbytes = att_nominal_alignby(nbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereport(ERROR, diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index e71d32773b..da68915ee2 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -75,6 +75,7 @@ typedef struct ArrayIteratorData int16 typlen; /* element type's length */ bool typbyval; /* element type's byval property */ char typalign; /* element type's align property */ + uint8 typalignby; /* typalign mapped to numeric alignment */ /* information about the requested slice size */ int slice_ndim; /* slice dimension, or 0 if not slicing */ @@ -123,7 +124,7 @@ static bool array_get_isnull(const bits8 *nullbitmap, int offset); static void array_set_isnull(bits8 *nullbitmap, int offset, bool isNull); static Datum ArrayCast(char *value, bool byval, int len); static int ArrayCastAndSet(Datum src, - int typlen, bool typbyval, char typalign, + int typlen, bool typbyval, uint8 typalignby, char *dest); static char *array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems, int typlen, bool typbyval, char typalign); @@ -187,6 +188,7 @@ array_in(PG_FUNCTION_ARGS) int typlen; bool typbyval; char typalign; + uint8 typalignby; char typdelim; Oid typioparam; char *p; @@ -232,6 +234,7 @@ array_in(PG_FUNCTION_ARGS) typlen = my_extra->typlen; typbyval = my_extra->typbyval; typalign = my_extra->typalign; + typalignby = typalign_to_alignby(typalign); typdelim = my_extra->typdelim; typioparam = my_extra->typioparam; @@ -328,7 +331,7 @@ array_in(PG_FUNCTION_ARGS) if (typlen == -1) values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i])); nbytes = att_addlength_datum(nbytes, typlen, values[i]); - nbytes = att_align_nominal(nbytes, typalign); + nbytes = att_nominal_alignby(nbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereturn(escontext, (Datum) 0, @@ -972,6 +975,7 @@ CopyArrayEls(ArrayType *array, bits8 *bitmap = ARR_NULLBITMAP(array); int bitval = 0; int bitmask = 1; + uint8 typalignby = typalign_to_alignby(typalign); int i; if (typbyval) @@ -988,7 +992,7 @@ CopyArrayEls(ArrayType *array, else { bitval |= bitmask; - p += ArrayCastAndSet(values[i], typlen, typbyval, typalign, p); + p += ArrayCastAndSet(values[i], typlen, typbyval, typalignby, p); if (freedata) pfree(DatumGetPointer(values[i])); } @@ -1112,7 +1116,7 @@ array_out(PG_FUNCTION_ARGS) needquotes = (bool *) palloc(nitems * sizeof(bool)); overall_length = 0; - array_iter_setup(&iter, v); + array_iter_setup(&iter, v, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -1121,8 +1125,7 @@ array_out(PG_FUNCTION_ARGS) bool needquote; /* Get source element, checking for NULL */ - itemvalue = array_iter_next(&iter, &isnull, i, - typlen, typbyval, typalign); + itemvalue = array_iter_next(&iter, &isnull, i); if (isnull) { @@ -1468,6 +1471,7 @@ ReadArrayBinary(StringInfo buf, int i; bool hasnull; int32 totbytes; + uint8 typalignby = typalign_to_alignby(typalign); for (i = 0; i < nitems; i++) { @@ -1526,7 +1530,7 @@ ReadArrayBinary(StringInfo buf, if (typlen == -1) values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i])); totbytes = att_addlength_datum(totbytes, typlen, values[i]); - totbytes = att_align_nominal(totbytes, typalign); + totbytes = att_nominal_alignby(totbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(totbytes)) ereport(ERROR, @@ -1614,7 +1618,7 @@ array_send(PG_FUNCTION_ARGS) } /* Send the array elements using the element's own sendproc */ - array_iter_setup(&iter, v); + array_iter_setup(&iter, v, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -1622,8 +1626,7 @@ array_send(PG_FUNCTION_ARGS) bool isnull; /* Get source element, checking for NULL */ - itemvalue = array_iter_next(&iter, &isnull, i, - typlen, typbyval, typalign); + itemvalue = array_iter_next(&iter, &isnull, i); if (isnull) { @@ -2231,6 +2234,7 @@ array_set_element(Datum arraydatum, addedafter, lenbefore, lenafter; + uint8 elmalignby = typalign_to_alignby(elmalign); if (arraytyplen > 0) { @@ -2258,7 +2262,7 @@ array_set_element(Datum arraydatum, resultarray = (char *) palloc(arraytyplen); memcpy(resultarray, DatumGetPointer(arraydatum), arraytyplen); elt_ptr = resultarray + indx[0] * elmlen; - ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalign, elt_ptr); + ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalignby, elt_ptr); return PointerGetDatum(resultarray); } @@ -2416,7 +2420,7 @@ array_set_element(Datum arraydatum, else { olditemlen = att_addlength_pointer(0, elmlen, elt_ptr); - olditemlen = att_align_nominal(olditemlen, elmalign); + olditemlen = att_nominal_alignby(olditemlen, elmalignby); } lenafter = olddatasize - lenbefore - olditemlen; } @@ -2426,7 +2430,7 @@ array_set_element(Datum arraydatum, else { newitemlen = att_addlength_datum(0, elmlen, dataValue); - newitemlen = att_align_nominal(newitemlen, elmalign); + newitemlen = att_nominal_alignby(newitemlen, elmalignby); } newsize = overheadlen + lenbefore + newitemlen + lenafter; @@ -2449,7 +2453,7 @@ array_set_element(Datum arraydatum, (char *) array + oldoverheadlen, lenbefore); if (!isNull) - ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalign, + ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalignby, (char *) newarray + overheadlen + lenbefore); memcpy((char *) newarray + overheadlen + lenbefore + newitemlen, (char *) array + oldoverheadlen + lenbefore + olditemlen, @@ -3221,6 +3225,7 @@ array_map(Datum arrayd, int typlen; bool typbyval; char typalign; + uint8 typalignby; array_iter iter; ArrayMetaState *inp_extra; ArrayMetaState *ret_extra; @@ -3270,21 +3275,21 @@ array_map(Datum arrayd, typlen = ret_extra->typlen; typbyval = ret_extra->typbyval; typalign = ret_extra->typalign; + typalignby = typalign_to_alignby(typalign); /* Allocate temporary arrays for new values */ values = (Datum *) palloc(nitems * sizeof(Datum)); nulls = (bool *) palloc(nitems * sizeof(bool)); /* Loop over source data */ - array_iter_setup(&iter, v); + array_iter_setup(&iter, v, inp_typlen, inp_typbyval, inp_typalign); hasnulls = false; for (i = 0; i < nitems; i++) { /* Get source element, checking for NULL */ *transform_source = - array_iter_next(&iter, transform_source_isnull, i, - inp_typlen, inp_typbyval, inp_typalign); + array_iter_next(&iter, transform_source_isnull, i); /* Apply the given expression to source element */ values[i] = ExecEvalExpr(exprstate, econtext, &nulls[i]); @@ -3298,7 +3303,7 @@ array_map(Datum arrayd, values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i])); /* Update total result size */ nbytes = att_addlength_datum(nbytes, typlen, values[i]); - nbytes = att_align_nominal(nbytes, typalign); + nbytes = att_nominal_alignby(nbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereport(ERROR, @@ -3505,6 +3510,7 @@ construct_md_array(Datum *elems, int32 dataoffset; int i; int nelems; + uint8 elmalignby = typalign_to_alignby(elmalign); if (ndims < 0) /* we do allow zero-dimension arrays */ ereport(ERROR, @@ -3538,7 +3544,7 @@ construct_md_array(Datum *elems, if (elmlen == -1) elems[i] = PointerGetDatum(PG_DETOAST_DATUM(elems[i])); nbytes = att_addlength_datum(nbytes, elmlen, elems[i]); - nbytes = att_align_nominal(nbytes, elmalign); + nbytes = att_nominal_alignby(nbytes, elmalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereport(ERROR, @@ -3641,6 +3647,7 @@ deconstruct_array(const ArrayType *array, bits8 *bitmap; int bitmask; int i; + uint8 elmalignby = typalign_to_alignby(elmalign); Assert(ARR_ELEMTYPE(array) == elmtype); @@ -3673,7 +3680,7 @@ deconstruct_array(const ArrayType *array, { elems[i] = fetch_att(p, elmbyval, elmlen); p = att_addlength_pointer(p, elmlen, p); - p = (char *) att_align_nominal(p, elmalign); + p = (char *) att_nominal_alignby(p, elmalignby); } /* advance bitmap pointer if any */ @@ -3878,8 +3885,8 @@ array_eq(PG_FUNCTION_ARGS) /* Loop over source data */ nitems = ArrayGetNItems(ndims1, dims1); - array_iter_setup(&it1, array1); - array_iter_setup(&it2, array2); + array_iter_setup(&it1, array1, typlen, typbyval, typalign); + array_iter_setup(&it2, array2, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -3890,10 +3897,8 @@ array_eq(PG_FUNCTION_ARGS) bool oprresult; /* Get elements, checking for NULL */ - elt1 = array_iter_next(&it1, &isnull1, i, - typlen, typbyval, typalign); - elt2 = array_iter_next(&it2, &isnull2, i, - typlen, typbyval, typalign); + elt1 = array_iter_next(&it1, &isnull1, i); + elt2 = array_iter_next(&it2, &isnull2, i); /* * We consider two NULLs equal; NULL and not-NULL are unequal. @@ -4042,8 +4047,8 @@ array_cmp(FunctionCallInfo fcinfo) /* Loop over source data */ min_nitems = Min(nitems1, nitems2); - array_iter_setup(&it1, array1); - array_iter_setup(&it2, array2); + array_iter_setup(&it1, array1, typlen, typbyval, typalign); + array_iter_setup(&it2, array2, typlen, typbyval, typalign); for (i = 0; i < min_nitems; i++) { @@ -4054,8 +4059,8 @@ array_cmp(FunctionCallInfo fcinfo) int32 cmpresult; /* Get elements, checking for NULL */ - elt1 = array_iter_next(&it1, &isnull1, i, typlen, typbyval, typalign); - elt2 = array_iter_next(&it2, &isnull2, i, typlen, typbyval, typalign); + elt1 = array_iter_next(&it1, &isnull1, i); + elt2 = array_iter_next(&it2, &isnull2, i); /* * We consider two NULLs equal; NULL > not-NULL. @@ -4238,7 +4243,7 @@ hash_array(PG_FUNCTION_ARGS) /* Loop over source data */ nitems = ArrayGetNItems(ndims, dims); - array_iter_setup(&iter, array); + array_iter_setup(&iter, array, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -4247,7 +4252,7 @@ hash_array(PG_FUNCTION_ARGS) uint32 elthash; /* Get element, checking for NULL */ - elt = array_iter_next(&iter, &isnull, i, typlen, typbyval, typalign); + elt = array_iter_next(&iter, &isnull, i); if (isnull) { @@ -4328,7 +4333,7 @@ hash_array_extended(PG_FUNCTION_ARGS) /* Loop over source data */ nitems = ArrayGetNItems(ndims, dims); - array_iter_setup(&iter, array); + array_iter_setup(&iter, array, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -4337,7 +4342,7 @@ hash_array_extended(PG_FUNCTION_ARGS) uint64 elthash; /* Get element, checking for NULL */ - elt = array_iter_next(&iter, &isnull, i, typlen, typbyval, typalign); + elt = array_iter_next(&iter, &isnull, i); if (isnull) { @@ -4451,7 +4456,7 @@ array_contain_compare(AnyArrayType *array1, AnyArrayType *array2, Oid collation, /* Loop over source data */ nelems1 = ArrayGetNItems(AARR_NDIM(array1), AARR_DIMS(array1)); - array_iter_setup(&it1, array1); + array_iter_setup(&it1, array1, typlen, typbyval, typalign); for (i = 0; i < nelems1; i++) { @@ -4459,7 +4464,7 @@ array_contain_compare(AnyArrayType *array1, AnyArrayType *array2, Oid collation, bool isnull1; /* Get element, checking for NULL */ - elt1 = array_iter_next(&it1, &isnull1, i, typlen, typbyval, typalign); + elt1 = array_iter_next(&it1, &isnull1, i); /* * We assume that the comparison operator is strict, so a NULL can't @@ -4626,6 +4631,7 @@ array_create_iterator(ArrayType *arr, int slice_ndim, ArrayMetaState *mstate) &iterator->typlen, &iterator->typbyval, &iterator->typalign); + iterator->typalignby = typalign_to_alignby(iterator->typalign); /* * Remember the slicing parameters. @@ -4700,7 +4706,7 @@ array_iterate(ArrayIterator iterator, Datum *value, bool *isnull) /* Move our data pointer forward to the next element */ p = att_addlength_pointer(p, iterator->typlen, p); - p = (char *) att_align_nominal(p, iterator->typalign); + p = (char *) att_nominal_alignby(p, iterator->typalignby); iterator->data_ptr = p; } } @@ -4730,7 +4736,7 @@ array_iterate(ArrayIterator iterator, Datum *value, bool *isnull) /* Move our data pointer forward to the next element */ p = att_addlength_pointer(p, iterator->typlen, p); - p = (char *) att_align_nominal(p, iterator->typalign); + p = (char *) att_nominal_alignby(p, iterator->typalignby); } } @@ -4828,7 +4834,7 @@ static int ArrayCastAndSet(Datum src, int typlen, bool typbyval, - char typalign, + uint8 typalignby, char *dest) { int inc; @@ -4839,14 +4845,14 @@ ArrayCastAndSet(Datum src, store_att_byval(dest, src, typlen); else memmove(dest, DatumGetPointer(src), typlen); - inc = att_align_nominal(typlen, typalign); + inc = att_nominal_alignby(typlen, typalignby); } else { Assert(!typbyval); inc = att_addlength_datum(0, typlen, src); memmove(dest, DatumGetPointer(src), inc); - inc = att_align_nominal(inc, typalign); + inc = att_nominal_alignby(inc, typalignby); } return inc; @@ -4867,12 +4873,13 @@ static char * array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems, int typlen, bool typbyval, char typalign) { + uint8 typalignby = typalign_to_alignby(typalign); int bitmask; int i; /* easy if fixed-size elements and no NULLs */ if (typlen > 0 && !nullbitmap) - return ptr + nitems * ((Size) att_align_nominal(typlen, typalign)); + return ptr + nitems * ((Size) att_nominal_alignby(typlen, typalignby)); /* seems worth having separate loops for NULL and no-NULLs cases */ if (nullbitmap) @@ -4885,7 +4892,7 @@ array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems, if (*nullbitmap & bitmask) { ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); + ptr = (char *) att_nominal_alignby(ptr, typalignby); } bitmask <<= 1; if (bitmask == 0x100) @@ -4900,7 +4907,7 @@ array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems, for (i = 0; i < nitems; i++) { ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); + ptr = (char *) att_nominal_alignby(ptr, typalignby); } } return ptr; @@ -5050,12 +5057,13 @@ array_slice_size(char *arraydataptr, bits8 *arraynullsptr, j, inc; int count = 0; + uint8 typalignby = typalign_to_alignby(typalign); mda_get_range(ndim, span, st, endp); /* Pretty easy for fixed element length without nulls ... */ if (typlen > 0 && !arraynullsptr) - return ArrayGetNItems(ndim, span) * att_align_nominal(typlen, typalign); + return ArrayGetNItems(ndim, span) * att_nominal_alignby(typlen, typalignby); /* Else gotta do it the hard way */ src_offset = ArrayGetOffset(ndim, dim, lb, st); @@ -5077,7 +5085,7 @@ array_slice_size(char *arraydataptr, bits8 *arraynullsptr, if (!array_get_isnull(arraynullsptr, src_offset)) { inc = att_addlength_pointer(0, typlen, ptr); - inc = att_align_nominal(inc, typalign); + inc = att_nominal_alignby(inc, typalignby); ptr += inc; count += inc; } @@ -6096,6 +6104,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, int16 elmlen; bool elmbyval; char elmalign; + uint8 elmalignby; ArrayMetaState *my_extra; /* @@ -6190,6 +6199,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, elmlen = my_extra->typlen; elmbyval = my_extra->typbyval; elmalign = my_extra->typalign; + elmalignby = typalign_to_alignby(elmalign); /* compute required space */ if (!isnull) @@ -6204,7 +6214,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, value = PointerGetDatum(PG_DETOAST_DATUM(value)); nbytes = att_addlength_datum(0, elmlen, value); - nbytes = att_align_nominal(nbytes, elmalign); + nbytes = att_nominal_alignby(nbytes, elmalignby); Assert(nbytes > 0); totbytes = nbytes * nitems; @@ -6228,7 +6238,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, p = ARR_DATA_PTR(result); for (i = 0; i < nitems; i++) - p += ArrayCastAndSet(value, elmlen, elmbyval, elmalign, p); + p += ArrayCastAndSet(value, elmlen, elmbyval, elmalignby, p); } else { @@ -6259,9 +6269,6 @@ array_unnest(PG_FUNCTION_ARGS) array_iter iter; int nextelem; int numelems; - int16 elmlen; - bool elmbyval; - char elmalign; } array_unnest_fctx; FuncCallContext *funcctx; @@ -6272,6 +6279,9 @@ array_unnest(PG_FUNCTION_ARGS) if (SRF_IS_FIRSTCALL()) { AnyArrayType *arr; + int16 elmlen; + bool elmbyval; + char elmalign; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); @@ -6293,23 +6303,24 @@ array_unnest(PG_FUNCTION_ARGS) /* allocate memory for user context */ fctx = palloc_object(array_unnest_fctx); - /* initialize state */ - array_iter_setup(&fctx->iter, arr); - fctx->nextelem = 0; - fctx->numelems = ArrayGetNItems(AARR_NDIM(arr), AARR_DIMS(arr)); - + /* get element-type data */ if (VARATT_IS_EXPANDED_HEADER(arr)) { /* we can just grab the type data from expanded array */ - fctx->elmlen = arr->xpn.typlen; - fctx->elmbyval = arr->xpn.typbyval; - fctx->elmalign = arr->xpn.typalign; + elmlen = arr->xpn.typlen; + elmbyval = arr->xpn.typbyval; + elmalign = arr->xpn.typalign; } else get_typlenbyvalalign(AARR_ELEMTYPE(arr), - &fctx->elmlen, - &fctx->elmbyval, - &fctx->elmalign); + &elmlen, + &elmbyval, + &elmalign); + + /* initialize state */ + array_iter_setup(&fctx->iter, arr, elmlen, elmbyval, elmalign); + fctx->nextelem = 0; + fctx->numelems = ArrayGetNItems(AARR_NDIM(arr), AARR_DIMS(arr)); funcctx->user_fctx = fctx; MemoryContextSwitchTo(oldcontext); @@ -6324,8 +6335,7 @@ array_unnest(PG_FUNCTION_ARGS) int offset = fctx->nextelem++; Datum elem; - elem = array_iter_next(&fctx->iter, &fcinfo->isnull, offset, - fctx->elmlen, fctx->elmbyval, fctx->elmalign); + elem = array_iter_next(&fctx->iter, &fcinfo->isnull, offset); SRF_RETURN_NEXT(funcctx, elem); } @@ -6401,6 +6411,7 @@ array_replace_internal(ArrayType *array, int typlen; bool typbyval; char typalign; + uint8 typalignby; char *arraydataptr; bits8 *bitmap; int bitmask; @@ -6445,6 +6456,7 @@ array_replace_internal(ArrayType *array, typlen = typentry->typlen; typbyval = typentry->typbyval; typalign = typentry->typalign; + typalignby = typalign_to_alignby(typalign); /* * Detoast values if they are toasted. The replacement value must be @@ -6506,7 +6518,7 @@ array_replace_internal(ArrayType *array, isNull = false; elt = fetch_att(arraydataptr, typbyval, typlen); arraydataptr = att_addlength_datum(arraydataptr, typlen, elt); - arraydataptr = (char *) att_align_nominal(arraydataptr, typalign); + arraydataptr = (char *) att_nominal_alignby(arraydataptr, typalignby); if (search_isnull) { @@ -6553,7 +6565,7 @@ array_replace_internal(ArrayType *array, { /* Update total result size */ nbytes = att_addlength_datum(nbytes, typlen, values[nresult]); - nbytes = att_align_nominal(nbytes, typalign); + nbytes = att_nominal_alignby(nbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereport(ERROR, @@ -6860,6 +6872,7 @@ width_bucket_array_variable(Datum operand, int typlen = typentry->typlen; bool typbyval = typentry->typbyval; char typalign = typentry->typalign; + uint8 typalignby = typalign_to_alignby(typalign); int left; int right; @@ -6883,7 +6896,7 @@ width_bucket_array_variable(Datum operand, for (i = left; i < mid; i++) { ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); + ptr = (char *) att_nominal_alignby(ptr, typalignby); } locfcinfo->args[0].value = operand; @@ -6908,7 +6921,7 @@ width_bucket_array_variable(Datum operand, * ensures we do only O(N) array indexing work, not O(N^2). */ ptr = att_addlength_pointer(ptr, typlen, ptr); - thresholds_data = (char *) att_align_nominal(ptr, typalign); + thresholds_data = (char *) att_nominal_alignby(ptr, typalignby); } } diff --git a/src/backend/utils/adt/multirangetypes.c b/src/backend/utils/adt/multirangetypes.c index 07e2a81d46..b1942387dc 100644 --- a/src/backend/utils/adt/multirangetypes.c +++ b/src/backend/utils/adt/multirangetypes.c @@ -572,21 +572,22 @@ multirange_size_estimate(TypeCacheEntry *rangetyp, int32 range_count, RangeType **ranges) { char elemalign = rangetyp->rngelemtype->typalign; + uint8 elemalignby = typalign_to_alignby(elemalign); Size size; int32 i; /* * Count space for MultirangeType struct, items and flags. */ - size = att_align_nominal(sizeof(MultirangeType) + - Max(range_count - 1, 0) * sizeof(uint32) + - range_count * sizeof(uint8), elemalign); + size = att_nominal_alignby(sizeof(MultirangeType) + + Max(range_count - 1, 0) * sizeof(uint32) + + range_count * sizeof(uint8), elemalignby); /* Count space for range bounds */ for (i = 0; i < range_count; i++) - size += att_align_nominal(VARSIZE(ranges[i]) - - sizeof(RangeType) - - sizeof(char), elemalign); + size += att_nominal_alignby(VARSIZE(ranges[i]) - + sizeof(RangeType) - + sizeof(char), elemalignby); return size; } @@ -605,6 +606,7 @@ write_multirange_data(MultirangeType *multirange, TypeCacheEntry *rangetyp, const char *begin; char *ptr; char elemalign = rangetyp->rngelemtype->typalign; + uint8 elemalignby = typalign_to_alignby(elemalign); items = MultirangeGetItemsPtr(multirange); flags = MultirangeGetFlagsPtr(multirange); @@ -630,7 +632,7 @@ write_multirange_data(MultirangeType *multirange, TypeCacheEntry *rangetyp, flags[i] = *((char *) ranges[i] + VARSIZE(ranges[i]) - sizeof(char)); len = VARSIZE(ranges[i]) - sizeof(RangeType) - sizeof(char); memcpy(ptr, ranges[i] + 1, len); - ptr += att_align_nominal(len, elemalign); + ptr += att_nominal_alignby(len, elemalignby); } } diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 6c1ebb0866..552ac0c61d 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -3898,6 +3898,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, int typlen; bool typbyval; char typalign; + uint8 typalignby; StringInfoData buf; bool printed = false; char *p; @@ -3947,6 +3948,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, typlen = my_extra->typlen; typbyval = my_extra->typbyval; typalign = my_extra->typalign; + typalignby = typalign_to_alignby(typalign); p = ARR_DATA_PTR(v); bitmap = ARR_NULLBITMAP(v); @@ -3983,7 +3985,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, printed = true; p = att_addlength_pointer(p, typlen, p); - p = (char *) att_align_nominal(p, typalign); + p = (char *) att_nominal_alignby(p, typalignby); } /* advance bitmap pointer if any */ diff --git a/src/include/access/tupmacs.h b/src/include/access/tupmacs.h index 3e5530658c..d64c18b950 100644 --- a/src/include/access/tupmacs.h +++ b/src/include/access/tupmacs.h @@ -71,6 +71,43 @@ fetch_att(const void *T, bool attbyval, int attlen) } #endif /* FRONTEND */ +/* + * typalign_to_alignby: map a TYPALIGN_xxx value to the numeric alignment + * value it represents. (We store TYPALIGN_xxx codes not the real alignment + * values mainly so that initial catalog contents can be machine-independent.) + */ +static inline uint8 +typalign_to_alignby(char typalign) +{ + uint8 alignby; + + switch (typalign) + { + case TYPALIGN_CHAR: + alignby = sizeof(char); + break; + case TYPALIGN_SHORT: + alignby = ALIGNOF_SHORT; + break; + case TYPALIGN_INT: + alignby = ALIGNOF_INT; + break; + case TYPALIGN_DOUBLE: + alignby = ALIGNOF_DOUBLE; + break; + default: +#ifndef FRONTEND + elog(ERROR, "invalid typalign value: %c", typalign); +#else + fprintf(stderr, "invalid typalign value: %c\n", typalign); + exit(1); +#endif + alignby = 0; + break; + } + return alignby; +} + /* * att_align_datum aligns the given offset as needed for a datum of alignment * requirement attalign and typlen attlen. attdatum is the Datum variable @@ -139,19 +176,11 @@ fetch_att(const void *T, bool attbyval, int attlen) * * within arrays and multiranges, we unconditionally align varlenas (XXX this * should be revisited, probably). * - * The attalign cases are tested in what is hopefully something like their - * frequency of occurrence. + * In performance-critical loops, avoid using this macro; instead use + * att_nominal_alignby with a pre-computed alignby value. */ #define att_align_nominal(cur_offset, attalign) \ -( \ - ((attalign) == TYPALIGN_INT) ? INTALIGN(cur_offset) : \ - (((attalign) == TYPALIGN_CHAR) ? (uintptr_t) (cur_offset) : \ - (((attalign) == TYPALIGN_DOUBLE) ? DOUBLEALIGN(cur_offset) : \ - ( \ - AssertMacro((attalign) == TYPALIGN_SHORT), \ - SHORTALIGN(cur_offset) \ - ))) \ -) + att_nominal_alignby(cur_offset, typalign_to_alignby(attalign)) /* * Similar to att_align_nominal, but accepts a number of bytes, typically from diff --git a/src/include/utils/arrayaccess.h b/src/include/utils/arrayaccess.h index abb8659de0..a325ae5257 100644 --- a/src/include/utils/arrayaccess.h +++ b/src/include/utils/arrayaccess.h @@ -22,8 +22,8 @@ * Functions for iterating through elements of a flat or expanded array. * These require a state struct "array_iter iter". * - * Use "array_iter_setup(&iter, arrayptr);" to prepare to iterate, and - * "datumvar = array_iter_next(&iter, &isnullvar, index, ...);" to fetch + * Use "array_iter_setup(&iter, arrayptr, ...);" to prepare to iterate, + * and "datumvar = array_iter_next(&iter, &isnullvar, index);" to fetch * the next element into datumvar/isnullvar. * "index" must be the zero-origin element number; we make caller provide * this since caller is generally counting the elements anyway. Despite @@ -42,11 +42,17 @@ typedef struct array_iter char *dataptr; /* Current spot in the data area */ bits8 *bitmapptr; /* Current byte of the nulls bitmap, or NULL */ int bitmask; /* mask for current bit in nulls bitmap */ + + /* Fields used in both cases: data about array's element type */ + int elmlen; + bool elmbyval; + uint8 elmalignby; } array_iter; static inline void -array_iter_setup(array_iter *it, AnyArrayType *a) +array_iter_setup(array_iter *it, AnyArrayType *a, + int elmlen, bool elmbyval, char elmalign) { if (VARATT_IS_EXPANDED_HEADER(a)) { @@ -75,11 +81,13 @@ array_iter_setup(array_iter *it, AnyArrayType *a) it->bitmapptr = ARR_NULLBITMAP((ArrayType *) a); } it->bitmask = 1; + it->elmlen = elmlen; + it->elmbyval = elmbyval; + it->elmalignby = typalign_to_alignby(elmalign); } static inline Datum -array_iter_next(array_iter *it, bool *isnull, int i, - int elmlen, bool elmbyval, char elmalign) +array_iter_next(array_iter *it, bool *isnull, int i) { Datum ret; @@ -98,10 +106,11 @@ array_iter_next(array_iter *it, bool *isnull, int i, else { *isnull = false; - ret = fetch_att(it->dataptr, elmbyval, elmlen); - it->dataptr = att_addlength_pointer(it->dataptr, elmlen, + ret = fetch_att(it->dataptr, it->elmbyval, it->elmlen); + it->dataptr = att_addlength_pointer(it->dataptr, it->elmlen, it->dataptr); - it->dataptr = (char *) att_align_nominal(it->dataptr, elmalign); + it->dataptr = (char *) att_nominal_alignby(it->dataptr, + it->elmalignby); } it->bitmask <<= 1; if (it->bitmask == 0x100) diff --git a/src/pl/plpython/plpy_typeio.c b/src/pl/plpython/plpy_typeio.c index 1f69109b08..44055de6ae 100644 --- a/src/pl/plpython/plpy_typeio.c +++ b/src/pl/plpython/plpy_typeio.c @@ -735,6 +735,7 @@ PLyList_FromArray_recurse(PLyDatumToOb *elm, int *dims, int ndim, int dim, char *dataptr = *dataptr_p; bits8 *bitmap = *bitmap_p; int bitmask = *bitmask_p; + uint8 typalignby = typalign_to_alignby(elm->typalign); for (i = 0; i < dims[dim]; i++) { @@ -751,7 +752,7 @@ PLyList_FromArray_recurse(PLyDatumToOb *elm, int *dims, int ndim, int dim, itemvalue = fetch_att(dataptr, elm->typbyval, elm->typlen); PyList_SetItem(list, i, elm->func(elm, itemvalue)); dataptr = att_addlength_pointer(dataptr, elm->typlen, dataptr); - dataptr = (char *) att_align_nominal(dataptr, elm->typalign); + dataptr = (char *) att_nominal_alignby(dataptr, typalignby); } /* advance bitmap pointer if any */ From 12451d9d1f5991739540aefdec77694d59567b34 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 2 Feb 2026 15:43:01 -0600 Subject: [PATCH 017/147] test_shm_mq: Set background worker names. Oversight in commit 5373bc2a08. Author: Michael Banck Discussion: https://postgr.es/m/20260202173156.GB17962%40p46.dedyn.io%3Blightning.p46.dedyn.io --- src/test/modules/test_shm_mq/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/modules/test_shm_mq/setup.c b/src/test/modules/test_shm_mq/setup.c index ba2fd746d7..579e5933d2 100644 --- a/src/test/modules/test_shm_mq/setup.c +++ b/src/test/modules/test_shm_mq/setup.c @@ -228,6 +228,7 @@ setup_background_workers(int nworkers, dsm_segment *seg) /* Register the workers. */ for (i = 0; i < nworkers; ++i) { + snprintf(worker.bgw_name, BGW_MAXLEN, "test_shm_mq worker %d", i + 1); if (!RegisterDynamicBackgroundWorker(&worker, &wstate->handle[i])) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), From 4a99ef1a0d11ed464295515635a44aad1b000691 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 2 Feb 2026 17:44:37 -0500 Subject: [PATCH 018/147] Fix flakiness in the pg_visibility VM-only vacuum test by using a temporary table. The test relies on VACUUM being able to mark a page all-visible, but this can fail when autovacuum in other sessions prevents the visibility horizon from advancing. Making the test table temporary isolates its horizon from other sessions, including catalog table vacuums, ensuring reliable test behavior. Reported-by: Alexander Lakhin Author: Kirill Reshke Reviewed-by: Melanie Plageman Discussion: https://postgr.es/m/2b09fba6-6b71-497a-96ef-a6947fcc39f6%40gmail.com --- contrib/pg_visibility/expected/pg_visibility.out | 2 +- contrib/pg_visibility/sql/pg_visibility.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/pg_visibility/expected/pg_visibility.out b/contrib/pg_visibility/expected/pg_visibility.out index e10f170601..d26f0ab758 100644 --- a/contrib/pg_visibility/expected/pg_visibility.out +++ b/contrib/pg_visibility/expected/pg_visibility.out @@ -207,7 +207,7 @@ select pg_truncate_visibility_map('test_partition'); -- test the case where vacuum phase I does not need to modify the heap buffer -- and only needs to set the VM -create table test_vac_unmodified_heap(a int); +create temp table test_vac_unmodified_heap(a int); insert into test_vac_unmodified_heap values (1); vacuum (freeze) test_vac_unmodified_heap; select pg_visibility_map_summary('test_vac_unmodified_heap'); diff --git a/contrib/pg_visibility/sql/pg_visibility.sql b/contrib/pg_visibility/sql/pg_visibility.sql index 57af8a0c5b..0888adb96a 100644 --- a/contrib/pg_visibility/sql/pg_visibility.sql +++ b/contrib/pg_visibility/sql/pg_visibility.sql @@ -97,7 +97,7 @@ select pg_truncate_visibility_map('test_partition'); -- test the case where vacuum phase I does not need to modify the heap buffer -- and only needs to set the VM -create table test_vac_unmodified_heap(a int); +create temp table test_vac_unmodified_heap(a int); insert into test_vac_unmodified_heap values (1); vacuum (freeze) test_vac_unmodified_heap; select pg_visibility_map_summary('test_vac_unmodified_heap'); From dddbbc253b923ef27f724c6abb5a6a39e1254d54 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Tue, 3 Feb 2026 10:03:19 +0900 Subject: [PATCH 019/147] psql: Add %i prompt escape to indicate hot standby status. This commit introduces a new prompt escape %i for psql, which shows whether the connected server is operating in hot standby mode. It expands to standby if the server reports in_hot_standby = on, and primary otherwise. This is useful for distinguishing standby servers from primary ones at a glance, especially when working with multiple connections in replicated environments where libpq's multi-host connection strings are used. Author: Jim Jones Reviewed-by: Fujii Masao Reviewed-by: Greg Sabino Mullane Reviewed-by: Srinath Reddy Sadipiralla Reviewed-by: Nathan Bossart Reviewed-by: Chao Li Reviewed-by: Andreas Karlsson Discussion: https://www.postgresql.org/message-id/flat/016f6738-f9a9-4e98-bb5a-e1e4b9591d46@uni-muenster.de --- doc/src/sgml/ref/psql-ref.sgml | 17 +++++++++++++++++ src/bin/psql/prompt.c | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index e464e3b13d..8b1d948ba0 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -5075,6 +5075,23 @@ testdb=> INSERT INTO my_table VALUES (:'content'); + + %i + + + Indicates whether the connected server is running in hot standby mode. + The value is shown as standby, if the server is + currently in hot standby and reports + as on, + and primary otherwise. This is useful when + connecting to multiple servers to quickly determine the role of + each connection. A value of ? is shown + when connected to a server running + PostgreSQL 13 or older. + + + + %x diff --git a/src/bin/psql/prompt.c b/src/bin/psql/prompt.c index 891cd6374f..9725d53dfe 100644 --- a/src/bin/psql/prompt.c +++ b/src/bin/psql/prompt.c @@ -44,6 +44,8 @@ * or a ! if session is not connected to a database; * in prompt2 -, *, ', or "; * in prompt3 nothing + * %i - "standby" or "primary" depending on the server's in_hot_standby + * status, or "?" if unavailable (empty if unknown) * %x - transaction status: empty, *, !, ? (unknown or no connection) * %l - The line number inside the current statement, starting from 1. * %? - the error code of the last query (not yet implemented) @@ -258,7 +260,23 @@ get_prompt(promptStatus_t status, ConditionalStack cstack) break; } break; + case 'i': + if (pset.db) + { + const char *hs = PQparameterStatus(pset.db, "in_hot_standby"); + if (hs) + { + if (strcmp(hs, "on") == 0) + strlcpy(buf, "standby", sizeof(buf)); + else + strlcpy(buf, "primary", sizeof(buf)); + } + /* Use ? for versions that don't report in_hot_standby */ + else + buf[0] = '?'; + } + break; case 'x': if (!pset.db) buf[0] = '?'; From 21c1125d660617f71b20304150e4a8583299cf86 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Tue, 3 Feb 2026 11:14:00 +0900 Subject: [PATCH 020/147] Release synchronous replication waiters immediately on configuration changes. Previously, when synchronous_standby_names was changed (for example, by reducing the number of required synchronous standbys or modifying the standby list), backends waiting for synchronous replication were not released immediately, even if the new configuration no longer required them to wait. They could remain blocked until additional messages arrived from standbys and triggered their release. This commit improves walsender so that backends waiting for synchronous replication are released as soon as the updated configuration takes effect and the new settings no longer require them to wait, by calling SyncRepReleaseWaiters() when configuration changes are processed. As part of this change, the duplicated code that handles configuration changes in walsender has been refactored into a new helper function, which is now used at the three existing call places. Since this is an improvement rather than a bug fix, it is applied only to the master branch. Author: Shinya Kato Reviewed-by: Chao Li Reviewed-by: Fujii Masao Reviewed-by: Xuneng Zhou Discussion: https://postgr.es/m/CAOzEurSRii0tEYhu5cePmRcvS=ZrxTLEvxm3Kj0d7_uKGdM23g@mail.gmail.com --- src/backend/replication/walsender.c | 47 ++++++++++++++++++----------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index a0e6a3d200..2cde8ebc72 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -1611,6 +1611,32 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, ProcessPendingWrites(); } +/* + * Handle configuration reload. + * + * Process the pending configuration file reload and reinitializes synchronous + * replication settings. Also releases any waiters that may now be satisfied due + * to changes in synchronous replication requirements. + */ +static void +WalSndHandleConfigReload(void) +{ + if (!ConfigReloadPending) + return; + + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + + /* + * Recheck and release any now-satisfied waiters after config reload + * changes synchronous replication requirements (e.g., reducing the number + * of sync standbys or changing the standby names). + */ + if (!am_cascading_walsender) + SyncRepReleaseWaiters(); +} + /* * Wait until there is no pending write. Also process replies from the other * side and check timeouts during that. @@ -1646,12 +1672,7 @@ ProcessPendingWrites(void) CHECK_FOR_INTERRUPTS(); /* Process any requests or signals received recently */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - SyncRepInitConfig(); - } + WalSndHandleConfigReload(); /* Try to flush pending output to the client */ if (pq_flush_if_writable() != 0) @@ -1854,12 +1875,7 @@ WalSndWaitForWal(XLogRecPtr loc) CHECK_FOR_INTERRUPTS(); /* Process any requests or signals received recently */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - SyncRepInitConfig(); - } + WalSndHandleConfigReload(); /* Check for input from the client */ ProcessRepliesIfAny(); @@ -2899,12 +2915,7 @@ WalSndLoop(WalSndSendDataCallback send_data) CHECK_FOR_INTERRUPTS(); /* Process any requests or signals received recently */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - SyncRepInitConfig(); - } + WalSndHandleConfigReload(); /* Check for input from the client */ ProcessRepliesIfAny(); From 213fec296f419af8f199a721a9986e879656555c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 3 Feb 2026 11:25:10 +0900 Subject: [PATCH 021/147] Fix incorrect errno in OpenWalSummaryFile() This routine has an option to bypass an error if a WAL summary file is opened for read but is missing (missing_ok=true). However, the code incorrectly checked for EEXIST, that matters when using O_CREAT and O_EXCL, rather than ENOENT, for this case. There are currently only two callers of OpenWalSummaryFile() in the tree, and both use missing_ok=false, meaning that the check based on the errno is currently dead code. This issue could matter for out-of-core code or future backpatches that would like to use missing_ok set to true. Issue spotted while monitoring this area of the code, after a9afa021e95f. Author: Michael Paquier Reviewed-by: Chao Li Discussion: https://postgr.es/m/aYAf8qDHbpBZ3Rml@paquier.xyz Backpatch-through: 17 --- src/backend/backup/walsummary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/backup/walsummary.c b/src/backend/backup/walsummary.c index 4ee510092f..4cd1824fbc 100644 --- a/src/backend/backup/walsummary.c +++ b/src/backend/backup/walsummary.c @@ -214,7 +214,7 @@ OpenWalSummaryFile(WalSummaryFile *ws, bool missing_ok) LSN_FORMAT_ARGS(ws->end_lsn)); file = PathNameOpenFile(path, O_RDONLY); - if (file < 0 && (errno != EEXIST || !missing_ok)) + if (file < 0 && (errno != ENOENT || !missing_ok)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); From e05a24c2d4eab3dd76741dc6e6c18bb0584771c5 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 3 Feb 2026 12:20:41 +0900 Subject: [PATCH 022/147] Add two IO wait events for COPY FROM/TO on a pipe/file/program Two wait events are added to the COPY FROM/TO code: * COPY_FROM_READ: reading data from a copy_file. * COPY_TO_WRITE: writing data to a copy_file. In the COPY code, copy_file can be set when processing a command through the pipe mode (for the non-DestRemote case), the program mode or the file mode, when processing fread() or fwrite() on it. Author: Nikolay Samokhvalov Reviewed-by: Dilip Kumar Reviewed-by: Sami Imseih Discussion: https://postgr.es/m/CAM527d_iDzz0Kqyi7HOfqa-Xzuq29jkR6AGXqfXLqA5PR5qsng@mail.gmail.com --- src/backend/commands/copyfromparse.c | 2 ++ src/backend/commands/copyto.c | 2 ++ src/backend/utils/activity/wait_event_names.txt | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 5868a7fa11..94d6f415a0 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -249,7 +249,9 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) switch (cstate->copy_src) { case COPY_FILE: + pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ); bytesread = fread(databuf, 1, maxread, cstate->copy_file); + pgstat_report_wait_end(); if (ferror(cstate->copy_file)) ereport(ERROR, (errcode_for_file_access(), diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index 4ab4a3893d..9ceeff6d99 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -454,6 +454,7 @@ CopySendEndOfRow(CopyToState cstate) switch (cstate->copy_dest) { case COPY_FILE: + pgstat_report_wait_start(WAIT_EVENT_COPY_TO_WRITE); if (fwrite(fe_msgbuf->data, fe_msgbuf->len, 1, cstate->copy_file) != 1 || ferror(cstate->copy_file)) @@ -486,6 +487,7 @@ CopySendEndOfRow(CopyToState cstate) (errcode_for_file_access(), errmsg("could not write to COPY file: %m"))); } + pgstat_report_wait_end(); break; case COPY_FRONTEND: /* Dump the accumulated row as one CopyData message */ diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index efde48e76b..4aa864fe3c 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -213,6 +213,8 @@ CONTROL_FILE_WRITE_UPDATE "Waiting for a write to update the pg_contro COPY_FILE_COPY "Waiting for a file copy operation." COPY_FILE_READ "Waiting for a read during a file copy operation." COPY_FILE_WRITE "Waiting for a write during a file copy operation." +COPY_FROM_READ "Waiting to read data from a pipe, a file or a program during COPY FROM." +COPY_TO_WRITE "Waiting to write data to a pipe, a file or a program during COPY TO." DATA_FILE_EXTEND "Waiting for a relation data file to be extended." DATA_FILE_FLUSH "Waiting for a relation data file to reach durable storage." DATA_FILE_IMMEDIATE_SYNC "Waiting for an immediate synchronization of a relation data file to durable storage." From 137d05df2f2014c584b229310b8635fa6a8572ba Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 3 Feb 2026 08:36:47 +0100 Subject: [PATCH 023/147] Rename AssertVariableIsOfType to StaticAssertVariableIsOfType This keeps run-time assertions and static assertions clearly separate. Reviewed-by: Bertrand Drouvot Discussion: https://www.postgresql.org/message-id/flat/2273bc2a-045d-4a75-8584-7cd9396e5534%40eisentraut.org --- contrib/hstore_plperl/hstore_plperl.c | 12 +++---- contrib/hstore_plpython/hstore_plpython.c | 16 +++++----- contrib/jsonb_plpython/jsonb_plpython.c | 8 ++--- contrib/ltree_plpython/ltree_plpython.c | 4 +-- src/backend/executor/execParallel.c | 2 +- src/backend/jit/llvm/llvmjit_types.c | 10 +++--- src/include/access/xlogdefs.h | 2 +- src/include/c.h | 14 ++++----- src/include/lib/ilist.h | 38 +++++++++++------------ src/include/lib/pairingheap.h | 8 ++--- src/include/postgres.h | 4 +-- src/include/storage/proclist.h | 4 +-- src/include/utils/freepage.h | 2 +- src/include/utils/relptr.h | 10 +++--- 14 files changed, 67 insertions(+), 67 deletions(-) diff --git a/contrib/hstore_plperl/hstore_plperl.c b/contrib/hstore_plperl/hstore_plperl.c index 31393b4fa5..1380a1b436 100644 --- a/contrib/hstore_plperl/hstore_plperl.c +++ b/contrib/hstore_plperl/hstore_plperl.c @@ -28,24 +28,24 @@ static hstoreCheckValLen_t hstoreCheckValLen_p; void _PG_init(void) { - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); + /* Static asserts verify that typedefs above match original declarations */ + StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); hstoreUpgrade_p = (hstoreUpgrade_t) load_external_function("$libdir/hstore", "hstoreUpgrade", true, NULL); - AssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); + StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); hstoreUniquePairs_p = (hstoreUniquePairs_t) load_external_function("$libdir/hstore", "hstoreUniquePairs", true, NULL); - AssertVariableIsOfType(&hstorePairs, hstorePairs_t); + StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t); hstorePairs_p = (hstorePairs_t) load_external_function("$libdir/hstore", "hstorePairs", true, NULL); - AssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); + StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); hstoreCheckKeyLen_p = (hstoreCheckKeyLen_t) load_external_function("$libdir/hstore", "hstoreCheckKeyLen", true, NULL); - AssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); + StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); hstoreCheckValLen_p = (hstoreCheckValLen_t) load_external_function("$libdir/hstore", "hstoreCheckValLen", true, NULL); diff --git a/contrib/hstore_plpython/hstore_plpython.c b/contrib/hstore_plpython/hstore_plpython.c index e2bfc6da38..3c8ada2a0d 100644 --- a/contrib/hstore_plpython/hstore_plpython.c +++ b/contrib/hstore_plpython/hstore_plpython.c @@ -35,32 +35,32 @@ static hstoreCheckValLen_t hstoreCheckValLen_p; void _PG_init(void) { - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); + /* Static asserts verify that typedefs above match original declarations */ + StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); PLyObject_AsString_p = (PLyObject_AsString_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyObject_AsString", true, NULL); - AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); + StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); - AssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); + StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); hstoreUpgrade_p = (hstoreUpgrade_t) load_external_function("$libdir/hstore", "hstoreUpgrade", true, NULL); - AssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); + StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); hstoreUniquePairs_p = (hstoreUniquePairs_t) load_external_function("$libdir/hstore", "hstoreUniquePairs", true, NULL); - AssertVariableIsOfType(&hstorePairs, hstorePairs_t); + StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t); hstorePairs_p = (hstorePairs_t) load_external_function("$libdir/hstore", "hstorePairs", true, NULL); - AssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); + StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); hstoreCheckKeyLen_p = (hstoreCheckKeyLen_t) load_external_function("$libdir/hstore", "hstoreCheckKeyLen", true, NULL); - AssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); + StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); hstoreCheckValLen_p = (hstoreCheckValLen_t) load_external_function("$libdir/hstore", "hstoreCheckValLen", true, NULL); diff --git a/contrib/jsonb_plpython/jsonb_plpython.c b/contrib/jsonb_plpython/jsonb_plpython.c index 7e8e1d6674..1983bf8c30 100644 --- a/contrib/jsonb_plpython/jsonb_plpython.c +++ b/contrib/jsonb_plpython/jsonb_plpython.c @@ -39,16 +39,16 @@ static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p; void _PG_init(void) { - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); + /* Static asserts verify that typedefs above match original declarations */ + StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); PLyObject_AsString_p = (PLyObject_AsString_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyObject_AsString", true, NULL); - AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); + StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); - AssertVariableIsOfType(&PLy_elog_impl, PLy_elog_impl_t); + StaticAssertVariableIsOfType(&PLy_elog_impl, PLy_elog_impl_t); PLy_elog_impl_p = (PLy_elog_impl_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLy_elog_impl", true, NULL); diff --git a/contrib/ltree_plpython/ltree_plpython.c b/contrib/ltree_plpython/ltree_plpython.c index 0493aeb242..a25fb5c5fa 100644 --- a/contrib/ltree_plpython/ltree_plpython.c +++ b/contrib/ltree_plpython/ltree_plpython.c @@ -20,8 +20,8 @@ static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p; void _PG_init(void) { - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); + /* Static asserts verify that typedefs above match original declarations */ + StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 772e81f315..f87978c137 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -106,7 +106,7 @@ struct SharedExecutorInstrumentation /* array of num_plan_nodes * num_workers Instrumentation objects follows */ }; #define GetInstrumentationArray(sei) \ - (AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ + (StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ (Instrumentation *) (((char *) sei) + sei->instrument_offset)) /* Context object for ExecParallelEstimate. */ diff --git a/src/backend/jit/llvm/llvmjit_types.c b/src/backend/jit/llvm/llvmjit_types.c index 4636b90cd0..c8a1f84129 100644 --- a/src/backend/jit/llvm/llvmjit_types.c +++ b/src/backend/jit/llvm/llvmjit_types.c @@ -81,7 +81,7 @@ extern Datum AttributeTemplate(PG_FUNCTION_ARGS); Datum AttributeTemplate(PG_FUNCTION_ARGS) { - AssertVariableIsOfType(&AttributeTemplate, PGFunction); + StaticAssertVariableIsOfType(&AttributeTemplate, PGFunction); PG_RETURN_NULL(); } @@ -99,8 +99,8 @@ ExecEvalSubroutineTemplate(ExprState *state, struct ExprEvalStep *op, ExprContext *econtext) { - AssertVariableIsOfType(&ExecEvalSubroutineTemplate, - ExecEvalSubroutine); + StaticAssertVariableIsOfType(&ExecEvalSubroutineTemplate, + ExecEvalSubroutine); } extern bool ExecEvalBoolSubroutineTemplate(ExprState *state, @@ -111,8 +111,8 @@ ExecEvalBoolSubroutineTemplate(ExprState *state, struct ExprEvalStep *op, ExprContext *econtext) { - AssertVariableIsOfType(&ExecEvalBoolSubroutineTemplate, - ExecEvalBoolSubroutine); + StaticAssertVariableIsOfType(&ExecEvalBoolSubroutineTemplate, + ExecEvalBoolSubroutine); return false; } diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h index f896dbe149..d77b894cb6 100644 --- a/src/include/access/xlogdefs.h +++ b/src/include/access/xlogdefs.h @@ -44,7 +44,7 @@ typedef uint64 XLogRecPtr; * To avoid breaking translatable messages, we're directly applying the * LSN format instead of using a macro. */ -#define LSN_FORMAT_ARGS(lsn) (AssertVariableIsOfTypeMacro((lsn), XLogRecPtr), (uint32) ((lsn) >> 32)), ((uint32) (lsn)) +#define LSN_FORMAT_ARGS(lsn) (StaticAssertVariableIsOfTypeMacro((lsn), XLogRecPtr), (uint32) ((lsn) >> 32)), ((uint32) (lsn)) /* * XLogSegNo - physical log file sequence number. diff --git a/src/include/c.h b/src/include/c.h index c443e75b89..68df93ed42 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -955,26 +955,26 @@ pg_noreturn extern void ExceptionalCondition(const char *conditionName, /* * Compile-time checks that a variable (or expression) has the specified type. * - * AssertVariableIsOfType() can be used as a statement. - * AssertVariableIsOfTypeMacro() is intended for use in macros, eg - * #define foo(x) (AssertVariableIsOfTypeMacro(x, int), bar(x)) + * StaticAssertVariableIsOfType() can be used as a statement. + * StaticAssertVariableIsOfTypeMacro() is intended for use in macros, eg + * #define foo(x) (StaticAssertVariableIsOfTypeMacro(x, int), bar(x)) * * If we don't have __builtin_types_compatible_p, we can still assert that * the types have the same size. This is far from ideal (especially on 32-bit * platforms) but it provides at least some coverage. */ #ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P -#define AssertVariableIsOfType(varname, typename) \ +#define StaticAssertVariableIsOfType(varname, typename) \ StaticAssertStmt(__builtin_types_compatible_p(__typeof__(varname), typename), \ CppAsString(varname) " does not have type " CppAsString(typename)) -#define AssertVariableIsOfTypeMacro(varname, typename) \ +#define StaticAssertVariableIsOfTypeMacro(varname, typename) \ (StaticAssertExpr(__builtin_types_compatible_p(__typeof__(varname), typename), \ CppAsString(varname) " does not have type " CppAsString(typename))) #else /* !HAVE__BUILTIN_TYPES_COMPATIBLE_P */ -#define AssertVariableIsOfType(varname, typename) \ +#define StaticAssertVariableIsOfType(varname, typename) \ StaticAssertStmt(sizeof(varname) == sizeof(typename), \ CppAsString(varname) " does not have type " CppAsString(typename)) -#define AssertVariableIsOfTypeMacro(varname, typename) \ +#define StaticAssertVariableIsOfTypeMacro(varname, typename) \ (StaticAssertExpr(sizeof(varname) == sizeof(typename), \ CppAsString(varname) " does not have type " CppAsString(typename))) #endif /* HAVE__BUILTIN_TYPES_COMPATIBLE_P */ diff --git a/src/include/lib/ilist.h b/src/include/lib/ilist.h index d49ec0ffbc..fc298a6c1d 100644 --- a/src/include/lib/ilist.h +++ b/src/include/lib/ilist.h @@ -591,8 +591,8 @@ dlist_tail_node(dlist_head *head) * This is used to convert a dlist_node * back to its containing struct. */ #define dlist_container(type, membername, ptr) \ - (AssertVariableIsOfTypeMacro(ptr, dlist_node *), \ - AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ + (StaticAssertVariableIsOfTypeMacro(ptr, dlist_node *), \ + StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ ((type *) ((char *) (ptr) - offsetof(type, membername)))) /* @@ -601,7 +601,7 @@ dlist_tail_node(dlist_head *head) * The list must not be empty. */ #define dlist_head_element(type, membername, lhead) \ - (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ + (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ (type *) dlist_head_element_off(lhead, offsetof(type, membername))) /* @@ -610,7 +610,7 @@ dlist_tail_node(dlist_head *head) * The list must not be empty. */ #define dlist_tail_element(type, membername, lhead) \ - (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ + (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ ((type *) dlist_tail_element_off(lhead, offsetof(type, membername)))) /* @@ -621,8 +621,8 @@ dlist_tail_node(dlist_head *head) * It is *not* allowed to manipulate the list during iteration. */ #define dlist_foreach(iter, lhead) \ - for (AssertVariableIsOfTypeMacro(iter, dlist_iter), \ - AssertVariableIsOfTypeMacro(lhead, dlist_head *), \ + for (StaticAssertVariableIsOfTypeMacro(iter, dlist_iter), \ + StaticAssertVariableIsOfTypeMacro(lhead, dlist_head *), \ (iter).end = &(lhead)->head, \ (iter).cur = (iter).end->next ? (iter).end->next : (iter).end; \ (iter).cur != (iter).end; \ @@ -638,8 +638,8 @@ dlist_tail_node(dlist_head *head) * fine to insert or delete adjacent nodes. */ #define dlist_foreach_modify(iter, lhead) \ - for (AssertVariableIsOfTypeMacro(iter, dlist_mutable_iter), \ - AssertVariableIsOfTypeMacro(lhead, dlist_head *), \ + for (StaticAssertVariableIsOfTypeMacro(iter, dlist_mutable_iter), \ + StaticAssertVariableIsOfTypeMacro(lhead, dlist_head *), \ (iter).end = &(lhead)->head, \ (iter).cur = (iter).end->next ? (iter).end->next : (iter).end, \ (iter).next = (iter).cur->next; \ @@ -652,8 +652,8 @@ dlist_tail_node(dlist_head *head) * It is *not* allowed to manipulate the list during iteration. */ #define dlist_reverse_foreach(iter, lhead) \ - for (AssertVariableIsOfTypeMacro(iter, dlist_iter), \ - AssertVariableIsOfTypeMacro(lhead, dlist_head *), \ + for (StaticAssertVariableIsOfTypeMacro(iter, dlist_iter), \ + StaticAssertVariableIsOfTypeMacro(lhead, dlist_head *), \ (iter).end = &(lhead)->head, \ (iter).cur = (iter).end->prev ? (iter).end->prev : (iter).end; \ (iter).cur != (iter).end; \ @@ -953,7 +953,7 @@ dclist_count(const dclist_head *head) * The list must not be empty. */ #define dclist_head_element(type, membername, lhead) \ - (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ + (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ (type *) dclist_head_element_off(lhead, offsetof(type, membername))) /* @@ -962,7 +962,7 @@ dclist_count(const dclist_head *head) * The list must not be empty. */ #define dclist_tail_element(type, membername, lhead) \ - (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ + (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ ((type *) dclist_tail_element_off(lhead, offsetof(type, membername)))) @@ -1104,8 +1104,8 @@ slist_delete_current(slist_mutable_iter *iter) * This is used to convert a slist_node * back to its containing struct. */ #define slist_container(type, membername, ptr) \ - (AssertVariableIsOfTypeMacro(ptr, slist_node *), \ - AssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \ + (StaticAssertVariableIsOfTypeMacro(ptr, slist_node *), \ + StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \ ((type *) ((char *) (ptr) - offsetof(type, membername)))) /* @@ -1114,7 +1114,7 @@ slist_delete_current(slist_mutable_iter *iter) * The list must not be empty. */ #define slist_head_element(type, membername, lhead) \ - (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \ + (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \ (type *) slist_head_element_off(lhead, offsetof(type, membername))) /* @@ -1130,8 +1130,8 @@ slist_delete_current(slist_mutable_iter *iter) * not safe.) */ #define slist_foreach(iter, lhead) \ - for (AssertVariableIsOfTypeMacro(iter, slist_iter), \ - AssertVariableIsOfTypeMacro(lhead, slist_head *), \ + for (StaticAssertVariableIsOfTypeMacro(iter, slist_iter), \ + StaticAssertVariableIsOfTypeMacro(lhead, slist_head *), \ (iter).cur = (lhead)->head.next; \ (iter).cur != NULL; \ (iter).cur = (iter).cur->next) @@ -1146,8 +1146,8 @@ slist_delete_current(slist_mutable_iter *iter) * deletion of nodes adjacent to the current node would misbehave. */ #define slist_foreach_modify(iter, lhead) \ - for (AssertVariableIsOfTypeMacro(iter, slist_mutable_iter), \ - AssertVariableIsOfTypeMacro(lhead, slist_head *), \ + for (StaticAssertVariableIsOfTypeMacro(iter, slist_mutable_iter), \ + StaticAssertVariableIsOfTypeMacro(lhead, slist_head *), \ (iter).prev = &(lhead)->head, \ (iter).cur = (iter).prev->next, \ (iter).next = (iter).cur ? (iter).cur->next : NULL; \ diff --git a/src/include/lib/pairingheap.h b/src/include/lib/pairingheap.h index b93ea5b638..f1582c9862 100644 --- a/src/include/lib/pairingheap.h +++ b/src/include/lib/pairingheap.h @@ -41,16 +41,16 @@ typedef struct pairingheap_node * This is used to convert a pairingheap_node * back to its containing struct. */ #define pairingheap_container(type, membername, ptr) \ - (AssertVariableIsOfTypeMacro(ptr, pairingheap_node *), \ - AssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node), \ + (StaticAssertVariableIsOfTypeMacro(ptr, pairingheap_node *), \ + StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node), \ ((type *) ((char *) (ptr) - offsetof(type, membername)))) /* * Like pairingheap_container, but used when the pointer is 'const ptr' */ #define pairingheap_const_container(type, membername, ptr) \ - (AssertVariableIsOfTypeMacro(ptr, const pairingheap_node *), \ - AssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node), \ + (StaticAssertVariableIsOfTypeMacro(ptr, const pairingheap_node *), \ + StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node), \ ((const type *) ((const char *) (ptr) - offsetof(type, membername)))) /* diff --git a/src/include/postgres.h b/src/include/postgres.h index 8b92f453e7..a7a6584e76 100644 --- a/src/include/postgres.h +++ b/src/include/postgres.h @@ -533,9 +533,9 @@ Float8GetDatum(float8 X) */ #define Int64GetDatumFast(X) \ - (AssertVariableIsOfTypeMacro(X, int64), Int64GetDatum(X)) + (StaticAssertVariableIsOfTypeMacro(X, int64), Int64GetDatum(X)) #define Float8GetDatumFast(X) \ - (AssertVariableIsOfTypeMacro(X, double), Float8GetDatum(X)) + (StaticAssertVariableIsOfTypeMacro(X, double), Float8GetDatum(X)) /* ---------------------------------------------------------------- diff --git a/src/include/storage/proclist.h b/src/include/storage/proclist.h index 965609145e..9caf109a84 100644 --- a/src/include/storage/proclist.h +++ b/src/include/storage/proclist.h @@ -204,8 +204,8 @@ proclist_pop_head_node_offset(proclist_head *list, size_t node_offset) * node with proclist_delete(list, iter.cur, node_offset). */ #define proclist_foreach_modify(iter, lhead, link_member) \ - for (AssertVariableIsOfTypeMacro(iter, proclist_mutable_iter), \ - AssertVariableIsOfTypeMacro(lhead, proclist_head *), \ + for (StaticAssertVariableIsOfTypeMacro(iter, proclist_mutable_iter), \ + StaticAssertVariableIsOfTypeMacro(lhead, proclist_head *), \ (iter).cur = (lhead)->head, \ (iter).next = (iter).cur == INVALID_PROC_NUMBER ? INVALID_PROC_NUMBER : \ proclist_node_get((iter).cur, \ diff --git a/src/include/utils/freepage.h b/src/include/utils/freepage.h index 8c0e0edd79..2681fd6d5e 100644 --- a/src/include/utils/freepage.h +++ b/src/include/utils/freepage.h @@ -65,7 +65,7 @@ struct FreePageManager /* Macros to convert between page numbers (expressed as Size) and pointers. */ #define fpm_page_to_pointer(base, page) \ - (AssertVariableIsOfTypeMacro(page, Size), \ + (StaticAssertVariableIsOfTypeMacro(page, Size), \ (base) + FPM_PAGE_SIZE * (page)) #define fpm_pointer_to_page(base, ptr) \ (((Size) (((char *) (ptr)) - (base))) / FPM_PAGE_SIZE) diff --git a/src/include/utils/relptr.h b/src/include/utils/relptr.h index aeb17fa24a..94975f2f23 100644 --- a/src/include/utils/relptr.h +++ b/src/include/utils/relptr.h @@ -40,12 +40,12 @@ #ifdef HAVE_TYPEOF #define relptr_access(base, rp) \ - (AssertVariableIsOfTypeMacro(base, char *), \ + (StaticAssertVariableIsOfTypeMacro(base, char *), \ (typeof((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \ (base) + (rp).relptr_off - 1)) #else #define relptr_access(base, rp) \ - (AssertVariableIsOfTypeMacro(base, char *), \ + (StaticAssertVariableIsOfTypeMacro(base, char *), \ (void *) ((rp).relptr_off == 0 ? NULL : (base) + (rp).relptr_off - 1)) #endif @@ -70,12 +70,12 @@ relptr_store_eval(char *base, char *val) #ifdef HAVE_TYPEOF #define relptr_store(base, rp, val) \ - (AssertVariableIsOfTypeMacro(base, char *), \ - AssertVariableIsOfTypeMacro(val, typeof((rp).relptr_type)), \ + (StaticAssertVariableIsOfTypeMacro(base, char *), \ + StaticAssertVariableIsOfTypeMacro(val, typeof((rp).relptr_type)), \ (rp).relptr_off = relptr_store_eval((base), (char *) (val))) #else #define relptr_store(base, rp, val) \ - (AssertVariableIsOfTypeMacro(base, char *), \ + (StaticAssertVariableIsOfTypeMacro(base, char *), \ (rp).relptr_off = relptr_store_eval((base), (char *) (val))) #endif From 955e50766869a5ccf862d7f8439f5d35b723c0f9 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 3 Feb 2026 08:36:47 +0100 Subject: [PATCH 024/147] Change StaticAssertVariableIsOfType to be a declaration This allows moving the uses to more natural and useful positions. Also, a declaration is the more native use of static assertions in C. Reviewed-by: Bertrand Drouvot Discussion: https://www.postgresql.org/message-id/flat/2273bc2a-045d-4a75-8584-7cd9396e5534%40eisentraut.org --- contrib/hstore_plperl/hstore_plperl.c | 13 +++++++------ contrib/hstore_plpython/hstore_plpython.c | 17 +++++++++-------- contrib/jsonb_plpython/jsonb_plpython.c | 10 ++++++---- contrib/ltree_plpython/ltree_plpython.c | 5 +++-- src/include/c.h | 6 +++--- 5 files changed, 28 insertions(+), 23 deletions(-) diff --git a/contrib/hstore_plperl/hstore_plperl.c b/contrib/hstore_plperl/hstore_plperl.c index 1380a1b436..69001191cc 100644 --- a/contrib/hstore_plperl/hstore_plperl.c +++ b/contrib/hstore_plperl/hstore_plperl.c @@ -21,6 +21,13 @@ static hstoreCheckKeyLen_t hstoreCheckKeyLen_p; typedef size_t (*hstoreCheckValLen_t) (size_t len); static hstoreCheckValLen_t hstoreCheckValLen_p; +/* Static asserts verify that typedefs above match original declarations */ +StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); +StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); +StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t); +StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); +StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); + /* * Module initialize function: fetch function pointers for cross-module calls. @@ -28,24 +35,18 @@ static hstoreCheckValLen_t hstoreCheckValLen_p; void _PG_init(void) { - /* Static asserts verify that typedefs above match original declarations */ - StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); hstoreUpgrade_p = (hstoreUpgrade_t) load_external_function("$libdir/hstore", "hstoreUpgrade", true, NULL); - StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); hstoreUniquePairs_p = (hstoreUniquePairs_t) load_external_function("$libdir/hstore", "hstoreUniquePairs", true, NULL); - StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t); hstorePairs_p = (hstorePairs_t) load_external_function("$libdir/hstore", "hstorePairs", true, NULL); - StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); hstoreCheckKeyLen_p = (hstoreCheckKeyLen_t) load_external_function("$libdir/hstore", "hstoreCheckKeyLen", true, NULL); - StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); hstoreCheckValLen_p = (hstoreCheckValLen_t) load_external_function("$libdir/hstore", "hstoreCheckValLen", true, NULL); diff --git a/contrib/hstore_plpython/hstore_plpython.c b/contrib/hstore_plpython/hstore_plpython.c index 3c8ada2a0d..d2be030e07 100644 --- a/contrib/hstore_plpython/hstore_plpython.c +++ b/contrib/hstore_plpython/hstore_plpython.c @@ -28,6 +28,15 @@ static hstoreCheckKeyLen_t hstoreCheckKeyLen_p; typedef size_t (*hstoreCheckValLen_t) (size_t len); static hstoreCheckValLen_t hstoreCheckValLen_p; +/* Static asserts verify that typedefs above match original declarations */ +StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); +StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); +StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); +StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); +StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t); +StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); +StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); + /* * Module initialize function: fetch function pointers for cross-module calls. @@ -35,32 +44,24 @@ static hstoreCheckValLen_t hstoreCheckValLen_p; void _PG_init(void) { - /* Static asserts verify that typedefs above match original declarations */ - StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); PLyObject_AsString_p = (PLyObject_AsString_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyObject_AsString", true, NULL); - StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); - StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); hstoreUpgrade_p = (hstoreUpgrade_t) load_external_function("$libdir/hstore", "hstoreUpgrade", true, NULL); - StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); hstoreUniquePairs_p = (hstoreUniquePairs_t) load_external_function("$libdir/hstore", "hstoreUniquePairs", true, NULL); - StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t); hstorePairs_p = (hstorePairs_t) load_external_function("$libdir/hstore", "hstorePairs", true, NULL); - StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); hstoreCheckKeyLen_p = (hstoreCheckKeyLen_t) load_external_function("$libdir/hstore", "hstoreCheckKeyLen", true, NULL); - StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); hstoreCheckValLen_p = (hstoreCheckValLen_t) load_external_function("$libdir/hstore", "hstoreCheckValLen", true, NULL); diff --git a/contrib/jsonb_plpython/jsonb_plpython.c b/contrib/jsonb_plpython/jsonb_plpython.c index 1983bf8c30..c2c4ce37c0 100644 --- a/contrib/jsonb_plpython/jsonb_plpython.c +++ b/contrib/jsonb_plpython/jsonb_plpython.c @@ -33,22 +33,24 @@ typedef PyObject *(*PLyUnicode_FromStringAndSize_t) (const char *s, Py_ssize_t size); static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p; +/* Static asserts verify that typedefs above match original declarations */ +StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); +StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); +StaticAssertVariableIsOfType(&PLy_elog_impl, PLy_elog_impl_t); + + /* * Module initialize function: fetch function pointers for cross-module calls. */ void _PG_init(void) { - /* Static asserts verify that typedefs above match original declarations */ - StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); PLyObject_AsString_p = (PLyObject_AsString_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyObject_AsString", true, NULL); - StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); - StaticAssertVariableIsOfType(&PLy_elog_impl, PLy_elog_impl_t); PLy_elog_impl_p = (PLy_elog_impl_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLy_elog_impl", true, NULL); diff --git a/contrib/ltree_plpython/ltree_plpython.c b/contrib/ltree_plpython/ltree_plpython.c index a25fb5c5fa..d4e7b613fa 100644 --- a/contrib/ltree_plpython/ltree_plpython.c +++ b/contrib/ltree_plpython/ltree_plpython.c @@ -13,6 +13,9 @@ PG_MODULE_MAGIC_EXT( typedef PyObject *(*PLyUnicode_FromStringAndSize_t) (const char *s, Py_ssize_t size); static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p; +/* Static asserts verify that typedefs above match original declarations */ +StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); + /* * Module initialize function: fetch function pointers for cross-module calls. @@ -20,8 +23,6 @@ static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p; void _PG_init(void) { - /* Static asserts verify that typedefs above match original declarations */ - StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); diff --git a/src/include/c.h b/src/include/c.h index 68df93ed42..063eac9808 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -955,7 +955,7 @@ pg_noreturn extern void ExceptionalCondition(const char *conditionName, /* * Compile-time checks that a variable (or expression) has the specified type. * - * StaticAssertVariableIsOfType() can be used as a statement. + * StaticAssertVariableIsOfType() can be used as a declaration. * StaticAssertVariableIsOfTypeMacro() is intended for use in macros, eg * #define foo(x) (StaticAssertVariableIsOfTypeMacro(x, int), bar(x)) * @@ -965,14 +965,14 @@ pg_noreturn extern void ExceptionalCondition(const char *conditionName, */ #ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P #define StaticAssertVariableIsOfType(varname, typename) \ - StaticAssertStmt(__builtin_types_compatible_p(__typeof__(varname), typename), \ + StaticAssertDecl(__builtin_types_compatible_p(__typeof__(varname), typename), \ CppAsString(varname) " does not have type " CppAsString(typename)) #define StaticAssertVariableIsOfTypeMacro(varname, typename) \ (StaticAssertExpr(__builtin_types_compatible_p(__typeof__(varname), typename), \ CppAsString(varname) " does not have type " CppAsString(typename))) #else /* !HAVE__BUILTIN_TYPES_COMPATIBLE_P */ #define StaticAssertVariableIsOfType(varname, typename) \ - StaticAssertStmt(sizeof(varname) == sizeof(typename), \ + StaticAssertDecl(sizeof(varname) == sizeof(typename), \ CppAsString(varname) " does not have type " CppAsString(typename)) #define StaticAssertVariableIsOfTypeMacro(varname, typename) \ (StaticAssertExpr(sizeof(varname) == sizeof(typename), \ From 96e2af605043974137d84edf5c0a24561956919e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Tue, 3 Feb 2026 12:33:29 +0100 Subject: [PATCH 025/147] Reject ADD CONSTRAINT NOT NULL if name mismatches existing constraint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using ALTER TABLE ... ADD CONSTRAINT to add a not-null constraint with an explicit name, we have to ensure that if the column is already marked NOT NULL, the provided name matches the existing constraint name. Failing to do so could lead to confusion regarding which constraint object actually enforces the rule. This patch adds a check to throw an error if the user tries to add a named not-null constraint to a column that already has one with a different name. Reported-by: yanliang lei Co-authored-by: Álvaro Herrera Co-authored-bu: Srinath Reddy Sadipiralla Backpatch-through: 18 Discussion: https://postgr.es/m/19351-8f1c523ead498545%40postgresql.org --- src/backend/catalog/heap.c | 1 + src/backend/catalog/pg_constraint.c | 21 +++++++++++++++++++-- src/include/catalog/pg_constraint.h | 2 +- src/test/regress/expected/constraints.out | 6 +++++- src/test/regress/sql/constraints.sql | 4 +++- 5 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 606434823c..a6ed9849e7 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -2635,6 +2635,7 @@ AddRelationNewConstraints(Relation rel, * requested validity. */ if (AdjustNotNullInheritance(RelationGetRelid(rel), colnum, + cdef->conname, is_local, cdef->is_no_inherit, cdef->skip_validation)) continue; diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index cbbcf166e4..b12765ae69 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -731,14 +731,15 @@ extractNotNullColumn(HeapTuple constrTup) * If a constraint exists but the connoinherit flag is not what the caller * wants, throw an error about the incompatibility. If the desired * constraint is valid but the existing constraint is not valid, also - * throw an error about that (the opposite case is acceptable). + * throw an error about that (the opposite case is acceptable). If + * the proposed constraint has a different name, also throw an error. * * If everything checks out, we adjust conislocal/coninhcount and return * true. If is_local is true we flip conislocal true, or do nothing if * it's already true; otherwise we increment coninhcount by 1. */ bool -AdjustNotNullInheritance(Oid relid, AttrNumber attnum, +AdjustNotNullInheritance(Oid relid, AttrNumber attnum, const char *new_conname, bool is_local, bool is_no_inherit, bool is_notvalid) { HeapTuple tup; @@ -777,6 +778,22 @@ AdjustNotNullInheritance(Oid relid, AttrNumber attnum, errhint("You might need to validate it using %s.", "ALTER TABLE ... VALIDATE CONSTRAINT")); + /* + * If, for a new constraint that is being defined locally (i.e., not + * being passed down via inheritance), a name was specified, then + * verify that the existing constraint has the same name. Otherwise + * throw an error. Names of inherited constraints are ignored because + * they are not directly user-specified, so matching is not important. + */ + if (is_local && new_conname && + strcmp(new_conname, NameStr(conform->conname)) != 0) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot create not-null constraint \"%s\" on column \"%s\" of table \"%s\"", + new_conname, get_attname(relid, attnum, false), get_rel_name(relid)), + errdetail("A not-null constraint named \"%s\" already exists for this column.", + NameStr(conform->conname))); + if (!is_local) { if (pg_add_s16_overflow(conform->coninhcount, 1, diff --git a/src/include/catalog/pg_constraint.h b/src/include/catalog/pg_constraint.h index 05933cd974..d5661b5bdf 100644 --- a/src/include/catalog/pg_constraint.h +++ b/src/include/catalog/pg_constraint.h @@ -263,7 +263,7 @@ extern HeapTuple findNotNullConstraintAttnum(Oid relid, AttrNumber attnum); extern HeapTuple findNotNullConstraint(Oid relid, const char *colname); extern HeapTuple findDomainNotNullConstraint(Oid typid); extern AttrNumber extractNotNullColumn(HeapTuple constrTup); -extern bool AdjustNotNullInheritance(Oid relid, AttrNumber attnum, +extern bool AdjustNotNullInheritance(Oid relid, AttrNumber attnum, const char *new_conname, bool is_local, bool is_no_inherit, bool is_notvalid); extern List *RelationGetNotNullConstraints(Oid relid, bool cooked, bool include_noinh); diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out index 1bbf59cca0..ebc892a2a4 100644 --- a/src/test/regress/expected/constraints.out +++ b/src/test/regress/expected/constraints.out @@ -846,8 +846,12 @@ CREATE TABLE notnull_tbl1 (a INTEGER NOT NULL NOT NULL); Not-null constraints: "notnull_tbl1_a_not_null" NOT NULL "a" --- no-op +-- specifying an existing constraint is a no-op +ALTER TABLE notnull_tbl1 ADD CONSTRAINT notnull_tbl1_a_not_null NOT NULL a; +-- but using a different constraint name is not allowed ALTER TABLE notnull_tbl1 ADD CONSTRAINT nn NOT NULL a; +ERROR: cannot create not-null constraint "nn" on column "a" of table "notnull_tbl1" +DETAIL: A not-null constraint named "notnull_tbl1_a_not_null" already exists for this column. \d+ notnull_tbl1 Table "public.notnull_tbl1" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql index 733a1dbccf..1e9989698b 100644 --- a/src/test/regress/sql/constraints.sql +++ b/src/test/regress/sql/constraints.sql @@ -623,7 +623,9 @@ DROP TABLE deferred_excl; -- verify constraints created for NOT NULL clauses CREATE TABLE notnull_tbl1 (a INTEGER NOT NULL NOT NULL); \d+ notnull_tbl1 --- no-op +-- specifying an existing constraint is a no-op +ALTER TABLE notnull_tbl1 ADD CONSTRAINT notnull_tbl1_a_not_null NOT NULL a; +-- but using a different constraint name is not allowed ALTER TABLE notnull_tbl1 ADD CONSTRAINT nn NOT NULL a; \d+ notnull_tbl1 -- duplicate name From cd375d5b6d5f7d89375541af444e16dd93d27a03 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 3 Feb 2026 15:08:13 +0200 Subject: [PATCH 026/147] Remove useless errdetail_abort() I don't understand how to reach errdetail_abort() with MyProc->recoveryConflictPending set. If a recovery conflict signal is received, ProcessRecoveryConflictInterrupt() raises an ERROR or FATAL error to cancel the query or connection, and abort processing clears the flag. The error message from ProcessRecoveryConflictInterrupt() is very clear that the query or connection was terminated because of recovery conflict. The only way to reach it AFAICS is with a race condition, if the startup process sends a recovery conflict signal when the transaction has just entered aborted state for some other reason. And in that case the detail would be misleading, as the transaction was already aborted for some other reason, not because of the recovery conflict. errdetail_abort() was the only user of the recoveryConflictPending flag in PGPROC, so we can remove that and all the related code too. Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/4cc13ba1-4248-4884-b6ba-4805349e7f39@iki.fi --- src/backend/storage/ipc/procarray.c | 20 +++-------------- src/backend/storage/ipc/standby.c | 15 ++++++------- src/backend/storage/lmgr/proc.c | 1 - src/backend/tcop/postgres.c | 35 +++++------------------------ src/include/storage/proc.h | 7 ------ src/include/storage/procarray.h | 6 ++--- 6 files changed, 18 insertions(+), 66 deletions(-) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 6be565155a..748c06b51c 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -708,8 +708,6 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; - proc->recoveryConflictPending = false; - /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->statusFlags & PROC_VACUUM_STATE_MASK) @@ -750,8 +748,6 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; - proc->recoveryConflictPending = false; - /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->statusFlags & PROC_VACUUM_STATE_MASK) @@ -933,7 +929,6 @@ ProcArrayClearTransaction(PGPROC *proc) proc->vxid.lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; - proc->recoveryConflictPending = false; Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); Assert(!proc->delayChkptFlags); @@ -3445,19 +3440,12 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) } /* - * CancelVirtualTransaction - used in recovery conflict processing + * SignalVirtualTransaction - used in recovery conflict processing * * Returns pid of the process signaled, or 0 if not found. */ pid_t -CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode) -{ - return SignalVirtualTransaction(vxid, sigmode, true); -} - -pid_t -SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, - bool conflictPending) +SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode) { ProcArrayStruct *arrayP = procArray; int index; @@ -3476,7 +3464,6 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, if (procvxid.procNumber == vxid.procNumber && procvxid.localTransactionId == vxid.localTransactionId) { - proc->recoveryConflictPending = conflictPending; pid = proc->pid; if (pid != 0) { @@ -3618,7 +3605,7 @@ CountDBConnections(Oid databaseid) * CancelDBBackends --- cancel backends that are using specified database */ void -CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending) +CancelDBBackends(Oid databaseid, ProcSignalReason sigmode) { ProcArrayStruct *arrayP = procArray; int index; @@ -3638,7 +3625,6 @@ CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending) GET_VXID_FROM_PGPROC(procvxid, *proc); - proc->recoveryConflictPending = conflictPending; pid = proc->pid; if (pid != 0) { diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index afffab7710..6db803476c 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -390,7 +390,7 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, * Now find out who to throw out of the balloon. */ Assert(VirtualTransactionIdIsValid(*waitlist)); - pid = CancelVirtualTransaction(*waitlist, reason); + pid = SignalVirtualTransaction(*waitlist, reason); /* * Wait a little bit for it to die so that we avoid flooding @@ -581,7 +581,7 @@ ResolveRecoveryConflictWithDatabase(Oid dbid) */ while (CountDBBackends(dbid) > 0) { - CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true); + CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE); /* * Wait awhile for them to die so that we avoid flooding an @@ -724,8 +724,7 @@ ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict) while (VirtualTransactionIdIsValid(*backends)) { SignalVirtualTransaction(*backends, - PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK, - false); + PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); backends++; } @@ -881,11 +880,11 @@ SendRecoveryConflictWithBufferPin(ProcSignalReason reason) /* * We send signal to all backends to ask them if they are holding the - * buffer pin which is delaying the Startup process. We must not set the - * conflict flag yet, since most backends will be innocent. Let the - * SIGUSR1 handling in each backend decide their own fate. + * buffer pin which is delaying the Startup process. Most of them will be + * innocent, but we let the SIGUSR1 handling in each backend decide their + * own fate. */ - CancelDBBackends(InvalidOid, reason, false); + CancelDBBackends(InvalidOid, reason); } /* diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 696bbb7b91..fdeed0f395 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -506,7 +506,6 @@ InitProcess(void) Assert(dlist_is_empty(&(MyProc->myProcLocks[i]))); } #endif - MyProc->recoveryConflictPending = false; /* Initialize fields for sync rep */ MyProc->waitLSN = InvalidXLogRecPtr; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index b4a8d2f3a1..d01d7a0898 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -175,7 +175,6 @@ static void forbidden_in_wal_sender(char firstchar); static bool check_log_statement(List *stmt_list); static int errdetail_execute(List *raw_parsetree_list); static int errdetail_params(ParamListInfo params); -static int errdetail_abort(void); static void bind_param_error_callback(void *arg); static void start_xact_command(void); static void finish_xact_command(void); @@ -1141,8 +1140,7 @@ exec_simple_query(const char *query_string) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); /* Make sure we are in a transaction command */ start_xact_command(); @@ -1498,8 +1496,7 @@ exec_parse_message(const char *query_string, /* string to execute */ ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); /* * Create the CachedPlanSource before we do parse analysis, since it @@ -1750,8 +1747,7 @@ exec_bind_message(StringInfo input_message) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); /* * Create the portal. Allow silent replacement of an existing portal only @@ -2255,8 +2251,7 @@ exec_execute_message(const char *portal_name, long max_rows) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); /* Check for cancel signal before we start execution */ CHECK_FOR_INTERRUPTS(); @@ -2536,20 +2531,6 @@ errdetail_params(ParamListInfo params) return 0; } -/* - * errdetail_abort - * - * Add an errdetail() line showing abort reason, if any. - */ -static int -errdetail_abort(void) -{ - if (MyProc->recoveryConflictPending) - errdetail("Abort reason: recovery conflict"); - - return 0; -} - /* * errdetail_recovery_conflict * @@ -2692,8 +2673,7 @@ exec_describe_statement_message(const char *stmt_name) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); if (whereToSendOutput != DestRemote) return; /* can't actually do anything... */ @@ -2769,8 +2749,7 @@ exec_describe_portal_message(const char *portal_name) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); if (whereToSendOutput != DestRemote) return; /* can't actually do anything... */ @@ -3139,8 +3118,6 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) return; } - MyProc->recoveryConflictPending = true; - /* Intentional fall through to error handling */ /* FALLTHROUGH */ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 039bc8353b..81f1960a63 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -235,13 +235,6 @@ struct PGPROC bool isRegularBackend; /* true if it's a regular backend. */ - /* - * While in hot standby mode, shows that a conflict signal has been sent - * for the current transaction. Set/cleared while holding ProcArrayLock, - * though not required. Accessed without lock, if needed. - */ - bool recoveryConflictPending; - /* * Info about LWLock the process is currently waiting for, if any. * diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index da7b5e78d3..3a8593f87b 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -77,14 +77,12 @@ extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, bool allDbs, int excludeVacuum, int *nvxids); extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid); -extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode); -extern pid_t SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, - bool conflictPending); +extern pid_t SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode); extern bool MinimumActiveBackends(int min); extern int CountDBBackends(Oid databaseid); extern int CountDBConnections(Oid databaseid); -extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending); +extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode); extern int CountUserBackends(Oid roleid); extern bool CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared); From 57bff90160fdee56a0d55d7eaa7ec5ad709fda08 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 3 Feb 2026 15:08:16 +0200 Subject: [PATCH 027/147] Don't hint that you can reconnect when the database is dropped Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/4cc13ba1-4248-4884-b6ba-4805349e7f39@iki.fi --- src/backend/tcop/postgres.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index d01d7a0898..02e9aaa6bc 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3209,27 +3209,29 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) } } - /* Intentional fall through to session cancel */ - /* FALLTHROUGH */ - - case PROCSIG_RECOVERY_CONFLICT_DATABASE: - /* - * Retrying is not possible because the database is dropped, or we - * decided above that we couldn't resolve the conflict with an - * ERROR and fell through. Terminate the session. + * We couldn't resolve the conflict with ERROR, so terminate the + * whole session. */ pgstat_report_recovery_conflict(reason); ereport(FATAL, - (errcode(reason == PROCSIG_RECOVERY_CONFLICT_DATABASE ? - ERRCODE_DATABASE_DROPPED : - ERRCODE_T_R_SERIALIZATION_FAILURE), + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("terminating connection due to conflict with recovery"), errdetail_recovery_conflict(reason), errhint("In a moment you should be able to reconnect to the" " database and repeat your command."))); break; + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + + /* The database is being dropped; terminate the session */ + pgstat_report_recovery_conflict(reason); + ereport(FATAL, + (errcode(ERRCODE_DATABASE_DROPPED), + errmsg("terminating connection due to conflict with recovery"), + errdetail_recovery_conflict(reason))); + break; + default: elog(FATAL, "unrecognized conflict mode: %d", (int) reason); } From 78bf28e3bf504db0eea5e3bcb3c43e9908108480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Tue, 3 Feb 2026 19:29:15 +0100 Subject: [PATCH 028/147] Docs: consolidate dependency notes in pg_dump and pg_restore The pg_dump documentation had repetitive notes for the --schema, --table, and --extension switches, noting that dependent database objects are not automatically included in the dump. This commit removes these notes and replaces them with a consolidated paragraph in the "Notes" section. pg_restore had a similar note for -t but lacked one for -n; do likewise. Also, add a note to --extension in pg_dump to note that ancillary files (such as shared libraries and control files) are not included in the dump and must be present on the destination system. Author: Florents Tselai Reviewed-by: Tom Lane Discussion: https://postgr.es/m/284C4D55-4F90-4AA0-84C8-1E6A28DDF271@gmail.com --- doc/src/sgml/ref/pg_dump.sgml | 41 ++++++++++++-------------------- doc/src/sgml/ref/pg_restore.sgml | 20 ++++++++-------- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 688e23c0e9..7f538e9019 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -263,12 +263,10 @@ PostgreSQL documentation - When is specified, - pg_dump makes no attempt to dump any other - database objects that the selected extension(s) might depend upon. - Therefore, there is no guarantee that the results of a - specific-extension dump can be successfully restored by themselves - into a clean database. + pg_dump does not dump the extension's + underlying installation files (such as shared libraries or control + files). These must be available on the destination system for the + restore to succeed. @@ -445,16 +443,6 @@ PostgreSQL documentation below. - - - When is specified, pg_dump - makes no attempt to dump any other database objects that the selected - schema(s) might depend upon. Therefore, there is no guarantee - that the results of a specific-schema dump can be successfully - restored by themselves into a clean database. - - - Non-schema objects such as large objects are not dumped when is @@ -596,16 +584,6 @@ PostgreSQL documentation be dumped. - - - When is specified, pg_dump - makes no attempt to dump any other database objects that the selected - table(s) might depend upon. Therefore, there is no guarantee - that the results of a specific-table dump can be successfully - restored by themselves into a clean database. - - - @@ -1689,6 +1667,17 @@ CREATE DATABASE foo WITH TEMPLATE template0; + + When options , or + are specified, pg_dump makes no attempt to dump + any other database objects that the selected object(s) might depend upon. + Therefore, there is no guarantee that the results of a dump so generated + can be successfully restored by themselves into a clean database. + For example, if a table whose definition includes a foreign key is + specified to be restored, the table referenced by the foreign key is + not automatically restored. + + When a dump without schema is chosen and the option is used, pg_dump emits commands diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml index 2c295bbf8d..420a308a7c 100644 --- a/doc/src/sgml/ref/pg_restore.sgml +++ b/doc/src/sgml/ref/pg_restore.sgml @@ -452,16 +452,6 @@ PostgreSQL documentation specify table(s) in a particular schema. - - - When is specified, pg_restore - makes no attempt to restore any other database objects that the - selected table(s) might depend upon. Therefore, there is no - guarantee that a specific-table restore into a clean database will - succeed. - - - This flag does not behave identically to the @@ -1089,6 +1079,16 @@ PostgreSQL documentation Notes + + When options or are specified, + pg_restore makes no attempt to restore + any other database objects that the selected table(s) or schema(s) + might depend upon. Therefore, there is no guarantee that a specific-table + restore into a clean database will succeed. For example, if a table + whose definition includes a foreign key is specified to be restored, the + table referenced by the foreign key is not automatically restored. + + If your installation has any local additions to the template1 database, be careful to load the output of From c8ec74713bf2c703c19f231ea4d1e6479630c72d Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 4 Feb 2026 16:38:06 +0900 Subject: [PATCH 029/147] pg_resetwal: Fix incorrect error message related to pg_wal/summaries/ A failure while closing pg_wal/summaries/ incorrectly generated a report about pg_wal/archive_status/. While at it, this commit adds #undefs for the macros used in KillExistingWALSummaries() and KillExistingArchiveStatus() to prevent those values from being misused in an incorrect function context. Oversight in dc212340058b. Author: Tianchen Zhang Reviewed-by: Chao Li Reviewed-by: Kyotaro Horiguchi Discussion: https://postgr.es/m/SE2P216MB2390C84C23F428A7864EE07FA19BA@SE2P216MB2390.KORP216.PROD.OUTLOOK.COM Backpatch-through: 17 --- src/bin/pg_resetwal/pg_resetwal.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 431b83a67d..85dc43d4cd 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -1077,6 +1077,8 @@ KillExistingArchiveStatus(void) if (closedir(xldir)) pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR); + +#undef ARCHSTATDIR } /* @@ -1111,7 +1113,10 @@ KillExistingWALSummaries(void) pg_fatal("could not read directory \"%s\": %m", WALSUMMARYDIR); if (closedir(xldir)) - pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR); + pg_fatal("could not close directory \"%s\": %m", WALSUMMARYDIR); + +#undef WALSUMMARY_NHEXCHARS +#undef WALSUMMARYDIR } /* From 4cfce4e62c8f09f5b1f6a7f69760ca46a74406e2 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 4 Feb 2026 08:39:55 +0100 Subject: [PATCH 030/147] Change copyObject() to use typeof_unqual Currently, when the argument of copyObject() is const-qualified, the return type is also, because the use of typeof carries over all the qualifiers. This is incorrect, since the point of copyObject() is to make a copy to mutate. But apparently no code ran into it. The new implementation uses typeof_unqual, which drops the qualifiers, making this work correctly. typeof_unqual is standardized in C23, but all recent versions of all the usual compilers support it even in non-C23 mode, at least as __typeof_unqual__. We add a configure/meson test for typeof_unqual and __typeof_unqual__ and use it if it's available, else we use the existing fallback of just returning void *. Reviewed-by: David Geier Discussion: https://www.postgresql.org/message-id/flat/92f9750f-c7f6-42d8-9a4a-85a3cbe808f3%40eisentraut.org --- config/c-compiler.m4 | 25 +++++++++++++++++++++++ configure | 42 ++++++++++++++++++++++++++++++++++++++ configure.ac | 1 + meson.build | 24 ++++++++++++++++++++++ src/include/nodes/nodes.h | 4 ++-- src/include/pg_config.h.in | 7 +++++++ 6 files changed, 101 insertions(+), 2 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 1509dbfa2a..7179a73bd2 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -192,6 +192,31 @@ if test "$pgac_cv_c_typeof" != no; then fi])# PGAC_C_TYPEOF +# PGAC_C_TYPEOF_UNQUAL +# -------------------- +# Check if the C compiler understands typeof_unqual or a variant. Define +# HAVE_TYPEOF_UNQUAL if so, and define 'typeof_unqual' to the actual key word. +# +AC_DEFUN([PGAC_C_TYPEOF_UNQUAL], +[AC_CACHE_CHECK(for typeof_unqual, pgac_cv_c_typeof_unqual, +[pgac_cv_c_typeof_unqual=no +for pgac_kw in typeof_unqual __typeof_unqual__; do + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], +[int x = 0; +$pgac_kw(x) y; +y = x; +return y;])], +[pgac_cv_c_typeof_unqual=$pgac_kw]) + test "$pgac_cv_c_typeof_unqual" != no && break +done]) +if test "$pgac_cv_c_typeof_unqual" != no; then + AC_DEFINE(HAVE_TYPEOF_UNQUAL, 1, + [Define to 1 if your compiler understands `typeof_unqual' or something similar.]) + if test "$pgac_cv_c_typeof_unqual" != typeof_unqual; then + AC_DEFINE_UNQUOTED(typeof_unqual, $pgac_cv_c_typeof_unqual, [Define to how the compiler spells `typeof_unqual'.]) + fi +fi])# PGAC_C_TYPEOF_UNQUAL + # PGAC_C_TYPES_COMPATIBLE # ----------------------- diff --git a/configure b/configure index a10a2c85c6..ba29393187 100755 --- a/configure +++ b/configure @@ -15010,6 +15010,48 @@ _ACEOF fi fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for typeof_unqual" >&5 +$as_echo_n "checking for typeof_unqual... " >&6; } +if ${pgac_cv_c_typeof_unqual+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_cv_c_typeof_unqual=no +for pgac_kw in typeof_unqual __typeof_unqual__; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +int x = 0; +$pgac_kw(x) y; +y = x; +return y; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_c_typeof_unqual=$pgac_kw +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + test "$pgac_cv_c_typeof_unqual" != no && break +done +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_c_typeof_unqual" >&5 +$as_echo "$pgac_cv_c_typeof_unqual" >&6; } +if test "$pgac_cv_c_typeof_unqual" != no; then + +$as_echo "#define HAVE_TYPEOF_UNQUAL 1" >>confdefs.h + + if test "$pgac_cv_c_typeof_unqual" != typeof_unqual; then + +cat >>confdefs.h <<_ACEOF +#define typeof_unqual $pgac_cv_c_typeof_unqual +_ACEOF + + fi +fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_types_compatible_p" >&5 $as_echo_n "checking for __builtin_types_compatible_p... " >&6; } if ${pgac_cv__types_compatible+:} false; then : diff --git a/configure.ac b/configure.ac index 814e64a967..412fe358a2 100644 --- a/configure.ac +++ b/configure.ac @@ -1717,6 +1717,7 @@ PGAC_PRINTF_ARCHETYPE PGAC_CXX_PRINTF_ARCHETYPE PGAC_C_STATEMENT_EXPRESSIONS PGAC_C_TYPEOF +PGAC_C_TYPEOF_UNQUAL PGAC_C_TYPES_COMPATIBLE PGAC_C_BUILTIN_CONSTANT_P PGAC_C_BUILTIN_OP_OVERFLOW diff --git a/meson.build b/meson.build index df907b62da..0722b16927 100644 --- a/meson.build +++ b/meson.build @@ -2880,6 +2880,30 @@ int main(void) endif endforeach +# Check if the C compiler understands typeof_unqual or a variant. Define +# HAVE_TYPEOF_UNQUAL if so, and define 'typeof_unqual' to the actual key word. +foreach kw : ['typeof_unqual', '__typeof_unqual__'] + if cc.compiles(''' +int main(void) +{ + int x = 0; + @0@(x) y; + y = x; + return y; +} +'''.format(kw), + name: kw, + args: test_c_args, include_directories: postgres_inc) + + cdata.set('HAVE_TYPEOF_UNQUAL', 1) + if kw != 'typeof_unqual' + cdata.set('typeof_unqual', kw) + endif + + break + endif +endforeach + # MSVC doesn't cope well with defining restrict to __restrict, the # spelling it understands, because it conflicts with diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index b6ad28618a..ba6dd7f389 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -226,8 +226,8 @@ extern int16 *readAttrNumberCols(int numCols); extern void *copyObjectImpl(const void *from); /* cast result back to argument type, if supported by compiler */ -#ifdef HAVE_TYPEOF -#define copyObject(obj) ((typeof(obj)) copyObjectImpl(obj)) +#ifdef HAVE_TYPEOF_UNQUAL +#define copyObject(obj) ((typeof_unqual(*(obj)) *) copyObjectImpl(obj)) #else #define copyObject(obj) copyObjectImpl(obj) #endif diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 339268dc8e..c089f2252c 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -454,6 +454,10 @@ /* Define to 1 if your compiler understands `typeof' or something similar. */ #undef HAVE_TYPEOF +/* Define to 1 if your compiler understands `typeof_unqual' or something + similar. */ +#undef HAVE_TYPEOF_UNQUAL + /* Define to 1 if you have the header file. */ #undef HAVE_UCHAR_H @@ -806,3 +810,6 @@ /* Define to how the compiler spells `typeof'. */ #undef typeof + +/* Define to how the compiler spells `typeof_unqual'. */ +#undef typeof_unqual From 084e42bc7109673e46527b0a0f284edf539c3285 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 4 Feb 2026 13:06:04 +0200 Subject: [PATCH 031/147] Add backendType to PGPROC, replacing isRegularBackend We can immediately make use of it in pg_signal_backend(), which previously fetched the process type from the backend status array with pgstat_get_backend_type_by_proc_number(). That was correct but felt a little questionable to me: backend status should be for observability purposes only, not for permission checks. Reviewed-by: Nathan Bossart Reviewed-by: Bertrand Drouvot Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/b77e4962-a64a-43db-81a1-580444b3e8f5@iki.fi --- src/backend/access/transam/twophase.c | 2 +- src/backend/storage/ipc/procarray.c | 4 ++-- src/backend/storage/ipc/signalfuncs.c | 5 +---- src/backend/storage/lmgr/proc.c | 4 ++-- src/backend/utils/activity/backend_status.c | 25 --------------------- src/include/storage/proc.h | 5 +++-- src/include/utils/backend_status.h | 1 - 7 files changed, 9 insertions(+), 37 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 601ce3faa6..eabc4d4820 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -470,7 +470,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, proc->databaseId = databaseid; proc->roleId = owner; proc->tempNamespaceId = InvalidOid; - proc->isRegularBackend = false; + proc->backendType = B_INVALID; proc->lwWaiting = LW_WS_NOT_WAITING; proc->lwWaitMode = 0; proc->waitLock = NULL; diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 748c06b51c..301f54fb5a 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -3589,7 +3589,7 @@ CountDBConnections(Oid databaseid) if (proc->pid == 0) continue; /* do not count prepared xacts */ - if (!proc->isRegularBackend) + if (proc->backendType != B_BACKEND) continue; /* count only regular backend processes */ if (!OidIsValid(databaseid) || proc->databaseId == databaseid) @@ -3660,7 +3660,7 @@ CountUserBackends(Oid roleid) if (proc->pid == 0) continue; /* do not count prepared xacts */ - if (!proc->isRegularBackend) + if (proc->backendType != B_BACKEND) continue; /* count only regular backend processes */ if (proc->roleId == roleid) count++; diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c index 6f7759cd72..d48b4fe379 100644 --- a/src/backend/storage/ipc/signalfuncs.c +++ b/src/backend/storage/ipc/signalfuncs.c @@ -87,10 +87,7 @@ pg_signal_backend(int pid, int sig) */ if (!OidIsValid(proc->roleId) || superuser_arg(proc->roleId)) { - ProcNumber procNumber = GetNumberFromPGProc(proc); - BackendType backendType = pgstat_get_backend_type_by_proc_number(procNumber); - - if (backendType == B_AUTOVAC_WORKER) + if (proc->backendType == B_AUTOVAC_WORKER) { if (!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_AUTOVACUUM_WORKER)) return SIGNAL_BACKEND_NOAUTOVAC; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index fdeed0f395..c7a001b3b7 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -486,7 +486,7 @@ InitProcess(void) MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; MyProc->tempNamespaceId = InvalidOid; - MyProc->isRegularBackend = AmRegularBackendProcess(); + MyProc->backendType = MyBackendType; MyProc->delayChkptFlags = 0; MyProc->statusFlags = 0; /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ @@ -684,7 +684,7 @@ InitAuxiliaryProcess(void) MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; MyProc->tempNamespaceId = InvalidOid; - MyProc->isRegularBackend = false; + MyProc->backendType = MyBackendType; MyProc->delayChkptFlags = 0; MyProc->statusFlags = 0; MyProc->lwWaiting = LW_WS_NOT_WAITING; diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index c84e653658..cd08712946 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -1164,31 +1164,6 @@ pgstat_get_my_plan_id(void) return MyBEEntry->st_plan_id; } -/* ---------- - * pgstat_get_backend_type_by_proc_number() - - * - * Return the type of the backend with the specified ProcNumber. This looks - * directly at the BackendStatusArray, so the return value may be out of date. - * The only current use of this function is in pg_signal_backend(), which is - * inherently racy, so we don't worry too much about this. - * - * It is the caller's responsibility to use this wisely; at minimum, callers - * should ensure that procNumber is valid and perform the required permissions - * checks. - * ---------- - */ -BackendType -pgstat_get_backend_type_by_proc_number(ProcNumber procNumber) -{ - volatile PgBackendStatus *status = &BackendStatusArray[procNumber]; - - /* - * We bypass the changecount mechanism since fetching and storing an int - * is almost certainly atomic. - */ - return status->st_backendType; -} - /* ---------- * cmp_lbestatus * diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 81f1960a63..679f0624f9 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -17,6 +17,7 @@ #include "access/clog.h" #include "access/xlogdefs.h" #include "lib/ilist.h" +#include "miscadmin.h" #include "storage/latch.h" #include "storage/lock.h" #include "storage/pg_sema.h" @@ -166,7 +167,7 @@ typedef enum * but its myProcLocks[] lists are valid. * * We allow many fields of this struct to be accessed without locks, such as - * delayChkptFlags and isRegularBackend. However, keep in mind that writing + * delayChkptFlags and backendType. However, keep in mind that writing * mirrored ones (see below) requires holding ProcArrayLock or XidGenLock in * at least shared mode, so that pgxactoff does not change concurrently. * @@ -233,7 +234,7 @@ struct PGPROC Oid tempNamespaceId; /* OID of temp schema this backend is * using */ - bool isRegularBackend; /* true if it's a regular backend. */ + BackendType backendType; /* what kind of process is this? */ /* * Info about LWLock the process is currently waiting for, if any. diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h index 781e48c0c1..ddd06304e9 100644 --- a/src/include/utils/backend_status.h +++ b/src/include/utils/backend_status.h @@ -331,7 +331,6 @@ extern const char *pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen); extern int64 pgstat_get_my_query_id(void); extern int64 pgstat_get_my_plan_id(void); -extern BackendType pgstat_get_backend_type_by_proc_number(ProcNumber procNumber); /* ---------- From 176dffdf7d2a0ea2615c4e390a2ab7e69d14f90f Mon Sep 17 00:00:00 2001 From: John Naylor Date: Wed, 4 Feb 2026 17:55:49 +0700 Subject: [PATCH 032/147] Fix various instances of undefined behavior Mostly this involves checking for NULL pointer before doing operations that add a non-zero offset. The exception is an overflow warning in heap_fetch_toast_slice(). This was caused by unneeded parentheses forcing an expression to be evaluated to a negative integer, which then got cast to size_t. Per clang 21 undefined behavior sanitizer. Backpatch to all supported versions. Co-authored-by: Alexander Lakhin Reported-by: Alexander Lakhin Discussion: https://postgr.es/m/777bd201-6e3a-4da0-a922-4ea9de46a3ee@gmail.com Backpatch-through: 14 --- contrib/pg_trgm/trgm_gist.c | 5 ++++- src/backend/access/heap/heaptoast.c | 2 +- src/backend/utils/adt/multirangetypes.c | 5 +++-- src/backend/utils/sort/sharedtuplestore.c | 3 ++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index 2f0d61985a..685275a0f9 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -701,10 +701,13 @@ gtrgm_penalty(PG_FUNCTION_ARGS) if (ISARRKEY(newval)) { char *cache = (char *) fcinfo->flinfo->fn_extra; - TRGM *cachedVal = (TRGM *) (cache + MAXALIGN(siglen)); + TRGM *cachedVal = NULL; Size newvalsize = VARSIZE(newval); BITVECP sign; + if (cache != NULL) + cachedVal = (TRGM *) (cache + MAXALIGN(siglen)); + /* * Cache the sign data across multiple calls with the same newval. */ diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c index e28fe47a44..6ddf6c6cf9 100644 --- a/src/backend/access/heap/heaptoast.c +++ b/src/backend/access/heap/heaptoast.c @@ -768,7 +768,7 @@ heap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, chcpyend = (sliceoffset + slicelength - 1) % TOAST_MAX_CHUNK_SIZE; memcpy(VARDATA(result) + - (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, + curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset + chcpystrt, chunkdata + chcpystrt, (chcpyend - chcpystrt) + 1); diff --git a/src/backend/utils/adt/multirangetypes.c b/src/backend/utils/adt/multirangetypes.c index b1942387dc..9548989d78 100644 --- a/src/backend/utils/adt/multirangetypes.c +++ b/src/backend/utils/adt/multirangetypes.c @@ -485,8 +485,9 @@ multirange_canonicalize(TypeCacheEntry *rangetyp, int32 input_range_count, int32 output_range_count = 0; /* Sort the ranges so we can find the ones that overlap/meet. */ - qsort_arg(ranges, input_range_count, sizeof(RangeType *), range_compare, - rangetyp); + if (ranges != NULL) + qsort_arg(ranges, input_range_count, sizeof(RangeType *), + range_compare, rangetyp); /* Now merge where possible: */ for (i = 0; i < input_range_count; i++) diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c index 8f35a25526..04189f708f 100644 --- a/src/backend/utils/sort/sharedtuplestore.c +++ b/src/backend/utils/sort/sharedtuplestore.c @@ -323,7 +323,8 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data, /* Do we have space? */ size = accessor->sts->meta_data_size + tuple->t_len; - if (accessor->write_pointer + size > accessor->write_end) + if (accessor->write_pointer == NULL || + accessor->write_pointer + size > accessor->write_end) { if (accessor->write_chunk == NULL) { From 36ead7123292c2849be9950f3e552325fad7e6b7 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Thu, 5 Feb 2026 00:43:06 +0900 Subject: [PATCH 033/147] Fix logical replication TAP test to read publisher log correctly. Commit 5f13999aa11 added a TAP test for GUC settings passed via the CONNECTION string in logical replication, but the buildfarm member sungazer reported test failures. The test incorrectly used the subscriber's log file position as the starting offset when reading the publisher's log. As a result, the test failed to find the expected log message in the publisher's log and erroneously reported a failure. This commit fixes the test to use the publisher's own log file position when reading the publisher's log. Also, to avoid similar confusion in the future, this commit splits the single $log_location variable into $log_location_pub and $log_location_sub, clearly distinguishing publisher and subscriber log positions. Backpatched to v15, where commit 5f13999aa11 introduced the test. Per buildfarm member sungazer. This issue was reported and diagnosed by Alexander Lakhin. Reported-by: Alexander Lakhin Discussion: https://postgr.es/m/966ec3d8-1b6f-4f57-ae59-fc7d55bc9a5a@gmail.com Backpatch-through: 15 --- src/test/subscription/t/001_rep_changes.pl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl index d7e62e4d48..7d41715ed8 100644 --- a/src/test/subscription/t/001_rep_changes.pl +++ b/src/test/subscription/t/001_rep_changes.pl @@ -353,7 +353,8 @@ # Note that the current location of the log file is not grabbed immediately # after reloading the configuration, but after sending one SQL command to # the node so as we are sure that the reloading has taken effect. -my $log_location = -s $node_subscriber->logfile; +my $log_location_pub = -s $node_publisher->logfile; +my $log_location_sub = -s $node_subscriber->logfile; $node_publisher->safe_psql('postgres', "UPDATE tab_full_pk SET b = 'quux' WHERE a = 1"); @@ -363,7 +364,7 @@ $node_publisher->wait_for_catchup('tap_sub'); -my $logfile = slurp_file($node_subscriber->logfile, $log_location); +my $logfile = slurp_file($node_subscriber->logfile, $log_location_sub); like( $logfile, qr/conflict detected on relation "public.tab_full_pk": conflict=update_missing.*\n.*DETAIL:.* Could not find the row to be updated: remote row \(1, quux\), replica identity \(a\)=\(1\)/m, @@ -445,11 +446,12 @@ # # First, confirm that no such QUERY STATISTICS message appears before enabling # log_statement_stats. -$logfile = slurp_file($node_publisher->logfile, $log_location); +$logfile = slurp_file($node_publisher->logfile, $log_location_pub); unlike( $logfile, qr/QUERY STATISTICS/, 'log_statement_stats has not been enabled yet'); +$log_location_pub = -s $node_publisher->logfile; # check that change of connection string and/or publication list causes # restart of subscription workers. We check the state along with @@ -476,7 +478,7 @@ # Check that the expected QUERY STATISTICS message appears, # which shows that log_statement_stats=on from the CONNECTION string # was correctly passed through to and honored by the walsender. -$logfile = slurp_file($node_publisher->logfile, $log_location); +$logfile = slurp_file($node_publisher->logfile, $log_location_pub); like( $logfile, qr/QUERY STATISTICS/, @@ -538,13 +540,13 @@ # Note that the current location of the log file is not grabbed immediately # after reloading the configuration, but after sending one SQL command to # the node so that we are sure that the reloading has taken effect. -$log_location = -s $node_publisher->logfile; +$log_location_pub = -s $node_publisher->logfile; $node_publisher->safe_psql('postgres', "INSERT INTO tab_notrep VALUES (11)"); $node_publisher->wait_for_catchup('tap_sub'); -$logfile = slurp_file($node_publisher->logfile, $log_location); +$logfile = slurp_file($node_publisher->logfile, $log_location_pub); like( $logfile, qr/skipped replication of an empty transaction with XID/, From 0c8e082fba8d36434552d3d7800abda54acafd57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Wed, 4 Feb 2026 16:56:57 +0100 Subject: [PATCH 034/147] Assign "backend" type earlier during process start-up Instead of assigning the backend type in the Main function of each postmaster child, do it right after fork(), by which time it is already known by postmaster_child_launch(). This reduces the time frame during which MyBackendType is incorrect. Before this commit, ProcessStartupPacket would overwrite MyBackendType to B_BACKEND for dead-end backends, which is quite dubious. Stop that. We may now see MyBackendType == B_BG_WORKER before setting up MyBgworkerEntry. As far as I can see this is only a problem if we try to log a message and %b is in log_line_prefix, so we now have a constant string to cover that case. Previously, it would print "unrecognized", which seems strictly worse. Author: Euler Taveira Discussion: https://postgr.es/m/e85c6671-1600-4112-8887-f97a8a5d07b2@app.fastmail.com --- src/backend/postmaster/autovacuum.c | 2 -- src/backend/postmaster/bgworker.c | 1 - src/backend/postmaster/bgwriter.c | 1 - src/backend/postmaster/checkpointer.c | 1 - src/backend/postmaster/launch_backend.c | 3 +++ src/backend/postmaster/pgarch.c | 1 - src/backend/postmaster/startup.c | 1 - src/backend/postmaster/syslogger.c | 1 - src/backend/postmaster/walsummarizer.c | 1 - src/backend/postmaster/walwriter.c | 1 - src/backend/replication/logical/slotsync.c | 2 -- src/backend/replication/walreceiver.c | 1 - src/backend/storage/aio/method_worker.c | 1 - src/backend/tcop/backend_startup.c | 3 +-- src/backend/utils/error/elog.c | 7 ++++++- 15 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 22379de1e3..6fde740465 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -385,7 +385,6 @@ AutoVacLauncherMain(const void *startup_data, size_t startup_data_len) PostmasterContext = NULL; } - MyBackendType = B_AUTOVAC_LAUNCHER; init_ps_display(NULL); ereport(DEBUG1, @@ -1398,7 +1397,6 @@ AutoVacWorkerMain(const void *startup_data, size_t startup_data_len) PostmasterContext = NULL; } - MyBackendType = B_AUTOVAC_WORKER; init_ps_display(NULL); Assert(GetProcessingMode() == InitProcessing); diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 5187448175..261ccd3f59 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -759,7 +759,6 @@ BackgroundWorkerMain(const void *startup_data, size_t startup_data_len) } MyBgworkerEntry = worker; - MyBackendType = B_BG_WORKER; init_ps_display(worker->bgw_name); Assert(GetProcessingMode() == InitProcessing); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 80e3088fc7..0956bd39a8 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -94,7 +94,6 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_BG_WRITER; AuxiliaryProcessMainCommon(); /* diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 6482c21b8f..e03c19123b 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -199,7 +199,6 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_CHECKPOINTER; AuxiliaryProcessMainCommon(); CheckpointerShmem->checkpointer_pid = MyProcPid; diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index 45690b11c9..926fd6f270 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -223,6 +223,8 @@ postmaster_child_launch(BackendType child_type, int child_slot, pid = fork_process(); if (pid == 0) /* child */ { + MyBackendType = child_type; + /* Capture and transfer timings that may be needed for logging */ if (IsExternalConnectionBackend(child_type)) { @@ -607,6 +609,7 @@ SubPostmasterMain(int argc, char *argv[]) child_type = (BackendType) atoi(child_kind); if (child_type <= B_INVALID || child_type > BACKEND_NUM_TYPES - 1) elog(ERROR, "unknown child kind %s", child_kind); + MyBackendType = child_type; /* Read in the variables file */ read_backend_variables(argv[2], &startup_data, &startup_data_len); diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 1a20387c4b..82731e452f 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -222,7 +222,6 @@ PgArchiverMain(const void *startup_data, size_t startup_data_len) { Assert(startup_data_len == 0); - MyBackendType = B_ARCHIVER; AuxiliaryProcessMainCommon(); /* diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index a1a4f65f9a..cdbe53dd26 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -217,7 +217,6 @@ StartupProcessMain(const void *startup_data, size_t startup_data_len) { Assert(startup_data_len == 0); - MyBackendType = B_STARTUP; AuxiliaryProcessMainCommon(); /* Arrange to clean up at startup process exit */ diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c index 1c443b3d12..86c5e376b4 100644 --- a/src/backend/postmaster/syslogger.c +++ b/src/backend/postmaster/syslogger.c @@ -206,7 +206,6 @@ SysLoggerMain(const void *startup_data, size_t startup_data_len) now = MyStartTime; - MyBackendType = B_LOGGER; init_ps_display(NULL); /* diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index c3d56c866d..2d8f57099f 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -234,7 +234,6 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_WAL_SUMMARIZER; AuxiliaryProcessMainCommon(); ereport(DEBUG1, diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 38ec8a4c8c..23e79a3234 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -94,7 +94,6 @@ WalWriterMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_WAL_WRITER; AuxiliaryProcessMainCommon(); /* diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 1c343d03d2..af5682ce50 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -1541,8 +1541,6 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_SLOTSYNC_WORKER; - init_ps_display(NULL); Assert(GetProcessingMode() == InitProcessing); diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 8b99160ed0..10e64a7d1f 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -169,7 +169,6 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_WAL_RECEIVER; AuxiliaryProcessMainCommon(); /* diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index d7c144cd8f..d9617c20e7 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -390,7 +390,6 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) volatile int error_errno = 0; char cmd[128]; - MyBackendType = B_IO_WORKER; AuxiliaryProcessMainCommon(); pqsignal(SIGHUP, SignalHandlerForConfigReload); diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c index 94a7b83956..c517115927 100644 --- a/src/backend/tcop/backend_startup.c +++ b/src/backend/tcop/backend_startup.c @@ -846,10 +846,9 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) if (strlen(port->user_name) >= NAMEDATALEN) port->user_name[NAMEDATALEN - 1] = '\0'; + Assert(MyBackendType == B_BACKEND || MyBackendType == B_DEAD_END_BACKEND); if (am_walsender) MyBackendType = B_WAL_SENDER; - else - MyBackendType = B_BACKEND; /* * Normal walsender backends, e.g. for streaming replication, are not diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index aa530d3685..e6a4ef9905 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -2779,7 +2779,12 @@ get_backend_type_for_log(void) if (MyProcPid == PostmasterPid) backend_type_str = "postmaster"; else if (MyBackendType == B_BG_WORKER) - backend_type_str = MyBgworkerEntry->bgw_type; + { + if (MyBgworkerEntry) + backend_type_str = MyBgworkerEntry->bgw_type; + else + backend_type_str = "early bgworker"; + } else backend_type_str = GetBackendTypeDesc(MyBackendType); From 3c5ec35dea254892d75d829b5642fc3732c8fcf9 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 5 Feb 2026 09:02:12 +0900 Subject: [PATCH 035/147] oid2name: Add relation path to the information provided by -x/--extended This affects two command patterns, showing information about relations: * oid2name -x -d DBNAME, applying to all relations on a database. * oid2name -x -d DBNAME -t TABNAME [-t ..], applying to a subset of defined relations on a database. The relative path of a relation is added to the information provided, using pg_relation_filepath(). Author: David Bidoc Reviewed-by: Laurenz Albe Reviewed-by: Guillaume Lelarge Reviewed-by: Euler Taveira Reviewed-by: Mark Wong Discussion: https://postgr.es/m/CABour1v2CU1wjjoM86wAFyezJQ3-+ncH43zY1f1uXeVojVN8Ow@mail.gmail.com --- contrib/oid2name/oid2name.c | 4 ++-- doc/src/sgml/oid2name.sgml | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/contrib/oid2name/oid2name.c b/contrib/oid2name/oid2name.c index 5180290713..63e6ce2dae 100644 --- a/contrib/oid2name/oid2name.c +++ b/contrib/oid2name/oid2name.c @@ -469,7 +469,7 @@ void sql_exec_dumpalltables(PGconn *conn, struct options *opts) { char todo[1024]; - char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" "; + char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\", pg_relation_filepath(c.oid) as \"Path\" "; snprintf(todo, sizeof(todo), "SELECT pg_catalog.pg_relation_filenode(c.oid) as \"Filenode\", relname as \"Table Name\" %s " @@ -507,7 +507,7 @@ sql_exec_searchtables(PGconn *conn, struct options *opts) *comma_filenumbers, *comma_tables; bool written = false; - char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" "; + char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\", pg_relation_filepath(c.oid) as \"Path\" "; /* get tables qualifiers, whether names, filenumbers, or OIDs */ comma_oids = get_comma_elts(opts->oids); diff --git a/doc/src/sgml/oid2name.sgml b/doc/src/sgml/oid2name.sgml index 54cc9be2b8..9340d7376a 100644 --- a/doc/src/sgml/oid2name.sgml +++ b/doc/src/sgml/oid2name.sgml @@ -118,7 +118,7 @@ display more information about each object shown: tablespace name, - schema name, and OID. + schema name, OID and path. @@ -299,10 +299,10 @@ From database "alvherre": $ # you can mix the options, and get more details with -x $ oid2name -d alvherre -t accounts -f 1155291 -x From database "alvherre": - Filenode Table Name Oid Schema Tablespace ------------------------------------------------------- - 155173 accounts 155173 public pg_default - 1155291 accounts_pkey 1155291 public pg_default + Filenode Table Name Oid Schema Tablespace Path +-------------------------------------------------------------------------- + 155173 accounts 155173 public pg_default base/17228/155173 + 1155291 accounts_pkey 1155291 public pg_default base/17228/1155291 $ # show disk space for every db object $ du [0-9]* | From 7a1f0f8747a7f7491702be88570a8e4d33686d76 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Wed, 4 Feb 2026 17:11:27 -0800 Subject: [PATCH 036/147] pg_upgrade: Optimize logical replication slot caught-up check. Commit 29d0a77fa6 improved pg_upgrade to allow migrating logical slots provided that all logical slots have caught up (i.e., they have no pending decodable WAL records). Previously, this verification was done by checking each slot individually, which could be time-consuming if there were many logical slots to migrate. This commit optimizes the check to avoid reading the same WAL stream multiple times. It performs the check only for the slot with the minimum confirmed_flush_lsn and applies the result to all other slots in the same database. This limits the check to at most one logical slot per database. During the check, we identify the last decodable WAL record's LSN to report any slots with unconsumed records, consistent with the existing error reporting behavior. Additionally, the maximum confirmed_flush_lsn among all logical slots on the database is used as an early scan cutoff; finding a decodable WAL record beyond this point implies that no slot has caught up. Performance testing demonstrated that the execution time remains stable regardless of the number of slots in the database. Note that we do not distinguish slots based on their output plugins. A hypothetical plugin might use a replication origin filter that filters out changes from a specific origin. In such cases, we might get a false positive (erroneously considering a slot caught up). However, this is safe from a data integrity standpoint, such scenarios are rare, and the impact of a false positive is minimal. This optimization is applied only when the old cluster is version 19 or later. Bump catalog version. Reviewed-by: Chao Li Reviewed-by: shveta malik Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/CAD21AoBZ0LAcw1OHGEKdW7S5TRJaURdhEk3CLAW69_siqfqyAg@mail.gmail.com --- src/backend/replication/logical/logical.c | 38 ++++++-- src/backend/utils/adt/pg_upgrade_support.c | 14 ++- src/bin/pg_upgrade/check.c | 2 +- src/bin/pg_upgrade/info.c | 105 +++++++++++++++------ src/bin/pg_upgrade/t/003_logical_slots.pl | 25 +++-- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 6 +- src/include/replication/logical.h | 3 +- 8 files changed, 140 insertions(+), 55 deletions(-) diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 85060d19a4..603a2b94d0 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -1986,16 +1986,22 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) } /* - * Read up to the end of WAL starting from the decoding slot's restart_lsn. - * Return true if any meaningful/decodable WAL records are encountered, - * otherwise false. + * Read up to the end of WAL starting from the decoding slot's restart_lsn + * to end_of_wal in order to check if any meaningful/decodable WAL records + * are encountered. scan_cutoff_lsn is the LSN, where we can terminate the + * WAL scan early if we find a decodable WAL record after this LSN. + * + * Returns the last LSN decodable WAL record's LSN if found, otherwise + * returns InvalidXLogRecPtr. */ -bool -LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal) +XLogRecPtr +LogicalReplicationSlotCheckPendingWal(XLogRecPtr end_of_wal, + XLogRecPtr scan_cutoff_lsn) { - bool has_pending_wal = false; + XLogRecPtr last_pending_wal = InvalidXLogRecPtr; Assert(MyReplicationSlot); + Assert(end_of_wal >= scan_cutoff_lsn); PG_TRY(); { @@ -2023,8 +2029,7 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal) /* Invalidate non-timetravel entries */ InvalidateSystemCaches(); - /* Loop until the end of WAL or some changes are processed */ - while (!has_pending_wal && ctx->reader->EndRecPtr < end_of_wal) + while (ctx->reader->EndRecPtr < end_of_wal) { XLogRecord *record; char *errm = NULL; @@ -2037,7 +2042,20 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal) if (record != NULL) LogicalDecodingProcessRecord(ctx, ctx->reader); - has_pending_wal = ctx->processing_required; + if (ctx->processing_required) + { + last_pending_wal = ctx->reader->ReadRecPtr; + + /* + * If we find a decodable WAL after the scan_cutoff_lsn point, + * we can terminate the scan early. + */ + if (last_pending_wal >= scan_cutoff_lsn) + break; + + /* Reset the flag and continue checking */ + ctx->processing_required = false; + } CHECK_FOR_INTERRUPTS(); } @@ -2055,7 +2073,7 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal) } PG_END_TRY(); - return has_pending_wal; + return last_pending_wal; } /* diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index 697143aec4..b505a6b4fe 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -282,11 +282,12 @@ binary_upgrade_set_missing_value(PG_FUNCTION_ARGS) * upgraded without data loss. */ Datum -binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS) +binary_upgrade_check_logical_slot_pending_wal(PG_FUNCTION_ARGS) { Name slot_name; XLogRecPtr end_of_wal; - bool found_pending_wal; + XLogRecPtr scan_cutoff_lsn; + XLogRecPtr last_pending_wal; CHECK_IS_BINARY_UPGRADE; @@ -297,6 +298,7 @@ binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS) Assert(has_rolreplication(GetUserId())); slot_name = PG_GETARG_NAME(0); + scan_cutoff_lsn = PG_GETARG_LSN(1); /* Acquire the given slot */ ReplicationSlotAcquire(NameStr(*slot_name), true, true); @@ -307,12 +309,16 @@ binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS) Assert(MyReplicationSlot->data.invalidated == RS_INVAL_NONE); end_of_wal = GetFlushRecPtr(NULL); - found_pending_wal = LogicalReplicationSlotHasPendingWal(end_of_wal); + last_pending_wal = LogicalReplicationSlotCheckPendingWal(end_of_wal, + scan_cutoff_lsn); /* Clean up */ ReplicationSlotRelease(); - PG_RETURN_BOOL(!found_pending_wal); + if (XLogRecPtrIsValid(last_pending_wal)) + PG_RETURN_LSN(last_pending_wal); + else + PG_RETURN_NULL(); } /* diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index a8d20a92a9..5c73773bf0 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -622,7 +622,7 @@ check_and_dump_old_cluster(void) { /* * Logical replication slots can be migrated since PG17. See comments - * atop get_old_cluster_logical_slot_infos(). + * in get_db_rel_and_slot_infos(). */ check_old_cluster_for_valid_slots(); diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 47e8d1039a..ad4b1530e6 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -29,7 +29,7 @@ static void free_rel_infos(RelInfoArr *rel_arr); static void print_db_infos(DbInfoArr *db_arr); static void print_rel_infos(RelInfoArr *rel_arr); static void print_slot_infos(LogicalSlotInfoArr *slot_arr); -static char *get_old_cluster_logical_slot_infos_query(void); +static const char *get_old_cluster_logical_slot_infos_query(ClusterInfo *cluster); static void process_old_cluster_logical_slot_infos(DbInfo *dbinfo, PGresult *res, void *arg); @@ -281,7 +281,6 @@ get_db_rel_and_slot_infos(ClusterInfo *cluster) { UpgradeTask *task = upgrade_task_create(); char *rel_infos_query = NULL; - char *logical_slot_infos_query = NULL; if (cluster->dbarr.dbs != NULL) free_db_and_rel_infos(&cluster->dbarr); @@ -306,20 +305,15 @@ get_db_rel_and_slot_infos(ClusterInfo *cluster) */ if (cluster == &old_cluster && GET_MAJOR_VERSION(cluster->major_version) > 1600) - { - logical_slot_infos_query = get_old_cluster_logical_slot_infos_query(); upgrade_task_add_step(task, - logical_slot_infos_query, + get_old_cluster_logical_slot_infos_query(cluster), process_old_cluster_logical_slot_infos, true, NULL); - } upgrade_task_run(task, cluster); upgrade_task_free(task); pg_free(rel_infos_query); - if (logical_slot_infos_query) - pg_free(logical_slot_infos_query); if (cluster == &old_cluster) pg_log(PG_VERBOSE, "\nsource databases:"); @@ -681,17 +675,15 @@ process_rel_infos(DbInfo *dbinfo, PGresult *res, void *arg) * get_db_rel_and_slot_infos()'s UpgradeTask. The status of each logical slot * is checked in check_old_cluster_for_valid_slots(). */ -static char * -get_old_cluster_logical_slot_infos_query(void) +static const char * +get_old_cluster_logical_slot_infos_query(ClusterInfo *cluster) { /* * Fetch the logical replication slot information. The check whether the * slot is considered caught up is done by an upgrade function. This * regards the slot as caught up if we don't find any decodable changes. - * See binary_upgrade_logical_slot_has_caught_up(). - * - * Note that we can't ensure whether the slot is caught up during - * live_check as the new WAL records could be generated. + * The implementation of this check varies depending on the server + * version. * * We intentionally skip checking the WALs for invalidated slots as the * corresponding WALs could have been removed for such slots. @@ -701,21 +693,80 @@ get_old_cluster_logical_slot_infos_query(void) * started and stopped several times causing any temporary slots to be * removed. */ - return psprintf("SELECT slot_name, plugin, two_phase, failover, " - "%s as caught_up, invalidation_reason IS NOT NULL as invalid " - "FROM pg_catalog.pg_replication_slots " - "WHERE slot_type = 'logical' AND " - "database = current_database() AND " - "temporary IS FALSE;", - user_opts.live_check ? "FALSE" : - "(CASE WHEN invalidation_reason IS NOT NULL THEN FALSE " - "ELSE (SELECT pg_catalog.binary_upgrade_logical_slot_has_caught_up(slot_name)) " - "END)"); + + if (user_opts.live_check) + { + /* + * We skip the caught-up check during live_check. We cannot verify + * whether the slot is caught up in this mode, as new WAL records + * could be generated concurrently. + */ + return "SELECT slot_name, plugin, two_phase, failover, " + "FALSE as caught_up, " + "invalidation_reason IS NOT NULL as invalid " + "FROM pg_catalog.pg_replication_slots " + "WHERE slot_type = 'logical' AND " + "database = current_database() AND " + "temporary IS FALSE"; + } + else if (GET_MAJOR_VERSION(cluster->major_version) >= 1900) + { + /* + * For PG19 and later, we optimize the slot caught-up check to avoid + * reading the same WAL stream multiple times: execute the caught-up + * check only for the slot with the minimum confirmed_flush_lsn, and + * apply the same result to all other slots in the same database. This + * limits the check to at most one logical slot per database. We also + * use the maximum confirmed_flush_lsn among all logical slots on the + * database as an early scan cutoff; finding a decodable WAL record + * beyond this point implies that no slot has caught up. + * + * Note that we don't distinguish slots based on their output plugin. + * If a plugin applies replication origin filters, we might get a + * false positive (i.e., erroneously considering a slot caught up). + * However, such cases are very rare, and the impact of a false + * positive is minimal. + */ + return "WITH check_caught_up AS ( " + " SELECT pg_catalog.binary_upgrade_check_logical_slot_pending_wal(slot_name, " + " MAX(confirmed_flush_lsn) OVER ()) as last_pending_wal " + " FROM pg_replication_slots " + " WHERE slot_type = 'logical' AND " + " database = current_database() AND " + " temporary IS FALSE AND " + " invalidation_reason IS NULL " + " ORDER BY confirmed_flush_lsn ASC " + " LIMIT 1 " + ") " + "SELECT slot_name, plugin, two_phase, failover, " + "CASE WHEN invalidation_reason IS NOT NULL THEN FALSE " + "ELSE last_pending_wal IS NULL OR " + " confirmed_flush_lsn > last_pending_wal " + "END as caught_up, " + "invalidation_reason IS NOT NULL as invalid " + "FROM pg_catalog.pg_replication_slots, check_caught_up " + "WHERE slot_type = 'logical' AND " + "database = current_database() AND " + "temporary IS FALSE "; + } + + /* + * For PG18 and earlier, we call + * binary_upgrade_logical_slot_has_caught_up() for each logical slot. + */ + return "SELECT slot_name, plugin, two_phase, failover, " + "CASE WHEN invalidation_reason IS NOT NULL THEN FALSE " + "ELSE (SELECT pg_catalog.binary_upgrade_logical_slot_has_caught_up(slot_name)) " + "END as caught_up, " + "invalidation_reason IS NOT NULL as invalid " + "FROM pg_catalog.pg_replication_slots " + "WHERE slot_type = 'logical' AND " + "database = current_database() AND " + "temporary IS FALSE "; } /* - * Callback function for processing results of the query returned by - * get_old_cluster_logical_slot_infos_query(), which is used for + * Callback function for processing results of the query, which is used for * get_db_rel_and_slot_infos()'s UpgradeTask. This function stores the logical * slot information for later use. */ @@ -768,7 +819,7 @@ process_old_cluster_logical_slot_infos(DbInfo *dbinfo, PGresult *res, void *arg) * * Note: this function always returns 0 if the old_cluster is PG16 and prior * because we gather slot information only for cluster versions greater than or - * equal to PG17. See get_old_cluster_logical_slot_infos(). + * equal to PG17. See get_db_rel_and_slot_infos(). */ int count_old_cluster_logical_slots(void) diff --git a/src/bin/pg_upgrade/t/003_logical_slots.pl b/src/bin/pg_upgrade/t/003_logical_slots.pl index b9abc3a2e2..15e6d267f2 100644 --- a/src/bin/pg_upgrade/t/003_logical_slots.pl +++ b/src/bin/pg_upgrade/t/003_logical_slots.pl @@ -64,6 +64,7 @@ 'postgres', qq[ SELECT pg_create_logical_replication_slot('test_slot1', 'test_decoding'); SELECT pg_create_logical_replication_slot('test_slot2', 'test_decoding'); + SELECT pg_create_logical_replication_slot('test_slot3', 'test_decoding'); ]); $oldpub->stop(); @@ -77,7 +78,7 @@ [@pg_upgrade_cmd], 1, [ - qr/"max_replication_slots" \(1\) must be greater than or equal to the number of logical replication slots \(2\) on the old cluster/ + qr/"max_replication_slots" \(1\) must be greater than or equal to the number of logical replication slots \(3\) on the old cluster/ ], [qr//], 'run of pg_upgrade where the new cluster has insufficient "max_replication_slots"' @@ -85,29 +86,31 @@ ok(-d $newpub->data_dir . "/pg_upgrade_output.d", "pg_upgrade_output.d/ not removed after pg_upgrade failure"); -# Set 'max_replication_slots' to match the number of slots (2) present on the +# Set 'max_replication_slots' to match the number of slots (3) present on the # old cluster. Both slots will be used for subsequent tests. -$newpub->append_conf('postgresql.conf', "max_replication_slots = 2"); +$newpub->append_conf('postgresql.conf', "max_replication_slots = 3"); # ------------------------------ # TEST: Confirm pg_upgrade fails when the slot still has unconsumed WAL records # Preparations for the subsequent test: -# 1. Generate extra WAL records. At this point neither test_slot1 nor -# test_slot2 has consumed them. +# 1. Generate extra WAL records. At this point none of the slots has consumed them. # # 2. Advance the slot test_slot2 up to the current WAL location, but test_slot1 # still has unconsumed WAL records. # # 3. Emit a non-transactional message. This will cause test_slot2 to detect the # unconsumed WAL record. +# +# 4. Advance the slot test_slots3 up to the current WAL location. $oldpub->start; $oldpub->safe_psql( 'postgres', qq[ CREATE TABLE tbl AS SELECT generate_series(1, 10) AS a; SELECT pg_replication_slot_advance('test_slot2', pg_current_wal_lsn()); - SELECT count(*) FROM pg_logical_emit_message('false', 'prefix', 'This is a non-transactional message'); + SELECT count(*) FROM pg_logical_emit_message('false', 'prefix', 'This is a non-transactional message', true); + SELECT pg_replication_slot_advance('test_slot3', pg_current_wal_lsn()); ]); $oldpub->stop; @@ -138,8 +141,9 @@ }, $newpub->data_dir . "/pg_upgrade_output.d"); -# Check the file content. Both slots should be reporting that they have -# unconsumed WAL records. +# Check the file content. While both test_slot1 and test_slot2 should be reporting +# that they have unconsumed WAL records, test_slot3 should not be reported as +# it has caught up. like( slurp_file($slots_filename), qr/The slot \"test_slot1\" has not consumed the WAL yet/m, @@ -148,6 +152,10 @@ slurp_file($slots_filename), qr/The slot \"test_slot2\" has not consumed the WAL yet/m, 'the previous test failed due to unconsumed WALs'); +unlike( + slurp_file($slots_filename), + qr/test_slot3/m, + 'caught-up slot is not reported'); # ------------------------------ @@ -162,6 +170,7 @@ 'postgres', qq[ SELECT * FROM pg_drop_replication_slot('test_slot1'); SELECT * FROM pg_drop_replication_slot('test_slot2'); + SELECT * FROM pg_drop_replication_slot('test_slot3'); CREATE PUBLICATION regress_pub FOR ALL TABLES; ]); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index fb57702666..a09d8a6c64 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202601261 +#define CATALOG_VERSION_NO 202602051 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 5e5e33f64f..83f6501df3 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11832,9 +11832,9 @@ proparallel => 'u', prorettype => 'void', proargtypes => 'oid', prosrc => 'binary_upgrade_set_next_pg_tablespace_oid' }, { oid => '6312', descr => 'for use by pg_upgrade', - proname => 'binary_upgrade_logical_slot_has_caught_up', provolatile => 'v', - proparallel => 'u', prorettype => 'bool', proargtypes => 'name', - prosrc => 'binary_upgrade_logical_slot_has_caught_up' }, + proname => 'binary_upgrade_check_logical_slot_pending_wal', provolatile => 'v', + proparallel => 'u', prorettype => 'pg_lsn', proargtypes => 'name pg_lsn', + prosrc => 'binary_upgrade_check_logical_slot_pending_wal' }, { oid => '6319', descr => 'for use by pg_upgrade (relation for pg_subscription_rel)', proname => 'binary_upgrade_add_sub_rel_state', proisstrict => 'f', diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index 7f03537bda..bc9d4ece67 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -148,7 +148,8 @@ extern bool filter_by_origin_cb_wrapper(LogicalDecodingContext *ctx, ReplOriginI extern void ResetLogicalStreamingState(void); extern void UpdateDecodingStats(LogicalDecodingContext *ctx); -extern bool LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal); +extern XLogRecPtr LogicalReplicationSlotCheckPendingWal(XLogRecPtr end_of_wal, + XLogRecPtr scan_cutoff_lsn); extern XLogRecPtr LogicalSlotAdvanceAndCheckSnapState(XLogRecPtr moveto, bool *found_consistent_snapshot); From 9476ef206c64207a4fd2ddcb373759c7ede13a3c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 5 Feb 2026 15:14:53 +0900 Subject: [PATCH 037/147] Fix comment in extended_stats_funcs.c The attribute storing the statistics data for a set of expressions in pg_statistic_ext_data is stxdexpr. stxdexprs does not exist. Extracted from a larger patch by the same author. Incorrect as of efbebb4e8587. Author: Corey Huinker Discussion: https://postgr.es/m/CADkLM=fPcci6oPyuyEZ0F4bWqAA7HzaWO+ZPptufuX5_uWt6kw@mail.gmail.com --- src/backend/statistics/extended_stats_funcs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/statistics/extended_stats_funcs.c b/src/backend/statistics/extended_stats_funcs.c index db10768460..b640941a9c 100644 --- a/src/backend/statistics/extended_stats_funcs.c +++ b/src/backend/statistics/extended_stats_funcs.c @@ -539,7 +539,7 @@ extended_statistics_update(FunctionCallInfo fcinfo) /* * After all the positive number attnums in stxkeys come the negative * numbers (if any) which represent expressions in the order that they - * appear in stxdexprs. Because the expressions are always + * appear in stxdexpr. Because the expressions are always * monotonically decreasing from -1, there is no point in looking at * the values in stxkeys, it's enough to know how many of them there * are. From e35add48ccc2e5aa94de360f1a43c6c150bda54a Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Fri, 6 Feb 2026 09:40:05 +0900 Subject: [PATCH 038/147] doc: Move synchronized_standby_slots to "Primary Server" section. synchronized_standby_slots is defined in guc_parameter.dat as part of the REPLICATION_PRIMARY group and is listed under the "Primary Server" section in postgresql.conf.sample. However, in the documentation its description was previously placed under the "Sending Servers" section. Since synchronized_standby_slots only takes effect on the primary server, this commit moves its documentation to the "Primary Server" section to match its behavior and other references. Backpatch to v17 where synchronized_standby_slots was added. Author: Fujii Masao Reviewed-by: Shinya Kato Discussion: https://postgr.es/m/CAHGQGwE_LwgXgCrqd08OFteJqdERiF3noqOKu2vt7Kjk4vMiGg@mail.gmail.com Backpatch-through: 17 --- doc/src/sgml/config.sgml | 78 ++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 5560b95ee6..0cacc062cd 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4722,45 +4722,6 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows - - synchronized_standby_slots (string) - - synchronized_standby_slots configuration parameter - - - - - A comma-separated list of streaming replication standby server slot names - that logical WAL sender processes will wait for. Logical WAL sender processes - will send decoded changes to plugins only after the specified replication - slots confirm receiving WAL. This guarantees that logical replication - failover slots do not consume changes until those changes are received - and flushed to corresponding physical standbys. If a - logical replication connection is meant to switch to a physical standby - after the standby is promoted, the physical replication slot for the - standby should be listed here. Note that logical replication will not - proceed if the slots specified in the - synchronized_standby_slots do not exist or are invalidated. - Additionally, the replication management functions - - pg_replication_slot_advance, - - pg_logical_slot_get_changes, and - - pg_logical_slot_peek_changes, - when used with logical failover slots, will block until all - physical slots specified in synchronized_standby_slots have - confirmed WAL receipt. - - - The standbys corresponding to the physical replication slots in - synchronized_standby_slots must configure - sync_replication_slots = true so they can receive - logical failover slot changes from the primary. - - - - @@ -4909,6 +4870,45 @@ ANY num_sync ( + synchronized_standby_slots (string) + + synchronized_standby_slots configuration parameter + + + + + A comma-separated list of streaming replication standby server slot names + that logical WAL sender processes will wait for. Logical WAL sender processes + will send decoded changes to plugins only after the specified replication + slots confirm receiving WAL. This guarantees that logical replication + failover slots do not consume changes until those changes are received + and flushed to corresponding physical standbys. If a + logical replication connection is meant to switch to a physical standby + after the standby is promoted, the physical replication slot for the + standby should be listed here. Note that logical replication will not + proceed if the slots specified in the + synchronized_standby_slots do not exist or are invalidated. + Additionally, the replication management functions + + pg_replication_slot_advance, + + pg_logical_slot_get_changes, and + + pg_logical_slot_peek_changes, + when used with logical failover slots, will block until all + physical slots specified in synchronized_standby_slots have + confirmed WAL receipt. + + + The standbys corresponding to the physical replication slots in + synchronized_standby_slots must configure + sync_replication_slots = true so they can receive + logical failover slot changes from the primary. + + From f94e9141a0bbb365f8194517e142746466ee7014 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sat, 31 May 2025 22:50:22 +1200 Subject: [PATCH 039/147] Add file_extend_method=posix_fallocate,write_zeros. Provide a way to disable the use of posix_fallocate() for relation files. It was introduced by commit 4d330a61bb1. The new setting file_extend_method=write_zeros can be used as a workaround for problems reported from the field: * BTRFS compression is disabled by the use of posix_fallocate() * XFS could produce spurious ENOSPC errors in some Linux kernel versions, though that problem is reported to have been fixed The default is file_extend_method=posix_fallocate if available, as before. The write_zeros option is similar to PostgreSQL < 16, except that now it's multi-block. Backpatch-through: 16 Reviewed-by: Jakub Wartak Reported-by: Dimitrios Apostolou Discussion: https://postgr.es/m/b1843124-fd22-e279-a31f-252dffb6fbf2%40gmx.net --- doc/src/sgml/config.sgml | 37 +++++++++++++++++++ src/backend/storage/file/fd.c | 3 ++ src/backend/storage/smgr/md.c | 21 ++++++++--- src/backend/utils/misc/guc_parameters.dat | 7 ++++ src/backend/utils/misc/guc_tables.c | 9 +++++ src/backend/utils/misc/postgresql.conf.sample | 4 ++ src/include/storage/fd.h | 11 ++++++ 7 files changed, 87 insertions(+), 5 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0cacc062cd..f1af1505cf 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2412,6 +2412,43 @@ include_dir 'conf.d' + + file_extend_method (enum) + + file_extend_method configuration parameter + + + + + Specifies the method used to extend data files during bulk operations + such as COPY. The first available option is used as + the default, depending on the operating system: + + + + posix_fallocate (Unix) uses the standard POSIX + interface for allocating disk space, but is missing on some systems. + If it is present but the underlying file system doesn't support it, + this option silently falls back to write_zeros. + Current versions of BTRFS are known to disable compression when + this option is used. + This is the default on systems that have the function. + + + + + write_zeros extends files by writing out blocks + of zero bytes. This is the default on systems that don't have the + function posix_fallocate. + + + + The write_zeros method is always used when data + files are extended by 8 blocks or fewer. + + + + max_notify_queue_pages (integer) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0f8083651d..5d07b64a1e 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -164,6 +164,9 @@ bool data_sync_retry = false; /* How SyncDataDirectory() should do its job. */ int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC; +/* How data files should be bulk-extended with zeros. */ +int file_extend_method = DEFAULT_FILE_EXTEND_METHOD; + /* Which kinds of files should be opened with PG_O_DIRECT. */ int io_direct_flags; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index a262587118..443434e4ea 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -602,13 +602,24 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, * that decision should be made though? For now just use a cutoff of * 8, anything between 4 and 8 worked OK in some local testing. */ - if (numblocks > 8) + if (numblocks > 8 && + file_extend_method != FILE_EXTEND_METHOD_WRITE_ZEROS) { - int ret; + int ret = 0; - ret = FileFallocate(v->mdfd_vfd, - seekpos, (pgoff_t) BLCKSZ * numblocks, - WAIT_EVENT_DATA_FILE_EXTEND); +#ifdef HAVE_POSIX_FALLOCATE + if (file_extend_method == FILE_EXTEND_METHOD_POSIX_FALLOCATE) + { + ret = FileFallocate(v->mdfd_vfd, + seekpos, (pgoff_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + } + else +#endif + { + elog(ERROR, "unsupported file_extend_method: %d", + file_extend_method); + } if (ret != 0) { ereport(ERROR, diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index f0260e6e41..c1f1603cd3 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1042,6 +1042,13 @@ options => 'file_copy_method_options', }, +{ name => 'file_extend_method', type => 'enum', context => 'PGC_SIGHUP', group => 'RESOURCES_DISK', + short_desc => 'Selects the method used for extending data files.', + variable => 'file_extend_method', + boot_val => 'DEFAULT_FILE_EXTEND_METHOD', + options => 'file_extend_method_options', +}, + { name => 'from_collapse_limit', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', short_desc => 'Sets the FROM-list size beyond which subqueries are not collapsed.', long_desc => 'The planner will merge subqueries into upper queries if the resulting FROM list would have no more than this many items.', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 13c569d879..5df3a36bf6 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -80,6 +80,7 @@ #include "storage/bufmgr.h" #include "storage/bufpage.h" #include "storage/copydir.h" +#include "storage/fd.h" #include "storage/io_worker.h" #include "storage/large_object.h" #include "storage/pg_shmem.h" @@ -491,6 +492,14 @@ static const struct config_enum_entry file_copy_method_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry file_extend_method_options[] = { +#ifdef HAVE_POSIX_FALLOCATE + {"posix_fallocate", FILE_EXTEND_METHOD_POSIX_FALLOCATE, false}, +#endif + {"write_zeros", FILE_EXTEND_METHOD_WRITE_ZEROS, false}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index c4f92fcdac..1ae594af84 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -179,6 +179,10 @@ # in kilobytes, or -1 for no limit #file_copy_method = copy # copy, clone (if supported by OS) +#file_extend_method = posix_fallocate # the default is the first option supported + # by the operating system: + # posix_fallocate (most Unix-like systems) + # write_zeros #max_notify_queue_pages = 1048576 # limits the number of SLRU pages allocated # for NOTIFY / LISTEN queue diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 413233bcd3..8ac466fd34 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -55,12 +55,23 @@ typedef int File; #define IO_DIRECT_WAL 0x02 #define IO_DIRECT_WAL_INIT 0x04 +enum FileExtendMethod +{ +#ifdef HAVE_POSIX_FALLOCATE + FILE_EXTEND_METHOD_POSIX_FALLOCATE, +#endif + FILE_EXTEND_METHOD_WRITE_ZEROS, +}; + +/* Default to the first available file_extend_method. */ +#define DEFAULT_FILE_EXTEND_METHOD 0 /* GUC parameter */ extern PGDLLIMPORT int max_files_per_process; extern PGDLLIMPORT bool data_sync_retry; extern PGDLLIMPORT int recovery_init_sync_method; extern PGDLLIMPORT int io_direct_flags; +extern PGDLLIMPORT int file_extend_method; /* * This is private to fd.c, but exported for save/restore_backend_variables() From 74a116a79b47631e163c9814f39f5d218834e94c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 6 Feb 2026 15:38:16 +0900 Subject: [PATCH 040/147] Fix some error message inconsistencies These errors are very unlikely going to show up, but in the event that they happen, some incorrect information would have been provided: - In pg_rewind, a stat() failure was reported as an open() failure. - In pg_combinebackup, a check for the new directory of a tablespace mapping was referred as the old directory. - In pg_combinebackup, a failure in reading a source file when copying blocks referred to the destination file. The changes for pg_combinebackup affect v17 and newer versions. For pg_rewind, all the stable branches are affected. Author: Man Zeng Discussion: https://postgr.es/m/tencent_1EE1430B1E6C18A663B8990F@qq.com Backpatch-through: 14 --- src/bin/pg_combinebackup/copy_file.c | 2 +- src/bin/pg_combinebackup/pg_combinebackup.c | 2 +- src/bin/pg_rewind/file_ops.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c index dd3c0dc1c8..0287d6e87d 100644 --- a/src/bin/pg_combinebackup/copy_file.c +++ b/src/bin/pg_combinebackup/copy_file.c @@ -210,7 +210,7 @@ copy_file_blocks(const char *src, const char *dst, } if (rb < 0) - pg_fatal("could not read from file \"%s\": %m", dst); + pg_fatal("could not read from file \"%s\": %m", src); pg_free(buffer); close(src_fd); diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 918b8b3564..b9f26ce782 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -501,7 +501,7 @@ add_tablespace_mapping(cb_options *opt, char *arg) tsmap->old_dir); if (!is_absolute_path(tsmap->new_dir)) - pg_fatal("old directory is not an absolute path in tablespace mapping: %s", + pg_fatal("new directory is not an absolute path in tablespace mapping: %s", tsmap->new_dir); /* Canonicalize paths to avoid spurious failures when comparing. */ diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index 356e23a308..7de2195a4f 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -327,7 +327,7 @@ slurpFile(const char *datadir, const char *path, size_t *filesize) fullpath); if (fstat(fd, &statbuf) < 0) - pg_fatal("could not open file \"%s\" for reading: %m", + pg_fatal("could not stat file \"%s\" for reading: %m", fullpath); len = statbuf.st_size; From 072c8421359730149f4eaf861ce55aa78968ba9d Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 6 Feb 2026 19:57:22 +0900 Subject: [PATCH 041/147] Fix use of proc number in pgstat_create_backend() This routine's internals directly used MyProcNumber to choose which object ID to assign for the hash key of a backend's stats entry, while the value to use is given as input argument of the function. The original intention was to pass MyProcNumber as an argument of pgstat_create_backend() when called in pgstat_bestart_final(), pgstat_beinit() ensuring that MyProcNumber has been set, not use it directly in the function. This commit addresses this inconsistency by using the procnum given by the caller of pgstat_create_backend(), not MyProcNumber. This issue is not a cause of bugs currently. However, let's keep the code in sync across all the branches where this code exists, as it could matter in a future backpatch. Oversight in 4feba03d8b92. Reported-by: Ryo Matsumura Discussion: https://postgr.es/m/TYCPR01MB11316AD8150C8F470319ACCAEE866A@TYCPR01MB11316.jpnprd01.prod.outlook.com Backpatch-through: 18 --- src/backend/utils/activity/pgstat_backend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 1350f5f62f..f2f8d3ff75 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -326,7 +326,7 @@ pgstat_create_backend(ProcNumber procnum) PgStatShared_Backend *shstatent; entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_BACKEND, InvalidOid, - MyProcNumber, false); + procnum, false); shstatent = (PgStatShared_Backend *) entry_ref->shared_stats; /* From e3d37853ecd51c87976df7ea5c5d641f45668370 Mon Sep 17 00:00:00 2001 From: Jacob Champion Date: Fri, 6 Feb 2026 10:25:12 -0800 Subject: [PATCH 042/147] doc: Expand upon protocol versions and extensions First, split the Protocol Versions table in two, and lead with the list of versions that are supported today. Reserved and unsupported version numbers go into the second table. Second, in anticipation of a new (reserved) protocol extension, document the extension negotiation process alongside version negotiation, and add the corresponding tables for future extension parameter registrations. Reviewed-by: Jelte Fennema-Nio Reviewed-by: David G. Johnston Discussion: https://postgr.es/m/DDPR5BPWH1RJ.1LWAK6QAURVAY%40jeltef.nl --- doc/src/sgml/protocol.sgml | 108 +++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 10 deletions(-) diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index a2b528c481..36fd327d4b 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -223,10 +223,12 @@ shows the currently supported protocol versions. + + documents protocol versions that are unsupported or otherwise reserved. - Protocol Versions + Supported Protocol Versions @@ -247,6 +249,27 @@ message was redefined to have a variable length payload. + + 3.0 + PostgreSQL 7.4 and later + + + +
+ + + Other Protocol Versions + + + + + Version + Supported by + Description + + + + 3.1 - @@ -257,15 +280,78 @@ - 3.0 - PostgreSQL 7.4 and later - - 2.0 up to PostgreSQL 13 - See previous releases of + Obsolete. See previous releases of the PostgreSQL documentation for - details + details. + + + +
+ + + + Protocol Extensions + + + Servers and clients may additionally negotiate individual extensions to the + protocol version in use. These are offered by the client in the startup + message, as specially-named parameters with a _pq_. + prefix. Servers reject any unknown or unsupported extensions by sending a + NegotiateProtocolVersion message containing the list of rejected parameter + names, at which point the client may choose whether to continue with the + connection. and + document the supported + and reserved protocol extension parameters, respectively. + + + + Supported Protocol Extensions + + + + + + + Parameter Name + Values + Supported by + Description + + + + + + + (No supported protocol extensions are currently defined.) + + + + +
+ + + Reserved Protocol Extensions + + + + + Parameter Name + Description + + + + + + _pq_.[name] + Any other parameter names beginning with _pq_., + that are not defined above, are reserved for future protocol expansion. + Servers must reject any that are received from a + client, by sending a NegotiateProtocolVersion message during the + startup flow, and should + otherwise continue the connection. + @@ -295,8 +381,8 @@ To begin a session, a frontend opens a connection to the server and sends a startup message. This message includes the names of the user and of the database the user wants to connect to; it also identifies the particular - protocol version to be used. (Optionally, the startup message can include - additional settings for run-time parameters.) + protocol version to be used. (Optionally, the startup message can request + protocol extensions and include additional settings for run-time parameters.) The server then uses this information and the contents of its configuration files (such as pg_hba.conf) to determine @@ -6151,7 +6237,9 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" In addition to the above, other parameters may be listed. Parameter names beginning with _pq_. are - reserved for use as protocol extensions, while others are + reserved for use as + protocol extensions, + while others are treated as run-time parameters to be set at backend start time. Such settings will be applied during backend start (after parsing the command-line arguments if any) and will From d8d7c5dc8f74506d35c7e8242be997fd5cf388eb Mon Sep 17 00:00:00 2001 From: Jacob Champion Date: Fri, 6 Feb 2026 10:31:45 -0800 Subject: [PATCH 043/147] libpq: Prepare for protocol grease during 19beta The main reason that libpq doesn't request protocol version 3.2 by default is because other proxy/server implementations don't implement the negotiation. This is a bit of a chicken-and-egg problem: We don't bump the default version that libpq requests, but other implementations may not be incentivized to implement version negotiation if their users never run into issues. One established practice to combat this is to flip Postel's Law on its head, by sending parameters that the server cannot possibly support. If the server fails the handshake instead of correctly negotiating, then the problem is surfaced naturally. If the server instead claims to support the bogus parameters, then we fail the connection to make the lie obvious. This is called "grease" (or "greasing"), after the GREASE mechanism in TLS that popularized the concept: https://www.rfc-editor.org/rfc/rfc8701.html This patch reserves 3.9999 as an explicitly unsupported protocol version number and `_pq_.test_protocol_negotiation` as an explicitly unsupported protocol extension. A later commit will send these by default in order to stress-test the ecosystem during the beta period; that commit will then be reverted before 19 RC1, so that we can decide what to do with whatever data has been gathered. The _pq_.test_protocol_negotiation change here is intentionally docs- only: after its implementation is reverted, the parameter should remain reserved. Extracted/adapted from a patch by Jelte Fennema-Nio. Author: Jelte Fennema-Nio Co-authored-by: Jacob Champion Discussion: https://postgr.es/m/DDPR5BPWH1RJ.1LWAK6QAURVAY%40jeltef.nl --- doc/src/sgml/protocol.sgml | 23 +++++++++++++++++++++++ src/include/libpq/pqcomm.h | 10 ++++++++++ src/interfaces/libpq/fe-protocol3.c | 14 +++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index 36fd327d4b..89ac680efd 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -270,6 +270,18 @@ + + 3.9999 + - + Reserved for protocol greasing. libpq may use this version, which + is higher than any minor version the project ever expects to use, to + test that servers and middleware properly implement protocol version + negotiation. Servers must not add special-case + logic for this version; they should simply compare it to their latest + supported version (which will always be smaller) and downgrade via a + NegotiateProtocolVersion message. + + 3.1 - @@ -353,6 +365,17 @@ otherwise continue the connection. + + + _pq_.test_protocol_negotiation + Reserved for protocol greasing. libpq may send this extension to + test that servers and middleware properly implement protocol extension + negotiation. Servers must not add special-case + logic for this parameter; they should simply send the list of all + unsupported options (including this one) via a NegotiateProtocolVersion + message. + +
diff --git a/src/include/libpq/pqcomm.h b/src/include/libpq/pqcomm.h index 1bbe5b9ee4..a29c9c94d7 100644 --- a/src/include/libpq/pqcomm.h +++ b/src/include/libpq/pqcomm.h @@ -104,6 +104,16 @@ is_unixsock_path(const char *path) */ #define PG_PROTOCOL_RESERVED_31 PG_PROTOCOL(3,1) +/* + * PG_PROTOCOL_GREASE is an intentionally unsupported protocol version used + * for "greasing" (the practice of sending valid, but extraneous or otherwise + * unusual, messages to keep peer implementations honest). This helps ensure + * that servers properly implement protocol version negotiation. Version 3.9999 + * was chosen since it is safely within the valid range, it is representable + * via PQfullProtocolVersion, and it is unlikely to ever be needed in practice. + */ +#define PG_PROTOCOL_GREASE PG_PROTOCOL(3,9999) + /* * A client can send a cancel-current-operation request to the postmaster. * This is uglier than sending it directly to the client's backend, but it diff --git a/src/interfaces/libpq/fe-protocol3.c b/src/interfaces/libpq/fe-protocol3.c index 103428033e..90bbb2eba1 100644 --- a/src/interfaces/libpq/fe-protocol3.c +++ b/src/interfaces/libpq/fe-protocol3.c @@ -1451,7 +1451,19 @@ pqGetNegotiateProtocolVersion3(PGconn *conn) if (pqGetInt(&num, 4, conn) != 0) goto eof; - /* Check the protocol version */ + /* + * Check the protocol version. + * + * PG_PROTOCOL_GREASE is intentionally unsupported and reserved. It's + * higher than any real version, so check for that first, to get the most + * specific error message. Then check the upper and lower bounds. + */ + if (their_version == PG_PROTOCOL_GREASE) + { + libpq_append_conn_error(conn, "received invalid protocol negotiation message: server requested \"grease\" protocol version 3.9999"); + goto failure; + } + if (their_version > conn->pversion) { libpq_append_conn_error(conn, "received invalid protocol negotiation message: server requested downgrade to a higher-numbered version"); From ba1e14134a775e56a76c1537936c61102827507f Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Fri, 6 Feb 2026 16:24:21 -0600 Subject: [PATCH 044/147] Adjust style of some debugging macros. This commit adjusts a few debugging macros to match the style of those in pg_config_manual.h. Like commits 123661427b and b4cbc106a6, these were discovered while reviewing Aleksander Alekseev's proposed changes to pgindent. Reviewed-by: Michael Paquier Reviewed-by: Chao Li Discussion: https://postgr.es/m/aP-H6kSsGOxaB21k%40nathan --- src/backend/access/nbtree/nbtsort.c | 2 +- src/backend/utils/adt/numeric.c | 2 +- src/include/executor/execdebug.h | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 90ab4e91b5..3a45508f62 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -69,8 +69,8 @@ /* * DISABLE_LEADER_PARTICIPATION disables the leader's participation in * parallel index builds. This may be useful as a debugging aid. -#undef DISABLE_LEADER_PARTICIPATION */ +/* #define DISABLE_LEADER_PARTICIPATION */ /* * Status record for spooling/sorting phase. (Note we may have two of diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 891ae6ba7f..3bd3635d98 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -48,8 +48,8 @@ * Uncomment the following to enable compilation of dump_numeric() * and dump_var() and to get a dump of any result produced by make_result(). * ---------- -#define NUMERIC_DEBUG */ +/* #define NUMERIC_DEBUG */ /* ---------- diff --git a/src/include/executor/execdebug.h b/src/include/executor/execdebug.h index 20ac9be0b9..3e11055191 100644 --- a/src/include/executor/execdebug.h +++ b/src/include/executor/execdebug.h @@ -34,22 +34,22 @@ * EXEC_NESTLOOPDEBUG is a flag which turns on debugging of the * nest loop node by NL_printf() and ENL_printf() in nodeNestloop.c * ---------------- -#undef EXEC_NESTLOOPDEBUG */ +/* #define EXEC_NESTLOOPDEBUG */ /* ---------------- * EXEC_SORTDEBUG is a flag which turns on debugging of * the ExecSort() stuff by SO_printf() in nodeSort.c * ---------------- -#undef EXEC_SORTDEBUG */ +/* #define EXEC_SORTDEBUG */ /* ---------------- * EXEC_MERGEJOINDEBUG is a flag which turns on debugging of * the ExecMergeJoin() stuff by MJ_printf() in nodeMergejoin.c * ---------------- -#undef EXEC_MERGEJOINDEBUG */ +/* #define EXEC_MERGEJOINDEBUG */ /* ---------------------------------------------------------------- * #defines controlled by above definitions From 7cdb633c89da82d4c6fdfba007a9ff05a9dff29e Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 6 Feb 2026 20:46:03 -0500 Subject: [PATCH 045/147] Make some minor cleanups in typalign-related code. Commit 7b378237a widened AclMode to 64 bits, which implies that the alignment of AclItem is now determined by an int64 field. That commit correctly set the typalign for SQL type aclitem to 'd', but it missed the hard-wired knowledge about _aclitem in bootstrap.c. This doesn't seem to have caused any ill effects, probably because we never try to fill a non-null value into an aclitem[] column during bootstrap. Nonetheless, it's clearly a gotcha waiting to happen, so fix it up. In passing, also fix a couple of typanalyze functions that were using hard-coded typalign constants when they could just as easily use greppable TYPALIGN_xxx macros. Noticed these while working on a patch to expand the set of typalign values. I doubt we are going to pursue that path, but these fixes still seem worth a quick commit. Discussion: https://postgr.es/m/1127261.1769649624@sss.pgh.pa.us --- src/backend/bootstrap/bootstrap.c | 2 +- src/backend/tsearch/ts_typanalyze.c | 2 +- src/backend/utils/adt/rangetypes_typanalyze.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index dd57624b4f..e8f825b3d6 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -137,7 +137,7 @@ static const struct typinfo TypInfo[] = { F_ARRAY_IN, F_ARRAY_OUT}, {"_char", 1002, CHAROID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_aclitem", 1034, ACLITEMOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, + {"_aclitem", 1034, ACLITEMOID, -1, false, TYPALIGN_DOUBLE, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT} }; diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index 0c513d694e..48ee050e37 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -444,7 +444,7 @@ compute_tsvector_stats(VacAttrStats *stats, stats->statypid[0] = TEXTOID; stats->statyplen[0] = -1; /* typlen, -1 for varlena */ stats->statypbyval[0] = false; - stats->statypalign[0] = 'i'; + stats->statypalign[0] = TYPALIGN_INT; } } else diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c index 38d12dedbc..278d4e6941 100644 --- a/src/backend/utils/adt/rangetypes_typanalyze.c +++ b/src/backend/utils/adt/rangetypes_typanalyze.c @@ -398,7 +398,7 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, stats->statypid[slot_idx] = FLOAT8OID; stats->statyplen[slot_idx] = sizeof(float8); stats->statypbyval[slot_idx] = true; - stats->statypalign[slot_idx] = 'd'; + stats->statypalign[slot_idx] = TYPALIGN_DOUBLE; /* Store the fraction of empty ranges */ emptyfrac = palloc_object(float4); From 0af05b5dbb42387957582e76232dc27138382e5a Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Sat, 7 Feb 2026 10:08:38 +0100 Subject: [PATCH 046/147] Revert "Change copyObject() to use typeof_unqual" This reverts commit 4cfce4e62c8f09f5b1f6a7f69760ca46a74406e2. This implementation fails to compile on newer MSVC that support __typeof_unqual__. (Older versions did not support it and compiled fine.) Revert for now and research further. Reported-by: Bryan Green Discussion: https://www.postgresql.org/message-id/b03ddcd4-2a16-49ee-b105-e7f609f3c514%40gmail.com --- config/c-compiler.m4 | 25 ----------------------- configure | 42 -------------------------------------- configure.ac | 1 - meson.build | 24 ---------------------- src/include/nodes/nodes.h | 4 ++-- src/include/pg_config.h.in | 7 ------- 6 files changed, 2 insertions(+), 101 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 7179a73bd2..1509dbfa2a 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -192,31 +192,6 @@ if test "$pgac_cv_c_typeof" != no; then fi])# PGAC_C_TYPEOF -# PGAC_C_TYPEOF_UNQUAL -# -------------------- -# Check if the C compiler understands typeof_unqual or a variant. Define -# HAVE_TYPEOF_UNQUAL if so, and define 'typeof_unqual' to the actual key word. -# -AC_DEFUN([PGAC_C_TYPEOF_UNQUAL], -[AC_CACHE_CHECK(for typeof_unqual, pgac_cv_c_typeof_unqual, -[pgac_cv_c_typeof_unqual=no -for pgac_kw in typeof_unqual __typeof_unqual__; do - AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], -[int x = 0; -$pgac_kw(x) y; -y = x; -return y;])], -[pgac_cv_c_typeof_unqual=$pgac_kw]) - test "$pgac_cv_c_typeof_unqual" != no && break -done]) -if test "$pgac_cv_c_typeof_unqual" != no; then - AC_DEFINE(HAVE_TYPEOF_UNQUAL, 1, - [Define to 1 if your compiler understands `typeof_unqual' or something similar.]) - if test "$pgac_cv_c_typeof_unqual" != typeof_unqual; then - AC_DEFINE_UNQUOTED(typeof_unqual, $pgac_cv_c_typeof_unqual, [Define to how the compiler spells `typeof_unqual'.]) - fi -fi])# PGAC_C_TYPEOF_UNQUAL - # PGAC_C_TYPES_COMPATIBLE # ----------------------- diff --git a/configure b/configure index ba29393187..a10a2c85c6 100755 --- a/configure +++ b/configure @@ -15010,48 +15010,6 @@ _ACEOF fi fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for typeof_unqual" >&5 -$as_echo_n "checking for typeof_unqual... " >&6; } -if ${pgac_cv_c_typeof_unqual+:} false; then : - $as_echo_n "(cached) " >&6 -else - pgac_cv_c_typeof_unqual=no -for pgac_kw in typeof_unqual __typeof_unqual__; do - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -int -main () -{ -int x = 0; -$pgac_kw(x) y; -y = x; -return y; - ; - return 0; -} -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - pgac_cv_c_typeof_unqual=$pgac_kw -fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext - test "$pgac_cv_c_typeof_unqual" != no && break -done -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_c_typeof_unqual" >&5 -$as_echo "$pgac_cv_c_typeof_unqual" >&6; } -if test "$pgac_cv_c_typeof_unqual" != no; then - -$as_echo "#define HAVE_TYPEOF_UNQUAL 1" >>confdefs.h - - if test "$pgac_cv_c_typeof_unqual" != typeof_unqual; then - -cat >>confdefs.h <<_ACEOF -#define typeof_unqual $pgac_cv_c_typeof_unqual -_ACEOF - - fi -fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_types_compatible_p" >&5 $as_echo_n "checking for __builtin_types_compatible_p... " >&6; } if ${pgac_cv__types_compatible+:} false; then : diff --git a/configure.ac b/configure.ac index 412fe358a2..814e64a967 100644 --- a/configure.ac +++ b/configure.ac @@ -1717,7 +1717,6 @@ PGAC_PRINTF_ARCHETYPE PGAC_CXX_PRINTF_ARCHETYPE PGAC_C_STATEMENT_EXPRESSIONS PGAC_C_TYPEOF -PGAC_C_TYPEOF_UNQUAL PGAC_C_TYPES_COMPATIBLE PGAC_C_BUILTIN_CONSTANT_P PGAC_C_BUILTIN_OP_OVERFLOW diff --git a/meson.build b/meson.build index 0722b16927..df907b62da 100644 --- a/meson.build +++ b/meson.build @@ -2880,30 +2880,6 @@ int main(void) endif endforeach -# Check if the C compiler understands typeof_unqual or a variant. Define -# HAVE_TYPEOF_UNQUAL if so, and define 'typeof_unqual' to the actual key word. -foreach kw : ['typeof_unqual', '__typeof_unqual__'] - if cc.compiles(''' -int main(void) -{ - int x = 0; - @0@(x) y; - y = x; - return y; -} -'''.format(kw), - name: kw, - args: test_c_args, include_directories: postgres_inc) - - cdata.set('HAVE_TYPEOF_UNQUAL', 1) - if kw != 'typeof_unqual' - cdata.set('typeof_unqual', kw) - endif - - break - endif -endforeach - # MSVC doesn't cope well with defining restrict to __restrict, the # spelling it understands, because it conflicts with diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index ba6dd7f389..b6ad28618a 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -226,8 +226,8 @@ extern int16 *readAttrNumberCols(int numCols); extern void *copyObjectImpl(const void *from); /* cast result back to argument type, if supported by compiler */ -#ifdef HAVE_TYPEOF_UNQUAL -#define copyObject(obj) ((typeof_unqual(*(obj)) *) copyObjectImpl(obj)) +#ifdef HAVE_TYPEOF +#define copyObject(obj) ((typeof(obj)) copyObjectImpl(obj)) #else #define copyObject(obj) copyObjectImpl(obj) #endif diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c089f2252c..339268dc8e 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -454,10 +454,6 @@ /* Define to 1 if your compiler understands `typeof' or something similar. */ #undef HAVE_TYPEOF -/* Define to 1 if your compiler understands `typeof_unqual' or something - similar. */ -#undef HAVE_TYPEOF_UNQUAL - /* Define to 1 if you have the header file. */ #undef HAVE_UCHAR_H @@ -810,6 +806,3 @@ /* Define to how the compiler spells `typeof'. */ #undef typeof - -/* Define to how the compiler spells `typeof_unqual'. */ -#undef typeof_unqual From 7467041cde9ed1966cb3ea18da8ac119b462c2e4 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sat, 7 Feb 2026 17:02:35 +0700 Subject: [PATCH 047/147] Future-proof sort template against undefined behavior Commit 176dffdf7 added a NULL array pointer check before performing a qsort in order to prevent undefined behavior when passing NULL pointer and zero length. To head off future degenerate cases, check that there are at least two elements to sort before proceeding with insertion sort. This has the added advantage of allowing us to remove four equivalent checks that guarded against recursion/iteration. There might be a tiny performance penalty from unproductive recursions, but we can buy that back by increasing the insertion sort threshold. That is left for future work. Discussion: https://postgr.es/m/CANWCAZZWvds_35nXc4vXD-eBQa_=mxVtqZf-PM_ps=SD7ghhJg@mail.gmail.com --- src/include/lib/sort_template.h | 40 +++++++++++++++++---------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/include/lib/sort_template.h b/src/include/lib/sort_template.h index e02aa73cd4..22b2092d03 100644 --- a/src/include/lib/sort_template.h +++ b/src/include/lib/sort_template.h @@ -311,6 +311,14 @@ ST_SORT(ST_ELEMENT_TYPE * data, size_t n DO_CHECK_FOR_INTERRUPTS(); if (n < 7) { + /* + * Not strictly necessary, but a caller may pass a NULL pointer input + * and zero length, and this silences warnings about applying offsets + * to NULL pointers. + */ + if (n < 2) + return; + for (pm = a + ST_POINTER_STEP; pm < a + n * ST_POINTER_STEP; pm += ST_POINTER_STEP) for (pl = pm; pl > a && DO_COMPARE(pl - ST_POINTER_STEP, pl) > 0; @@ -387,29 +395,23 @@ ST_SORT(ST_ELEMENT_TYPE * data, size_t n if (d1 <= d2) { /* Recurse on left partition, then iterate on right partition */ - if (d1 > ST_POINTER_STEP) - DO_SORT(a, d1 / ST_POINTER_STEP); - if (d2 > ST_POINTER_STEP) - { - /* Iterate rather than recurse to save stack space */ - /* DO_SORT(pn - d2, d2 / ST_POINTER_STEP) */ - a = pn - d2; - n = d2 / ST_POINTER_STEP; - goto loop; - } + DO_SORT(a, d1 / ST_POINTER_STEP); + + /* Iterate rather than recurse to save stack space */ + /* DO_SORT(pn - d2, d2 / ST_POINTER_STEP) */ + a = pn - d2; + n = d2 / ST_POINTER_STEP; + goto loop; } else { /* Recurse on right partition, then iterate on left partition */ - if (d2 > ST_POINTER_STEP) - DO_SORT(pn - d2, d2 / ST_POINTER_STEP); - if (d1 > ST_POINTER_STEP) - { - /* Iterate rather than recurse to save stack space */ - /* DO_SORT(a, d1 / ST_POINTER_STEP) */ - n = d1 / ST_POINTER_STEP; - goto loop; - } + DO_SORT(pn - d2, d2 / ST_POINTER_STEP); + + /* Iterate rather than recurse to save stack space */ + /* DO_SORT(a, d1 / ST_POINTER_STEP) */ + n = d1 / ST_POINTER_STEP; + goto loop; } } #endif From 1653ce5236c4948550e52d15d54e4b6bb66a23b1 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Sat, 7 Feb 2026 22:37:02 +0100 Subject: [PATCH 048/147] Further error message fix Further fix of error message changed in commit 74a116a79b4. The initial fix was not quite correct. Discussion: https://www.postgresql.org/message-id/flat/tencent_1EE1430B1E6C18A663B8990F%40qq.com --- src/bin/pg_rewind/file_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index 7de2195a4f..5cfb676f41 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -327,7 +327,7 @@ slurpFile(const char *datadir, const char *path, size_t *filesize) fullpath); if (fstat(fd, &statbuf) < 0) - pg_fatal("could not stat file \"%s\" for reading: %m", + pg_fatal("could not stat file \"%s\": %m", fullpath); len = statbuf.st_size; From c0bf15729f461308f54b7d4d46472c1ad43941a3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 7 Feb 2026 20:05:52 -0500 Subject: [PATCH 049/147] meson: host_system value for Solaris is 'sunos' not 'solaris'. This thinko caused us to not substitute our own getopt() code, which results in failing to parse long options for the postmaster since Solaris' getopt() doesn't do what we expect. This can be seen in the results of buildfarm member icarus, which is the only one trying to build via meson on Solaris. Per consultation with pgsql-release, it seems okay to fix this now even though we're in release freeze. The fix visibly won't affect any other platforms, and it can't break Solaris/meson builds any worse than they're already broken. Discussion: https://postgr.es/m/2471229.1770499291@sss.pgh.pa.us Backpatch-through: 16 --- meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson.build b/meson.build index df907b62da..96b3869df8 100644 --- a/meson.build +++ b/meson.build @@ -2911,7 +2911,7 @@ gnugetopt_dep = cc.find_library('gnugetopt', required: false) # (i.e., allow '-' as a flag character), so use our version on those platforms # - We want to use system's getopt_long() only if the system provides struct # option -always_replace_getopt = host_system in ['windows', 'cygwin', 'openbsd', 'solaris'] +always_replace_getopt = host_system in ['windows', 'cygwin', 'openbsd', 'sunos'] always_replace_getopt_long = host_system in ['windows', 'cygwin'] or not cdata.has('HAVE_STRUCT_OPTION') # Required on BSDs From 73dd7163c5d19f93b629d1ccd9d2a2de6e9667f6 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 7 Feb 2026 23:15:20 -0500 Subject: [PATCH 050/147] Replace some hard-wired OID constants with corresponding macros. Looking again at commit 7cdb633c8, I wondered why we have hard-wired "1034" for the OID of type aclitem[]. Some other entries in the same array have numeric type OIDs as well. This seems to be a hangover from years ago when not every built-in pg_type entry had an OID macro. But since we made genbki.pl responsible for generating these macros, there are macros available for all these array types, so there's no reason not to follow the project policy of never writing numeric OID constants in C code. --- src/backend/bootstrap/bootstrap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index e8f825b3d6..7d32cd0e15 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -131,13 +131,13 @@ static const struct typinfo TypInfo[] = { F_OIDVECTORIN, F_OIDVECTOROUT}, {"_int4", INT4ARRAYOID, INT4OID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_text", 1009, TEXTOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, + {"_text", TEXTARRAYOID, TEXTOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, F_ARRAY_IN, F_ARRAY_OUT}, - {"_oid", 1028, OIDOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, + {"_oid", OIDARRAYOID, OIDOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_char", 1002, CHAROID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, + {"_char", CHARARRAYOID, CHAROID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_aclitem", 1034, ACLITEMOID, -1, false, TYPALIGN_DOUBLE, TYPSTORAGE_EXTENDED, InvalidOid, + {"_aclitem", ACLITEMARRAYOID, ACLITEMOID, -1, false, TYPALIGN_DOUBLE, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT} }; From 379695d3cc70d040b547d912ce4842090d917ece Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 9 Feb 2026 08:00:59 +0900 Subject: [PATCH 051/147] pgcrypto: Fix buffer overflow in pgp_pub_decrypt_bytea() pgp_pub_decrypt_bytea() was missing a safeguard for the session key length read from the message data, that can be given in input of pgp_pub_decrypt_bytea(). This can result in the possibility of a buffer overflow for the session key data, when the length specified is longer than PGP_MAX_KEY, which is the maximum size of the buffer where the session data is copied to. A script able to rebuild the message and key data that can trigger the overflow is included in this commit, based on some contents provided by the reporter, heavily editted by me. A SQL test is added, based on the data generated by the script. Reported-by: Team Xint Code as part of zeroday.cloud Author: Michael Paquier Reviewed-by: Noah Misch Security: CVE-2026-2005 Backpatch-through: 14 --- contrib/pgcrypto/Makefile | 3 +- .../pgcrypto/expected/pgp-pubkey-session.out | 47 ++ contrib/pgcrypto/meson.build | 1 + contrib/pgcrypto/pgp-pubdec.c | 11 +- contrib/pgcrypto/px.c | 1 + contrib/pgcrypto/px.h | 2 +- contrib/pgcrypto/scripts/pgp_session_data.py | 491 ++++++++++++++++++ contrib/pgcrypto/sql/pgp-pubkey-session.sql | 46 ++ 8 files changed, 599 insertions(+), 3 deletions(-) create mode 100644 contrib/pgcrypto/expected/pgp-pubkey-session.out create mode 100644 contrib/pgcrypto/scripts/pgp_session_data.py create mode 100644 contrib/pgcrypto/sql/pgp-pubkey-session.sql diff --git a/contrib/pgcrypto/Makefile b/contrib/pgcrypto/Makefile index 69afa37501..17d2b0c5ed 100644 --- a/contrib/pgcrypto/Makefile +++ b/contrib/pgcrypto/Makefile @@ -44,7 +44,8 @@ REGRESS = init md5 sha1 hmac-md5 hmac-sha1 blowfish rijndael \ sha2 des 3des cast5 \ crypt-des crypt-md5 crypt-blowfish crypt-xdes \ pgp-armor pgp-decrypt pgp-encrypt pgp-encrypt-md5 $(CF_PGP_TESTS) \ - pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-info crypt-shacrypt + pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-pubkey-session \ + pgp-info crypt-shacrypt ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pgcrypto/expected/pgp-pubkey-session.out b/contrib/pgcrypto/expected/pgp-pubkey-session.out new file mode 100644 index 0000000000..f724d98eb2 --- /dev/null +++ b/contrib/pgcrypto/expected/pgp-pubkey-session.out @@ -0,0 +1,47 @@ +-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/pgp_session_data.py. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1 +da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30 +94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd +0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616 +3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10 +a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7 +b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d +8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc +0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494 +57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599 +ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3 +67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5 +060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56 +2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175 +5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d +135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea, +'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad +9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f +f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12 +07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1 +23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709 +f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c +138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4 +c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5 +18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847 +e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9 +de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0 +239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0 +ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9 +9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e +74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c +3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8 +58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549 +507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd +183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302 +25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45 +3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103 +cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03 +ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 +7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 +487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 +9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); +ERROR: Public key too big diff --git a/contrib/pgcrypto/meson.build b/contrib/pgcrypto/meson.build index c9c48f16f9..4f255c8cb0 100644 --- a/contrib/pgcrypto/meson.build +++ b/contrib/pgcrypto/meson.build @@ -52,6 +52,7 @@ pgcrypto_regress = [ 'pgp-encrypt-md5', 'pgp-pubkey-decrypt', 'pgp-pubkey-encrypt', + 'pgp-pubkey-session', 'pgp-info', 'crypt-shacrypt' ] diff --git a/contrib/pgcrypto/pgp-pubdec.c b/contrib/pgcrypto/pgp-pubdec.c index a0a5738a40..2a13aa3e6a 100644 --- a/contrib/pgcrypto/pgp-pubdec.c +++ b/contrib/pgcrypto/pgp-pubdec.c @@ -157,6 +157,7 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt) uint8 *msg; int msglen; PGP_MPI *m; + unsigned sess_key_len; pk = ctx->pub_key; if (pk == NULL) @@ -220,11 +221,19 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt) if (res < 0) goto out; + sess_key_len = msglen - 3; + if (sess_key_len > PGP_MAX_KEY) + { + px_debug("incorrect session key length=%u", sess_key_len); + res = PXE_PGP_KEY_TOO_BIG; + goto out; + } + /* * got sesskey */ ctx->cipher_algo = *msg; - ctx->sess_key_len = msglen - 3; + ctx->sess_key_len = sess_key_len; memcpy(ctx->sess_key, msg + 1, ctx->sess_key_len); out: diff --git a/contrib/pgcrypto/px.c b/contrib/pgcrypto/px.c index 4d668d4e49..d9bf1aae81 100644 --- a/contrib/pgcrypto/px.c +++ b/contrib/pgcrypto/px.c @@ -65,6 +65,7 @@ static const struct error_desc px_err_list[] = { {PXE_PGP_UNEXPECTED_PKT, "Unexpected packet in key data"}, {PXE_PGP_MATH_FAILED, "Math operation failed"}, {PXE_PGP_SHORT_ELGAMAL_KEY, "Elgamal keys must be at least 1024 bits long"}, + {PXE_PGP_KEY_TOO_BIG, "Public key too big"}, {PXE_PGP_UNKNOWN_PUBALGO, "Unknown public-key encryption algorithm"}, {PXE_PGP_WRONG_KEY, "Wrong key"}, {PXE_PGP_MULTIPLE_KEYS, diff --git a/contrib/pgcrypto/px.h b/contrib/pgcrypto/px.h index 4b81fceab8..a09533a358 100644 --- a/contrib/pgcrypto/px.h +++ b/contrib/pgcrypto/px.h @@ -75,7 +75,7 @@ /* -108 is unused */ #define PXE_PGP_MATH_FAILED -109 #define PXE_PGP_SHORT_ELGAMAL_KEY -110 -/* -111 is unused */ +#define PXE_PGP_KEY_TOO_BIG -111 #define PXE_PGP_UNKNOWN_PUBALGO -112 #define PXE_PGP_WRONG_KEY -113 #define PXE_PGP_MULTIPLE_KEYS -114 diff --git a/contrib/pgcrypto/scripts/pgp_session_data.py b/contrib/pgcrypto/scripts/pgp_session_data.py new file mode 100644 index 0000000000..999350bb2b --- /dev/null +++ b/contrib/pgcrypto/scripts/pgp_session_data.py @@ -0,0 +1,491 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# Generate PGP data to check the session key length of the input data provided +# to pgp_pub_decrypt_bytea(). +# +# First, the crafted data is generated from valid RSA data, freshly generated +# by this script each time it is run, see generate_rsa_keypair(). +# Second, the crafted PGP data is built, see build_message_data() and +# build_key_data(). Finally, the resulting SQL script is generated. +# +# This script generates in stdout the SQL file that is used in the regression +# tests of pgcrypto. The following command can be used to regenerate the file +# which should never be manually manipulated: +# python3 scripts/pgp_session_data.py > sql/pgp-pubkey-session.sql + +import os +import re +import struct +import secrets +import sys +import time + +# pwn for binary manipulation (p32, p64) +from pwn import * + +# Cryptographic libraries, to craft the PGP data. +from Crypto.Cipher import AES +from Crypto.PublicKey import RSA +from Crypto.Util.number import inverse + +# AES key used for session key encryption (16 bytes for AES-128) +AES_KEY = b'\x01' * 16 + +def generate_rsa_keypair(key_size: int = 2048) -> dict: + """ + Generate a fresh RSA key pair. + + The generated key includes all components needed for PGP operations: + - n: public modulus (p * q) + - e: public exponent (typically 65537) + - d: private exponent (e^-1 mod phi(n)) + - p, q: prime factors of n + - u: coefficient (p^-1 mod q) for CRT optimization + + The caller can pass the wanted key size in input, for a default of 2048 + bytes. This function returns the RSA key components, after performing + some validation on them. + """ + + start_time = time.time() + + # Generate RSA key + key = RSA.generate(key_size) + + # Extract all key components + rsa_components = { + 'n': key.n, # Public modulus (p * q) + 'e': key.e, # Public exponent (typically 65537) + 'd': key.d, # Private exponent (e^-1 mod phi(n)) + 'p': key.p, # First prime factor + 'q': key.q, # Second prime factor + 'u': inverse(key.p, key.q) # Coefficient for CRT: p^-1 mod q + } + + # Validate key components for correctness + validate_rsa_key(rsa_components) + + return rsa_components + +def validate_rsa_key(rsa: dict) -> None: + """ + Validate a generated RSA key. + + This function performs basic validation to ensure the RSA key is properly + constructed and all components are consistent, at least mathematically. + + Validations performed: + 1. n = p * q (modulus is product of primes) + 2. gcd(e, phi(n)) = 1 (public exponent is coprime to phi(n)) + 3. (d * e) mod(phi(n)) = 1 (private exponent is multiplicative inverse) + 4. (u * p) (mod q) = 1 (coefficient is correct for CRT) + """ + + n, e, d, p, q, u = rsa['n'], rsa['e'], rsa['d'], rsa['p'], rsa['q'], rsa['u'] + + # Check that n = p * q + if n != p * q: + raise ValueError("RSA validation failed: n <> p * q") + + # Check that p and q are different + if p == q: + raise ValueError("RSA validation failed: p = q (not allowed)") + + # Calculate phi(n) = (p-1)(q-1) + phi_n = (p - 1) * (q - 1) + + # Check that gcd(e, phi(n)) = 1 + def gcd(a, b): + while b: + a, b = b, a % b + return a + + if gcd(e, phi_n) != 1: + raise ValueError("RSA validation failed: gcd(e, phi(n)) <> 1") + + # Check that (d * e) mod(phi(n)) = 1 + if (d * e) % phi_n != 1: + raise ValueError("RSA validation failed: d * e <> 1 (mod phi(n))") + + # Check that (u * p) (mod q) = 1 + if (u * p) % q != 1: + raise ValueError("RSA validation failed: u * p <> 1 (mod q)") + +def mpi_encode(x: int) -> bytes: + """ + Encode an integer as an OpenPGP Multi-Precision Integer (MPI). + + Format (RFC 4880, Section 3.2): + - 2 bytes: bit length of the integer (big-endian) + - N bytes: the integer in big-endian format + + This is used to encode RSA key components (n, e, d, p, q, u) in PGP + packets. + + The integer to encode is given in input, returning an MPI-encoded + integer. + + For example: + mpi_encode(65537) -> b'\x00\x11\x01\x00\x01' + (17 bits, value 0x010001) + """ + if x < 0: + raise ValueError("MPI cannot encode negative integers") + + if x == 0: + # Special case: zero has 0 bits and empty magnitude + bits = 0 + mag = b"" + else: + # Calculate bit length and convert to bytes + bits = x.bit_length() + mag = x.to_bytes((bits + 7) // 8, 'big') + + # Pack: 2-byte bit length + magnitude bytes + return struct.pack('>H', bits) + mag + +def new_packet(tag: int, payload: bytes) -> bytes: + """ + Create a new OpenPGP packet with a proper header. + + OpenPGP packet format (RFC 4880, Section 4.2): + - New packet format: 0xC0 | tag + - Length encoding depends on payload size: + * 0-191: single byte + * 192-8383: two bytes (192 + ((length - 192) >> 8), (length - 192) & 0xFF) + * 8384+: five bytes (0xFF + 4-byte big-endian length) + + The packet is built from a "tag" (1-63) and some "payload" data. The + result generated is a complete OpenPGP packet. + + For example: + new_packet(1, b'data') -> b'\xC1\x04data' + (Tag 1, length 4, payload 'data') + """ + # New packet format: set bit 7 and 6, clear bit 5, tag in bits 0-5 + first = 0xC0 | (tag & 0x3F) + ln = len(payload) + + # Encode length according to OpenPGP specification + if ln <= 191: + # Single byte length for small packets + llen = bytes([ln]) + elif ln <= 8383: + # Two-byte length for medium packets + ln2 = ln - 192 + llen = bytes([192 + (ln2 >> 8), ln2 & 0xFF]) + else: + # Five-byte length for large packets + llen = bytes([255]) + struct.pack('>I', ln) + + return bytes([first]) + llen + payload + +def build_key_data(rsa: dict) -> bytes: + """ + Build the key data, containing an RSA private key. + + The RSA contents should have been generated previously. + + Format (see RFC 4880, Section 5.5.3): + - 1 byte: version (4) + - 4 bytes: creation time (current Unix timestamp) + - 1 byte: public key algorithm (2 = RSA encrypt) + - MPI: RSA public modulus n + - MPI: RSA public exponent e + - 1 byte: string-to-key usage (0 = no encryption) + - MPI: RSA private exponent d + - MPI: RSA prime p + - MPI: RSA prime q + - MPI: RSA coefficient u = p^-1 mod q + - 2 bytes: checksum of private key material + + This function takes a set of RSA key components in input (n, e, d, p, q, u) + and returns a secret key packet. + """ + + # Public key portion + ver = bytes([4]) # Version 4 key + ctime = struct.pack('>I', int(time.time())) # Current Unix timestamp + algo = bytes([2]) # RSA encrypt algorithm + n_mpi = mpi_encode(rsa['n']) # Public modulus + e_mpi = mpi_encode(rsa['e']) # Public exponent + pub = ver + ctime + algo + n_mpi + e_mpi + + # Private key portion + hide_type = bytes([0]) # No string-to-key encryption + d_mpi = mpi_encode(rsa['d']) # Private exponent + p_mpi = mpi_encode(rsa['p']) # Prime p + q_mpi = mpi_encode(rsa['q']) # Prime q + u_mpi = mpi_encode(rsa['u']) # Coefficient u = p^-1 mod q + + # Calculate checksum of private key material (simple sum mod 65536) + private_data = d_mpi + p_mpi + q_mpi + u_mpi + cksum = sum(private_data) & 0xFFFF + + secret = hide_type + private_data + struct.pack('>H', cksum) + payload = pub + secret + + return new_packet(7, payload) + +def pgp_cfb_encrypt_resync(key, plaintext): + """ + Implement OpenPGP CFB mode with resync. + + OpenPGP CFB mode is a variant of standard CFB with a resync operation + after the first two blocks. + + Algorithm (RFC 4880, Section 13.9): + 1. Block 1: FR=zeros, encrypt full block_size bytes + 2. Block 2: FR=block1, encrypt only 2 bytes + 3. Resync: FR = block1[2:] + block2 + 4. Remaining blocks: standard CFB mode + + This function uses the following arguments: + - key: AES encryption key (16 bytes for AES-128) + - plaintext: Data to encrypt + """ + block_size = 16 # AES block size + cipher = AES.new(key[:16], AES.MODE_ECB) # Use ECB for manual CFB + ciphertext = b'' + + # Block 1: FR=zeros, encrypt full 16 bytes + FR = b'\x00' * block_size + FRE = cipher.encrypt(FR) # Encrypt the feedback register + block1 = bytes(a ^ b for a, b in zip(FRE, plaintext[0:16])) + ciphertext += block1 + + # Block 2: FR=block1, encrypt only 2 bytes + FR = block1 + FRE = cipher.encrypt(FR) + block2 = bytes(a ^ b for a, b in zip(FRE[0:2], plaintext[16:18])) + ciphertext += block2 + + # Resync: FR = block1[2:16] + block2[0:2] + # This is the key difference from standard CFB mode + FR = block1[2:] + block2 + + # Block 3+: Continue with standard CFB mode + pos = 18 + while pos < len(plaintext): + FRE = cipher.encrypt(FR) + chunk_len = min(block_size, len(plaintext) - pos) + chunk = plaintext[pos:pos+chunk_len] + enc_chunk = bytes(a ^ b for a, b in zip(FRE[:chunk_len], chunk)) + ciphertext += enc_chunk + + # Update feedback register for next iteration + if chunk_len == block_size: + FR = enc_chunk + else: + # Partial block: pad with old FR bytes + FR = enc_chunk + FR[chunk_len:] + pos += chunk_len + + return ciphertext + +def build_literal_data_packet(data: bytes) -> bytes: + """ + Build a literal data packet containing a message. + + Format (RFC 4880, Section 5.9): + - 1 byte: data format ('b' = binary, 't' = text, 'u' = UTF-8 text) + - 1 byte: filename length (0 = no filename) + - N bytes: filename (empty in this case) + - 4 bytes: date (current Unix timestamp) + - M bytes: literal data + + The data used to build the packet is given in input, with the generated + result returned. + """ + body = bytes([ + ord('b'), # Binary data format + 0, # Filename length (0 = no filename) + ]) + struct.pack('>I', int(time.time())) + data # Current timestamp + data + + return new_packet(11, body) + +def build_symenc_data_packet(sess_key: bytes, cipher_algo: int, payload: bytes) -> bytes: + """ + Build a symmetrically-encrypted data packet using AES-128-CFB. + + This packet contains encrypted data using the session key. The format + includes a random prefix, for security (see RFC 4880, Section 5.7). + + Packet structure: + - Random prefix (block_size bytes) + - Prefix repeat (last 2 bytes of prefix repeated) + - Encrypted literal data packet + + This function uses the following set of arguments: + - sess_key: Session key for encryption + - cipher_algo: Cipher algorithm identifier (7 = AES-128) + - payload: Data to encrypt (wrapped in literal data packet) + """ + block_size = 16 # AES-128 block size + key = sess_key[:16] # Use first 16 bytes for AES-128 + + # Create random prefix + repeat last 2 bytes (total 18 bytes) + # This is required by OpenPGP for integrity checking + prefix_random = secrets.token_bytes(block_size) + prefix = prefix_random + prefix_random[-2:] # 18 bytes total + + # Wrap payload in literal data packet + literal_pkt = build_literal_data_packet(payload) + + # Plaintext = prefix + literal data packet + plaintext = prefix + literal_pkt + + # Encrypt using OpenPGP CFB mode with resync + ciphertext = pgp_cfb_encrypt_resync(key, plaintext) + + return new_packet(9, ciphertext) + +def build_tag1_packet(rsa: dict, sess_key: bytes) -> bytes: + """ + Build a public-key encrypted key. + + This is a very important function, as it is able to create the packet + triggering the overflow check. This function can also be used to create + "legit" packet data. + + Format (RFC 4880, Section 5.1): + - 1 byte: version (3) + - 8 bytes: key ID (0 = any key accepted) + - 1 byte: public key algorithm (2 = RSA encrypt) + - MPI: RSA-encrypted session key + + This uses in arguments the generated RSA key pair, and the session key + to encrypt. The latter is manipulated to trigger the overflow. + + This function returns a complete packet encrypted by a session key. + """ + + # Calculate RSA modulus size in bytes + n_bytes = (rsa['n'].bit_length() + 7) // 8 + + # Session key message format: + # - 1 byte: symmetric cipher algorithm (7 = AES-128) + # - N bytes: session key + # - 2 bytes: checksum (simple sum of session key bytes) + algo_byte = bytes([7]) # AES-128 algorithm identifier + cksum = sum(sess_key) & 0xFFFF # 16-bit checksum + M = algo_byte + sess_key + struct.pack('>H', cksum) + + # PKCS#1 v1.5 padding construction + # Format: 0x02 || PS || 0x00 || M + # Total padded message must be exactly n_bytes long. + total_len = n_bytes # Total length must equal modulus size in bytes + ps_len = total_len - len(M) - 2 # Subtract 2 for 0x02 and 0x00 bytes + + if ps_len < 8: + raise ValueError(f"Padding string too short ({ps_len} bytes); need at least 8 bytes. " + f"Message length: {len(M)}, Modulus size: {n_bytes} bytes") + + # Create padding string with *ALL* bytes being 0xFF (no zero separator!) + PS = bytes([0xFF]) * ps_len + + # Construct the complete padded message + # Normal PKCS#1 v1.5 padding: 0x02 || PS || 0x00 || M + padded = bytes([0x02]) + PS + bytes([0x00]) + M + + # Verify padding construction + if len(padded) != n_bytes: + raise ValueError(f"Padded message length ({len(padded)}) doesn't match RSA modulus size ({n_bytes})") + + # Convert padded message to integer and encrypt with RSA + m_int = int.from_bytes(padded, 'big') + + # Ensure message is smaller than modulus (required for RSA) + if m_int >= rsa['n']: + raise ValueError("Padded message is larger than RSA modulus") + + # RSA encryption: c = m^e mod n + c_int = pow(m_int, rsa['e'], rsa['n']) + + # Encode encrypted result as MPI + c_mpi = mpi_encode(c_int) + + # Build complete packet + ver = bytes([3]) # Version 3 packet + key_id = b"\x00" * 8 # Key ID (0 = any key accepted) + algo = bytes([2]) # RSA encrypt algorithm + payload = ver + key_id + algo + c_mpi + + return new_packet(1, payload) + +def build_message_data(rsa: dict) -> bytes: + """ + This function creates a crafted message, with a long session key + length. + + This takes in input the RSA key components generated previously, + returning a concatenated set of PGP packets crafted for the purpose + of this test. + """ + + # Base prefix for session key (AES key + padding + size). + # Note that the crafted size is the important part for this test. + prefix = AES_KEY + b"\x00" * 16 + p32(0x10) + + # Build encrypted data packet, legit. + sedata = build_symenc_data_packet(AES_KEY, cipher_algo=7, payload=b"\x0a\x00") + + # Build multiple packets + packets = [ + # First packet, legit. + build_tag1_packet(rsa, prefix), + + # Encrypted data packet, legit. + sedata, + + # Second packet: information payload. + # + # This packet contains a longer-crafted session key, able to trigger + # the overflow check in pgcrypto. This is the critical part, and + # and you are right to pay a lot of attention here if you are + # reading this code. + build_tag1_packet(rsa, prefix) + ] + + return b"".join(packets) + +def main(): + # Default key size. + # This number can be set to a higher number if wanted, like 4096. We + # just do not need to do that here. + key_size = 2048 + + # Generate fresh RSA key pair + rsa = generate_rsa_keypair(key_size) + + # Generate the message data. + print("### Building message data", file=sys.stderr) + message_data = build_message_data(rsa) + + # Build the key containing the RSA private key + print("### Building key data", file=sys.stderr) + key_data = build_key_data(rsa) + + # Convert to hexadecimal, for the bytea used in the SQL file. + message_data = message_data.hex() + key_data = key_data.hex() + + # Split each value into lines of 72 characters, for readability. + message_data = re.sub("(.{72})", "\\1\n", message_data, 0, re.DOTALL) + key_data = re.sub("(.{72})", "\\1\n", key_data, 0, re.DOTALL) + + # Get the script filename for documentation + file_basename = os.path.basename(__file__) + + # Output the SQL test case + print(f'''-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/{file_basename}. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\\x{message_data}'::bytea, +'\\x{key_data}'::bytea);''', + file=sys.stdout) + +if __name__ == "__main__": + main() diff --git a/contrib/pgcrypto/sql/pgp-pubkey-session.sql b/contrib/pgcrypto/sql/pgp-pubkey-session.sql new file mode 100644 index 0000000000..51792f1f4d --- /dev/null +++ b/contrib/pgcrypto/sql/pgp-pubkey-session.sql @@ -0,0 +1,46 @@ +-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/pgp_session_data.py. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1 +da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30 +94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd +0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616 +3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10 +a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7 +b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d +8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc +0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494 +57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599 +ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3 +67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5 +060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56 +2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175 +5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d +135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea, +'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad +9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f +f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12 +07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1 +23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709 +f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c +138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4 +c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5 +18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847 +e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9 +de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0 +239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0 +ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9 +9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e +74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c +3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8 +58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549 +507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd +183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302 +25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45 +3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103 +cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03 +ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 +7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 +487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 +9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); From 54598670fe0a191f49848d1a1a8ab09d99616e71 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Jan 2026 14:34:32 +0200 Subject: [PATCH 052/147] Remove 'charlen' argument from make_trigrams() The function assumed that if charlen == bytelen, there are no multibyte characters in the string. That's sensible, but the callers were a little careless in how they calculated the lengths. The callers converted the string to lowercase before calling make_trigram(), and the 'charlen' value was calculated *before* the conversion to lowercase while 'bytelen' was calculated after the conversion. If the lowercased string had a different number of characters than the original, make_trigram() might incorrectly apply the fastpath and treat all the bytes as single-byte characters, or fail to apply the fastpath (which is harmless), or it might hit the "Assert(bytelen == charlen)" assertion. I'm not aware of any locale / character combinations where you could hit that assertion in practice, i.e. where a string converted to lowercase would have fewer characters than the original, but it seems best to avoid making that assumption. To fix, remove the 'charlen' argument. To keep the performance when there are no multibyte characters, always try the fast path first, but check the input for multibyte characters as we go. The check on each byte adds some overhead, but it's close enough. And to compensate, the find_word() function no longer needs to count the characters. This fixes one small bug in make_trigrams(): in the multibyte codepath, it peeked at the byte just after the end of the input string. When compiled with IGNORECASE, that was harmless because there is always a NUL byte or blank after the input string. But with !IGNORECASE, the call from generate_wildcard_trgm() doesn't guarantee that. Backpatch to v18, but no further. In previous versions lower-casing was done character by character, and thus the assumption that lower-casing doesn't change the character length was valid. That was changed in v18, commit fb1a18810f. Security: CVE-2026-2007 Reviewed-by: Noah Misch --- contrib/pg_trgm/trgm_op.c | 116 ++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 49 deletions(-) diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 81182a15e0..581ca48dd5 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -220,7 +220,7 @@ comp_trgm(const void *a, const void *b) * endword points to the character after word */ static char * -find_word(char *str, int lenstr, char **endword, int *charlen) +find_word(char *str, int lenstr, char **endword) { char *beginword = str; @@ -231,12 +231,8 @@ find_word(char *str, int lenstr, char **endword, int *charlen) return NULL; *endword = beginword; - *charlen = 0; while (*endword - str < lenstr && ISWORDCHR(*endword)) - { *endword += pg_mblen(*endword); - (*charlen)++; - } return beginword; } @@ -269,45 +265,82 @@ compact_trigram(trgm *tptr, char *str, int bytelen) } /* - * Adds trigrams from words (already padded). + * Adds trigrams from the word in 'str' (already padded if necessary). */ static trgm * -make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) +make_trigrams(trgm *tptr, char *str, int bytelen) { char *ptr = str; - if (charlen < 3) + if (bytelen < 3) return tptr; - if (bytelen > charlen) + if (pg_encoding_max_length(GetDatabaseEncoding()) == 1) { - /* Find multibyte character boundaries and apply compact_trigram */ - int lenfirst = pg_mblen(str), - lenmiddle = pg_mblen(str + lenfirst), - lenlast = pg_mblen(str + lenfirst + lenmiddle); - - while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) + while (ptr < str + bytelen - 2) { - compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast); - - ptr += lenfirst; + CPTRGM(tptr, ptr); + ptr++; tptr++; - - lenfirst = lenmiddle; - lenmiddle = lenlast; - lenlast = pg_mblen(ptr + lenfirst + lenmiddle); } } else { - /* Fast path when there are no multibyte characters */ - Assert(bytelen == charlen); + int lenfirst, + lenmiddle, + lenlast; + char *endptr; - while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ ) + /* + * Fast path as long as there are no multibyte characters + */ + if (!IS_HIGHBIT_SET(ptr[0]) && !IS_HIGHBIT_SET(ptr[1])) { - CPTRGM(tptr, ptr); - ptr++; + while (!IS_HIGHBIT_SET(ptr[2])) + { + CPTRGM(tptr, ptr); + ptr++; + tptr++; + + if (ptr == str + bytelen - 2) + return tptr; + } + + lenfirst = 1; + lenmiddle = 1; + lenlast = pg_mblen(ptr + 2); + } + else + { + lenfirst = pg_mblen(ptr); + if (ptr + lenfirst >= str + bytelen) + return tptr; + lenmiddle = pg_mblen(ptr + lenfirst); + if (ptr + lenfirst + lenmiddle >= str + bytelen) + return tptr; + lenlast = pg_mblen(ptr + lenfirst + lenmiddle); + } + + /* + * Slow path to handle any remaining multibyte characters + * + * As we go, 'ptr' points to the beginning of the current + * three-character string and 'endptr' points to just past it. + */ + endptr = ptr + lenfirst + lenmiddle + lenlast; + while (endptr <= str + bytelen) + { + compact_trigram(tptr, ptr, endptr - ptr); tptr++; + + /* Advance to the next character */ + if (endptr == str + bytelen) + break; + ptr += lenfirst; + lenfirst = lenmiddle; + lenmiddle = lenlast; + lenlast = pg_mblen(endptr); + endptr += lenlast; } } @@ -328,8 +361,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) { trgm *tptr; char *buf; - int charlen, - bytelen; + int bytelen; char *bword, *eword; @@ -349,7 +381,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) } eword = str; - while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL) + while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL) { #ifdef IGNORECASE bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID); @@ -370,8 +402,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) /* Calculate trigrams marking their bounds if needed */ if (bounds) bounds[tptr - trg] |= TRGM_BOUND_LEFT; - tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING, - charlen + LPADDING + RPADDING); + tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING); if (bounds) bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT; } @@ -761,17 +792,16 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, * str: source string, of length lenstr bytes (need not be null-terminated) * buf: where to return the substring (must be long enough) * *bytelen: receives byte length of the found substring - * *charlen: receives character length of the found substring * * Returns pointer to end+1 of the found substring in the source string. - * Returns NULL if no word found (in which case buf, bytelen, charlen not set) + * Returns NULL if no word found (in which case buf, bytelen is not set) * * If the found word is bounded by non-word characters or string boundaries * then this function will include corresponding padding spaces into buf. */ static const char * get_wildcard_part(const char *str, int lenstr, - char *buf, int *bytelen, int *charlen) + char *buf, int *bytelen) { const char *beginword = str; const char *endword; @@ -820,18 +850,13 @@ get_wildcard_part(const char *str, int lenstr, * Add left padding spaces if preceding character wasn't wildcard * meta-character. */ - *charlen = 0; if (!in_leading_wildcard_meta) { if (LPADDING > 0) { *s++ = ' '; - (*charlen)++; if (LPADDING > 1) - { *s++ = ' '; - (*charlen)++; - } } } @@ -848,7 +873,6 @@ get_wildcard_part(const char *str, int lenstr, if (ISWORDCHR(endword)) { memcpy(s, endword, clen); - (*charlen)++; s += clen; } else @@ -876,7 +900,6 @@ get_wildcard_part(const char *str, int lenstr, else if (ISWORDCHR(endword)) { memcpy(s, endword, clen); - (*charlen)++; s += clen; } else @@ -894,12 +917,8 @@ get_wildcard_part(const char *str, int lenstr, if (RPADDING > 0) { *s++ = ' '; - (*charlen)++; if (RPADDING > 1) - { *s++ = ' '; - (*charlen)++; - } } } @@ -922,7 +941,6 @@ generate_wildcard_trgm(const char *str, int slen) *buf2; trgm *tptr; int len, - charlen, bytelen; const char *eword; @@ -945,7 +963,7 @@ generate_wildcard_trgm(const char *str, int slen) */ eword = str; while ((eword = get_wildcard_part(eword, slen - (eword - str), - buf, &bytelen, &charlen)) != NULL) + buf, &bytelen)) != NULL) { #ifdef IGNORECASE buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID); @@ -957,7 +975,7 @@ generate_wildcard_trgm(const char *str, int slen) /* * count trigrams */ - tptr = make_trigrams(tptr, buf2, bytelen, charlen); + tptr = make_trigrams(tptr, buf2, bytelen); #ifdef IGNORECASE pfree(buf2); From 00896ddaf41fa7b725991120678d544c18c6af70 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Jan 2026 11:53:28 +0200 Subject: [PATCH 053/147] Fix buffer overflows in pg_trgm due to lower-casing The code made a subtle assumption that the lower-cased version of a string never has more characters than the original. That is not always true. For example, in a database with the latin9 encoding: latin9db=# select lower(U&'\00CC' COLLATE "lt-x-icu"); lower ----------- i\x1A\x1A (1 row) In this example, lower-casing expands the single input character into three characters. The generate_trgm_only() function relied on that assumption in two ways: - It used "slen * pg_database_encoding_max_length() + 4" to allocate the buffer to hold the lowercased and blank-padded string. That formula accounts for expansion if the lower-case characters are longer (in bytes) than the originals, but it's still not enough if the lower-cased string contains more *characters* than the original. - Its callers sized the output array to hold the trigrams extracted from the input string with the formula "(slen / 2 + 1) * 3", where 'slen' is the input string length in bytes. (The formula was generous to account for the possibility that RPADDING was set to 2.) That's also not enough if one input byte can turn into multiple characters. To fix, introduce a growable trigram array and give up on trying to choose the correct max buffer sizes ahead of time. Backpatch to v18, but no further. In previous versions lower-casing was done character by character, and thus the assumption that lower-casing doesn't change the character length was valid. That was changed in v18, commit fb1a18810f. Security: CVE-2026-2007 Reviewed-by: Noah Misch Reviewed-by: Jeff Davis --- contrib/pg_trgm/trgm_op.c | 275 +++++++++++++++++++++---------- src/tools/pgindent/typedefs.list | 1 + 2 files changed, 185 insertions(+), 91 deletions(-) diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 581ca48dd5..4bb5506647 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -66,6 +66,78 @@ typedef uint8 TrgmBound; #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match * word bounds */ +/* + * A growable array of trigrams + * + * The actual array of trigrams is in 'datum'. Note that the other fields in + * 'datum', i.e. datum->flags and the varlena length, are not kept up to date + * when items are added to the growable array. We merely reserve the space + * for them here. You must fill those other fields before using 'datum' as a + * proper TRGM datum. + */ +typedef struct +{ + TRGM *datum; /* trigram array */ + int length; /* number of trigrams in the array */ + int allocated; /* allocated size of 'datum' (# of trigrams) */ +} growable_trgm_array; + +/* + * Allocate a new growable array. + * + * 'slen' is the size of the source string that we're extracting the trigrams + * from. It is used to choose the initial size of the array. + */ +static void +init_trgm_array(growable_trgm_array *arr, int slen) +{ + size_t init_size; + + /* + * In the extreme case, the input string consists entirely of one + * character words, like "a b c", where each word is expanded to two + * trigrams. This is not a strict upper bound though, because when + * IGNORECASE is defined, we convert the input string to lowercase before + * extracting the trigrams, which in rare cases can expand one input + * character into multiple characters. + */ + init_size = (size_t) slen + 1; + + /* + * Guard against possible overflow in the palloc request. (We don't worry + * about the additive constants, since palloc can detect requests that are + * a little above MaxAllocSize --- we just need to prevent integer + * overflow in the multiplications.) + */ + if (init_size > MaxAllocSize / sizeof(trgm)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of memory"))); + + arr->datum = palloc(CALCGTSIZE(ARRKEY, init_size)); + arr->allocated = init_size; + arr->length = 0; +} + +/* Make sure the array can hold at least 'needed' more trigrams */ +static void +enlarge_trgm_array(growable_trgm_array *arr, int needed) +{ + size_t new_needed = (size_t) arr->length + needed; + + if (new_needed > arr->allocated) + { + /* Guard against possible overflow, like in init_trgm_array */ + if (new_needed > MaxAllocSize / sizeof(trgm)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of memory"))); + + arr->datum = repalloc(arr->datum, CALCGTSIZE(ARRKEY, new_needed)); + arr->allocated = new_needed; + } +} + /* * Module load callback */ @@ -267,13 +339,18 @@ compact_trigram(trgm *tptr, char *str, int bytelen) /* * Adds trigrams from the word in 'str' (already padded if necessary). */ -static trgm * -make_trigrams(trgm *tptr, char *str, int bytelen) +static void +make_trigrams(growable_trgm_array *dst, char *str, int bytelen) { + trgm *tptr; char *ptr = str; if (bytelen < 3) - return tptr; + return; + + /* max number of trigrams = strlen - 2 */ + enlarge_trgm_array(dst, bytelen - 2); + tptr = GETARR(dst->datum) + dst->length; if (pg_encoding_max_length(GetDatabaseEncoding()) == 1) { @@ -303,7 +380,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen) tptr++; if (ptr == str + bytelen - 2) - return tptr; + goto done; } lenfirst = 1; @@ -314,10 +391,10 @@ make_trigrams(trgm *tptr, char *str, int bytelen) { lenfirst = pg_mblen(ptr); if (ptr + lenfirst >= str + bytelen) - return tptr; + goto done; lenmiddle = pg_mblen(ptr + lenfirst); if (ptr + lenfirst + lenmiddle >= str + bytelen) - return tptr; + goto done; lenlast = pg_mblen(ptr + lenfirst + lenmiddle); } @@ -344,35 +421,54 @@ make_trigrams(trgm *tptr, char *str, int bytelen) } } - return tptr; +done: + dst->length = tptr - GETARR(dst->datum); + Assert(dst->length <= dst->allocated); } /* * Make array of trigrams without sorting and removing duplicate items. * - * trg: where to return the array of trigrams. + * dst: where to return the array of trigrams. * str: source string, of length slen bytes. - * bounds: where to return bounds of trigrams (if needed). - * - * Returns length of the generated array. + * bounds_p: where to return bounds of trigrams (if needed). */ -static int -generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) +static void +generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bounds_p) { - trgm *tptr; + size_t buflen; char *buf; int bytelen; char *bword, *eword; + TrgmBound *bounds = NULL; + int bounds_allocated = 0; - if (slen + LPADDING + RPADDING < 3 || slen == 0) - return 0; + init_trgm_array(dst, slen); - tptr = trg; + /* + * If requested, allocate an array for the bounds, with the same size as + * the trigram array. + */ + if (bounds_p) + { + bounds_allocated = dst->allocated; + bounds = *bounds_p = palloc0_array(TrgmBound, bounds_allocated); + } - /* Allocate a buffer for case-folded, blank-padded words */ - buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4); + if (slen + LPADDING + RPADDING < 3 || slen == 0) + return; + /* + * Allocate a buffer for case-folded, blank-padded words. + * + * As an initial guess, allocate a buffer large enough to hold the + * original string with padding, which is always enough when compiled with + * !IGNORECASE. If the case-folding produces a string longer than the + * original, we'll grow the buffer. + */ + buflen = (size_t) slen + 4; + buf = (char *) palloc(buflen); if (LPADDING > 0) { *buf = ' '; @@ -383,49 +479,57 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) eword = str; while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL) { + int oldlen; + + /* Convert word to lower case before extracting trigrams from it */ #ifdef IGNORECASE - bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID); - bytelen = strlen(bword); + { + char *lowered; + + lowered = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID); + bytelen = strlen(lowered); + + /* grow the buffer if necessary */ + if (bytelen > buflen - 4) + { + pfree(buf); + buflen = (size_t) bytelen + 4; + buf = (char *) palloc(buflen); + if (LPADDING > 0) + { + *buf = ' '; + if (LPADDING > 1) + *(buf + 1) = ' '; + } + } + memcpy(buf + LPADDING, lowered, bytelen); + pfree(lowered); + } #else bytelen = eword - bword; -#endif - memcpy(buf + LPADDING, bword, bytelen); - -#ifdef IGNORECASE - pfree(bword); #endif buf[LPADDING + bytelen] = ' '; buf[LPADDING + bytelen + 1] = ' '; /* Calculate trigrams marking their bounds if needed */ + oldlen = dst->length; + make_trigrams(dst, buf, bytelen + LPADDING + RPADDING); if (bounds) - bounds[tptr - trg] |= TRGM_BOUND_LEFT; - tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING); - if (bounds) - bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT; + { + if (bounds_allocated < dst->length) + { + bounds = repalloc0_array(bounds, TrgmBound, bounds_allocated, dst->allocated); + bounds_allocated = dst->allocated; + } + + bounds[oldlen] |= TRGM_BOUND_LEFT; + bounds[dst->length - 1] |= TRGM_BOUND_RIGHT; + } } pfree(buf); - - return tptr - trg; -} - -/* - * Guard against possible overflow in the palloc requests below. (We - * don't worry about the additive constants, since palloc can detect - * requests that are a little above MaxAllocSize --- we just need to - * prevent integer overflow in the multiplications.) - */ -static void -protect_out_of_mem(int slen) -{ - if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) || - (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length())) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of memory"))); } /* @@ -439,19 +543,14 @@ TRGM * generate_trgm(char *str, int slen) { TRGM *trg; + growable_trgm_array arr; int len; - protect_out_of_mem(slen); - - trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); + generate_trgm_only(&arr, str, slen, NULL); + len = arr.length; + trg = arr.datum; trg->flag = ARRKEY; - len = generate_trgm_only(GETARR(trg), str, slen, NULL); - SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); - - if (len == 0) - return trg; - /* * Make trigrams unique. */ @@ -706,8 +805,8 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, { bool *found; pos_trgm *ptrg; - trgm *trg1; - trgm *trg2; + growable_trgm_array trg1; + growable_trgm_array trg2; int len1, len2, len, @@ -716,27 +815,21 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, ulen1; int *trg2indexes; float4 result; - TrgmBound *bounds; - - protect_out_of_mem(slen1 + slen2); + TrgmBound *bounds = NULL; /* Make positional trigrams */ - trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3); - trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3); - if (flags & WORD_SIMILARITY_STRICT) - bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3); - else - bounds = NULL; - len1 = generate_trgm_only(trg1, str1, slen1, NULL); - len2 = generate_trgm_only(trg2, str2, slen2, bounds); + generate_trgm_only(&trg1, str1, slen1, NULL); + len1 = trg1.length; + generate_trgm_only(&trg2, str2, slen2, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL); + len2 = trg2.length; - ptrg = make_positional_trgm(trg1, len1, trg2, len2); + ptrg = make_positional_trgm(GETARR(trg1.datum), len1, GETARR(trg2.datum), len2); len = len1 + len2; qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm); - pfree(trg1); - pfree(trg2); + pfree(trg1.datum); + pfree(trg2.datum); /* * Merge positional trigrams array: enumerate each trigram and find its @@ -937,23 +1030,21 @@ TRGM * generate_wildcard_trgm(const char *str, int slen) { TRGM *trg; - char *buf, - *buf2; - trgm *tptr; + growable_trgm_array arr; + char *buf; int len, bytelen; const char *eword; - protect_out_of_mem(slen); - - trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); - trg->flag = ARRKEY; - SET_VARSIZE(trg, TRGMHDRSIZE); - if (slen + LPADDING + RPADDING < 3 || slen == 0) + { + trg = (TRGM *) palloc(TRGMHDRSIZE); + trg->flag = ARRKEY; + SET_VARSIZE(trg, TRGMHDRSIZE); return trg; + } - tptr = GETARR(trg); + init_trgm_array(&arr, slen); /* Allocate a buffer for blank-padded, but not yet case-folded, words */ buf = palloc_array(char, slen + 4); @@ -965,37 +1056,39 @@ generate_wildcard_trgm(const char *str, int slen) while ((eword = get_wildcard_part(eword, slen - (eword - str), buf, &bytelen)) != NULL) { + char *word; + #ifdef IGNORECASE - buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID); - bytelen = strlen(buf2); + word = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID); + bytelen = strlen(word); #else - buf2 = buf; + word = buf; #endif /* * count trigrams */ - tptr = make_trigrams(tptr, buf2, bytelen); + make_trigrams(&arr, word, bytelen); #ifdef IGNORECASE - pfree(buf2); + pfree(word); #endif } pfree(buf); - if ((len = tptr - GETARR(trg)) == 0) - return trg; - /* * Make trigrams unique. */ + trg = arr.datum; + len = arr.length; if (len > 1) { qsort(GETARR(trg), len, sizeof(trgm), comp_trgm); len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm); } + trg->flag = ARRKEY; SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); return trg; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9f5ee8fd48..7751848941 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3737,6 +3737,7 @@ gistxlogPageReuse gistxlogPageSplit gistxlogPageUpdate grouping_sets_data +growable_trgm_array gseg_picksplit_item gss_OID_set gss_buffer_desc From af79c30dc3e5369cd6d2bfdccd2c2c0ffbd60ef3 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Thu, 5 Feb 2026 01:04:24 +1300 Subject: [PATCH 054/147] Fix encoding length for EUC_CN. While EUC_CN supports only 1- and 2-byte sequences (CS0, CS1), the mb<->wchar conversion functions allow 3-byte sequences beginning SS2, SS3. Change pg_encoding_max_length() to return 3, not 2, to close a hypothesized buffer overrun if a corrupted string is converted to wchar and back again in a newly allocated buffer. We might reconsider that in master (ie harmonizing in a different direction), but this change seems better for the back-branches. Also change pg_euccn_mblen() to report SS2 and SS3 characters as having length 3 (following the example of EUC_KR). Even though such characters would not pass verification, it's remotely possible that invalid bytes could be used to compute a buffer size for use in wchar conversion. Security: CVE-2026-2006 Backpatch-through: 14 Author: Thomas Munro Reviewed-by: Noah Misch Reviewed-by: Heikki Linnakangas --- src/common/wchar.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 5631e2c936..191f3552ed 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -267,12 +267,22 @@ pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) return cnt; } +/* + * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for + * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that + * relies on agreement between mb2wchar_with_len and mblen. Invalid text + * datums (e.g. from shared catalogs) reach this. + */ static int pg_euccn_mblen(const unsigned char *s) { int len; - if (IS_HIGHBIT_SET(*s)) + if (*s == SS2) + len = 3; + else if (*s == SS3) + len = 3; + else if (IS_HIGHBIT_SET(*s)) len = 2; else len = 1; @@ -2064,7 +2074,7 @@ pg_encoding_set_invalid(int encoding, char *dst) const pg_wchar_tbl pg_wchar_table[] = { [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1}, [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, - [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2}, + [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3}, [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3}, [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4}, [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, From 74ee636cc93d919c845e3e3ad3642e6366ce1802 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 26 Jan 2026 11:22:32 +1300 Subject: [PATCH 055/147] Fix mb2wchar functions on short input. When converting multibyte to pg_wchar, the UTF-8 implementation would silently ignore an incomplete final character, while the other implementations would cast a single byte to pg_wchar, and then repeat for the remaining byte sequence. While it didn't overrun the buffer, it was surely garbage output. Make all encodings behave like the UTF-8 implementation. A later change for master only will convert this to an error, but we choose not to back-patch that behavior change on the off-chance that someone is relying on the existing UTF-8 behavior. Security: CVE-2026-2006 Backpatch-through: 14 Author: Thomas Munro Reported-by: Noah Misch Reviewed-by: Noah Misch Reviewed-by: Heikki Linnakangas --- src/common/wchar.c | 52 ++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 191f3552ed..eb15ee5949 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -63,6 +63,9 @@ * subset to the ASCII routines to ensure consistency. */ +/* No error-reporting facility. Ignore incomplete trailing byte sequence. */ +#define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break + /* * SQL/ASCII */ @@ -108,22 +111,24 @@ pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) while (len > 0 && *from) { - if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte - * KANA") */ + if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */ { + MB2CHAR_NEED_AT_LEAST(len, 2); from++; *to = (SS2 << 8) | *from++; len -= 2; } - else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */ + else if (*from == SS3) /* JIS X 0212 KANJI */ { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = (SS3 << 16) | (*from++ << 8); *to |= *from++; len -= 3; } - else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */ + else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */ { + MB2CHAR_NEED_AT_LEAST(len, 2); *to = *from++ << 8; *to |= *from++; len -= 2; @@ -235,22 +240,25 @@ pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) while (len > 0 && *from) { - if (*from == SS2 && len >= 3) /* code set 2 (unused?) */ + if (*from == SS2) /* code set 2 (unused?) */ { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = (SS2 << 16) | (*from++ << 8); *to |= *from++; len -= 3; } - else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */ + else if (*from == SS3) /* code set 3 (unused ?) */ { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = (SS3 << 16) | (*from++ << 8); *to |= *from++; len -= 3; } - else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */ + else if (IS_HIGHBIT_SET(*from)) /* code set 1 */ { + MB2CHAR_NEED_AT_LEAST(len, 2); *to = *from++ << 8; *to |= *from++; len -= 2; @@ -312,23 +320,26 @@ pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) while (len > 0 && *from) { - if (*from == SS2 && len >= 4) /* code set 2 */ + if (*from == SS2) /* code set 2 */ { + MB2CHAR_NEED_AT_LEAST(len, 4); from++; *to = (((uint32) SS2) << 24) | (*from++ << 16); *to |= *from++ << 8; *to |= *from++; len -= 4; } - else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */ + else if (*from == SS3) /* code set 3 (unused?) */ { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = (SS3 << 16) | (*from++ << 8); *to |= *from++; len -= 3; } - else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */ + else if (IS_HIGHBIT_SET(*from)) /* code set 2 */ { + MB2CHAR_NEED_AT_LEAST(len, 2); *to = *from++ << 8; *to |= *from++; len -= 2; @@ -465,8 +476,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } else if ((*from & 0xe0) == 0xc0) { - if (len < 2) - break; /* drop trailing incomplete char */ + MB2CHAR_NEED_AT_LEAST(len, 2); c1 = *from++ & 0x1f; c2 = *from++ & 0x3f; *to = (c1 << 6) | c2; @@ -474,8 +484,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } else if ((*from & 0xf0) == 0xe0) { - if (len < 3) - break; /* drop trailing incomplete char */ + MB2CHAR_NEED_AT_LEAST(len, 3); c1 = *from++ & 0x0f; c2 = *from++ & 0x3f; c3 = *from++ & 0x3f; @@ -484,8 +493,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } else if ((*from & 0xf8) == 0xf0) { - if (len < 4) - break; /* drop trailing incomplete char */ + MB2CHAR_NEED_AT_LEAST(len, 4); c1 = *from++ & 0x07; c2 = *from++ & 0x3f; c3 = *from++ & 0x3f; @@ -687,28 +695,32 @@ pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) while (len > 0 && *from) { - if (IS_LC1(*from) && len >= 2) + if (IS_LC1(*from)) { + MB2CHAR_NEED_AT_LEAST(len, 2); *to = *from++ << 16; *to |= *from++; len -= 2; } - else if (IS_LCPRV1(*from) && len >= 3) + else if (IS_LCPRV1(*from)) { + MB2CHAR_NEED_AT_LEAST(len, 3); from++; *to = *from++ << 16; *to |= *from++; len -= 3; } - else if (IS_LC2(*from) && len >= 3) + else if (IS_LC2(*from)) { + MB2CHAR_NEED_AT_LEAST(len, 3); *to = *from++ << 16; *to |= *from++ << 8; *to |= *from++; len -= 3; } - else if (IS_LCPRV2(*from) && len >= 4) + else if (IS_LCPRV2(*from)) { + MB2CHAR_NEED_AT_LEAST(len, 4); from++; *to = *from++ << 16; *to |= *from++ << 8; From 1e7fe06c10c0a8da9dd6261a6be8d405dc17c728 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Wed, 7 Jan 2026 22:14:31 +1300 Subject: [PATCH 056/147] Replace pg_mblen() with bounds-checked versions. A corrupted string could cause code that iterates with pg_mblen() to overrun its buffer. Fix, by converting all callers to one of the following: 1. Callers with a null-terminated string now use pg_mblen_cstr(), which raises an "illegal byte sequence" error if it finds a terminator in the middle of the sequence. 2. Callers with a length or end pointer now use either pg_mblen_with_len() or pg_mblen_range(), for the same effect, depending on which of the two seems more convenient at each site. 3. A small number of cases pre-validate a string, and can use pg_mblen_unbounded(). The traditional pg_mblen() function and COPYCHAR macro still exist for backward compatibility, but are no longer used by core code and are hereby deprecated. The same applies to the t_isXXX() functions. Security: CVE-2026-2006 Backpatch-through: 14 Co-authored-by: Thomas Munro Co-authored-by: Noah Misch Reviewed-by: Heikki Linnakangas Reported-by: Paul Gerste (as part of zeroday.cloud) Reported-by: Moritz Sanft (as part of zeroday.cloud) --- contrib/btree_gist/btree_utils_var.c | 21 +++- contrib/dict_xsyn/dict_xsyn.c | 4 +- contrib/hstore/hstore_io.c | 2 +- contrib/ltree/crc32.c | 3 +- contrib/ltree/lquery_op.c | 4 +- contrib/ltree/ltree.h | 2 +- contrib/ltree/ltree_io.c | 8 +- contrib/ltree/ltxtquery_io.c | 2 +- contrib/pageinspect/heapfuncs.c | 2 +- contrib/pg_trgm/trgm.h | 2 +- contrib/pg_trgm/trgm_op.c | 52 +++++--- contrib/pg_trgm/trgm_regexp.c | 23 ++-- contrib/pgcrypto/crypt-sha.c | 2 +- contrib/unaccent/unaccent.c | 5 +- src/backend/catalog/pg_proc.c | 2 +- src/backend/tsearch/dict_synonym.c | 4 +- src/backend/tsearch/dict_thesaurus.c | 8 +- src/backend/tsearch/regis.c | 37 +++--- src/backend/tsearch/spell.c | 81 ++++++------ src/backend/tsearch/ts_locale.c | 56 +++++---- src/backend/tsearch/ts_utils.c | 2 +- src/backend/tsearch/wparser_def.c | 3 +- src/backend/utils/adt/encode.c | 6 +- src/backend/utils/adt/formatting.c | 22 ++-- src/backend/utils/adt/jsonfuncs.c | 2 +- src/backend/utils/adt/jsonpath_gram.y | 3 +- src/backend/utils/adt/levenshtein.c | 14 ++- src/backend/utils/adt/like.c | 18 +-- src/backend/utils/adt/like_match.c | 3 +- src/backend/utils/adt/oracle_compat.c | 33 +++-- src/backend/utils/adt/regexp.c | 9 +- src/backend/utils/adt/tsquery.c | 13 +- src/backend/utils/adt/tsvector.c | 11 +- src/backend/utils/adt/tsvector_op.c | 10 +- src/backend/utils/adt/tsvector_parser.c | 19 ++- src/backend/utils/adt/varbit.c | 8 +- src/backend/utils/adt/varlena.c | 38 +++--- src/backend/utils/adt/xml.c | 11 +- src/backend/utils/mb/mbutils.c | 150 +++++++++++++++++++++-- src/include/mb/pg_wchar.h | 7 ++ src/include/tsearch/ts_locale.h | 30 ++++- src/include/tsearch/ts_utils.h | 14 +-- src/test/modules/test_regex/test_regex.c | 3 +- 43 files changed, 485 insertions(+), 264 deletions(-) diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c index 6847e4e54d..f6ba1c0c82 100644 --- a/contrib/btree_gist/btree_utils_var.c +++ b/contrib/btree_gist/btree_utils_var.c @@ -115,36 +115,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo) /* * returns the common prefix length of a node key + * + * If the underlying type is character data, the prefix length may point in + * the middle of a multibyte character. */ static int32 gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) { GBT_VARKEY_R r = gbt_var_key_readable(node); int32 i = 0; - int32 l = 0; + int32 l_left_to_match = 0; + int32 l_total = 0; int32 t1len = VARSIZE(r.lower) - VARHDRSZ; int32 t2len = VARSIZE(r.upper) - VARHDRSZ; int32 ml = Min(t1len, t2len); char *p1 = VARDATA(r.lower); char *p2 = VARDATA(r.upper); + const char *end1 = p1 + t1len; + const char *end2 = p2 + t2len; if (ml == 0) return 0; while (i < ml) { - if (tinfo->eml > 1 && l == 0) + if (tinfo->eml > 1 && l_left_to_match == 0) { - if ((l = pg_mblen(p1)) != pg_mblen(p2)) + l_total = pg_mblen_range(p1, end1); + if (l_total != pg_mblen_range(p2, end2)) { return i; } + l_left_to_match = l_total; } if (*p1 != *p2) { if (tinfo->eml > 1) { - return (i - l + 1); + int32 l_matched_subset = l_total - l_left_to_match; + + /* end common prefix at final byte of last matching char */ + return i - l_matched_subset; } else { @@ -154,7 +165,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) p1++; p2++; - l--; + l_left_to_match--; i++; } return ml; /* lower == upper */ diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c index 5c4917ce1f..9e3784e0f4 100644 --- a/contrib/dict_xsyn/dict_xsyn.c +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -54,14 +54,14 @@ find_word(char *in, char **end) *end = NULL; while (*in && isspace((unsigned char) *in)) - in += pg_mblen(in); + in += pg_mblen_cstr(in); if (!*in || *in == '#') return NULL; start = in; while (*in && !isspace((unsigned char) *in)) - in += pg_mblen(in); + in += pg_mblen_cstr(in); *end = in; diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c index 34e3918811..9cdfcb5daa 100644 --- a/contrib/hstore/hstore_io.c +++ b/contrib/hstore/hstore_io.c @@ -67,7 +67,7 @@ prssyntaxerror(HSParser *state) errsave(state->escontext, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error in hstore, near \"%.*s\" at position %d", - pg_mblen(state->ptr), state->ptr, + pg_mblen_cstr(state->ptr), state->ptr, (int) (state->ptr - state->begin)))); /* In soft error situation, return false as convenience for caller */ return false; diff --git a/contrib/ltree/crc32.c b/contrib/ltree/crc32.c index 3918d4a0ec..d21bed31fd 100644 --- a/contrib/ltree/crc32.c +++ b/contrib/ltree/crc32.c @@ -23,6 +23,7 @@ ltree_crc32_sz(const char *buf, int size) { pg_crc32 crc; const char *p = buf; + const char *end = buf + size; static pg_locale_t locale = NULL; if (!locale) @@ -32,7 +33,7 @@ ltree_crc32_sz(const char *buf, int size) while (size > 0) { char foldstr[UNICODE_CASEMAP_BUFSZ]; - int srclen = pg_mblen(p); + int srclen = pg_mblen_range(p, end); size_t foldlen; /* fold one codepoint at a time */ diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c index a28ddbf40d..0adcdd8ff2 100644 --- a/contrib/ltree/lquery_op.c +++ b/contrib/ltree/lquery_op.c @@ -27,14 +27,14 @@ getlexeme(char *start, char *end, int *len) char *ptr; while (start < end && t_iseq(start, '_')) - start += pg_mblen(start); + start += pg_mblen_range(start, end); ptr = start; if (ptr >= end) return NULL; while (ptr < end && !t_iseq(ptr, '_')) - ptr += pg_mblen(ptr); + ptr += pg_mblen_range(ptr, end); *len = ptr - start; return start; diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h index 78478dec17..b0ded40eba 100644 --- a/contrib/ltree/ltree.h +++ b/contrib/ltree/ltree.h @@ -127,7 +127,7 @@ typedef struct #define LQUERY_HASNOT 0x01 /* valid label chars are alphanumerics, underscores and hyphens */ -#define ISLABEL(x) ( t_isalnum(x) || t_iseq(x, '_') || t_iseq(x, '-') ) +#define ISLABEL(x) ( t_isalnum_cstr(x) || t_iseq(x, '_') || t_iseq(x, '-') ) /* full text query */ diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c index 59c4462df8..54c4ca3c5c 100644 --- a/contrib/ltree/ltree_io.c +++ b/contrib/ltree/ltree_io.c @@ -54,7 +54,7 @@ parse_ltree(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); if (t_iseq(ptr, '.')) num++; ptr += charlen; @@ -69,7 +69,7 @@ parse_ltree(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); switch (state) { @@ -291,7 +291,7 @@ parse_lquery(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); if (t_iseq(ptr, '.')) num++; @@ -311,7 +311,7 @@ parse_lquery(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); switch (state) { diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c index 91a2222eaa..d15f323539 100644 --- a/contrib/ltree/ltxtquery_io.c +++ b/contrib/ltree/ltxtquery_io.c @@ -64,7 +64,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint for (;;) { - charlen = pg_mblen(state->buf); + charlen = pg_mblen_cstr(state->buf); switch (state->state) { diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 8277fa256c..2f0dfff175 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -101,7 +101,7 @@ text_to_bits(char *str, int len) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid character \"%.*s\" in t_bits string", - pg_mblen(str + off), str + off))); + pg_mblen_cstr(str + off), str + off))); if (off % 8 == 7) bits[off / 8] = byte; diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index ca01758536..ca23aad4dd 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -47,7 +47,7 @@ typedef char trgm[3]; } while(0) extern int (*CMPTRGM) (const void *a, const void *b); -#define ISWORDCHR(c) (t_isalnum(c)) +#define ISWORDCHR(c, len) (t_isalnum_with_len(c, len)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') ) #define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) ) diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 4bb5506647..5fba594b61 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -295,16 +295,29 @@ static char * find_word(char *str, int lenstr, char **endword) { char *beginword = str; + const char *endstr = str + lenstr; - while (beginword - str < lenstr && !ISWORDCHR(beginword)) - beginword += pg_mblen(beginword); + while (beginword < endstr) + { + int clen = pg_mblen_range(beginword, endstr); - if (beginword - str >= lenstr) + if (ISWORDCHR(beginword, clen)) + break; + beginword += clen; + } + + if (beginword >= endstr) return NULL; *endword = beginword; - while (*endword - str < lenstr && ISWORDCHR(*endword)) - *endword += pg_mblen(*endword); + while (*endword < endstr) + { + int clen = pg_mblen_range(*endword, endstr); + + if (!ISWORDCHR(*endword, clen)) + break; + *endword += clen; + } return beginword; } @@ -385,17 +398,17 @@ make_trigrams(growable_trgm_array *dst, char *str, int bytelen) lenfirst = 1; lenmiddle = 1; - lenlast = pg_mblen(ptr + 2); + lenlast = pg_mblen_unbounded(ptr + 2); } else { - lenfirst = pg_mblen(ptr); + lenfirst = pg_mblen_unbounded(ptr); if (ptr + lenfirst >= str + bytelen) goto done; - lenmiddle = pg_mblen(ptr + lenfirst); + lenmiddle = pg_mblen_unbounded(ptr + lenfirst); if (ptr + lenfirst + lenmiddle >= str + bytelen) goto done; - lenlast = pg_mblen(ptr + lenfirst + lenmiddle); + lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle); } /* @@ -416,7 +429,7 @@ make_trigrams(growable_trgm_array *dst, char *str, int bytelen) ptr += lenfirst; lenfirst = lenmiddle; lenmiddle = lenlast; - lenlast = pg_mblen(endptr); + lenlast = pg_mblen_unbounded(endptr); endptr += lenlast; } } @@ -898,6 +911,7 @@ get_wildcard_part(const char *str, int lenstr, { const char *beginword = str; const char *endword; + const char *endstr = str + lenstr; char *s = buf; bool in_leading_wildcard_meta = false; bool in_trailing_wildcard_meta = false; @@ -910,11 +924,13 @@ get_wildcard_part(const char *str, int lenstr, * from this loop to the next one, since we may exit at a word character * that is in_escape. */ - while (beginword - str < lenstr) + while (beginword < endstr) { + clen = pg_mblen_range(beginword, endstr); + if (in_escape) { - if (ISWORDCHR(beginword)) + if (ISWORDCHR(beginword, clen)) break; in_escape = false; in_leading_wildcard_meta = false; @@ -925,12 +941,12 @@ get_wildcard_part(const char *str, int lenstr, in_escape = true; else if (ISWILDCARDCHAR(beginword)) in_leading_wildcard_meta = true; - else if (ISWORDCHR(beginword)) + else if (ISWORDCHR(beginword, clen)) break; else in_leading_wildcard_meta = false; } - beginword += pg_mblen(beginword); + beginword += clen; } /* @@ -958,12 +974,12 @@ get_wildcard_part(const char *str, int lenstr, * string boundary. Strip escapes during copy. */ endword = beginword; - while (endword - str < lenstr) + while (endword < endstr) { - clen = pg_mblen(endword); + clen = pg_mblen_range(endword, endstr); if (in_escape) { - if (ISWORDCHR(endword)) + if (ISWORDCHR(endword, clen)) { memcpy(s, endword, clen); s += clen; @@ -990,7 +1006,7 @@ get_wildcard_part(const char *str, int lenstr, in_trailing_wildcard_meta = true; break; } - else if (ISWORDCHR(endword)) + else if (ISWORDCHR(endword, clen)) { memcpy(s, endword, clen); s += clen; diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 1d1b5fe304..efee4cf5fb 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -483,7 +483,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation); static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA); -static bool convertPgWchar(pg_wchar c, trgm_mb_char *result); +static int convertPgWchar(pg_wchar c, trgm_mb_char *result); static void transformGraph(TrgmNFA *trgmNFA); static void processState(TrgmNFA *trgmNFA, TrgmState *state); static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key); @@ -807,10 +807,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) for (j = 0; j < charsCount; j++) { trgm_mb_char c; + int clen = convertPgWchar(chars[j], &c); - if (!convertPgWchar(chars[j], &c)) + if (!clen) continue; /* ok to ignore it altogether */ - if (ISWORDCHR(c.bytes)) + if (ISWORDCHR(c.bytes, clen)) colorInfo->wordChars[colorInfo->wordCharsCount++] = c; else colorInfo->containsNonWord = true; @@ -822,13 +823,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) /* * Convert pg_wchar to multibyte format. - * Returns false if the character should be ignored completely. + * Returns 0 if the character should be ignored completely, else returns its + * byte length. */ -static bool +static int convertPgWchar(pg_wchar c, trgm_mb_char *result) { /* "s" has enough space for a multibyte character and a trailing NUL */ char s[MAX_MULTIBYTE_CHAR_LEN + 1]; + int clen; /* * We can ignore the NUL character, since it can never appear in a PG text @@ -836,11 +839,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) * reconstructing trigrams. */ if (c == 0) - return false; + return 0; /* Do the conversion, making sure the result is NUL-terminated */ memset(s, 0, sizeof(s)); - pg_wchar2mb_with_len(&c, s, 1); + clen = pg_wchar2mb_with_len(&c, s, 1); /* * In IGNORECASE mode, we can ignore uppercase characters. We assume that @@ -857,12 +860,12 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) */ #ifdef IGNORECASE { - char *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID); + char *lowerCased = str_tolower(s, clen, DEFAULT_COLLATION_OID); if (strcmp(lowerCased, s) != 0) { pfree(lowerCased); - return false; + return 0; } pfree(lowerCased); } @@ -870,7 +873,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */ memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN); - return true; + return clen; } diff --git a/contrib/pgcrypto/crypt-sha.c b/contrib/pgcrypto/crypt-sha.c index 7ec21771a8..e8f32bc389 100644 --- a/contrib/pgcrypto/crypt-sha.c +++ b/contrib/pgcrypto/crypt-sha.c @@ -328,7 +328,7 @@ px_crypt_shacrypt(const char *pw, const char *salt, char *passwd, unsigned dstle ereport(ERROR, errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid character in salt string: \"%.*s\"", - pg_mblen(ep), ep)); + pg_mblen_cstr(ep), ep)); } else { diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index e25c8a5aa2..69b173e449 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -156,7 +156,7 @@ initTrie(const char *filename) state = 0; for (ptr = line; *ptr; ptr += ptrlen) { - ptrlen = pg_mblen(ptr); + ptrlen = pg_mblen_cstr(ptr); /* ignore whitespace, but end src or trg */ if (isspace((unsigned char) *ptr)) { @@ -382,6 +382,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) char *srcchar = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); char *srcstart = srcchar; + const char *srcend = srcstart + len; TSLexeme *res; StringInfoData buf; @@ -409,7 +410,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) } else { - matchlen = pg_mblen(srcchar); + matchlen = pg_mblen_range(srcchar, srcend); if (buf.data != NULL) appendBinaryStringInfo(&buf, srcchar, matchlen); } diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c index acff7a0096..5df4b3f7a9 100644 --- a/src/backend/catalog/pg_proc.c +++ b/src/backend/catalog/pg_proc.c @@ -1206,7 +1206,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal, if (cursorpos > 0) newcp++; } - chlen = pg_mblen(prosrc); + chlen = pg_mblen_cstr(prosrc); if (strncmp(prosrc, literal, chlen) != 0) goto fail; prosrc += chlen; diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index 6dee28ae52..3937f25bcc 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -50,7 +50,7 @@ findwrd(char *in, char **end, uint16 *flags) /* Skip leading spaces */ while (*in && isspace((unsigned char) *in)) - in += pg_mblen(in); + in += pg_mblen_cstr(in); /* Return NULL on empty lines */ if (*in == '\0') @@ -65,7 +65,7 @@ findwrd(char *in, char **end, uint16 *flags) while (*in && !isspace((unsigned char) *in)) { lastchar = in; - in += pg_mblen(in); + in += pg_mblen_cstr(in); } if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags) diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index 7253f64e5f..0fd4cf3dfa 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -191,7 +191,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) /* is it a comment? */ while (*ptr && isspace((unsigned char) *ptr)) - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); if (t_iseq(ptr, '#') || *ptr == '\0' || t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) @@ -237,13 +237,13 @@ thesaurusRead(const char *filename, DictThesaurus *d) { useasis = true; state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); + beginwrd = ptr + pg_mblen_cstr(ptr); } else if (t_iseq(ptr, '\\')) { useasis = false; state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); + beginwrd = ptr + pg_mblen_cstr(ptr); } else if (!isspace((unsigned char) *ptr)) { @@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) else elog(ERROR, "unrecognized thesaurus state: %d", state); - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); } if (state == TR_INSUBS) diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c index 1c7d5c361f..51ba78fabb 100644 --- a/src/backend/tsearch/regis.c +++ b/src/backend/tsearch/regis.c @@ -37,7 +37,7 @@ RS_isRegis(const char *str) { if (state == RS_IN_WAIT) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) /* okay */ ; else if (t_iseq(c, '[')) state = RS_IN_ONEOF; @@ -48,14 +48,14 @@ RS_isRegis(const char *str) { if (t_iseq(c, '^')) state = RS_IN_NONEOF; - else if (t_isalpha(c)) + else if (t_isalpha_cstr(c)) state = RS_IN_ONEOF_IN; else return false; } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) /* okay */ ; else if (t_iseq(c, ']')) state = RS_IN_WAIT; @@ -64,7 +64,7 @@ RS_isRegis(const char *str) } else elog(ERROR, "internal error in RS_isRegis: state %d", state); - c += pg_mblen(c); + c += pg_mblen_cstr(c); } return (state == RS_IN_WAIT); @@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str) { if (state == RS_IN_WAIT) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) { if (ptr) ptr = newRegisNode(ptr, len); else ptr = r->node = newRegisNode(NULL, len); - COPYCHAR(ptr->data, c); ptr->type = RSF_ONEOF; - ptr->len = pg_mblen(c); + ptr->len = ts_copychar_cstr(ptr->data, c); } else if (t_iseq(c, '[')) { @@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str) ptr->type = RSF_NONEOF; state = RS_IN_NONEOF; } - else if (t_isalpha(c)) + else if (t_isalpha_cstr(c)) { - COPYCHAR(ptr->data, c); - ptr->len = pg_mblen(c); + ptr->len = ts_copychar_cstr(ptr->data, c); state = RS_IN_ONEOF_IN; } else /* shouldn't get here */ @@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str) } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (t_isalpha(c)) - { - COPYCHAR(ptr->data + ptr->len, c); - ptr->len += pg_mblen(c); - } + if (t_isalpha_cstr(c)) + ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c); else if (t_iseq(c, ']')) state = RS_IN_WAIT; else /* shouldn't get here */ @@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str) } else elog(ERROR, "internal error in RS_compile: state %d", state); - c += pg_mblen(c); + c += pg_mblen_cstr(c); } if (state != RS_IN_WAIT) /* shouldn't get here */ @@ -187,10 +182,10 @@ mb_strchr(char *str, char *c) char *ptr = str; bool res = false; - clen = pg_mblen(c); + clen = pg_mblen_cstr(c); while (*ptr && !res) { - plen = pg_mblen(ptr); + plen = pg_mblen_cstr(ptr); if (plen == clen) { i = plen; @@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str) while (*c) { len++; - c += pg_mblen(c); + c += pg_mblen_cstr(c); } if (len < r->nchar) @@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str) { len -= r->nchar; while (len-- > 0) - c += pg_mblen(c); + c += pg_mblen_cstr(c); } @@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str) elog(ERROR, "unrecognized regis node type: %d", ptr->type); } ptr = ptr->next; - c += pg_mblen(c); + c += pg_mblen_cstr(c); } return true; diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index ad0ceec37b..a1bfd2a9f9 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -233,7 +233,7 @@ findchar(char *str, int c) { if (t_iseq(str, c)) return str; - str += pg_mblen(str); + str += pg_mblen_cstr(str); } return NULL; @@ -246,7 +246,7 @@ findchar2(char *str, int c1, int c2) { if (t_iseq(str, c1) || t_iseq(str, c2)) return str; - str += pg_mblen(str); + str += pg_mblen_cstr(str); } return NULL; @@ -353,6 +353,7 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag) char *next; const char *sbuf = *sflagset; int maxstep; + int clen; bool stop = false; bool met_comma = false; @@ -364,11 +365,11 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag) { case FM_LONG: case FM_CHAR: - COPYCHAR(sflag, *sflagset); - sflag += pg_mblen(*sflagset); + clen = ts_copychar_cstr(sflag, *sflagset); + sflag += clen; /* Go to start of the next flag */ - *sflagset += pg_mblen(*sflagset); + *sflagset += clen; /* Check if we get all characters of flag */ maxstep--; @@ -418,7 +419,7 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag) *sflagset))); } - *sflagset += pg_mblen(*sflagset); + *sflagset += pg_mblen_cstr(*sflagset); } stop = true; break; @@ -544,7 +545,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename) while (*s) { /* we allow only single encoded flags for faster works */ - if (pg_mblen(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s)) + if (pg_mblen_cstr(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s)) s++; else { @@ -565,7 +566,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename) *s = '\0'; break; } - s += pg_mblen(s); + s += pg_mblen_cstr(s); } pstr = lowerstr_ctx(Conf, line); @@ -797,17 +798,17 @@ get_nextfield(char **str, char *next) while (**str) { + int clen = pg_mblen_cstr(*str); + if (state == PAE_WAIT_MASK) { if (t_iseq(*str, '#')) return false; else if (!isspace((unsigned char) **str)) { - int clen = pg_mblen(*str); - if (clen < avail) { - COPYCHAR(next, *str); + ts_copychar_with_len(next, *str, clen); next += clen; avail -= clen; } @@ -823,17 +824,15 @@ get_nextfield(char **str, char *next) } else { - int clen = pg_mblen(*str); - if (clen < avail) { - COPYCHAR(next, *str); + ts_copychar_with_len(next, *str, clen); next += clen; avail -= clen; } } } - *str += pg_mblen(*str); + *str += clen; } *next = '\0'; @@ -923,14 +922,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl) while (*str) { + int clen = pg_mblen_cstr(str); + if (state == PAE_WAIT_MASK) { if (t_iseq(str, '#')) return false; else if (!isspace((unsigned char) *str)) { - COPYCHAR(pmask, str); - pmask += pg_mblen(str); + pmask += ts_copychar_with_len(pmask, str, clen); state = PAE_INMASK; } } @@ -943,8 +943,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl) } else if (!isspace((unsigned char) *str)) { - COPYCHAR(pmask, str); - pmask += pg_mblen(str); + pmask += ts_copychar_with_len(pmask, str, clen); } } else if (state == PAE_WAIT_FIND) @@ -953,10 +952,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) { state = PAE_INFIND; } - else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ ) + else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ ) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); state = PAE_INREPL; } else if (!isspace((unsigned char) *str)) @@ -971,10 +969,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *pfind = '\0'; state = PAE_WAIT_REPL; } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(pfind, str); - pfind += pg_mblen(str); + pfind += ts_copychar_with_len(pfind, str, clen); } else if (!isspace((unsigned char) *str)) ereport(ERROR, @@ -987,10 +984,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) { break; /* void repl */ } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); state = PAE_INREPL; } else if (!isspace((unsigned char) *str)) @@ -1005,10 +1001,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *prepl = '\0'; break; } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); } else if (!isspace((unsigned char) *str)) ereport(ERROR, @@ -1018,7 +1013,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl) else elog(ERROR, "unrecognized state in parse_affentry: %d", state); - str += pg_mblen(str); + str += clen; } *pmask = *pfind = *prepl = '\0'; @@ -1071,10 +1066,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) CompoundAffixFlag *newValue; char sbuf[BUFSIZ]; char *sflag; - int clen; while (*s && isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); if (!*s) ereport(ERROR, @@ -1085,8 +1079,8 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) sflag = sbuf; while (*s && !isspace((unsigned char) *s) && *s != '\n') { - clen = pg_mblen(s); - COPYCHAR(sflag, s); + int clen = ts_copychar_cstr(sflag, s); + sflag += clen; s += clen; } @@ -1267,7 +1261,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) char *s = recoded + strlen("FLAG"); while (*s && isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); if (*s) { @@ -1466,11 +1460,11 @@ NIImportAffixes(IspellDict *Conf, const char *filename) if (s) { while (*s && !isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); while (*s && isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); - if (*s && pg_mblen(s) == 1) + if (*s && pg_mblen_cstr(s) == 1) { addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG); Conf->usecompound = true; @@ -1499,7 +1493,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename) flagflags = 0; while (*s && isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); if (*s == '*') { @@ -1520,12 +1514,11 @@ NIImportAffixes(IspellDict *Conf, const char *filename) * be followed by EOL, whitespace, or ':'. Otherwise this is a * new-format flag command. */ - if (*s && pg_mblen(s) == 1) + if (*s && pg_mblen_cstr(s) == 1) { - COPYCHAR(flag, s); + flag[0] = *s++; flag[1] = '\0'; - s++; if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' || isspace((unsigned char) *s)) { diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index 1e98f32195..df02ffb12f 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -23,32 +23,40 @@ static void tsearch_readline_callback(void *arg); /* space for a single character plus a trailing NUL */ #define WC_BUF_LEN 2 -int -t_isalpha(const char *ptr) -{ - pg_wchar wstr[WC_BUF_LEN]; - int wlen pg_attribute_unused(); - - wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr)); - Assert(wlen <= 1); - - /* pass single character, or NUL if empty */ - return pg_iswalpha(wstr[0], pg_database_locale()); -} - -int -t_isalnum(const char *ptr) -{ - pg_wchar wstr[WC_BUF_LEN]; - int wlen pg_attribute_unused(); - - wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr)); - Assert(wlen <= 1); - - /* pass single character, or NUL if empty */ - return pg_iswalnum(wstr[0], pg_database_locale()); +#define GENERATE_T_ISCLASS_DEF(character_class) \ +/* mblen shall be that of the first character */ \ +int \ +t_is##character_class##_with_len(const char *ptr, int mblen) \ +{ \ + pg_wchar wstr[WC_BUF_LEN]; \ + int wlen pg_attribute_unused(); \ + wlen = pg_mb2wchar_with_len(ptr, wstr, mblen); \ + Assert(wlen <= 1); \ + /* pass single character, or NUL if empty */ \ + return pg_isw##character_class(wstr[0], pg_database_locale()); \ +} \ +\ +/* ptr shall point to a NUL-terminated string */ \ +int \ +t_is##character_class##_cstr(const char *ptr) \ +{ \ + return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \ +} \ +/* ptr shall point to a string with pre-validated encoding */ \ +int \ +t_is##character_class##_unbounded(const char *ptr) \ +{ \ + return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \ +} \ +/* historical name for _unbounded */ \ +int \ +t_is##character_class(const char *ptr) \ +{ \ + return t_is##character_class##_unbounded(ptr); \ } +GENERATE_T_ISCLASS_DEF(alnum) +GENERATE_T_ISCLASS_DEF(alpha) /* * Set up to read a file using tsearch_readline(). This facility is diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index 9072d22423..52cf65533e 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -90,7 +90,7 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *, size /* Trim trailing space */ while (*pbuf && !isspace((unsigned char) *pbuf)) - pbuf += pg_mblen(pbuf); + pbuf += pg_mblen_cstr(pbuf); *pbuf = '\0'; /* Skip empty lines */ diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index bfe8aa7fbc..8b9b34e762 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -1683,7 +1683,8 @@ TParserGet(TParser *prs) prs->state->charlen = 0; else prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen : - pg_mblen(prs->str + prs->state->posbyte); + pg_mblen_range(prs->str + prs->state->posbyte, + prs->str + prs->lenstr); Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr); Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null); diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index 3c7f54f263..f5f835e944 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -290,7 +290,7 @@ hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext) ereturn(escontext, 0, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal digit: \"%.*s\"", - pg_mblen(s), s))); + pg_mblen_range(s, srcend), s))); s++; if (s >= srcend) ereturn(escontext, 0, @@ -300,7 +300,7 @@ hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext) ereturn(escontext, 0, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal digit: \"%.*s\"", - pg_mblen(s), s))); + pg_mblen_range(s, srcend), s))); s++; *p++ = (v1 << 4) | v2; } @@ -564,7 +564,7 @@ pg_base64_decode_internal(const char *src, size_t len, char *dst, bool url) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid symbol \"%.*s\" found while decoding %s sequence", - pg_mblen(s - 1), s - 1, + pg_mblen_range(s - 1, srcend), s - 1, url ? "base64url" : "base64"))); } } diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index cf580c63c7..7720911a6a 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1438,7 +1438,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, ereport(ERROR, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("invalid datetime format separator: \"%s\"", - pnstrdup(str, pg_mblen(str))))); + pnstrdup(str, pg_mblen_cstr(str))))); if (*str == ' ') n->type = NODE_TYPE_SPACE; @@ -1468,7 +1468,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, /* backslash quotes the next character, if any */ if (*str == '\\' && *(str + 1)) str++; - chlen = pg_mblen(str); + chlen = pg_mblen_cstr(str); n->type = NODE_TYPE_CHAR; memcpy(n->character, str, chlen); n->character[chlen] = '\0'; @@ -1486,7 +1486,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, */ if (*str == '\\' && *(str + 1) == '"') str++; - chlen = pg_mblen(str); + chlen = pg_mblen_cstr(str); if ((flags & DCH_FLAG) && is_separator_char(str)) n->type = NODE_TYPE_SEPARATOR; @@ -1992,8 +1992,8 @@ asc_toupper_z(const char *buff) do { \ if (IS_SUFFIX_THth(_suf)) \ { \ - if (*(ptr)) (ptr) += pg_mblen(ptr); \ - if (*(ptr)) (ptr) += pg_mblen(ptr); \ + if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ + if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ } \ } while (0) @@ -3183,7 +3183,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, * insist that the consumed character match the format's * character. */ - s += pg_mblen(s); + s += pg_mblen_cstr(s); } continue; } @@ -3205,11 +3205,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, if (extra_skip > 0) extra_skip--; else - s += pg_mblen(s); + s += pg_mblen_cstr(s); } else { - int chlen = pg_mblen(s); + int chlen = pg_mblen_cstr(s); /* * Standard mode requires strict match of format characters. @@ -5724,13 +5724,15 @@ NUM_numpart_to_char(NUMProc *Np, int id) static void NUM_eat_non_data_chars(NUMProc *Np, int n, size_t input_len) { + const char *end = Np->inout + input_len; + while (n-- > 0) { if (OVERLOAD_TEST) break; /* end of input */ if (strchr("0123456789.,+-", *Np->inout_p) != NULL) break; /* it's a data character */ - Np->inout_p += pg_mblen(Np->inout_p); + Np->inout_p += pg_mblen_range(Np->inout_p, end); } } @@ -6167,7 +6169,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, } else { - Np->inout_p += pg_mblen(Np->inout_p); + Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len); } continue; } diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index 1e5b60801e..d5b64d7fca 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -695,7 +695,7 @@ report_json_context(JsonLexContext *lex) { /* Advance to next multibyte character */ if (IS_HIGHBIT_SET(*context_start)) - context_start += pg_mblen(context_start); + context_start += pg_mblen_range(context_start, context_end); else context_start++; } diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y index 4543626ffc..87070235d1 100644 --- a/src/backend/utils/adt/jsonpath_gram.y +++ b/src/backend/utils/adt/jsonpath_gram.y @@ -599,7 +599,8 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unrecognized flag character \"%.*s\" in LIKE_REGEX predicate.", - pg_mblen(flags->val + i), flags->val + i))); + pg_mblen_range(flags->val + i, flags->val + flags->len), + flags->val + i))); break; } } diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c index fb2ba591ac..5b3d84029f 100644 --- a/src/backend/utils/adt/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -83,6 +83,8 @@ varstr_levenshtein(const char *source, int slen, int *s_char_len = NULL; int j; const char *y; + const char *send = source + slen; + const char *tend = target + tlen; /* * For varstr_levenshtein_less_equal, we have real variables called @@ -183,10 +185,10 @@ varstr_levenshtein(const char *source, int slen, #endif /* - * In order to avoid calling pg_mblen() repeatedly on each character in s, - * we cache all the lengths before starting the main loop -- but if all - * the characters in both strings are single byte, then we skip this and - * use a fast-path in the main loop. If only one string contains + * In order to avoid calling pg_mblen_range() repeatedly on each character + * in s, we cache all the lengths before starting the main loop -- but if + * all the characters in both strings are single byte, then we skip this + * and use a fast-path in the main loop. If only one string contains * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ @@ -198,7 +200,7 @@ varstr_levenshtein(const char *source, int slen, s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) { - s_char_len[i] = pg_mblen(cp); + s_char_len[i] = pg_mblen_range(cp, send); cp += s_char_len[i]; } s_char_len[i] = 0; @@ -224,7 +226,7 @@ varstr_levenshtein(const char *source, int slen, { int *temp; const char *x = source; - int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; + int y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1; int i; #ifdef LEVENSHTEIN_LESS_EQUAL diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 2143d8658e..350bc07f21 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -55,20 +55,20 @@ static int Generic_Text_IC_like(text *str, text *pat, Oid collation); *-------------------- */ static inline int -wchareq(const char *p1, const char *p2) +wchareq(const char *p1, int p1len, const char *p2, int p2len) { - int p1_len; + int p1clen; /* Optimization: quickly compare the first byte. */ if (*p1 != *p2) return 0; - p1_len = pg_mblen(p1); - if (pg_mblen(p2) != p1_len) + p1clen = pg_mblen_with_len(p1, p1len); + if (pg_mblen_with_len(p2, p2len) != p1clen) return 0; /* They are the same length */ - while (p1_len--) + while (p1clen--) { if (*p1++ != *p2++) return 0; @@ -93,11 +93,11 @@ wchareq(const char *p1, const char *p2) #define NextByte(p, plen) ((p)++, (plen)--) /* Set up to compile like_match.c for multibyte characters */ -#define CHAREQ(p1, p2) wchareq((p1), (p2)) +#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len)) #define NextChar(p, plen) \ - do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) + do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0) #define CopyAdvChar(dst, src, srclen) \ - do { int __l = pg_mblen(src); \ + do { int __l = pg_mblen_with_len((src), (srclen)); \ (srclen) -= __l; \ while (__l-- > 0) \ *(dst)++ = *(src)++; \ @@ -109,7 +109,7 @@ wchareq(const char *p1, const char *p2) #include "like_match.c" /* Set up to compile like_match.c for single-byte characters */ -#define CHAREQ(p1, p2) (*(p1) == *(p2)) +#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2)) #define NextChar(p, plen) NextByte((p), (plen)) #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index 02990ca9a1..f5f72b82e2 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -442,6 +442,7 @@ do_like_escape(text *pat, text *esc) errhint("Escape string must be empty or one character."))); e = VARDATA_ANY(esc); + elen = VARSIZE_ANY_EXHDR(esc); /* * If specified escape is '\', just copy the pattern as-is. @@ -460,7 +461,7 @@ do_like_escape(text *pat, text *esc) afterescape = false; while (plen > 0) { - if (CHAREQ(p, e) && !afterescape) + if (CHAREQ(p, plen, e, elen) && !afterescape) { *r++ = '\\'; NextChar(p, plen); diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index a003f90066..5b0d098bd0 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -169,8 +169,8 @@ lpad(PG_FUNCTION_ARGS) char *ptr1, *ptr2, *ptr2start, - *ptr2end, *ptr_ret; + const char *ptr2end; int m, s1len, s2len; @@ -215,7 +215,7 @@ lpad(PG_FUNCTION_ARGS) while (m--) { - int mlen = pg_mblen(ptr2); + int mlen = pg_mblen_range(ptr2, ptr2end); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; @@ -228,7 +228,7 @@ lpad(PG_FUNCTION_ARGS) while (s1len--) { - int mlen = pg_mblen(ptr1); + int mlen = pg_mblen_unbounded(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; @@ -267,8 +267,8 @@ rpad(PG_FUNCTION_ARGS) char *ptr1, *ptr2, *ptr2start, - *ptr2end, *ptr_ret; + const char *ptr2end; int m, s1len, s2len; @@ -308,11 +308,12 @@ rpad(PG_FUNCTION_ARGS) m = len - s1len; ptr1 = VARDATA_ANY(string1); + ptr_ret = VARDATA(ret); while (s1len--) { - int mlen = pg_mblen(ptr1); + int mlen = pg_mblen_unbounded(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; @@ -324,7 +325,7 @@ rpad(PG_FUNCTION_ARGS) while (m--) { - int mlen = pg_mblen(ptr2); + int mlen = pg_mblen_range(ptr2, ptr2end); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; @@ -409,6 +410,7 @@ dotrim(const char *string, int stringlen, */ const char **stringchars; const char **setchars; + const char *setend; int *stringmblen; int *setmblen; int stringnchars; @@ -416,6 +418,7 @@ dotrim(const char *string, int stringlen, int resultndx; int resultnchars; const char *p; + const char *pend; int len; int mblen; const char *str_pos; @@ -426,10 +429,11 @@ dotrim(const char *string, int stringlen, stringnchars = 0; p = string; len = stringlen; + pend = p + len; while (len > 0) { stringchars[stringnchars] = p; - stringmblen[stringnchars] = mblen = pg_mblen(p); + stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend); stringnchars++; p += mblen; len -= mblen; @@ -440,10 +444,11 @@ dotrim(const char *string, int stringlen, setnchars = 0; p = set; len = setlen; + setend = set + setlen; while (len > 0) { setchars[setnchars] = p; - setmblen[setnchars] = mblen = pg_mblen(p); + setmblen[setnchars] = mblen = pg_mblen_range(p, setend); setnchars++; p += mblen; len -= mblen; @@ -821,6 +826,8 @@ translate(PG_FUNCTION_ARGS) *to_end; char *source, *target; + const char *source_end; + const char *from_end; int m, fromlen, tolen, @@ -835,9 +842,11 @@ translate(PG_FUNCTION_ARGS) if (m <= 0) PG_RETURN_TEXT_P(string); source = VARDATA_ANY(string); + source_end = source + m; fromlen = VARSIZE_ANY_EXHDR(from); from_ptr = VARDATA_ANY(from); + from_end = from_ptr + fromlen; tolen = VARSIZE_ANY_EXHDR(to); to_ptr = VARDATA_ANY(to); to_end = to_ptr + tolen; @@ -861,12 +870,12 @@ translate(PG_FUNCTION_ARGS) while (m > 0) { - source_len = pg_mblen(source); + source_len = pg_mblen_range(source, source_end); from_index = 0; for (i = 0; i < fromlen; i += len) { - len = pg_mblen(&from_ptr[i]); + len = pg_mblen_range(&from_ptr[i], from_end); if (len == source_len && memcmp(source, &from_ptr[i], len) == 0) break; @@ -882,11 +891,11 @@ translate(PG_FUNCTION_ARGS) { if (p >= to_end) break; - p += pg_mblen(p); + p += pg_mblen_range(p, to_end); } if (p < to_end) { - len = pg_mblen(p); + len = pg_mblen_range(p, to_end); memcpy(target, p, len); target += len; retlen += len; diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 94cd15bbab..311b9877bb 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -443,7 +443,7 @@ parse_re_flags(pg_re_flags *flags, text *opts) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression option: \"%.*s\"", - pg_mblen(opt_p + i), opt_p + i))); + pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i))); break; } } @@ -673,12 +673,13 @@ textregexreplace(PG_FUNCTION_ARGS) if (VARSIZE_ANY_EXHDR(opt) > 0) { char *opt_p = VARDATA_ANY(opt); + const char *end_p = opt_p + VARSIZE_ANY_EXHDR(opt); if (*opt_p >= '0' && *opt_p <= '9') ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression option: \"%.*s\"", - pg_mblen(opt_p), opt_p), + pg_mblen_range(opt_p, end_p), opt_p), errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly."))); } @@ -772,6 +773,7 @@ similar_escape_internal(text *pat_text, text *esc_text) *r; int plen, elen; + const char *pend; bool afterescape = false; int nquotes = 0; int bracket_depth = 0; /* square bracket nesting level */ @@ -779,6 +781,7 @@ similar_escape_internal(text *pat_text, text *esc_text) p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); + pend = p + plen; if (esc_text == NULL) { /* No ESCAPE clause provided; default to backslash as escape */ @@ -878,7 +881,7 @@ similar_escape_internal(text *pat_text, text *esc_text) if (elen > 1) { - int mblen = pg_mblen(p); + int mblen = pg_mblen_range(p, pend); if (mblen > 1) { diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index e3bf1fbbfd..7e54f36c2a 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -120,7 +120,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) return buf; buf++; - while (*buf && pg_mblen(buf) == 1) + while (*buf && pg_mblen_cstr(buf) == 1) { switch (*buf) { @@ -259,12 +259,12 @@ parse_or_operator(TSQueryParserState pstate) return false; /* it shouldn't be a part of any word */ - if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr)) + if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum_cstr(ptr)) return false; for (;;) { - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); if (*ptr == '\0') /* got end of string without operand */ return false; @@ -390,7 +390,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator, break; } - state->buf += pg_mblen(state->buf); + state->buf += pg_mblen_cstr(state->buf); } } @@ -502,7 +502,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, break; } - state->buf += pg_mblen(state->buf); + state->buf += pg_mblen_cstr(state->buf); } } @@ -1014,9 +1014,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp) *(in->cur) = '\\'; in->cur++; } - COPYCHAR(in->cur, op); - clen = pg_mblen(op); + clen = ts_copychar_cstr(in->cur, op); op += clen; in->cur += clen; } diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 38342298a5..024f5160cd 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -319,9 +319,9 @@ tsvectorout(PG_FUNCTION_ARGS) lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); - char *curbegin, - *curin, + char *curin, *curout; + const char *curend; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) @@ -334,13 +334,14 @@ tsvectorout(PG_FUNCTION_ARGS) curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { - curbegin = curin = STRPTR(out) + ptr->pos; + curin = STRPTR(out) + ptr->pos; + curend = curin + ptr->len; if (i != 0) *curout++ = ' '; *curout++ = '\''; - while (curin - curbegin < ptr->len) + while (curin < curend) { - int len = pg_mblen(curin); + int len = pg_mblen_range(curin, curend); if (t_iseq(curin, '\'')) *curout++ = '\''; diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 94e0fed830..71c7c7d3b3 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -2604,11 +2604,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) if (ws) { char *buf; + const char *end; buf = VARDATA_ANY(ws); - while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws)) + end = buf + VARSIZE_ANY_EXHDR(ws); + while (buf < end) { - if (pg_mblen(buf) == 1) + int len = pg_mblen_range(buf, end); + + if (len == 1) { switch (*buf) { @@ -2632,7 +2636,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) stat->weight |= 0; } } - buf += pg_mblen(buf); + buf += len; } } diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index b3c04f6344..efeaeb5533 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -208,8 +208,7 @@ gettoken_tsvector(TSVectorParseState state, PRSSYNTAXERROR; else if (!isspace((unsigned char) *state->prsbuf)) { - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); statecode = WAITENDWORD; } } @@ -223,8 +222,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); Assert(oldstate != 0); statecode = oldstate; } @@ -259,8 +257,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); } } else if (statecode == WAITENDCMPLX) @@ -279,8 +276,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); } } else if (statecode == WAITCHARCMPLX) @@ -288,8 +284,7 @@ gettoken_tsvector(TSVectorParseState state, if (!state->is_web && t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); statecode = WAITENDCMPLX; } else @@ -300,7 +295,7 @@ gettoken_tsvector(TSVectorParseState state, PRSSYNTAXERROR; if (state->oprisdelim) { - /* state->prsbuf+=pg_mblen(state->prsbuf); */ + /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */ RETURN_TOKEN; } else @@ -383,6 +378,6 @@ gettoken_tsvector(TSVectorParseState state, statecode); /* get next char */ - state->prsbuf += pg_mblen(state->prsbuf); + state->prsbuf += pg_mblen_cstr(state->prsbuf); } } diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c index 50ffee679b..65ad1bfe18 100644 --- a/src/backend/utils/adt/varbit.c +++ b/src/backend/utils/adt/varbit.c @@ -232,7 +232,7 @@ bit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid binary digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); x >>= 1; if (x == 0) @@ -257,7 +257,7 @@ bit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid hexadecimal digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); if (bc) { @@ -533,7 +533,7 @@ varbit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid binary digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); x >>= 1; if (x == 0) @@ -558,7 +558,7 @@ varbit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid hexadecimal digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); if (bc) { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 552ac0c61d..6bb14620a6 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -494,8 +494,11 @@ text_catenate(text *t1, text *t2) * charlen_to_bytelen() * Compute the number of bytes occupied by n characters starting at *p * - * It is caller's responsibility that there actually are n characters; - * the string need not be null-terminated. + * The caller shall ensure there are n complete characters. Callers achieve + * this by deriving "n" from regmatch_t findings from searching a wchar array. + * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex + * matches will end no later than the last complete character. (The string + * need not be null-terminated.) */ static int charlen_to_bytelen(const char *p, int n) @@ -510,7 +513,7 @@ charlen_to_bytelen(const char *p, int n) const char *s; for (s = p; n > 0; n--) - s += pg_mblen(s); + s += pg_mblen_unbounded(s); /* caller verified encoding */ return s - p; } @@ -644,6 +647,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) int32 slice_start; int32 slice_size; int32 slice_strlen; + int32 slice_len; text *slice; int32 E1; int32 i; @@ -713,7 +717,8 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) slice = (text *) DatumGetPointer(str); /* see if we got back an empty string */ - if (VARSIZE_ANY_EXHDR(slice) == 0) + slice_len = VARSIZE_ANY_EXHDR(slice); + if (slice_len == 0) { if (slice != (text *) DatumGetPointer(str)) pfree(slice); @@ -722,7 +727,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) /* Now we can get the actual length of the slice in MB characters */ slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), - VARSIZE_ANY_EXHDR(slice)); + slice_len); /* * Check that the start position wasn't > slice_strlen. If so, SQL99 @@ -749,7 +754,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) */ p = VARDATA_ANY(slice); for (i = 0; i < S1 - 1; i++) - p += pg_mblen(p); + p += pg_mblen_unbounded(p); /* hang onto a pointer to our start position */ s = p; @@ -759,7 +764,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) * length. */ for (i = S1; i < E1; i++) - p += pg_mblen(p); + p += pg_mblen_unbounded(p); ret = (text *) palloc(VARHDRSZ + (p - s)); SET_VARSIZE(ret, VARHDRSZ + (p - s)); @@ -1064,6 +1069,8 @@ text_position_next(TextPositionState *state) */ if (state->is_multibyte_char_in_char && state->locale->deterministic) { + const char *haystack_end = state->str1 + state->len1; + /* Walk one character at a time, until we reach the match. */ /* the search should never move backwards. */ @@ -1072,7 +1079,7 @@ text_position_next(TextPositionState *state) while (state->refpoint < matchptr) { /* step to next character. */ - state->refpoint += pg_mblen(state->refpoint); + state->refpoint += pg_mblen_range(state->refpoint, haystack_end); state->refpos++; /* @@ -1160,7 +1167,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) test_end = hptr; do { - test_end += pg_mblen(test_end); + test_end += pg_mblen_range(test_end, haystack_end); if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0) { state->last_match_len_tmp = (test_end - hptr); @@ -1173,7 +1180,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) if (result_hptr) break; - hptr += pg_mblen(hptr); + hptr += pg_mblen_range(hptr, haystack_end); } return (char *) result_hptr; @@ -3767,6 +3774,8 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) } else { + const char *end_ptr; + /* * When fldsep is NULL, each character in the input string becomes a * separate element in the result set. The separator is effectively @@ -3775,10 +3784,11 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) inputstring_len = VARSIZE_ANY_EXHDR(inputstring); start_ptr = VARDATA_ANY(inputstring); + end_ptr = start_ptr + inputstring_len; while (inputstring_len > 0) { - int chunk_len = pg_mblen(start_ptr); + int chunk_len = pg_mblen_range(start_ptr, end_ptr); CHECK_FOR_INTERRUPTS(); @@ -4684,7 +4694,7 @@ text_reverse(PG_FUNCTION_ARGS) { int sz; - sz = pg_mblen(p); + sz = pg_mblen_range(p, endp); dst -= sz; memcpy(dst, p, sz); p += sz; @@ -4845,7 +4855,7 @@ text_format(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized format() type specifier \"%.*s\"", - pg_mblen(cp), cp), + pg_mblen_range(cp, end_ptr), cp), errhint("For a single \"%%\" use \"%%%%\"."))); /* If indirect width was specified, get its value */ @@ -4966,7 +4976,7 @@ text_format(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized format() type specifier \"%.*s\"", - pg_mblen(cp), cp), + pg_mblen_range(cp, end_ptr), cp), errhint("For a single \"%%\" use \"%%%%\"."))); break; } diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index f69dc68286..fcb13e7c0a 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -2376,8 +2376,7 @@ sqlchar_to_unicode(const char *s) char *utf8string; pg_wchar ret[2]; /* need space for trailing zero */ - /* note we're not assuming s is null-terminated */ - utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8); + utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8); pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret, pg_encoding_mblen(PG_UTF8, utf8string)); @@ -2430,7 +2429,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, initStringInfo(&buf); - for (p = ident; *p; p += pg_mblen(p)) + for (p = ident; *p; p += pg_mblen_cstr(p)) { if (*p == ':' && (p == ident || fully_escaped)) appendStringInfoString(&buf, "_x003A_"); @@ -2455,7 +2454,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, : !is_valid_xml_namechar(u)) appendStringInfo(&buf, "_x%04X_", (unsigned int) u); else - appendBinaryStringInfo(&buf, p, pg_mblen(p)); + appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); } } @@ -2478,7 +2477,7 @@ map_xml_name_to_sql_identifier(const char *name) initStringInfo(&buf); - for (p = name; *p; p += pg_mblen(p)) + for (p = name; *p; p += pg_mblen_cstr(p)) { if (*p == '_' && *(p + 1) == 'x' && isxdigit((unsigned char) *(p + 2)) @@ -2496,7 +2495,7 @@ map_xml_name_to_sql_identifier(const char *name) p += 6; } else - appendBinaryStringInfo(&buf, p, pg_mblen(p)); + appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); } return buf.data; diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 6950e743d0..a5a734839a 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -38,6 +38,7 @@ #include "catalog/namespace.h" #include "mb/pg_wchar.h" #include "utils/fmgrprotos.h" +#include "utils/memdebug.h" #include "utils/memutils.h" #include "utils/relcache.h" #include "varatt.h" @@ -97,6 +98,13 @@ static char *perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server); static int cliplen(const char *str, int len, int limit); +pg_noreturn +static void report_invalid_encoding_int(int encoding, const char *mbstr, + int mblen, int len); + +pg_noreturn +static void report_invalid_encoding_db(const char *mbstr, int mblen, int len); + /* * Prepare for a future call to SetClientEncoding. Success should mean @@ -1021,11 +1029,126 @@ pg_encoding_wchar2mb_with_len(int encoding, return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len); } -/* returns the byte length of a multibyte character */ +/* + * Returns the byte length of a multibyte character sequence in a + * null-terminated string. Raises an illegal byte sequence error if the + * sequence would hit a null terminator. + * + * The caller is expected to have checked for a terminator at *mbstr == 0 + * before calling, but some callers want 1 in that case, so this function + * continues that tradition. + * + * This must only be used for strings that have a null-terminator to enable + * bounds detection. + */ +int +pg_mblen_cstr(const char *mbstr) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + /* + * The .mblen functions return 1 when given a pointer to a terminator. + * Some callers depend on that, so we tolerate it for now. Well-behaved + * callers check the leading byte for a terminator *before* calling. + */ + for (int i = 1; i < length; ++i) + if (unlikely(mbstr[i] == 0)) + report_invalid_encoding_db(mbstr, length, i); + + /* + * String should be NUL-terminated, but checking that would make typical + * callers O(N^2), tripling Valgrind check-world time. Unless + * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we + * found a character, not a terminator, the next byte must be a terminator + * or the start of the next character.) If the caller iterates the whole + * string, the last call will diagnose a missing terminator. + */ + if (mbstr[0] != '\0') + { +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr)); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1); +#endif + } + + return length; +} + +/* + * Returns the byte length of a multibyte character sequence bounded by a range + * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence + * error if the sequence would exceed the range. + */ +int +pg_mblen_range(const char *mbstr, const char *end) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + Assert(end > mbstr); +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); +#endif + + if (unlikely(mbstr + length > end)) + report_invalid_encoding_db(mbstr, length, end - mbstr); + + return length; +} + +/* + * Returns the byte length of a multibyte character sequence bounded by a range + * extending for 'limit' bytes, which must be at least one. Raises an illegal + * byte sequence error if the sequence would exceed the range. + */ +int +pg_mblen_with_len(const char *mbstr, int limit) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + Assert(limit >= 1); +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); +#endif + + if (unlikely(length > limit)) + report_invalid_encoding_db(mbstr, length, limit); + + return length; +} + + +/* + * Returns the length of a multibyte character sequence, without any + * validation of bounds. + * + * PLEASE NOTE: This function can only be used safely if the caller has + * already verified the input string, since otherwise there is a risk of + * overrunning the buffer if the string is invalid. A prior call to a + * pg_mbstrlen* function suffices. + */ +int +pg_mblen_unbounded(const char *mbstr) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); + + return length; +} + +/* + * Historical name for pg_mblen_unbounded(). Should not be used and will be + * removed in a later version. + */ int pg_mblen(const char *mbstr) { - return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + return pg_mblen_unbounded(mbstr); } /* returns the display length of a multibyte character */ @@ -1047,14 +1170,14 @@ pg_mbstrlen(const char *mbstr) while (*mbstr) { - mbstr += pg_mblen(mbstr); + mbstr += pg_mblen_cstr(mbstr); len++; } return len; } /* returns the length (counted in wchars) of a multibyte string - * (not necessarily NULL terminated) + * (stops at the first of "limit" or a NUL) */ int pg_mbstrlen_with_len(const char *mbstr, int limit) @@ -1067,7 +1190,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit) while (limit > 0 && *mbstr) { - int l = pg_mblen(mbstr); + int l = pg_mblen_with_len(mbstr, limit); limit -= l; mbstr += l; @@ -1137,7 +1260,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit) while (len > 0 && *mbstr) { - l = pg_mblen(mbstr); + l = pg_mblen_with_len(mbstr, len); nch++; if (nch > limit) break; @@ -1701,12 +1824,19 @@ void report_invalid_encoding(int encoding, const char *mbstr, int len) { int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len); + + report_invalid_encoding_int(encoding, mbstr, l, len); +} + +static void +report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len) +{ char buf[8 * 5 + 1]; char *p = buf; int j, jlimit; - jlimit = Min(l, len); + jlimit = Min(mblen, len); jlimit = Min(jlimit, 8); /* prevent buffer overrun */ for (j = 0; j < jlimit; j++) @@ -1723,6 +1853,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len) buf))); } +static void +report_invalid_encoding_db(const char *mbstr, int mblen, int len) +{ + report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len); +} + /* * report_untranslatable_char: complain about untranslatable character * diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index a5b7b49e4b..e1655fe61d 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -695,7 +695,14 @@ extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2); extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n); extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n); extern size_t pg_wchar_strlen(const pg_wchar *str); +extern int pg_mblen_cstr(const char *mbstr); +extern int pg_mblen_range(const char *mbstr, const char *end); +extern int pg_mblen_with_len(const char *mbstr, int limit); +extern int pg_mblen_unbounded(const char *mbstr); + +/* deprecated */ extern int pg_mblen(const char *mbstr); + extern int pg_dsplen(const char *mbstr); extern int pg_mbstrlen(const char *mbstr); extern int pg_mbstrlen_with_len(const char *mbstr, int limit); diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h index cea417a91b..6e2d67ee4a 100644 --- a/src/include/tsearch/ts_locale.h +++ b/src/include/tsearch/ts_locale.h @@ -37,10 +37,34 @@ typedef struct /* The second argument of t_iseq() must be a plain ASCII character */ #define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c)) -#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s)) +/* Copy multibyte character of known byte length, return byte length. */ +static inline int +ts_copychar_with_len(void *dest, const void *src, int length) +{ + memcpy(dest, src, length); + return length; +} + +/* Copy multibyte character from null-terminated string, return byte length. */ +static inline int +ts_copychar_cstr(void *dest, const void *src) +{ + return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src)); +} + +/* Historical macro for the above. */ +#define COPYCHAR ts_copychar_cstr + +#define GENERATE_T_ISCLASS_DECL(character_class) \ +extern int t_is##character_class##_with_len(const char *ptr, int len); \ +extern int t_is##character_class##_cstr(const char *ptr); \ +extern int t_is##character_class##_unbounded(const char *ptr); \ +\ +/* deprecated */ \ +extern int t_is##character_class(const char *ptr); -extern int t_isalpha(const char *ptr); -extern int t_isalnum(const char *ptr); +GENERATE_T_ISCLASS_DECL(alnum); +GENERATE_T_ISCLASS_DECL(alpha); extern bool tsearch_readline_begin(tsearch_readline_state *stp, const char *filename); diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index b0d1dbab6d..3eb0770f9c 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -40,14 +40,12 @@ extern bool gettoken_tsvector(TSVectorParseState state, extern void close_tsvector_parser(TSVectorParseState state); /* phrase operator begins with '<' */ -#define ISOPERATOR(x) \ - ( pg_mblen(x) == 1 && ( *(x) == '!' || \ - *(x) == '&' || \ - *(x) == '|' || \ - *(x) == '(' || \ - *(x) == ')' || \ - *(x) == '<' \ - ) ) +#define ISOPERATOR(x) (*(x) == '!' || \ + *(x) == '&' || \ + *(x) == '|' || \ + *(x) == '(' || \ + *(x) == ')' || \ + *(x) == '<') /* parse_tsquery */ diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c index 070464a341..4e97cde65a 100644 --- a/src/test/modules/test_regex/test_regex.c +++ b/src/test/modules/test_regex/test_regex.c @@ -411,7 +411,8 @@ parse_test_flags(test_re_flags *flags, text *opts) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression test option: \"%.*s\"", - pg_mblen(opt_p + i), opt_p + i))); + pg_mblen_range(opt_p + i, opt_p + opt_len), + opt_p + i))); break; } } From c67bef3f3252a3a38bf347f9f119944176a796ce Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 12 Jan 2026 10:20:06 +1300 Subject: [PATCH 057/147] Code coverage for most pg_mblen* calls. A security patch changed them today, so close the coverage gap now. Test that buffer overrun is avoided when pg_mblen*() requires more than the number of bytes remaining. This does not cover the calls in dict_thesaurus.c or in dict_synonym.c. That code is straightforward. To change that code's input, one must have access to modify installed OS files, so low-privilege users are not a threat. Testing this would likewise require changing installed share/postgresql/tsearch_data, which was enough of an obstacle to not bother. Security: CVE-2026-2006 Backpatch-through: 14 Co-authored-by: Thomas Munro Co-authored-by: Noah Misch Reviewed-by: Heikki Linnakangas --- contrib/pg_trgm/Makefile | 2 +- contrib/pg_trgm/data/trgm_utf8.data | 50 +++ contrib/pg_trgm/expected/pg_utf8_trgm.out | 8 + contrib/pg_trgm/expected/pg_utf8_trgm_1.out | 3 + contrib/pg_trgm/meson.build | 1 + contrib/pg_trgm/sql/pg_utf8_trgm.sql | 9 + src/backend/utils/adt/arrayfuncs.c | 6 + src/test/regress/expected/copyencoding.out | 7 + src/test/regress/expected/encoding.out | 401 ++++++++++++++++++++ src/test/regress/expected/encoding_1.out | 4 + src/test/regress/expected/euc_kr.out | 16 + src/test/regress/expected/euc_kr_1.out | 6 + src/test/regress/parallel_schedule | 2 +- src/test/regress/regress.c | 139 +++++++ src/test/regress/sql/copyencoding.sql | 7 + src/test/regress/sql/encoding.sql | 228 +++++++++++ src/test/regress/sql/euc_kr.sql | 12 + 17 files changed, 899 insertions(+), 2 deletions(-) create mode 100644 contrib/pg_trgm/data/trgm_utf8.data create mode 100644 contrib/pg_trgm/expected/pg_utf8_trgm.out create mode 100644 contrib/pg_trgm/expected/pg_utf8_trgm_1.out create mode 100644 contrib/pg_trgm/sql/pg_utf8_trgm.sql create mode 100644 src/test/regress/expected/encoding.out create mode 100644 src/test/regress/expected/encoding_1.out create mode 100644 src/test/regress/expected/euc_kr.out create mode 100644 src/test/regress/expected/euc_kr_1.out create mode 100644 src/test/regress/sql/encoding.sql create mode 100644 src/test/regress/sql/euc_kr.sql diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile index 1fbdc9ec1e..c1756993ec 100644 --- a/contrib/pg_trgm/Makefile +++ b/contrib/pg_trgm/Makefile @@ -14,7 +14,7 @@ DATA = pg_trgm--1.5--1.6.sql pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \ pg_trgm--1.0--1.1.sql PGFILEDESC = "pg_trgm - trigram matching" -REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm +REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pg_trgm/data/trgm_utf8.data b/contrib/pg_trgm/data/trgm_utf8.data new file mode 100644 index 0000000000..713856e76a --- /dev/null +++ b/contrib/pg_trgm/data/trgm_utf8.data @@ -0,0 +1,50 @@ +Mathematics +数学 +गणित +Matemáticas +رياضيات +Mathématiques +গণিত +Matemática +Математика +ریاضی +Matematika +Mathematik +数学 +Mathematics +गणित +గణితం +Matematik +கணிதம் +數學 +Toán học +Matematika +数学 +수학 +ریاضی +Lissafi +Hisabati +Matematika +Matematica +ریاضی +ಗಣಿತ +ગણિત +คณิตศาสตร์ +ሂሳብ +गणित +ਗਣਿਤ +數學 +数学 +Iṣiro +數學 +သင်္ချာ +Herrega +رياضي +गणित +Математика +Matematyka +ഗണിതം +Matematika +رياضي +Matematika +Matematică diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm.out b/contrib/pg_trgm/expected/pg_utf8_trgm.out new file mode 100644 index 0000000000..0768e7d6a8 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_utf8_trgm.out @@ -0,0 +1,8 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +-- Index 50 translations of the word "Mathematics" +CREATE TEMP TABLE mb (s text); +\copy mb from 'data/trgm_utf8.data' +CREATE INDEX ON mb USING gist(s gist_trgm_ops); diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm_1.out b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out new file mode 100644 index 0000000000..8505c4fa55 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out @@ -0,0 +1,3 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/contrib/pg_trgm/meson.build b/contrib/pg_trgm/meson.build index 3cc299d5ea..3ecf95ba86 100644 --- a/contrib/pg_trgm/meson.build +++ b/contrib/pg_trgm/meson.build @@ -39,6 +39,7 @@ tests += { 'regress': { 'sql': [ 'pg_trgm', + 'pg_utf8_trgm', 'pg_word_trgm', 'pg_strict_word_trgm', ], diff --git a/contrib/pg_trgm/sql/pg_utf8_trgm.sql b/contrib/pg_trgm/sql/pg_utf8_trgm.sql new file mode 100644 index 0000000000..0dd962ced8 --- /dev/null +++ b/contrib/pg_trgm/sql/pg_utf8_trgm.sql @@ -0,0 +1,9 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +-- Index 50 translations of the word "Mathematics" +CREATE TEMP TABLE mb (s text); +\copy mb from 'data/trgm_utf8.data' +CREATE INDEX ON mb USING gist(s gist_trgm_ops); diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index da68915ee2..734e5fea45 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -3736,6 +3736,12 @@ deconstruct_array_builtin(const ArrayType *array, elmalign = TYPALIGN_SHORT; break; + case INT4OID: + elmlen = sizeof(int32); + elmbyval = true; + elmalign = TYPALIGN_INT; + break; + case OIDOID: elmlen = sizeof(Oid); elmbyval = true; diff --git a/src/test/regress/expected/copyencoding.out b/src/test/regress/expected/copyencoding.out index cfa2ed6df0..76ea0e7cf0 100644 --- a/src/test/regress/expected/copyencoding.out +++ b/src/test/regress/expected/copyencoding.out @@ -17,6 +17,13 @@ CREATE TABLE copy_encoding_tab (t text); COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8'); -- Read UTF8 data as LATIN1: no error COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1'); +-- Non-server encodings have distinct code paths. +\set fname :abs_builddir '/results/copyencoding_gb18030.csv' +COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT csv, ENCODING 'GB18030'); +COPY copy_encoding_tab FROM :'fname' WITH (FORMAT csv, ENCODING 'GB18030'); +\set fname :abs_builddir '/results/copyencoding_gb18030.data' +COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT text, ENCODING 'GB18030'); +COPY copy_encoding_tab FROM :'fname' WITH (FORMAT text, ENCODING 'GB18030'); -- Use client_encoding SET client_encoding TO UTF8; -- U+3042 HIRAGANA LETTER A diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out new file mode 100644 index 0000000000..ea1f38cff4 --- /dev/null +++ b/src/test/regress/expected/encoding.out @@ -0,0 +1,401 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +\getenv libdir PG_LIBDIR +\getenv dlsuffix PG_DLSUFFIX +\set regresslib :libdir '/regress' :dlsuffix +CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean + AS :'regresslib' LANGUAGE C STRICT; +CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); +INSERT INTO regress_encoding +VALUES ('café', + 'caf' || test_bytea_to_text('\xc3'), + 'café' || test_bytea_to_text('\x00') || 'dcba', + 'caf' || test_bytea_to_text('\xc300') || 'dcba'); +SELECT good, truncated, with_nul FROM regress_encoding; + good | truncated | with_nul +------+-----------+---------- + café | caf | café +(1 row) + +SELECT length(good) FROM regress_encoding; + length +-------- + 4 +(1 row) + +SELECT substring(good, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(good, 4, 1) FROM regress_encoding; + substring +----------- + é +(1 row) + +SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + é +(1 row) + +SELECT reverse(good) FROM regress_encoding; + reverse +--------- + éfac +(1 row) + +-- invalid short mb character = error +SELECT length(truncated) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT substring(truncated, 1, 1) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT reverse(truncated) FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +-- invalid short mb character = silently dropped +SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + caf +(1 row) + +-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string +-- contains NUL at a character boundary position, some functions treat it as a +-- character while others treat it as a terminator, as implementation details. +-- NUL = terminator +SELECT length(with_nul) FROM regress_encoding; + length +-------- + 4 +(1 row) + +SELECT substring(with_nul, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(with_nul, 4, 1) FROM regress_encoding; + substring +----------- + é +(1 row) + +SELECT substring(with_nul, 5, 1) FROM regress_encoding; + substring +----------- + +(1 row) + +SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; + convert_to +------------ + \x +(1 row) + +SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + é +(1 row) + +-- NUL = character +SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; + with_nul | reverse | reverse +----------+---------+--------- + café | abcd | café +(1 row) + +-- If a corrupted string contains NUL in the tail bytes of a multibyte +-- character (invalid in all encodings), it is considered part of the +-- character for length purposes. An error will only be raised in code paths +-- that convert or verify encodings. +SELECT length(truncated_with_nul) FROM regress_encoding; + length +-------- + 8 +(1 row) + +SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; + substring +----------- + f +(1 row) + +SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; + substring +----------- + +(1 row) + +SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 0x00 +SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; + substring +----------- + d +(1 row) + +SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; + ?column? +---------- + t +(1 row) + +SELECT reverse(truncated_with_nul) FROM regress_encoding; + reverse +--------- + abcd +(1 row) + +-- unbounded: sequence would overrun the string! +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +-- condition detected when using the length/range variants +SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +-- unbounded: sequence would overrun the string, if the terminator were really +-- the end of it +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) +FROM regress_encoding; + test_mblen_func +----------------- + 2 +(1 row) + +-- condition detected when using the cstr variants +SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; +ERROR: invalid byte sequence for encoding "UTF8": 0xc3 +DROP TABLE regress_encoding; +-- mb<->wchar conversions +CREATE FUNCTION test_encoding(encoding text, description text, input bytea) +RETURNS VOID LANGUAGE plpgsql AS +$$ +DECLARE + prefix text; + len int; + wchars int[]; + round_trip bytea; + result text; +BEGIN + prefix := rpad(encoding || ' ' || description || ':', 28); + + -- XXX could also test validation, length functions and include client + -- only encodings with these test cases + + IF test_valid_server_encoding(encoding) THEN + wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); + round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); + if input = round_trip then + result := 'OK'; + elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then + result := 'truncated'; + else + result := 'failed'; + end if; + RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; + END IF; +END; +$$; +-- No validation is done on the encoding itself, just the length to avoid +-- overruns, so some of the byte sequences below are bogus. They cover +-- all code branches, server encodings only for now. +CREATE TABLE encoding_tests (encoding text, description text, input bytea); +INSERT INTO encoding_tests VALUES + -- LATIN1, other single-byte encodings + ('LATIN1', 'ASCII', 'a'), + ('LATIN1', 'extended', '\xe9'), + -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): + -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 2 80..ff (CS1) + ('EUC_JP', 'ASCII', 'a'), + ('EUC_JP', 'CS1, short', '\x80'), + ('EUC_JP', 'CS1', '\x8002'), + ('EUC_JP', 'CS2, short', '\x8e'), + ('EUC_JP', 'CS2', '\x8e02'), + ('EUC_JP', 'CS3, short', '\x8f'), + ('EUC_JP', 'CS3, short', '\x8f02'), + ('EUC_JP', 'CS3', '\x8f0203'), + -- EUC_CN + -- 3 8e (CS2, not used but arbitrarily considered to have length 3) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_CN', 'ASCII', 'a'), + ('EUC_CN', 'CS1, short', '\x80'), + ('EUC_CN', 'CS1', '\x8002'), + ('EUC_CN', 'CS2, short', '\x8e'), + ('EUC_CN', 'CS2, short', '\x8e02'), + ('EUC_CN', 'CS2', '\x8e0203'), + ('EUC_CN', 'CS3, short', '\x8f'), + ('EUC_CN', 'CS3, short', '\x8f02'), + ('EUC_CN', 'CS3', '\x8f0203'), + -- EUC_TW: + -- 4 8e (CS2) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_TW', 'ASCII', 'a'), + ('EUC_TW', 'CS1, short', '\x80'), + ('EUC_TW', 'CS1', '\x8002'), + ('EUC_TW', 'CS2, short', '\x8e'), + ('EUC_TW', 'CS2, short', '\x8e02'), + ('EUC_TW', 'CS2, short', '\x8e0203'), + ('EUC_TW', 'CS2', '\x8e020304'), + ('EUC_TW', 'CS3, short', '\x8f'), + ('EUC_TW', 'CS3, short', '\x8f02'), + ('EUC_TW', 'CS3', '\x8f0203'), + -- UTF8 + -- 2 c0..df + -- 3 e0..ef + -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) + -- 5 f8..fb (not supported) + -- 6 fc..fd (not supported) + ('UTF8', 'ASCII', 'a'), + ('UTF8', '2 byte, short', '\xdf'), + ('UTF8', '2 byte', '\xdf82'), + ('UTF8', '3 byte, short', '\xef'), + ('UTF8', '3 byte, short', '\xef82'), + ('UTF8', '3 byte', '\xef8283'), + ('UTF8', '4 byte, short', '\xf7'), + ('UTF8', '4 byte, short', '\xf782'), + ('UTF8', '4 byte, short', '\xf78283'), + ('UTF8', '4 byte', '\xf7828384'), + ('UTF8', '5 byte, unsupported', '\xfb'), + ('UTF8', '5 byte, unsupported', '\xfb82'), + ('UTF8', '5 byte, unsupported', '\xfb8283'), + ('UTF8', '5 byte, unsupported', '\xfb828384'), + ('UTF8', '5 byte, unsupported', '\xfb82838485'), + ('UTF8', '6 byte, unsupported', '\xfd'), + ('UTF8', '6 byte, unsupported', '\xfd82'), + ('UTF8', '6 byte, unsupported', '\xfd8283'), + ('UTF8', '6 byte, unsupported', '\xfd828384'), + ('UTF8', '6 byte, unsupported', '\xfd82838485'), + ('UTF8', '6 byte, unsupported', '\xfd8283848586'), + -- MULE_INTERNAL + -- 2 81..8d LC1 + -- 3 90..99 LC2 + ('MULE_INTERNAL', 'ASCII', 'a'), + ('MULE_INTERNAL', 'LC1, short', '\x81'), + ('MULE_INTERNAL', 'LC1', '\x8182'), + ('MULE_INTERNAL', 'LC2, short', '\x90'), + ('MULE_INTERNAL', 'LC2, short', '\x9082'), + ('MULE_INTERNAL', 'LC2', '\x908283'); +SELECT COUNT(test_encoding(encoding, description, input)) > 0 +FROM encoding_tests; +NOTICE: LATIN1 ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: LATIN1 extended: \xe9 -> {233} -> \xe9 = OK +NOTICE: EUC_JP ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_JP CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_JP CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_JP CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_JP CS2: \x8e02 -> {36354} -> \x8e02 = OK +NOTICE: EUC_JP CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_JP CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_JP CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: EUC_CN ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_CN CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_CN CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_CN CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_CN CS2, short: \x8e02 -> {} -> \x = truncated +NOTICE: EUC_CN CS2: \x8e0203 -> {9306627} -> \x8e0203 = OK +NOTICE: EUC_CN CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_CN CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_CN CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: EUC_TW ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: EUC_TW CS1, short: \x80 -> {} -> \x = truncated +NOTICE: EUC_TW CS1: \x8002 -> {32770} -> \x8002 = OK +NOTICE: EUC_TW CS2, short: \x8e -> {} -> \x = truncated +NOTICE: EUC_TW CS2, short: \x8e02 -> {} -> \x = truncated +NOTICE: EUC_TW CS2, short: \x8e0203 -> {} -> \x = truncated +NOTICE: EUC_TW CS2: \x8e020304 -> {-1912470780} -> \x8e020304 = OK +NOTICE: EUC_TW CS3, short: \x8f -> {} -> \x = truncated +NOTICE: EUC_TW CS3, short: \x8f02 -> {} -> \x = truncated +NOTICE: EUC_TW CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK +NOTICE: UTF8 ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: UTF8 2 byte, short: \xdf -> {} -> \x = truncated +NOTICE: UTF8 2 byte: \xdf82 -> {1986} -> \xdf82 = OK +NOTICE: UTF8 3 byte, short: \xef -> {} -> \x = truncated +NOTICE: UTF8 3 byte, short: \xef82 -> {} -> \x = truncated +NOTICE: UTF8 3 byte: \xef8283 -> {61571} -> \xef8283 = OK +NOTICE: UTF8 4 byte, short: \xf7 -> {} -> \x = truncated +NOTICE: UTF8 4 byte, short: \xf782 -> {} -> \x = truncated +NOTICE: UTF8 4 byte, short: \xf78283 -> {} -> \x = truncated +NOTICE: UTF8 4 byte: \xf7828384 -> {1843396} -> \xf7828384 = OK +NOTICE: UTF8 5 byte, unsupported: \xfb -> {251} -> \xc3bb = failed +NOTICE: UTF8 5 byte, unsupported: \xfb82 -> {251,130} -> \xc3bbc282 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed +NOTICE: UTF8 5 byte, unsupported: \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd -> {253} -> \xc3bd = failed +NOTICE: UTF8 6 byte, unsupported: \xfd82 -> {253,130} -> \xc3bdc282 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed +NOTICE: UTF8 6 byte, unsupported: \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed +NOTICE: MULE_INTERNAL ASCII: \x61 -> {97} -> \x61 = OK +NOTICE: MULE_INTERNAL LC1, short: \x81 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC1: \x8182 -> {8454274} -> \x8182 = OK +NOTICE: MULE_INTERNAL LC2, short: \x90 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC2, short: \x9082 -> {} -> \x = truncated +NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK + ?column? +---------- + t +(1 row) + +DROP TABLE encoding_tests; +DROP FUNCTION test_encoding; +DROP FUNCTION test_text_to_wchars; +DROP FUNCTION test_mblen_func; +DROP FUNCTION test_bytea_to_text; +DROP FUNCTION test_text_to_bytea; +-- substring slow path: multi-byte escape char vs. multi-byte pattern char. +SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); + substring +----------- + +(1 row) + +-- Levenshtein distance metric: exercise character length cache. +SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); +ERROR: column "real§_name" does not exist +LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); + ^ +HINT: Perhaps you meant to reference the column "x.real_name". +-- JSON errcontext: truncate long data. +SELECT repeat(U&'\00A7', 30)::json; +ERROR: invalid input syntax for type json +DETAIL: Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid. +CONTEXT: JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§ diff --git a/src/test/regress/expected/encoding_1.out b/src/test/regress/expected/encoding_1.out new file mode 100644 index 0000000000..a5b0209090 --- /dev/null +++ b/src/test/regress/expected/encoding_1.out @@ -0,0 +1,4 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/expected/euc_kr.out b/src/test/regress/expected/euc_kr.out new file mode 100644 index 0000000000..7a61c89a43 --- /dev/null +++ b/src/test/regress/expected/euc_kr.out @@ -0,0 +1,16 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit +\endif +-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. +SELECT POSITION( + convert_from('\xbcf6c7d0', 'EUC_KR') IN + convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); + position +---------- + 5 +(1 row) + diff --git a/src/test/regress/expected/euc_kr_1.out b/src/test/regress/expected/euc_kr_1.out new file mode 100644 index 0000000000..faaac5d635 --- /dev/null +++ b/src/test/regress/expected/euc_kr_1.out @@ -0,0 +1,6 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 021d57f66b..549e9b2d7b 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t # geometry depends on point, lseg, line, box, path, polygon, circle # horology depends on date, time, timetz, timestamp, timestamptz, interval # ---------- -test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 +test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr # ---------- # Load huge amounts of data diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index ce5f5f9eb1..bea858f03c 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -1115,6 +1115,145 @@ test_enc_conversion(PG_FUNCTION_ARGS) PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); } +/* Convert bytea to text without validation for corruption tests from SQL. */ +PG_FUNCTION_INFO_V1(test_bytea_to_text); +Datum +test_bytea_to_text(PG_FUNCTION_ARGS) +{ + PG_RETURN_TEXT_P(PG_GETARG_BYTEA_PP(0)); +} + +/* And the reverse. */ +PG_FUNCTION_INFO_V1(test_text_to_bytea); +Datum +test_text_to_bytea(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(PG_GETARG_TEXT_PP(0)); +} + +/* Corruption tests in C. */ +PG_FUNCTION_INFO_V1(test_mblen_func); +Datum +test_mblen_func(PG_FUNCTION_ARGS) +{ + const char *func = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + const char *encoding = text_to_cstring(PG_GETARG_BYTEA_PP(1)); + text *string = PG_GETARG_BYTEA_PP(2); + int offset = PG_GETARG_INT32(3); + const char *data = VARDATA_ANY(string); + size_t size = VARSIZE_ANY_EXHDR(string); + int result = 0; + + if (strcmp(func, "pg_mblen_unbounded") == 0) + result = pg_mblen_unbounded(data + offset); + else if (strcmp(func, "pg_mblen_cstr") == 0) + result = pg_mblen_cstr(data + offset); + else if (strcmp(func, "pg_mblen_with_len") == 0) + result = pg_mblen_with_len(data + offset, size - offset); + else if (strcmp(func, "pg_mblen_range") == 0) + result = pg_mblen_range(data + offset, data + size); + else if (strcmp(func, "pg_encoding_mblen") == 0) + result = pg_encoding_mblen(pg_char_to_encoding(encoding), data + offset); + else + elog(ERROR, "unknown function"); + + PG_RETURN_INT32(result); +} + +PG_FUNCTION_INFO_V1(test_text_to_wchars); +Datum +test_text_to_wchars(PG_FUNCTION_ARGS) +{ + const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + text *string = PG_GETARG_TEXT_PP(1); + const char *data = VARDATA_ANY(string); + size_t size = VARSIZE_ANY_EXHDR(string); + pg_wchar *wchars = palloc(sizeof(pg_wchar) * (size + 1)); + Datum *datums; + int wlen; + int encoding; + + encoding = pg_char_to_encoding(encoding_name); + if (encoding < 0) + elog(ERROR, "unknown encoding name: %s", encoding_name); + + if (size > 0) + { + datums = palloc(sizeof(Datum) * size); + wlen = pg_encoding_mb2wchar_with_len(encoding, + data, + wchars, + size); + Assert(wlen >= 0); + Assert(wlen <= size); + Assert(wchars[wlen] == 0); + + for (int i = 0; i < wlen; ++i) + datums[i] = UInt32GetDatum(wchars[i]); + } + else + { + datums = NULL; + wlen = 0; + } + + PG_RETURN_ARRAYTYPE_P(construct_array_builtin(datums, wlen, INT4OID)); +} + +PG_FUNCTION_INFO_V1(test_wchars_to_text); +Datum +test_wchars_to_text(PG_FUNCTION_ARGS) +{ + const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0)); + ArrayType *array = PG_GETARG_ARRAYTYPE_P(1); + Datum *datums; + bool *nulls; + char *mb; + text *result; + int wlen; + int bytes; + int encoding; + + encoding = pg_char_to_encoding(encoding_name); + if (encoding < 0) + elog(ERROR, "unknown encoding name: %s", encoding_name); + + deconstruct_array_builtin(array, INT4OID, &datums, &nulls, &wlen); + + if (wlen > 0) + { + pg_wchar *wchars = palloc(sizeof(pg_wchar) * wlen); + + for (int i = 0; i < wlen; ++i) + { + if (nulls[i]) + elog(ERROR, "unexpected NULL in array"); + wchars[i] = DatumGetInt32(datums[i]); + } + + mb = palloc(pg_encoding_max_length(encoding) * wlen + 1); + bytes = pg_encoding_wchar2mb_with_len(encoding, wchars, mb, wlen); + } + else + { + mb = ""; + bytes = 0; + } + + result = palloc(bytes + VARHDRSZ); + SET_VARSIZE(result, bytes + VARHDRSZ); + memcpy(VARDATA(result), mb, bytes); + + PG_RETURN_TEXT_P(result); +} + +PG_FUNCTION_INFO_V1(test_valid_server_encoding); +Datum +test_valid_server_encoding(PG_FUNCTION_ARGS) +{ + return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0))); +} + /* Provide SQL access to IsBinaryCoercible() */ PG_FUNCTION_INFO_V1(binary_coercible); Datum diff --git a/src/test/regress/sql/copyencoding.sql b/src/test/regress/sql/copyencoding.sql index 4e96a4d650..64718245b9 100644 --- a/src/test/regress/sql/copyencoding.sql +++ b/src/test/regress/sql/copyencoding.sql @@ -23,6 +23,13 @@ CREATE TABLE copy_encoding_tab (t text); COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8'); -- Read UTF8 data as LATIN1: no error COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1'); +-- Non-server encodings have distinct code paths. +\set fname :abs_builddir '/results/copyencoding_gb18030.csv' +COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT csv, ENCODING 'GB18030'); +COPY copy_encoding_tab FROM :'fname' WITH (FORMAT csv, ENCODING 'GB18030'); +\set fname :abs_builddir '/results/copyencoding_gb18030.data' +COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT text, ENCODING 'GB18030'); +COPY copy_encoding_tab FROM :'fname' WITH (FORMAT text, ENCODING 'GB18030'); -- Use client_encoding SET client_encoding TO UTF8; diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql new file mode 100644 index 0000000000..b9543c0cb3 --- /dev/null +++ b/src/test/regress/sql/encoding.sql @@ -0,0 +1,228 @@ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +\getenv libdir PG_LIBDIR +\getenv dlsuffix PG_DLSUFFIX + +\set regresslib :libdir '/regress' :dlsuffix + +CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text + AS :'regresslib' LANGUAGE C STRICT; +CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean + AS :'regresslib' LANGUAGE C STRICT; + + +CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); +INSERT INTO regress_encoding +VALUES ('café', + 'caf' || test_bytea_to_text('\xc3'), + 'café' || test_bytea_to_text('\x00') || 'dcba', + 'caf' || test_bytea_to_text('\xc300') || 'dcba'); + +SELECT good, truncated, with_nul FROM regress_encoding; + +SELECT length(good) FROM regress_encoding; +SELECT substring(good, 3, 1) FROM regress_encoding; +SELECT substring(good, 4, 1) FROM regress_encoding; +SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; +SELECT reverse(good) FROM regress_encoding; + +-- invalid short mb character = error +SELECT length(truncated) FROM regress_encoding; +SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT reverse(truncated) FROM regress_encoding; +-- invalid short mb character = silently dropped +SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; + +-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string +-- contains NUL at a character boundary position, some functions treat it as a +-- character while others treat it as a terminator, as implementation details. + +-- NUL = terminator +SELECT length(with_nul) FROM regress_encoding; +SELECT substring(with_nul, 3, 1) FROM regress_encoding; +SELECT substring(with_nul, 4, 1) FROM regress_encoding; +SELECT substring(with_nul, 5, 1) FROM regress_encoding; +SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; +SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; +-- NUL = character +SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; + +-- If a corrupted string contains NUL in the tail bytes of a multibyte +-- character (invalid in all encodings), it is considered part of the +-- character for length purposes. An error will only be raised in code paths +-- that convert or verify encodings. + +SELECT length(truncated_with_nul) FROM regress_encoding; +SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; +SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; +SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; +SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; +SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; +SELECT reverse(truncated_with_nul) FROM regress_encoding; + +-- unbounded: sequence would overrun the string! +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) +FROM regress_encoding; + +-- condition detected when using the length/range variants +SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) +FROM regress_encoding; +SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) +FROM regress_encoding; + +-- unbounded: sequence would overrun the string, if the terminator were really +-- the end of it +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; +SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) +FROM regress_encoding; + +-- condition detected when using the cstr variants +SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) +FROM regress_encoding; + +DROP TABLE regress_encoding; + +-- mb<->wchar conversions +CREATE FUNCTION test_encoding(encoding text, description text, input bytea) +RETURNS VOID LANGUAGE plpgsql AS +$$ +DECLARE + prefix text; + len int; + wchars int[]; + round_trip bytea; + result text; +BEGIN + prefix := rpad(encoding || ' ' || description || ':', 28); + + -- XXX could also test validation, length functions and include client + -- only encodings with these test cases + + IF test_valid_server_encoding(encoding) THEN + wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); + round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); + if input = round_trip then + result := 'OK'; + elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then + result := 'truncated'; + else + result := 'failed'; + end if; + RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; + END IF; +END; +$$; +-- No validation is done on the encoding itself, just the length to avoid +-- overruns, so some of the byte sequences below are bogus. They cover +-- all code branches, server encodings only for now. +CREATE TABLE encoding_tests (encoding text, description text, input bytea); +INSERT INTO encoding_tests VALUES + -- LATIN1, other single-byte encodings + ('LATIN1', 'ASCII', 'a'), + ('LATIN1', 'extended', '\xe9'), + -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): + -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) + -- 2 80..ff (CS1) + ('EUC_JP', 'ASCII', 'a'), + ('EUC_JP', 'CS1, short', '\x80'), + ('EUC_JP', 'CS1', '\x8002'), + ('EUC_JP', 'CS2, short', '\x8e'), + ('EUC_JP', 'CS2', '\x8e02'), + ('EUC_JP', 'CS3, short', '\x8f'), + ('EUC_JP', 'CS3, short', '\x8f02'), + ('EUC_JP', 'CS3', '\x8f0203'), + -- EUC_CN + -- 3 8e (CS2, not used but arbitrarily considered to have length 3) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_CN', 'ASCII', 'a'), + ('EUC_CN', 'CS1, short', '\x80'), + ('EUC_CN', 'CS1', '\x8002'), + ('EUC_CN', 'CS2, short', '\x8e'), + ('EUC_CN', 'CS2, short', '\x8e02'), + ('EUC_CN', 'CS2', '\x8e0203'), + ('EUC_CN', 'CS3, short', '\x8f'), + ('EUC_CN', 'CS3, short', '\x8f02'), + ('EUC_CN', 'CS3', '\x8f0203'), + -- EUC_TW: + -- 4 8e (CS2) + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) + -- 2 80..ff (CS1) + ('EUC_TW', 'ASCII', 'a'), + ('EUC_TW', 'CS1, short', '\x80'), + ('EUC_TW', 'CS1', '\x8002'), + ('EUC_TW', 'CS2, short', '\x8e'), + ('EUC_TW', 'CS2, short', '\x8e02'), + ('EUC_TW', 'CS2, short', '\x8e0203'), + ('EUC_TW', 'CS2', '\x8e020304'), + ('EUC_TW', 'CS3, short', '\x8f'), + ('EUC_TW', 'CS3, short', '\x8f02'), + ('EUC_TW', 'CS3', '\x8f0203'), + -- UTF8 + -- 2 c0..df + -- 3 e0..ef + -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) + -- 5 f8..fb (not supported) + -- 6 fc..fd (not supported) + ('UTF8', 'ASCII', 'a'), + ('UTF8', '2 byte, short', '\xdf'), + ('UTF8', '2 byte', '\xdf82'), + ('UTF8', '3 byte, short', '\xef'), + ('UTF8', '3 byte, short', '\xef82'), + ('UTF8', '3 byte', '\xef8283'), + ('UTF8', '4 byte, short', '\xf7'), + ('UTF8', '4 byte, short', '\xf782'), + ('UTF8', '4 byte, short', '\xf78283'), + ('UTF8', '4 byte', '\xf7828384'), + ('UTF8', '5 byte, unsupported', '\xfb'), + ('UTF8', '5 byte, unsupported', '\xfb82'), + ('UTF8', '5 byte, unsupported', '\xfb8283'), + ('UTF8', '5 byte, unsupported', '\xfb828384'), + ('UTF8', '5 byte, unsupported', '\xfb82838485'), + ('UTF8', '6 byte, unsupported', '\xfd'), + ('UTF8', '6 byte, unsupported', '\xfd82'), + ('UTF8', '6 byte, unsupported', '\xfd8283'), + ('UTF8', '6 byte, unsupported', '\xfd828384'), + ('UTF8', '6 byte, unsupported', '\xfd82838485'), + ('UTF8', '6 byte, unsupported', '\xfd8283848586'), + -- MULE_INTERNAL + -- 2 81..8d LC1 + -- 3 90..99 LC2 + ('MULE_INTERNAL', 'ASCII', 'a'), + ('MULE_INTERNAL', 'LC1, short', '\x81'), + ('MULE_INTERNAL', 'LC1', '\x8182'), + ('MULE_INTERNAL', 'LC2, short', '\x90'), + ('MULE_INTERNAL', 'LC2, short', '\x9082'), + ('MULE_INTERNAL', 'LC2', '\x908283'); + +SELECT COUNT(test_encoding(encoding, description, input)) > 0 +FROM encoding_tests; + +DROP TABLE encoding_tests; +DROP FUNCTION test_encoding; +DROP FUNCTION test_text_to_wchars; +DROP FUNCTION test_mblen_func; +DROP FUNCTION test_bytea_to_text; +DROP FUNCTION test_text_to_bytea; + + +-- substring slow path: multi-byte escape char vs. multi-byte pattern char. +SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); +-- Levenshtein distance metric: exercise character length cache. +SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); +-- JSON errcontext: truncate long data. +SELECT repeat(U&'\00A7', 30)::json; diff --git a/src/test/regress/sql/euc_kr.sql b/src/test/regress/sql/euc_kr.sql new file mode 100644 index 0000000000..1851b2a8c1 --- /dev/null +++ b/src/test/regress/sql/euc_kr.sql @@ -0,0 +1,12 @@ +-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent +-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all +-- of EUC_KR, also run the test in UTF8. +SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset +\if :skip_test +\quit +\endif + +-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. +SELECT POSITION( + convert_from('\xbcf6c7d0', 'EUC_KR') IN + convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); From 38e0190ced714b33c43c9676d768cc6814fc662a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Mon, 9 Feb 2026 13:23:10 +0100 Subject: [PATCH 058/147] Allow log_min_messages to be set per process type Change log_min_messages from being a single element to a comma-separated list of type:level elements, with 'type' representing a process type, and 'level' being a log level to use for that type of process. The list must also have a freestanding level specification which is used for process types not listed, which convenientely makes the whole thing backwards-compatible. Some choices made here could be contested; for instance, we use the process type `backend` to affect regular backends as well as dead-end backends and the standalone backend, and `autovacuum` means both the launcher and the workers. I think it's largely sensible though, and it can easily be tweaked if desired. Author: Euler Taveira Reviewed-by: Chao Li Reviewed-by: Japin Li Reviewed-by: Tan Yang <332696245@qq.com> Discussion: https://postgr.es/m/e85c6671-1600-4112-8887-f97a8a5d07b2@app.fastmail.com --- doc/src/sgml/config.sgml | 52 +++- src/backend/commands/extension.c | 2 +- src/backend/postmaster/launch_backend.c | 2 +- src/backend/utils/error/elog.c | 247 +++++++++++++++++- src/backend/utils/init/miscinit.c | 2 +- src/backend/utils/misc/guc_parameters.dat | 10 +- src/backend/utils/misc/guc_tables.c | 13 +- src/backend/utils/misc/postgresql.conf.sample | 16 +- src/include/postmaster/proctypelist.h | 42 +-- src/include/utils/guc.h | 5 +- src/include/utils/guc_hooks.h | 2 + src/test/regress/expected/guc.out | 57 ++++ src/test/regress/sql/guc.sql | 22 ++ 13 files changed, 428 insertions(+), 44 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index f1af1505cf..3734298696 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7120,27 +7120,57 @@ local0.* /var/log/postgresql - log_min_messages (enum) + log_min_messages (string) log_min_messages configuration parameter - Controls which message - levels are written to the server log. - Valid values are DEBUG5, DEBUG4, - DEBUG3, DEBUG2, DEBUG1, - INFO, NOTICE, WARNING, - ERROR, LOG, FATAL, and - PANIC. Each level includes all the levels that - follow it. The later the level, the fewer messages are sent - to the log. The default is WARNING. Note that - LOG has a different rank here than in + Controls which + message levels + are written to the server log. The value is a comma-separated + list of zero or more + process type:level + entries and exactly one mandatory + level entry, + which becomes the default for process types not listed. + Valid process types are listed in the table below. + + archiver + autovacuum + backend + bgworker + bgwriter + checkpointer + ioworker + postmaster + syslogger + slotsyncworker + startup + walreceiver + walsender + walsummarizer + walwriter + + Valid level values are DEBUG5, + DEBUG4, DEBUG3, DEBUG2, + DEBUG1, INFO, NOTICE, + WARNING, ERROR, LOG, + FATAL, and PANIC. Each level includes + all the levels that follow it. The later the level, the fewer messages are sent + to the log. The default is WARNING, which + applies that level to all process types. + Note that LOG has a different rank here than in . Only superusers and users with the appropriate SET privilege can change this setting. + + Example: To log walsender and autovacuum + at level DEBUG1 and everything else at ERROR, + set log_min_messages to error, walsender:debug1, autovacuum:debug1. + diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 596105ee07..688f1874f3 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -1191,7 +1191,7 @@ execute_extension_script(Oid extensionOid, ExtensionControlFile *control, (void) set_config_option("client_min_messages", "warning", PGC_USERSET, PGC_S_SESSION, GUC_ACTION_SAVE, true, 0, false); - if (log_min_messages < WARNING) + if (log_min_messages[MyBackendType] < WARNING) (void) set_config_option_ext("log_min_messages", "warning", PGC_SUSET, PGC_S_SESSION, BOOTSTRAP_SUPERUSERID, diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index 926fd6f270..05b1feef3c 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -178,7 +178,7 @@ typedef struct } child_process_kind; static child_process_kind child_process_kinds[] = { -#define PG_PROCTYPE(bktype, description, main_func, shmem_attach) \ +#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ [bktype] = {description, main_func, shmem_attach}, #include "postmaster/proctypelist.h" #undef PG_PROCTYPE diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index e6a4ef9905..129906e2da 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -182,6 +182,7 @@ static bool matches_backtrace_functions(const char *funcname); static pg_noinline void set_backtrace(ErrorData *edata, int num_skip); static void set_errdata_field(MemoryContextData *cxt, char **ptr, const char *str); static void FreeErrorDataContents(ErrorData *edata); +static int log_min_messages_cmp(const ListCell *a, const ListCell *b); static void write_console(const char *line, int len); static const char *process_log_prefix_padding(const char *p, int *ppadding); static void log_line_prefix(StringInfo buf, ErrorData *edata); @@ -235,7 +236,7 @@ is_log_level_output(int elevel, int log_min_level) static inline bool should_output_to_server(int elevel) { - return is_log_level_output(elevel, log_min_messages); + return is_log_level_output(elevel, log_min_messages[MyBackendType]); } /* @@ -2170,6 +2171,250 @@ DebugFileOpen(void) } +/* + * GUC check_hook for log_min_messages + * + * This value is parsed as a comma-separated list of zero or more TYPE:LEVEL + * elements. For each element, TYPE corresponds to a bk_category value (see + * postmaster/proctypelist.h); LEVEL is one of server_message_level_options. + * + * In addition, there must be a single LEVEL element (with no TYPE part) + * which sets the default level for process types that aren't specified. + */ +bool +check_log_min_messages(char **newval, void **extra, GucSource source) +{ + char *rawstring; + List *elemlist; + StringInfoData buf; + char *result; + int newlevel[BACKEND_NUM_TYPES]; + bool assigned[BACKEND_NUM_TYPES] = {0}; + int genericlevel = -1; /* -1 means not assigned */ + + const char *const process_types[] = { +#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ + [bktype] = bkcategory, +#include "postmaster/proctypelist.h" +#undef PG_PROCTYPE + }; + + /* Need a modifiable copy of string. */ + rawstring = guc_strdup(LOG, *newval); + if (rawstring == NULL) + return false; + + /* Parse the string into a list. */ + if (!SplitGUCList(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + GUC_check_errdetail("List syntax is invalid."); + list_free(elemlist); + guc_free(rawstring); + return false; + } + + /* Validate and assign log level and process type. */ + foreach_ptr(char, elem, elemlist) + { + char *sep = strchr(elem, ':'); + + /* + * If there's no ':' separator in the entry, this is the default log + * level. Otherwise it's a process type-specific entry. + */ + if (sep == NULL) + { + const struct config_enum_entry *entry; + bool found; + + /* Reject duplicates for generic log level. */ + if (genericlevel != -1) + { + GUC_check_errdetail("Redundant specification of default log level."); + goto lmm_fail; + } + + /* Validate the log level */ + found = false; + for (entry = server_message_level_options; entry && entry->name; entry++) + { + if (pg_strcasecmp(entry->name, elem) == 0) + { + genericlevel = entry->val; + found = true; + break; + } + } + + if (!found) + { + GUC_check_errdetail("Unrecognized log level: \"%s\".", elem); + goto lmm_fail; + } + } + else + { + char *loglevel = sep + 1; + char *ptype = elem; + bool found; + int level; + const struct config_enum_entry *entry; + + /* + * Temporarily clobber the ':' with a string terminator, so that + * we can validate it. We restore this at the bottom. + */ + *sep = '\0'; + + /* Validate the log level */ + found = false; + for (entry = server_message_level_options; entry && entry->name; entry++) + { + if (pg_strcasecmp(entry->name, loglevel) == 0) + { + level = entry->val; + found = true; + break; + } + } + + if (!found) + { + GUC_check_errdetail("Unrecognized log level for process type \"%s\": \"%s\".", + ptype, loglevel); + goto lmm_fail; + } + + /* Is the process type name valid and unique? */ + found = false; + for (int i = 0; i < BACKEND_NUM_TYPES; i++) + { + if (pg_strcasecmp(process_types[i], ptype) == 0) + { + /* Reject duplicates for a process type. */ + if (assigned[i]) + { + GUC_check_errdetail("Redundant log level specification for process type \"%s\".", + ptype); + goto lmm_fail; + } + + newlevel[i] = level; + assigned[i] = true; + found = true; + + /* + * note: we must keep looking! some process types appear + * multiple times in proctypelist.h. + */ + } + } + + if (!found) + { + GUC_check_errdetail("Unrecognized process type \"%s\".", ptype); + goto lmm_fail; + } + + /* Put the separator back in place */ + *sep = ':'; + } + + /* all good */ + continue; + +lmm_fail: + guc_free(rawstring); + list_free(elemlist); + return false; + } + + /* + * The generic log level must be specified. It is the fallback value. + */ + if (genericlevel == -1) + { + GUC_check_errdetail("Default log level was not defined."); + guc_free(rawstring); + list_free(elemlist); + return false; + } + + /* Apply the default log level to all processes not listed. */ + for (int i = 0; i < BACKEND_NUM_TYPES; i++) + { + if (!assigned[i]) + newlevel[i] = genericlevel; + } + + /* + * Save an ordered representation of the user-specified string, for the + * show_hook. + */ + list_sort(elemlist, log_min_messages_cmp); + + initStringInfoExt(&buf, strlen(rawstring) + 1); + foreach_ptr(char, elem, elemlist) + { + if (foreach_current_index(elem) == 0) + appendStringInfoString(&buf, elem); + else + appendStringInfo(&buf, ", %s", elem); + } + + result = (char *) guc_malloc(LOG, buf.len + 1); + if (!result) + return false; + memcpy(result, buf.data, buf.len); + result[buf.len] = '\0'; + + guc_free(*newval); + *newval = result; + + guc_free(rawstring); + list_free(elemlist); + pfree(buf.data); + + /* + * Pass back data for assign_log_min_messages to use. + */ + *extra = guc_malloc(LOG, BACKEND_NUM_TYPES * sizeof(int)); + if (!*extra) + return false; + memcpy(*extra, newlevel, BACKEND_NUM_TYPES * sizeof(int)); + + return true; +} + +/* + * list_sort() callback for check_log_min_messages. The default element + * goes first; the rest are ordered by strcmp() of the process type. + */ +static int +log_min_messages_cmp(const ListCell *a, const ListCell *b) +{ + const char *s = lfirst(a); + const char *t = lfirst(b); + + if (strchr(s, ':') == NULL) + return -1; + else if (strchr(t, ':') == NULL) + return 1; + else + return strcmp(s, t); +} + +/* + * GUC assign_hook for log_min_messages + */ +void +assign_log_min_messages(const char *newval, void *extra) +{ + for (int i = 0; i < BACKEND_NUM_TYPES; i++) + log_min_messages[i] = ((int *) extra)[i]; +} + /* * GUC check_hook for backtrace_functions * diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 563f20374f..03f6c8479f 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -266,7 +266,7 @@ GetBackendTypeDesc(BackendType backendType) switch (backendType) { -#define PG_PROCTYPE(bktype, description, main_func, shmem_attach) \ +#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ case bktype: backendDesc = description; break; #include "postmaster/proctypelist.h" #undef PG_PROCTYPE diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index c1f1603cd3..762b8efe6b 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1693,12 +1693,14 @@ options => 'server_message_level_options', }, -{ name => 'log_min_messages', type => 'enum', context => 'PGC_SUSET', group => 'LOGGING_WHEN', +{ name => 'log_min_messages', type => 'string', context => 'PGC_SUSET', group => 'LOGGING_WHEN', short_desc => 'Sets the message levels that are logged.', long_desc => 'Each level includes all the levels that follow it. The later the level, the fewer messages are sent.', - variable => 'log_min_messages', - boot_val => 'WARNING', - options => 'server_message_level_options', + flags => 'GUC_LIST_INPUT', + variable => 'log_min_messages_string', + boot_val => '"WARNING"', + check_hook => 'check_log_min_messages', + assign_hook => 'assign_log_min_messages', }, { name => 'log_parameter_max_length', type => 'int', context => 'PGC_SUSET', group => 'LOGGING_WHAT', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 5df3a36bf6..741fce8ded 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -147,7 +147,7 @@ static const struct config_enum_entry client_message_level_options[] = { {NULL, 0, false} }; -static const struct config_enum_entry server_message_level_options[] = { +const struct config_enum_entry server_message_level_options[] = { {"debug5", DEBUG5, false}, {"debug4", DEBUG4, false}, {"debug3", DEBUG3, false}, @@ -546,7 +546,6 @@ static bool standard_conforming_strings = true; bool current_role_is_superuser; int log_min_error_statement = ERROR; -int log_min_messages = WARNING; int client_min_messages = NOTICE; int log_min_duration_sample = -1; int log_min_duration_statement = -1; @@ -604,6 +603,7 @@ static char *server_version_string; static int server_version_num; static char *debug_io_direct_string; static char *restrict_nonsystem_relation_kind_string; +static char *log_min_messages_string; #ifdef HAVE_SYSLOG #define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0 @@ -656,6 +656,15 @@ char *role_string; /* should be static, but guc.c needs to get at this */ bool in_hot_standby_guc; +/* + * set default log_min_messages to WARNING for all process types + */ +int log_min_messages[] = { +#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ + [bktype] = WARNING, +#include "postmaster/proctypelist.h" +#undef PG_PROCTYPE +}; /* * Displayable names for context types (enum GucContext) diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 1ae594af84..6e82c8e055 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -532,7 +532,21 @@ # - When to Log - -#log_min_messages = warning # values in order of decreasing detail: +#log_min_messages = warning # comma-separated list of + # process_type:level entries, plus + # one freestanding level as default. + # Valid process types are: + # archiver autovacuum + # backend bgworker + # bgwriter checkpointer + # ioworker postmaster + # slotsyncworker startup + # syslogger walreceiver + # walsummarizer walwriter + # walsender + # + # Level values in order of decreasing + # detail: # debug5 # debug4 # debug3 diff --git a/src/include/postmaster/proctypelist.h b/src/include/postmaster/proctypelist.h index 0b99eaabfd..4e259e84c2 100644 --- a/src/include/postmaster/proctypelist.h +++ b/src/include/postmaster/proctypelist.h @@ -25,27 +25,27 @@ */ /* - * List of process types (symbol, description, Main function, shmem_attach) - * entries. + * List of process types (symbol, category, description, Main function, + * shmem_attach, message level) entries. */ -/* bktype, description, main_func, shmem_attach */ -PG_PROCTYPE(B_ARCHIVER, gettext_noop("archiver"), PgArchiverMain, true) -PG_PROCTYPE(B_AUTOVAC_LAUNCHER, gettext_noop("autovacuum launcher"), AutoVacLauncherMain, true) -PG_PROCTYPE(B_AUTOVAC_WORKER, gettext_noop("autovacuum worker"), AutoVacWorkerMain, true) -PG_PROCTYPE(B_BACKEND, gettext_noop("client backend"), BackendMain, true) -PG_PROCTYPE(B_BG_WORKER, gettext_noop("background worker"), BackgroundWorkerMain, true) -PG_PROCTYPE(B_BG_WRITER, gettext_noop("background writer"), BackgroundWriterMain, true) -PG_PROCTYPE(B_CHECKPOINTER, gettext_noop("checkpointer"), CheckpointerMain, true) -PG_PROCTYPE(B_DEAD_END_BACKEND, gettext_noop("dead-end client backend"), BackendMain, true) -PG_PROCTYPE(B_INVALID, gettext_noop("unrecognized"), NULL, false) -PG_PROCTYPE(B_IO_WORKER, gettext_noop("io worker"), IoWorkerMain, true) -PG_PROCTYPE(B_LOGGER, gettext_noop("syslogger"), SysLoggerMain, false) -PG_PROCTYPE(B_SLOTSYNC_WORKER, gettext_noop("slotsync worker"), ReplSlotSyncWorkerMain, true) -PG_PROCTYPE(B_STANDALONE_BACKEND, gettext_noop("standalone backend"), NULL, false) -PG_PROCTYPE(B_STARTUP, gettext_noop("startup"), StartupProcessMain, true) -PG_PROCTYPE(B_WAL_RECEIVER, gettext_noop("walreceiver"), WalReceiverMain, true) -PG_PROCTYPE(B_WAL_SENDER, gettext_noop("walsender"), NULL, true) -PG_PROCTYPE(B_WAL_SUMMARIZER, gettext_noop("walsummarizer"), WalSummarizerMain, true) -PG_PROCTYPE(B_WAL_WRITER, gettext_noop("walwriter"), WalWriterMain, true) +/* bktype, bkcategory, description, main_func, shmem_attach */ +PG_PROCTYPE(B_ARCHIVER, "archiver", gettext_noop("archiver"), PgArchiverMain, true) +PG_PROCTYPE(B_AUTOVAC_LAUNCHER, "autovacuum", gettext_noop("autovacuum launcher"), AutoVacLauncherMain, true) +PG_PROCTYPE(B_AUTOVAC_WORKER, "autovacuum", gettext_noop("autovacuum worker"), AutoVacWorkerMain, true) +PG_PROCTYPE(B_BACKEND, "backend", gettext_noop("client backend"), BackendMain, true) +PG_PROCTYPE(B_BG_WORKER, "bgworker", gettext_noop("background worker"), BackgroundWorkerMain, true) +PG_PROCTYPE(B_BG_WRITER, "bgwriter", gettext_noop("background writer"), BackgroundWriterMain, true) +PG_PROCTYPE(B_CHECKPOINTER, "checkpointer", gettext_noop("checkpointer"), CheckpointerMain, true) +PG_PROCTYPE(B_DEAD_END_BACKEND, "backend", gettext_noop("dead-end client backend"), BackendMain, true) +PG_PROCTYPE(B_INVALID, "postmaster", gettext_noop("unrecognized"), NULL, false) +PG_PROCTYPE(B_IO_WORKER, "ioworker", gettext_noop("io worker"), IoWorkerMain, true) +PG_PROCTYPE(B_LOGGER, "syslogger", gettext_noop("syslogger"), SysLoggerMain, false) +PG_PROCTYPE(B_SLOTSYNC_WORKER, "slotsyncworker", gettext_noop("slotsync worker"), ReplSlotSyncWorkerMain, true) +PG_PROCTYPE(B_STANDALONE_BACKEND, "backend", gettext_noop("standalone backend"), NULL, false) +PG_PROCTYPE(B_STARTUP, "startup", gettext_noop("startup"), StartupProcessMain, true) +PG_PROCTYPE(B_WAL_RECEIVER, "walreceiver", gettext_noop("walreceiver"), WalReceiverMain, true) +PG_PROCTYPE(B_WAL_SENDER, "walsender", gettext_noop("walsender"), NULL, true) +PG_PROCTYPE(B_WAL_SUMMARIZER, "walsummarizer", gettext_noop("walsummarizer"), WalSummarizerMain, true) +PG_PROCTYPE(B_WAL_WRITER, "walwriter", gettext_noop("walwriter"), WalWriterMain, true) diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index bf39878c43..8acbdba7ff 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -295,7 +295,7 @@ extern PGDLLIMPORT bool log_duration; extern PGDLLIMPORT int log_parameter_max_length; extern PGDLLIMPORT int log_parameter_max_length_on_error; extern PGDLLIMPORT int log_min_error_statement; -extern PGDLLIMPORT int log_min_messages; +extern PGDLLIMPORT int log_min_messages[]; extern PGDLLIMPORT int client_min_messages; extern PGDLLIMPORT int log_min_duration_sample; extern PGDLLIMPORT int log_min_duration_statement; @@ -329,6 +329,8 @@ extern PGDLLIMPORT bool trace_sort; extern PGDLLIMPORT bool optimize_bounded_sort; #endif +extern PGDLLIMPORT const char *const log_min_messages_process_types[]; + /* * Declarations for options for enum values * @@ -344,6 +346,7 @@ extern PGDLLIMPORT const struct config_enum_entry archive_mode_options[]; extern PGDLLIMPORT const struct config_enum_entry dynamic_shared_memory_options[]; extern PGDLLIMPORT const struct config_enum_entry io_method_options[]; extern PGDLLIMPORT const struct config_enum_entry recovery_target_action_options[]; +extern PGDLLIMPORT const struct config_enum_entry server_message_level_options[]; extern PGDLLIMPORT const struct config_enum_entry wal_level_options[]; extern PGDLLIMPORT const struct config_enum_entry wal_sync_method_options[]; diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index b6ecb0e769..9c90670d9b 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -177,5 +177,7 @@ extern void assign_wal_sync_method(int new_wal_sync_method, void *extra); extern bool check_synchronized_standby_slots(char **newval, void **extra, GucSource source); extern void assign_synchronized_standby_slots(const char *newval, void *extra); +extern bool check_log_min_messages(char **newval, void **extra, GucSource source); +extern void assign_log_min_messages(const char *newval, void *extra); #endif /* GUC_HOOKS_H */ diff --git a/src/test/regress/expected/guc.out b/src/test/regress/expected/guc.out index d6fb879f50..3fa2562f23 100644 --- a/src/test/regress/expected/guc.out +++ b/src/test/regress/expected/guc.out @@ -711,6 +711,63 @@ select current_schemas(false); reset search_path; -- +-- Test parsing of log_min_messages +-- +SET log_min_messages TO foo; -- fail +ERROR: invalid value for parameter "log_min_messages": "foo" +DETAIL: Unrecognized log level: "foo". +SET log_min_messages TO fatal; +SHOW log_min_messages; + log_min_messages +------------------ + fatal +(1 row) + +SET log_min_messages TO 'fatal'; +SHOW log_min_messages; + log_min_messages +------------------ + fatal +(1 row) + +SET log_min_messages TO 'checkpointer:debug2, autovacuum:debug1'; -- fail +ERROR: invalid value for parameter "log_min_messages": "checkpointer:debug2, autovacuum:debug1" +DETAIL: Default log level was not defined. +SET log_min_messages TO 'debug1, backend:error, fatal'; -- fail +ERROR: invalid value for parameter "log_min_messages": "debug1, backend:error, fatal" +DETAIL: Redundant specification of default log level. +SET log_min_messages TO 'backend:error, debug1, backend:warning'; -- fail +ERROR: invalid value for parameter "log_min_messages": "backend:error, debug1, backend:warning" +DETAIL: Redundant log level specification for process type "backend". +SET log_min_messages TO 'backend:error, foo:fatal, archiver:debug1'; -- fail +ERROR: invalid value for parameter "log_min_messages": "backend:error, foo:fatal, archiver:debug1" +DETAIL: Unrecognized process type "foo". +SET log_min_messages TO 'backend:error, checkpointer:bar, archiver:debug1'; -- fail +ERROR: invalid value for parameter "log_min_messages": "backend:error, checkpointer:bar, archiver:debug1" +DETAIL: Unrecognized log level for process type "checkpointer": "bar". +SET log_min_messages TO 'backend:error, checkpointer:debug3, fatal, archiver:debug2, autovacuum:debug1, walsender:debug3'; +SHOW log_min_messages; + log_min_messages +------------------------------------------------------------------------------------------------- + fatal, archiver:debug2, autovacuum:debug1, backend:error, checkpointer:debug3, walsender:debug3 +(1 row) + +SET log_min_messages TO 'warning, autovacuum:debug1'; +SHOW log_min_messages; + log_min_messages +---------------------------- + warning, autovacuum:debug1 +(1 row) + +SET log_min_messages TO 'autovacuum:debug1, warning'; +SHOW log_min_messages; + log_min_messages +---------------------------- + warning, autovacuum:debug1 +(1 row) + +RESET log_min_messages; +-- -- Tests for function-local GUC settings -- set work_mem = '3MB'; diff --git a/src/test/regress/sql/guc.sql b/src/test/regress/sql/guc.sql index bafaf067e8..dfb843fd3a 100644 --- a/src/test/regress/sql/guc.sql +++ b/src/test/regress/sql/guc.sql @@ -232,6 +232,28 @@ drop schema not_there_initially; select current_schemas(false); reset search_path; +-- +-- Test parsing of log_min_messages +-- + +SET log_min_messages TO foo; -- fail +SET log_min_messages TO fatal; +SHOW log_min_messages; +SET log_min_messages TO 'fatal'; +SHOW log_min_messages; +SET log_min_messages TO 'checkpointer:debug2, autovacuum:debug1'; -- fail +SET log_min_messages TO 'debug1, backend:error, fatal'; -- fail +SET log_min_messages TO 'backend:error, debug1, backend:warning'; -- fail +SET log_min_messages TO 'backend:error, foo:fatal, archiver:debug1'; -- fail +SET log_min_messages TO 'backend:error, checkpointer:bar, archiver:debug1'; -- fail +SET log_min_messages TO 'backend:error, checkpointer:debug3, fatal, archiver:debug2, autovacuum:debug1, walsender:debug3'; +SHOW log_min_messages; +SET log_min_messages TO 'warning, autovacuum:debug1'; +SHOW log_min_messages; +SET log_min_messages TO 'autovacuum:debug1, warning'; +SHOW log_min_messages; +RESET log_min_messages; + -- -- Tests for function-local GUC settings -- From d536aee5566354e42a1012da9dd3960e45402af5 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Mon, 9 Feb 2026 06:14:47 -0800 Subject: [PATCH 059/147] Require PGP-decrypted text to pass encoding validation. pgp_sym_decrypt() and pgp_pub_decrypt() will raise such errors, while bytea variants will not. The existing "dat3" test decrypted to non-UTF8 text, so switch that query to bytea. The long-term intent is for type "text" to always be valid in the database encoding. pgcrypto has long been known as a source of exceptions to that intent, but a report about exploiting invalid values of type "text" brought this module to the forefront. This particular exception is straightforward to fix, with reasonable effect on user queries. Back-patch to v14 (all supported versions). Reported-by: Paul Gerste (as part of zeroday.cloud) Reported-by: Moritz Sanft (as part of zeroday.cloud) Author: shihao zhong Reviewed-by: cary huang Discussion: https://postgr.es/m/CAGRkXqRZyo0gLxPJqUsDqtWYBbgM14betsHiLRPj9mo2=z9VvA@mail.gmail.com Backpatch-through: 14 Security: CVE-2026-2006 --- contrib/pgcrypto/expected/pgp-decrypt.out | 23 ++++++++++++++++++++- contrib/pgcrypto/expected/pgp-decrypt_1.out | 23 ++++++++++++++++++++- contrib/pgcrypto/pgp-pgsql.c | 2 ++ contrib/pgcrypto/sql/pgp-decrypt.sql | 22 +++++++++++++++++++- 4 files changed, 67 insertions(+), 3 deletions(-) diff --git a/contrib/pgcrypto/expected/pgp-decrypt.out b/contrib/pgcrypto/expected/pgp-decrypt.out index eb049ba9d4..1db89e8c00 100644 --- a/contrib/pgcrypto/expected/pgp-decrypt.out +++ b/contrib/pgcrypto/expected/pgp-decrypt.out @@ -315,7 +315,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== \xda39a3ee5e6b4b0d3255bfef95601890afd80709 (1 row) -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -387,6 +387,27 @@ ERROR: Wrong key or corrupt data select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); NOTICE: dbg: parse_literal_data: data type=b ERROR: Not text data +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n +SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== +=c2cz +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; +ERROR: invalid byte sequence for encoding [REDACTED]: 0x00 +CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. diff --git a/contrib/pgcrypto/expected/pgp-decrypt_1.out b/contrib/pgcrypto/expected/pgp-decrypt_1.out index 80a4c48613..d214e0bc0e 100644 --- a/contrib/pgcrypto/expected/pgp-decrypt_1.out +++ b/contrib/pgcrypto/expected/pgp-decrypt_1.out @@ -311,7 +311,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== \xda39a3ee5e6b4b0d3255bfef95601890afd80709 (1 row) -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -383,6 +383,27 @@ ERROR: Wrong key or corrupt data select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); NOTICE: dbg: parse_literal_data: data type=b ERROR: Not text data +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n +SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== +=c2cz +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; +ERROR: invalid byte sequence for encoding [REDACTED]: 0x00 +CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. diff --git a/contrib/pgcrypto/pgp-pgsql.c b/contrib/pgcrypto/pgp-pgsql.c index 3e47b9364a..d3e7895b0d 100644 --- a/contrib/pgcrypto/pgp-pgsql.c +++ b/contrib/pgcrypto/pgp-pgsql.c @@ -631,6 +631,7 @@ pgp_sym_decrypt_text(PG_FUNCTION_ARGS) arg = PG_GETARG_TEXT_PP(2); res = decrypt_internal(0, 1, data, key, NULL, arg); + pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false); PG_FREE_IF_COPY(data, 0); PG_FREE_IF_COPY(key, 1); @@ -732,6 +733,7 @@ pgp_pub_decrypt_text(PG_FUNCTION_ARGS) arg = PG_GETARG_TEXT_PP(3); res = decrypt_internal(1, 1, data, key, psw, arg); + pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false); PG_FREE_IF_COPY(data, 0); PG_FREE_IF_COPY(key, 1); diff --git a/contrib/pgcrypto/sql/pgp-decrypt.sql b/contrib/pgcrypto/sql/pgp-decrypt.sql index 49a0267bbc..2fe498f2f0 100644 --- a/contrib/pgcrypto/sql/pgp-decrypt.sql +++ b/contrib/pgcrypto/sql/pgp-decrypt.sql @@ -228,7 +228,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== -----END PGP MESSAGE----- '), '0123456789abcdefghij'), 'sha1'); -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -282,6 +282,26 @@ VsxxqLSPzNLAeIspJk5G -- Routine text/binary mismatch. select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n +SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== +=c2cz +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; + -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. From 60e7ae41a6987ed05dcfe87bddaccac8e1e93126 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 9 Feb 2026 09:57:43 -0500 Subject: [PATCH 060/147] Guard against unexpected dimensions of oidvector/int2vector. These data types are represented like full-fledged arrays, but functions that deal specifically with these types assume that the array is 1-dimensional and contains no nulls. However, there are cast pathways that allow general oid[] or int2[] arrays to be cast to these types, allowing these expectations to be violated. This can be exploited to cause server memory disclosure or SIGSEGV. Fix by installing explicit checks in functions that accept these types. Reported-by: Altan Birler Author: Tom Lane Reviewed-by: Noah Misch Security: CVE-2026-2003 Backpatch-through: 14 --- src/backend/access/hash/hashfunc.c | 3 +++ src/backend/access/nbtree/nbtcompare.c | 4 ++++ src/backend/utils/adt/format_type.c | 6 ++++- src/backend/utils/adt/int.c | 31 +++++++++++++++++++++++++- src/backend/utils/adt/oid.c | 31 +++++++++++++++++++++++++- src/include/utils/builtins.h | 1 + src/test/regress/expected/arrays.out | 5 +++++ src/test/regress/sql/arrays.sql | 4 ++++ 8 files changed, 82 insertions(+), 3 deletions(-) diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index 83bda209c4..036421fc66 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -27,6 +27,7 @@ #include "postgres.h" #include "common/hashfn.h" +#include "utils/builtins.h" #include "utils/float.h" #include "utils/fmgrprotos.h" #include "utils/pg_locale.h" @@ -233,6 +234,7 @@ hashoidvector(PG_FUNCTION_ARGS) { oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + check_valid_oidvector(key); return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid)); } @@ -241,6 +243,7 @@ hashoidvectorextended(PG_FUNCTION_ARGS) { oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + check_valid_oidvector(key); return hash_any_extended((unsigned char *) key->values, key->dim1 * sizeof(Oid), PG_GETARG_INT64(1)); diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 8425805a29..1d343377e9 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -57,6 +57,7 @@ #include +#include "utils/builtins.h" #include "utils/fmgrprotos.h" #include "utils/skipsupport.h" #include "utils/sortsupport.h" @@ -587,6 +588,9 @@ btoidvectorcmp(PG_FUNCTION_ARGS) oidvector *b = (oidvector *) PG_GETARG_POINTER(1); int i; + check_valid_oidvector(a); + check_valid_oidvector(b); + /* We arbitrarily choose to sort first by vector length */ if (a->dim1 != b->dim1) PG_RETURN_INT32(a->dim1 - b->dim1); diff --git a/src/backend/utils/adt/format_type.c b/src/backend/utils/adt/format_type.c index 544205ca06..3cd5053d11 100644 --- a/src/backend/utils/adt/format_type.c +++ b/src/backend/utils/adt/format_type.c @@ -448,11 +448,15 @@ oidvectortypes(PG_FUNCTION_ARGS) { oidvector *oidArray = (oidvector *) PG_GETARG_POINTER(0); char *result; - int numargs = oidArray->dim1; + int numargs; int num; size_t total; size_t left; + /* validate input before fetching dim1 */ + check_valid_oidvector(oidArray); + numargs = oidArray->dim1; + total = 20 * numargs + 1; result = palloc(total); result[0] = '\0'; diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c index d230262658..ff54d50ea9 100644 --- a/src/backend/utils/adt/int.c +++ b/src/backend/utils/adt/int.c @@ -134,6 +134,30 @@ buildint2vector(const int16 *int2s, int n) return result; } +/* + * validate that an array object meets the restrictions of int2vector + * + * We need this because there are pathways by which a general int2[] array can + * be cast to int2vector, allowing the type's restrictions to be violated. + * All code that receives an int2vector as a SQL parameter should check this. + */ +static void +check_valid_int2vector(const int2vector *int2Array) +{ + /* + * We insist on ndim == 1 and dataoffset == 0 (that is, no nulls) because + * otherwise the array's layout will not be what calling code expects. We + * needn't be picky about the index lower bound though. Checking elemtype + * is just paranoia. + */ + if (int2Array->ndim != 1 || + int2Array->dataoffset != 0 || + int2Array->elemtype != INT2OID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("array is not a valid int2vector"))); +} + /* * int2vectorin - converts "num num ..." to internal form */ @@ -208,10 +232,14 @@ int2vectorout(PG_FUNCTION_ARGS) { int2vector *int2Array = (int2vector *) PG_GETARG_POINTER(0); int num, - nnums = int2Array->dim1; + nnums; char *rp; char *result; + /* validate input before fetching dim1 */ + check_valid_int2vector(int2Array); + nnums = int2Array->dim1; + /* assumes sign, 5 digits, ' ' */ rp = result = (char *) palloc(nnums * 7 + 1); for (num = 0; num < nnums; num++) @@ -272,6 +300,7 @@ int2vectorrecv(PG_FUNCTION_ARGS) Datum int2vectorsend(PG_FUNCTION_ARGS) { + /* We don't do check_valid_int2vector, since array_send won't care */ return array_send(fcinfo); } diff --git a/src/backend/utils/adt/oid.c b/src/backend/utils/adt/oid.c index 6f4c299dee..a341972897 100644 --- a/src/backend/utils/adt/oid.c +++ b/src/backend/utils/adt/oid.c @@ -107,6 +107,30 @@ buildoidvector(const Oid *oids, int n) return result; } +/* + * validate that an array object meets the restrictions of oidvector + * + * We need this because there are pathways by which a general oid[] array can + * be cast to oidvector, allowing the type's restrictions to be violated. + * All code that receives an oidvector as a SQL parameter should check this. + */ +void +check_valid_oidvector(const oidvector *oidArray) +{ + /* + * We insist on ndim == 1 and dataoffset == 0 (that is, no nulls) because + * otherwise the array's layout will not be what calling code expects. We + * needn't be picky about the index lower bound though. Checking elemtype + * is just paranoia. + */ + if (oidArray->ndim != 1 || + oidArray->dataoffset != 0 || + oidArray->elemtype != OIDOID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("array is not a valid oidvector"))); +} + /* * oidvectorin - converts "num num ..." to internal form */ @@ -159,10 +183,14 @@ oidvectorout(PG_FUNCTION_ARGS) { oidvector *oidArray = (oidvector *) PG_GETARG_POINTER(0); int num, - nnums = oidArray->dim1; + nnums; char *rp; char *result; + /* validate input before fetching dim1 */ + check_valid_oidvector(oidArray); + nnums = oidArray->dim1; + /* assumes sign, 10 digits, ' ' */ rp = result = (char *) palloc(nnums * 12 + 1); for (num = 0; num < nnums; num++) @@ -225,6 +253,7 @@ oidvectorrecv(PG_FUNCTION_ARGS) Datum oidvectorsend(PG_FUNCTION_ARGS) { + /* We don't do check_valid_oidvector, since array_send won't care */ return array_send(fcinfo); } diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index cf57819ebd..5dcd788ff8 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -68,6 +68,7 @@ extern char *pg_ultostr(char *str, uint32 value); /* oid.c */ extern oidvector *buildoidvector(const Oid *oids, int n); +extern void check_valid_oidvector(const oidvector *oidArray); extern Oid oidparse(Node *node); extern int oid_cmp(const void *p1, const void *p2); diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index e1ab6dc278..66439d427a 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -1737,6 +1737,11 @@ select '[-2147483648:-2147483647]={1,2}'::int[]; (1 row) -- all of the above should be accepted +-- some day we might allow these cases, but for now they're errors: +select array[]::oidvector; +ERROR: array is not a valid oidvector +select array[]::int2vector; +ERROR: array is not a valid int2vector -- tests for array aggregates CREATE TEMP TABLE arraggtest ( f1 INT[], f2 TEXT[][], f3 FLOAT[]); INSERT INTO arraggtest (f1, f2, f3) VALUES diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index 450389831a..82837af7c4 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -528,6 +528,10 @@ select '[2147483646:2147483646]={1}'::int[]; select '[-2147483648:-2147483647]={1,2}'::int[]; -- all of the above should be accepted +-- some day we might allow these cases, but for now they're errors: +select array[]::oidvector; +select array[]::int2vector; + -- tests for array aggregates CREATE TEMP TABLE arraggtest ( f1 INT[], f2 TEXT[][], f3 FLOAT[]); From 841d42cc4e2f9ca1cf59758fc15619b00a11e148 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 9 Feb 2026 10:07:31 -0500 Subject: [PATCH 061/147] Require superuser to install a non-built-in selectivity estimator. Selectivity estimators come in two flavors: those that make specific assumptions about the data types they are working with, and those that don't. Most of the built-in estimators are of the latter kind and are meant to be safely attachable to any operator. If the operator does not behave as the estimator expects, you might get a poor estimate, but it won't crash. However, estimators that do make datatype assumptions can malfunction if they are attached to the wrong operator, since then the data they get from pg_statistic may not be of the type they expect. This can rise to the level of a security problem, even permitting arbitrary code execution by a user who has the ability to create SQL objects. To close this hole, establish a rule that built-in estimators are required to protect themselves against being called on the wrong type of data. It does not seem practical however to expect estimators in extensions to reach a similar level of security, at least not in the near term. Therefore, also establish a rule that superuser privilege is required to attach a non-built-in estimator to an operator. We expect that this restriction will have little negative impact on extensions, since estimators generally have to be written in C and thus superuser privilege is required to create them in the first place. This commit changes the privilege checks in CREATE/ALTER OPERATOR to enforce the rule about superuser privilege, and fixes a couple of built-in estimators that were making datatype assumptions without sufficiently checking that they're valid. Reported-by: Daniel Firer as part of zeroday.cloud Author: Tom Lane Reviewed-by: Noah Misch Security: CVE-2026-2004 Backpatch-through: 14 --- src/backend/commands/operatorcmds.c | 57 +++++++++++++++++++----- src/backend/tsearch/ts_selfuncs.c | 8 ++-- src/backend/utils/adt/network_selfuncs.c | 48 +++++++++++++------- 3 files changed, 83 insertions(+), 30 deletions(-) diff --git a/src/backend/commands/operatorcmds.c b/src/backend/commands/operatorcmds.c index 9f7e0ed17c..3e7b09b349 100644 --- a/src/backend/commands/operatorcmds.c +++ b/src/backend/commands/operatorcmds.c @@ -276,7 +276,6 @@ ValidateRestrictionEstimator(List *restrictionName) { Oid typeId[4]; Oid restrictionOid; - AclResult aclresult; typeId[0] = INTERNALOID; /* PlannerInfo */ typeId[1] = OIDOID; /* operator OID */ @@ -292,11 +291,33 @@ ValidateRestrictionEstimator(List *restrictionName) errmsg("restriction estimator function %s must return type %s", NameListToString(restrictionName), "float8"))); - /* Require EXECUTE rights for the estimator */ - aclresult = object_aclcheck(ProcedureRelationId, restrictionOid, GetUserId(), ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, OBJECT_FUNCTION, - NameListToString(restrictionName)); + /* + * If the estimator is not a built-in function, require superuser + * privilege to install it. This protects against using something that is + * not a restriction estimator or has hard-wired assumptions about what + * data types it is working with. (Built-in estimators are required to + * defend themselves adequately against unexpected data type choices, but + * it seems impractical to expect that of extensions' estimators.) + * + * If it is built-in, only require EXECUTE rights. + */ + if (restrictionOid >= FirstGenbkiObjectId) + { + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to specify a non-built-in restriction estimator function"))); + } + else + { + AclResult aclresult; + + aclresult = object_aclcheck(ProcedureRelationId, restrictionOid, + GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + NameListToString(restrictionName)); + } return restrictionOid; } @@ -312,7 +333,6 @@ ValidateJoinEstimator(List *joinName) Oid typeId[5]; Oid joinOid; Oid joinOid2; - AclResult aclresult; typeId[0] = INTERNALOID; /* PlannerInfo */ typeId[1] = OIDOID; /* operator OID */ @@ -350,11 +370,24 @@ ValidateJoinEstimator(List *joinName) errmsg("join estimator function %s must return type %s", NameListToString(joinName), "float8"))); - /* Require EXECUTE rights for the estimator */ - aclresult = object_aclcheck(ProcedureRelationId, joinOid, GetUserId(), ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, OBJECT_FUNCTION, - NameListToString(joinName)); + /* privilege checks are the same as in ValidateRestrictionEstimator */ + if (joinOid >= FirstGenbkiObjectId) + { + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to specify a non-built-in join estimator function"))); + } + else + { + AclResult aclresult; + + aclresult = object_aclcheck(ProcedureRelationId, joinOid, + GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + NameListToString(joinName)); + } return joinOid; } diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 5afa6e4bad..64b60bb951 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -108,12 +108,14 @@ tsmatchsel(PG_FUNCTION_ARGS) * OK, there's a Var and a Const we're dealing with here. We need the * Const to be a TSQuery, else we can't do anything useful. We have to * check this because the Var might be the TSQuery not the TSVector. + * + * Also check that the Var really is a TSVector, in case this estimator is + * mistakenly attached to some other operator. */ - if (((Const *) other)->consttype == TSQUERYOID) + if (((Const *) other)->consttype == TSQUERYOID && + vardata.vartype == TSVECTOROID) { /* tsvector @@ tsquery or the other way around */ - Assert(vardata.vartype == TSVECTOROID); - selec = tsquerysel(&vardata, ((Const *) other)->constvalue); } else diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c index 902f9c25db..2a8d2ded90 100644 --- a/src/backend/utils/adt/network_selfuncs.c +++ b/src/backend/utils/adt/network_selfuncs.c @@ -43,9 +43,9 @@ /* Maximum number of items to consider in join selectivity calculations */ #define MAX_CONSIDERED_ELEMS 1024 -static Selectivity networkjoinsel_inner(Oid operator, +static Selectivity networkjoinsel_inner(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2); -static Selectivity networkjoinsel_semi(Oid operator, +static Selectivity networkjoinsel_semi(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2); static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues); static Selectivity inet_hist_value_sel(const Datum *values, int nvalues, @@ -82,6 +82,7 @@ networksel(PG_FUNCTION_ARGS) Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); + int opr_codenum; VariableStatData vardata; Node *other; bool varonleft; @@ -95,6 +96,14 @@ networksel(PG_FUNCTION_ARGS) nullfrac; FmgrInfo proc; + /* + * Before all else, verify that the operator is one of the ones supported + * by this function, which in turn proves that the input datatypes are + * what we expect. Otherwise, attaching this selectivity function to some + * unexpected operator could cause trouble. + */ + opr_codenum = inet_opr_codenum(operator); + /* * If expression is not (variable op something) or (something op * variable), then punt and return a default estimate. @@ -150,13 +159,12 @@ networksel(PG_FUNCTION_ARGS) STATISTIC_KIND_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES)) { - int opr_codenum = inet_opr_codenum(operator); + int h_codenum; /* Commute if needed, so we can consider histogram to be on the left */ - if (!varonleft) - opr_codenum = -opr_codenum; + h_codenum = varonleft ? opr_codenum : -opr_codenum; non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues, - constvalue, opr_codenum); + constvalue, h_codenum); free_attstatsslot(&hslot); } @@ -203,10 +211,19 @@ networkjoinsel(PG_FUNCTION_ARGS) #endif SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); double selec; + int opr_codenum; VariableStatData vardata1; VariableStatData vardata2; bool join_is_reversed; + /* + * Before all else, verify that the operator is one of the ones supported + * by this function, which in turn proves that the input datatypes are + * what we expect. Otherwise, attaching this selectivity function to some + * unexpected operator could cause trouble. + */ + opr_codenum = inet_opr_codenum(operator); + get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &join_is_reversed); @@ -220,15 +237,18 @@ networkjoinsel(PG_FUNCTION_ARGS) * Selectivity for left/full join is not exactly the same as inner * join, but we neglect the difference, as eqjoinsel does. */ - selec = networkjoinsel_inner(operator, &vardata1, &vardata2); + selec = networkjoinsel_inner(operator, opr_codenum, + &vardata1, &vardata2); break; case JOIN_SEMI: case JOIN_ANTI: /* Here, it's important that we pass the outer var on the left. */ if (!join_is_reversed) - selec = networkjoinsel_semi(operator, &vardata1, &vardata2); + selec = networkjoinsel_semi(operator, opr_codenum, + &vardata1, &vardata2); else selec = networkjoinsel_semi(get_commutator(operator), + -opr_codenum, &vardata2, &vardata1); break; default: @@ -260,7 +280,7 @@ networkjoinsel(PG_FUNCTION_ARGS) * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner(). */ static Selectivity -networkjoinsel_inner(Oid operator, +networkjoinsel_inner(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2) { Form_pg_statistic stats; @@ -273,7 +293,6 @@ networkjoinsel_inner(Oid operator, mcv2_exists = false, hist1_exists = false, hist2_exists = false; - int opr_codenum; int mcv1_length = 0, mcv2_length = 0; AttStatsSlot mcv1_slot; @@ -325,8 +344,6 @@ networkjoinsel_inner(Oid operator, memset(&hist2_slot, 0, sizeof(hist2_slot)); } - opr_codenum = inet_opr_codenum(operator); - /* * Calculate selectivity for MCV vs MCV matches. */ @@ -387,7 +404,7 @@ networkjoinsel_inner(Oid operator, * histogram selectivity for semi/anti join cases. */ static Selectivity -networkjoinsel_semi(Oid operator, +networkjoinsel_semi(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2) { Form_pg_statistic stats; @@ -401,7 +418,6 @@ networkjoinsel_semi(Oid operator, mcv2_exists = false, hist1_exists = false, hist2_exists = false; - int opr_codenum; FmgrInfo proc; int i, mcv1_length = 0, @@ -455,7 +471,6 @@ networkjoinsel_semi(Oid operator, memset(&hist2_slot, 0, sizeof(hist2_slot)); } - opr_codenum = inet_opr_codenum(operator); fmgr_info(get_opcode(operator), &proc); /* Estimate number of input rows represented by RHS histogram. */ @@ -827,6 +842,9 @@ inet_semi_join_sel(Datum lhs_value, /* * Assign useful code numbers for the subnet inclusion/overlap operators * + * This will throw an error if the operator is not one of the ones we + * support in networksel() and networkjoinsel(). + * * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend * on the exact codes assigned here; but many other places in this file * know that they can negate a code to obtain the code for the commutator From 8ebdf41c262ccd86407ca684aab3113bdbcf2c66 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 9 Feb 2026 10:14:22 -0500 Subject: [PATCH 062/147] Harden _int_matchsel() against being attached to the wrong operator. While the preceding commit prevented such attachments from occurring in future, this one aims to prevent further abuse of any already- created operator that exposes _int_matchsel to the wrong data types. (No other contrib module has a vulnerable selectivity estimator.) We need only check that the Const we've found in the query is indeed of the type we expect (query_int), but there's a difficulty: as an extension type, query_int doesn't have a fixed OID that we could hard-code into the estimator. Therefore, the bulk of this patch consists of infrastructure to let an extension function securely look up the OID of a datatype belonging to the same extension. (Extension authors have requested such functionality before, so we anticipate that this code will have additional non-security uses, and may soon be extended to allow looking up other kinds of SQL objects.) This is done by first finding the extension that owns the calling function (there can be only one), and then thumbing through the objects owned by that extension to find a type that has the desired name. This is relatively expensive, especially for large extensions, so a simple cache is put in front of these lookups. Reported-by: Daniel Firer as part of zeroday.cloud Author: Tom Lane Reviewed-by: Noah Misch Security: CVE-2026-2004 Backpatch-through: 14 --- contrib/intarray/_int_selfuncs.c | 14 +++- src/backend/catalog/pg_depend.c | 73 +++++++++++++++++ src/backend/commands/extension.c | 129 +++++++++++++++++++++++++++++++ src/include/catalog/dependency.h | 2 + src/include/commands/extension.h | 2 + src/tools/pgindent/typedefs.list | 1 + 6 files changed, 220 insertions(+), 1 deletion(-) diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c index 4a7053028c..7fce743632 100644 --- a/contrib/intarray/_int_selfuncs.c +++ b/contrib/intarray/_int_selfuncs.c @@ -19,6 +19,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" +#include "commands/extension.h" #include "miscadmin.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" @@ -170,7 +171,18 @@ _int_matchsel(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(0.0); } - /* The caller made sure the const is a query, so get it now */ + /* + * Verify that the Const is a query_int, else return a default estimate. + * (This could only fail if someone attached this estimator to the wrong + * operator.) + */ + if (((Const *) other)->consttype != + get_function_sibling_type(fcinfo->flinfo->fn_oid, "query_int")) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); + } + query = DatumGetQueryTypeP(((Const *) other)->constvalue); /* Empty query matches nothing */ diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c index 55309d16f1..07c2d41c18 100644 --- a/src/backend/catalog/pg_depend.c +++ b/src/backend/catalog/pg_depend.c @@ -23,12 +23,14 @@ #include "catalog/pg_constraint.h" #include "catalog/pg_depend.h" #include "catalog/pg_extension.h" +#include "catalog/pg_type.h" #include "catalog/partition.h" #include "commands/extension.h" #include "miscadmin.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/syscache.h" static bool isObjectPinned(const ObjectAddress *object); @@ -813,6 +815,77 @@ getAutoExtensionsOfObject(Oid classId, Oid objectId) return result; } +/* + * Look up a type belonging to an extension. + * + * Returns the type's OID, or InvalidOid if not found. + * + * Notice that the type is specified by name only, without a schema. + * That's because this will typically be used by relocatable extensions + * which can't make a-priori assumptions about which schema their objects + * are in. As long as the extension only defines one type of this name, + * the answer is unique anyway. + * + * We might later add the ability to look up functions, operators, etc. + */ +Oid +getExtensionType(Oid extensionOid, const char *typname) +{ + Oid result = InvalidOid; + Relation depRel; + ScanKeyData key[3]; + SysScanDesc scan; + HeapTuple tup; + + depRel = table_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_depend_refclassid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(ExtensionRelationId)); + ScanKeyInit(&key[1], + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(extensionOid)); + ScanKeyInit(&key[2], + Anum_pg_depend_refobjsubid, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(0)); + + scan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 3, key); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_depend depform = (Form_pg_depend) GETSTRUCT(tup); + + if (depform->classid == TypeRelationId && + depform->deptype == DEPENDENCY_EXTENSION) + { + Oid typoid = depform->objid; + HeapTuple typtup; + + typtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid)); + if (!HeapTupleIsValid(typtup)) + continue; /* should we throw an error? */ + if (strcmp(NameStr(((Form_pg_type) GETSTRUCT(typtup))->typname), + typname) == 0) + { + result = typoid; + ReleaseSysCache(typtup); + break; /* no need to keep searching */ + } + ReleaseSysCache(typtup); + } + } + + systable_endscan(scan); + + table_close(depRel, AccessShareLock); + + return result; +} + /* * Detect whether a sequence is marked as "owned" by a column * diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 688f1874f3..72fdd7511b 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -45,6 +45,7 @@ #include "catalog/pg_depend.h" #include "catalog/pg_extension.h" #include "catalog/pg_namespace.h" +#include "catalog/pg_proc.h" #include "catalog/pg_type.h" #include "commands/alter.h" #include "commands/comment.h" @@ -62,6 +63,7 @@ #include "utils/builtins.h" #include "utils/conffiles.h" #include "utils/fmgroids.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -141,7 +143,26 @@ typedef struct char *loc; } ExtensionLocation; +/* + * Cache structure for get_function_sibling_type (and maybe later, + * allied lookup functions). + */ +typedef struct ExtensionSiblingCache +{ + struct ExtensionSiblingCache *next; /* list link */ + /* lookup key: requesting function's OID and type name */ + Oid reqfuncoid; + const char *typname; + bool valid; /* is entry currently valid? */ + uint32 exthash; /* cache hash of owning extension's OID */ + Oid typeoid; /* OID associated with typname */ +} ExtensionSiblingCache; + +/* Head of linked list of ExtensionSiblingCache structs */ +static ExtensionSiblingCache *ext_sibling_list = NULL; + /* Local functions */ +static void ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue); static List *find_update_path(List *evi_list, ExtensionVersionInfo *evi_start, ExtensionVersionInfo *evi_target, @@ -263,6 +284,114 @@ get_extension_schema(Oid ext_oid) return result; } +/* + * get_function_sibling_type - find a type belonging to same extension as func + * + * Returns the type's OID, or InvalidOid if not found. + * + * This is useful in extensions, which won't have fixed object OIDs. + * We work from the calling function's own OID, which it can get from its + * FunctionCallInfo parameter, and look up the owning extension and thence + * a type belonging to the same extension. + * + * Notice that the type is specified by name only, without a schema. + * That's because this will typically be used by relocatable extensions + * which can't make a-priori assumptions about which schema their objects + * are in. As long as the extension only defines one type of this name, + * the answer is unique anyway. + * + * We might later add the ability to look up functions, operators, etc. + * + * This code is simply a frontend for some pg_depend lookups. Those lookups + * are fairly expensive, so we provide a simple cache facility. We assume + * that the passed typname is actually a C constant, or at least permanently + * allocated, so that we need not copy that string. + */ +Oid +get_function_sibling_type(Oid funcoid, const char *typname) +{ + ExtensionSiblingCache *cache_entry; + Oid extoid; + Oid typeoid; + + /* + * See if we have the answer cached. Someday there may be enough callers + * to justify a hash table, but for now, a simple linked list is fine. + */ + for (cache_entry = ext_sibling_list; cache_entry != NULL; + cache_entry = cache_entry->next) + { + if (funcoid == cache_entry->reqfuncoid && + strcmp(typname, cache_entry->typname) == 0) + break; + } + if (cache_entry && cache_entry->valid) + return cache_entry->typeoid; + + /* + * Nope, so do the expensive lookups. We do not expect failures, so we do + * not cache negative results. + */ + extoid = getExtensionOfObject(ProcedureRelationId, funcoid); + if (!OidIsValid(extoid)) + return InvalidOid; + typeoid = getExtensionType(extoid, typname); + if (!OidIsValid(typeoid)) + return InvalidOid; + + /* + * Build, or revalidate, cache entry. + */ + if (cache_entry == NULL) + { + /* Register invalidation hook if this is first entry */ + if (ext_sibling_list == NULL) + CacheRegisterSyscacheCallback(EXTENSIONOID, + ext_sibling_callback, + (Datum) 0); + + /* Momentarily zero the space to ensure valid flag is false */ + cache_entry = (ExtensionSiblingCache *) + MemoryContextAllocZero(CacheMemoryContext, + sizeof(ExtensionSiblingCache)); + cache_entry->next = ext_sibling_list; + ext_sibling_list = cache_entry; + } + + cache_entry->reqfuncoid = funcoid; + cache_entry->typname = typname; + cache_entry->exthash = GetSysCacheHashValue1(EXTENSIONOID, + ObjectIdGetDatum(extoid)); + cache_entry->typeoid = typeoid; + /* Mark it valid only once it's fully populated */ + cache_entry->valid = true; + + return typeoid; +} + +/* + * ext_sibling_callback + * Syscache inval callback function for EXTENSIONOID cache + * + * It seems sufficient to invalidate ExtensionSiblingCache entries when + * the owning extension's pg_extension entry is modified or deleted. + * Neither a requesting function's OID, nor the OID of the object it's + * looking for, could change without an extension update or drop/recreate. + */ +static void +ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue) +{ + ExtensionSiblingCache *cache_entry; + + for (cache_entry = ext_sibling_list; cache_entry != NULL; + cache_entry = cache_entry->next) + { + if (hashvalue == 0 || + cache_entry->exthash == hashvalue) + cache_entry->valid = false; + } +} + /* * Utility functions to check validity of extension and version names */ diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 969fd8b23f..2f3c1eae3c 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -186,6 +186,8 @@ extern long changeDependenciesOn(Oid refClassId, Oid oldRefObjectId, extern Oid getExtensionOfObject(Oid classId, Oid objectId); extern List *getAutoExtensionsOfObject(Oid classId, Oid objectId); +extern Oid getExtensionType(Oid extensionOid, const char *typname); + extern bool sequenceIsOwned(Oid seqId, char deptype, Oid *tableId, int32 *colId); extern List *getOwnedSequences(Oid relid); extern Oid getIdentitySequence(Relation rel, AttrNumber attnum, bool missing_ok); diff --git a/src/include/commands/extension.h b/src/include/commands/extension.h index 4ebc2bac22..7a76bdebcf 100644 --- a/src/include/commands/extension.h +++ b/src/include/commands/extension.h @@ -52,6 +52,8 @@ extern char *get_extension_name(Oid ext_oid); extern Oid get_extension_schema(Oid ext_oid); extern bool extension_file_exists(const char *extensionName); +extern Oid get_function_sibling_type(Oid funcoid, const char *typname); + extern ObjectAddress AlterExtensionNamespace(const char *extensionName, const char *newschema, Oid *oldschema); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 7751848941..7619845fba 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -803,6 +803,7 @@ ExtensibleNodeMethods ExtensionControlFile ExtensionInfo ExtensionLocation +ExtensionSiblingCache ExtensionVersionInfo FDWCollateState FD_SET From c5dc75479b1525a3aa1daaf79028fa5af159800e Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Mon, 9 Feb 2026 09:08:10 -0800 Subject: [PATCH 063/147] Fix test "NUL byte in text decrypt" for --without-zlib builds. Backpatch-through: 14 Security: CVE-2026-2006 --- contrib/pgcrypto/expected/pgp-decrypt.out | 9 +++++---- contrib/pgcrypto/expected/pgp-decrypt_1.out | 9 +++++---- contrib/pgcrypto/sql/pgp-decrypt.sql | 9 +++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/contrib/pgcrypto/expected/pgp-decrypt.out b/contrib/pgcrypto/expected/pgp-decrypt.out index 1db89e8c00..8ce6466f2e 100644 --- a/contrib/pgcrypto/expected/pgp-decrypt.out +++ b/contrib/pgcrypto/expected/pgp-decrypt.out @@ -388,7 +388,8 @@ select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); NOTICE: dbg: parse_literal_data: data type=b ERROR: Not text data -- NUL byte in text decrypt. Ciphertext source: --- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ +-- --personal-compress-preferences uncompressed --textmode \ -- --personal-cipher-preferences aes --no-emit-version --batch \ -- --symmetric --passphrase key --armor do $$ @@ -396,9 +397,9 @@ begin perform pgp_sym_decrypt(dearmor(' -----BEGIN PGP MESSAGE----- -jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n -SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== -=c2cz +jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH +vu0YlJP5D5BX7yqZ+Pry7TlDmiFO +=rV7z -----END PGP MESSAGE----- '), 'key', 'debug=1'); exception when others then diff --git a/contrib/pgcrypto/expected/pgp-decrypt_1.out b/contrib/pgcrypto/expected/pgp-decrypt_1.out index d214e0bc0e..ee57ad43cb 100644 --- a/contrib/pgcrypto/expected/pgp-decrypt_1.out +++ b/contrib/pgcrypto/expected/pgp-decrypt_1.out @@ -384,7 +384,8 @@ select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); NOTICE: dbg: parse_literal_data: data type=b ERROR: Not text data -- NUL byte in text decrypt. Ciphertext source: --- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ +-- --personal-compress-preferences uncompressed --textmode \ -- --personal-cipher-preferences aes --no-emit-version --batch \ -- --symmetric --passphrase key --armor do $$ @@ -392,9 +393,9 @@ begin perform pgp_sym_decrypt(dearmor(' -----BEGIN PGP MESSAGE----- -jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n -SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== -=c2cz +jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH +vu0YlJP5D5BX7yqZ+Pry7TlDmiFO +=rV7z -----END PGP MESSAGE----- '), 'key', 'debug=1'); exception when others then diff --git a/contrib/pgcrypto/sql/pgp-decrypt.sql b/contrib/pgcrypto/sql/pgp-decrypt.sql index 2fe498f2f0..b499bf757b 100644 --- a/contrib/pgcrypto/sql/pgp-decrypt.sql +++ b/contrib/pgcrypto/sql/pgp-decrypt.sql @@ -283,7 +283,8 @@ VsxxqLSPzNLAeIspJk5G select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); -- NUL byte in text decrypt. Ciphertext source: --- printf 'a\x00\xc' | gpg --homedir /nonexistent --textmode \ +-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ +-- --personal-compress-preferences uncompressed --textmode \ -- --personal-cipher-preferences aes --no-emit-version --batch \ -- --symmetric --passphrase key --armor do $$ @@ -291,9 +292,9 @@ begin perform pgp_sym_decrypt(dearmor(' -----BEGIN PGP MESSAGE----- -jA0EBwMCLd9OvySmZNZg0jgBe7vGTmnje5HGXI+zsIQ99WPZu4Zs/P6pQcZ+HZ4n -SZQHOfE8tagjB6Rqow82QpSBiOfWn4qjhQ== -=c2cz +jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH +vu0YlJP5D5BX7yqZ+Pry7TlDmiFO +=rV7z -----END PGP MESSAGE----- '), 'key', 'debug=1'); exception when others then From 18f0afb2a635b433e778684acabffe1e52da8a86 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 9 Feb 2026 19:15:44 +0200 Subject: [PATCH 064/147] Fix incorrect iteration type in extension_file_exists() Commit f3c9e341cd changed the type of objects in the List that get_extension_control_directories() returns, from "char *" to "ExtensionLocation *", but missed adjusting this one caller. Author: Chao Li Discussion: https://www.postgresql.org/message-id/362EA9B3-589B-475A-A16E-F10C30426E28@gmail.com --- src/backend/commands/extension.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 72fdd7511b..81f24615d5 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -2686,9 +2686,9 @@ extension_file_exists(const char *extensionName) locations = get_extension_control_directories(); - foreach_ptr(char, location, locations) + foreach_ptr(ExtensionLocation, location, locations) { - dir = AllocateDir(location); + dir = AllocateDir(location->loc); /* * If the control directory doesn't exist, we want to silently return @@ -2700,7 +2700,7 @@ extension_file_exists(const char *extensionName) } else { - while ((de = ReadDir(dir, location)) != NULL) + while ((de = ReadDir(dir, location->loc)) != NULL) { char *extname; From cbef472558ca50d282414e68083717c44b92ad62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Mon, 9 Feb 2026 19:15:20 +0100 Subject: [PATCH 065/147] Remove HeapTupleheaderSetXminCommitted/Invalid functions They are not and never have been used by any known code -- apparently we just cargo-culted them in commit 37484ad2aace (or their ancestor macros anyway, which begat these functions in commit 34694ec888d6). Allegedly they're also potentially dangerous; users are better off going through HeapTupleSetHintBits instead. Author: Andy Fan Discussion: https://postgr.es/m/87sejogt4g.fsf@163.com --- src/include/access/htup_details.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index d406825ff2..75f8b159b8 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -357,20 +357,6 @@ HeapTupleHeaderXminFrozen(const HeapTupleHeaderData *tup) return (tup->t_infomask & HEAP_XMIN_FROZEN) == HEAP_XMIN_FROZEN; } -static inline void -HeapTupleHeaderSetXminCommitted(HeapTupleHeaderData *tup) -{ - Assert(!HeapTupleHeaderXminInvalid(tup)); - tup->t_infomask |= HEAP_XMIN_COMMITTED; -} - -static inline void -HeapTupleHeaderSetXminInvalid(HeapTupleHeaderData *tup) -{ - Assert(!HeapTupleHeaderXminCommitted(tup)); - tup->t_infomask |= HEAP_XMIN_INVALID; -} - static inline void HeapTupleHeaderSetXminFrozen(HeapTupleHeaderData *tup) { From 73d60ac385a93684f68297ae0ccb8f75bc6f23e1 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 9 Feb 2026 20:26:23 +0200 Subject: [PATCH 066/147] cleanup: Deadlock checker is no longer called from signal handler Clean up a few leftovers from when the deadlock checker was called from signal handler. We stopped doing that in commit 6753333f55, in year 2015. - CheckDeadLock can return a return value directly to the caller, there's no need to use a global variable for that. - Remove outdated comments that claimed that CheckDeadLock "signals ProcSleep". - It should be OK to ereport() from DeadLockCheck now. I considered getting rid of InitDeadLockChecking() and moving the workspace allocations into DeadLockCheck, but it's still good to avoid doing the allocations while we're holding all the partition locks. So just update the comment to give that as the reason we do the allocations up front. --- src/backend/storage/lmgr/deadlock.c | 10 ++++------ src/backend/storage/lmgr/proc.c | 28 ++++++++++++++++------------ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c index 8334a88761..0a8dd5eb7c 100644 --- a/src/backend/storage/lmgr/deadlock.c +++ b/src/backend/storage/lmgr/deadlock.c @@ -135,10 +135,9 @@ static PGPROC *blocking_autovacuum_proc = NULL; * This does per-backend initialization of the deadlock checker; primarily, * allocation of working memory for DeadLockCheck. We do this per-backend * since there's no percentage in making the kernel do copy-on-write - * inheritance of workspace from the postmaster. We want to allocate the - * space at startup because (a) the deadlock checker might be invoked when - * there's no free memory left, and (b) the checker is normally run inside a - * signal handler, which is a very dangerous place to invoke palloc from. + * inheritance of workspace from the postmaster. We allocate the space at + * startup because the deadlock checker is run with all the partitions of the + * lock table locked, and we want to keep that section as short as possible. */ void InitDeadLockChecking(void) @@ -213,8 +212,7 @@ InitDeadLockChecking(void) * * On failure, deadlock details are recorded in deadlockDetails[] for * subsequent printing by DeadLockReport(). That activity is separate - * because (a) we don't want to do it while holding all those LWLocks, - * and (b) we are typically invoked inside a signal handler. + * because we don't want to do it while holding all those LWLocks. */ DeadLockState DeadLockCheck(PGPROC *proc) diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index c7a001b3b7..8560a903bc 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -80,15 +80,13 @@ PROC_HDR *ProcGlobal = NULL; NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL; PGPROC *PreparedXactProcs = NULL; -static DeadLockState deadlock_state = DS_NOT_YET_CHECKED; - /* Is a deadlock check pending? */ static volatile sig_atomic_t got_deadlock_timeout; static void RemoveProcFromArray(int code, Datum arg); static void ProcKill(int code, Datum arg); static void AuxiliaryProcKill(int code, Datum arg); -static void CheckDeadLock(void); +static DeadLockState CheckDeadLock(void); /* @@ -1321,6 +1319,7 @@ ProcSleep(LOCALLOCK *locallock) bool allow_autovacuum_cancel = true; bool logged_recovery_conflict = false; ProcWaitStatus myWaitStatus; + DeadLockState deadlock_state; /* The caller must've armed the on-error cleanup mechanism */ Assert(GetAwaitedLock() == locallock); @@ -1461,7 +1460,7 @@ ProcSleep(LOCALLOCK *locallock) /* check for deadlocks first, as that's probably log-worthy */ if (got_deadlock_timeout) { - CheckDeadLock(); + deadlock_state = CheckDeadLock(); got_deadlock_timeout = false; } CHECK_FOR_INTERRUPTS(); @@ -1784,14 +1783,14 @@ ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock) * * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a * lock to be released by some other process. Check if there's a deadlock; if - * not, just return. (But signal ProcSleep to log a message, if - * log_lock_waits is true.) If we have a real deadlock, remove ourselves from - * the lock's wait queue and signal an error to ProcSleep. + * not, just return. If we have a real deadlock, remove ourselves from the + * lock's wait queue. */ -static void +static DeadLockState CheckDeadLock(void) { int i; + DeadLockState result; /* * Acquire exclusive lock on the entire shared lock data structures. Must @@ -1818,17 +1817,20 @@ CheckDeadLock(void) */ if (MyProc->links.prev == NULL || MyProc->links.next == NULL) + { + result = DS_NO_DEADLOCK; goto check_done; + } #ifdef LOCK_DEBUG if (Debug_deadlocks) DumpAllLocks(); #endif - /* Run the deadlock check, and set deadlock_state for use by ProcSleep */ - deadlock_state = DeadLockCheck(MyProc); + /* Run the deadlock check */ + result = DeadLockCheck(MyProc); - if (deadlock_state == DS_HARD_DEADLOCK) + if (result == DS_HARD_DEADLOCK) { /* * Oops. We have a deadlock. @@ -1840,7 +1842,7 @@ CheckDeadLock(void) * * RemoveFromWaitQueue sets MyProc->waitStatus to * PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we - * return from the signal handler. + * return. */ Assert(MyProc->waitLock != NULL); RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag))); @@ -1867,6 +1869,8 @@ CheckDeadLock(void) check_done: for (i = NUM_LOCK_PARTITIONS; --i >= 0;) LWLockRelease(LockHashPartitionLockByIndex(i)); + + return result; } /* From 158408fef8b96907bb14f89654dd2beab27ff030 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 9 Feb 2026 14:58:02 -0600 Subject: [PATCH 067/147] pg_upgrade: Fix handling of pg_largeobject_metadata. For binary upgrades from v16 or newer, pg_upgrade transfers the files for pg_largeobject_metadata from the old cluster, as opposed to using COPY or ordinary SQL commands to reconstruct its contents. While this approach adds complexity, it can greatly reduce pg_upgrade's runtime when there are many large objects. Large objects with comments or security labels are one source of complexity for this approach. During pg_upgrade, schema restoration happens before files are transferred. Comments and security labels are transferred in the former step, but the COMMENT and SECURITY LABEL commands will fail if their corresponding large objects do not exist. To deal with this, pg_upgrade first copies only the rows of pg_largeobject_metadata that are needed to avoid failures. Later, pg_upgrade overwrites those rows by replacing pg_largeobject_metadata's files with its files in the old cluster. Unfortunately, there's a subtle problem here. Simply put, there's no guarantee that pg_upgrade will overwrite all of pg_largeobject_metadata's files on the new cluster. For example, the new cluster's version might more aggressively extend relations or create visibility maps, and pg_upgrade's file transfer code is not sophisticated enough to remove files that lack counterparts in the old cluster. These extra files could cause problems post-upgrade. More fortunately, we can simultaneously fix the aforementioned problem and further optimize binary upgrades for clusters with many large objects. If we teach the COMMENT and SECURITY LABEL commands to allow nonexistent large objects during binary upgrades, pg_upgrade no longer needs to transfer pg_largeobject_metadata's contents beforehand. This approach also allows us to remove the associated dependency tracking from pg_dump, even for upgrades from v12-v15 that use COPY to transfer pg_largeobject_metadata's contents. In addition to what is described in the previous paragraph, this commit modifies the query in getLOs() to only retrieve LOs with comments or security labels for upgrades from v12 or newer. We have long assumed that such usage is rare, so this should reduce pg_upgrade's memory usage and runtime in many cases. We might also be able to remove the "upgrades from v12 or newer" restriction on the recent batch of optimizations by adding special handling for pg_largeobject_metadata's hidden OID column on older versions (since this catalog previously used the now-removed WITH OIDS feature), but that is left as a future exercise. Reported-by: Andres Freund Reviewed-by: Andres Freund Discussion: https://postgr.es/m/3yd2ss6n7xywo6pmhd7jjh3bqwgvx35bflzgv3ag4cnzfkik7m%40hiyadppqxx6w --- src/backend/commands/comment.c | 12 +++- src/backend/commands/seclabel.c | 12 +++- src/bin/pg_dump/pg_dump.c | 109 ++++++++++++++------------------ 3 files changed, 70 insertions(+), 63 deletions(-) diff --git a/src/backend/commands/comment.c b/src/backend/commands/comment.c index caacb17e5d..771aba2a69 100644 --- a/src/backend/commands/comment.c +++ b/src/backend/commands/comment.c @@ -41,6 +41,7 @@ CommentObject(CommentStmt *stmt) { Relation relation; ObjectAddress address = InvalidObjectAddress; + bool missing_ok; /* * When loading a dump, we may see a COMMENT ON DATABASE for the old name @@ -63,6 +64,14 @@ CommentObject(CommentStmt *stmt) } } + /* + * During binary upgrade, allow nonexistent large objects so that we don't + * have to create them during schema restoration. pg_upgrade will + * transfer the contents of pg_largeobject_metadata via COPY or by + * copying/linking its files from the old cluster later on. + */ + missing_ok = IsBinaryUpgrade && stmt->objtype == OBJECT_LARGEOBJECT; + /* * Translate the parser representation that identifies this object into an * ObjectAddress. get_object_address() will throw an error if the object @@ -70,7 +79,8 @@ CommentObject(CommentStmt *stmt) * against concurrent DROP operations. */ address = get_object_address(stmt->objtype, stmt->object, - &relation, ShareUpdateExclusiveLock, false); + &relation, ShareUpdateExclusiveLock, + missing_ok); /* Require ownership of the target object. */ check_object_ownership(GetUserId(), stmt->objtype, address, diff --git a/src/backend/commands/seclabel.c b/src/backend/commands/seclabel.c index 4160f5b685..5b80396723 100644 --- a/src/backend/commands/seclabel.c +++ b/src/backend/commands/seclabel.c @@ -118,6 +118,7 @@ ExecSecLabelStmt(SecLabelStmt *stmt) ObjectAddress address; Relation relation; ListCell *lc; + bool missing_ok; /* * Find the named label provider, or if none specified, check whether @@ -159,6 +160,14 @@ ExecSecLabelStmt(SecLabelStmt *stmt) (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("security labels are not supported for this type of object"))); + /* + * During binary upgrade, allow nonexistent large objects so that we don't + * have to create them during schema restoration. pg_upgrade will + * transfer the contents of pg_largeobject_metadata via COPY or by + * copying/linking its files from the old cluster later on. + */ + missing_ok = IsBinaryUpgrade && stmt->objtype == OBJECT_LARGEOBJECT; + /* * Translate the parser representation which identifies this object into * an ObjectAddress. get_object_address() will throw an error if the @@ -166,7 +175,8 @@ ExecSecLabelStmt(SecLabelStmt *stmt) * guard against concurrent modifications. */ address = get_object_address(stmt->objtype, stmt->object, - &relation, ShareUpdateExclusiveLock, false); + &relation, ShareUpdateExclusiveLock, + missing_ok); /* Require ownership of the target object. */ check_object_ownership(GetUserId(), stmt->objtype, address, diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 2bebefd0ba..2c3754d020 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -214,12 +214,6 @@ static int nbinaryUpgradeClassOids = 0; static SequenceItem *sequences = NULL; static int nsequences = 0; -/* - * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use - * as a dependency for pg_shdepend and any large object comments/seclabels. - */ -static DumpId lo_metadata_dumpId; - /* Maximum number of relations to fetch in a fetchAttributeStats() call. */ #define MAX_ATTR_STATS_RELS 64 @@ -1121,27 +1115,20 @@ main(int argc, char **argv) getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE); /* - * For binary upgrade mode, dump pg_largeobject_metadata and the - * associated pg_shdepend rows. This is faster to restore than the - * equivalent set of large object commands. We can only do this for - * upgrades from v12 and newer; in older versions, pg_largeobject_metadata - * was created WITH OIDS, so the OID column is hidden and won't be dumped. + * For binary upgrade mode, dump the pg_shdepend rows for large objects + * and maybe even pg_largeobject_metadata (see comment below for details). + * This is faster to restore than the equivalent set of large object + * commands. We can only do this for upgrades from v12 and newer; in + * older versions, pg_largeobject_metadata was created WITH OIDS, so the + * OID column is hidden and won't be dumped. */ if (dopt.binary_upgrade && fout->remoteVersion >= 120000) { - TableInfo *lo_metadata = findTableByOid(LargeObjectMetadataRelationId); - TableInfo *shdepend = findTableByOid(SharedDependRelationId); + TableInfo *shdepend; - makeTableDataInfo(&dopt, lo_metadata); + shdepend = findTableByOid(SharedDependRelationId); makeTableDataInfo(&dopt, shdepend); - /* - * Save pg_largeobject_metadata's dump ID for use as a dependency for - * pg_shdepend and any large object comments/seclabels. - */ - lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId; - addObjectDependency(&shdepend->dataObj->dobj, lo_metadata_dumpId); - /* * Only dump large object shdepend rows for this database. */ @@ -1150,21 +1137,19 @@ main(int argc, char **argv) " WHERE datname = current_database())"; /* - * If upgrading from v16 or newer, only dump large objects with - * comments/seclabels. For these upgrades, pg_upgrade can copy/link - * pg_largeobject_metadata's files (which is usually faster) but we - * still need to dump LOs with comments/seclabels here so that the - * subsequent COMMENT and SECURITY LABEL commands work. pg_upgrade - * can't copy/link the files from older versions because aclitem - * (needed by pg_largeobject_metadata.lomacl) changed its storage - * format in v16. + * For binary upgrades from v16 and newer versions, we can copy + * pg_largeobject_metadata's files from the old cluster, so we don't + * need to dump its contents. pg_upgrade can't copy/link the files + * from older versions because aclitem (needed by + * pg_largeobject_metadata.lomacl) changed its storage format in v16. */ - if (fout->remoteVersion >= 160000) - lo_metadata->dataObj->filtercond = "WHERE oid IN " - "(SELECT objoid FROM pg_description " - "WHERE classoid = " CppAsString2(LargeObjectRelationId) " " - "UNION SELECT objoid FROM pg_seclabel " - "WHERE classoid = " CppAsString2(LargeObjectRelationId) ")"; + if (fout->remoteVersion < 160000) + { + TableInfo *lo_metadata; + + lo_metadata = findTableByOid(LargeObjectMetadataRelationId); + makeTableDataInfo(&dopt, lo_metadata); + } } /* @@ -3979,7 +3964,25 @@ getLOs(Archive *fout) appendPQExpBufferStr(loQry, "SELECT oid, lomowner, lomacl, " "acldefault('L', lomowner) AS acldefault " - "FROM pg_largeobject_metadata " + "FROM pg_largeobject_metadata "); + + /* + * For binary upgrades from v12 or newer, we transfer + * pg_largeobject_metadata via COPY or by copying/linking its files from + * the old cluster. On such upgrades, we only need to consider large + * objects that have comments or security labels, since we still restore + * those objects via COMMENT/SECURITY LABEL commands. + */ + if (dopt->binary_upgrade && + fout->remoteVersion >= 120000) + appendPQExpBufferStr(loQry, + "WHERE oid IN " + "(SELECT objoid FROM pg_description " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) " " + "UNION SELECT objoid FROM pg_seclabel " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) ") "); + + appendPQExpBufferStr(loQry, "ORDER BY lomowner, lomacl::pg_catalog.text, oid"); res = ExecuteSqlQuery(fout, loQry->data, PGRES_TUPLES_OK); @@ -4062,36 +4065,20 @@ getLOs(Archive *fout) /* * In binary-upgrade mode for LOs, we do *not* dump out the LO data, * as it will be copied by pg_upgrade, which simply copies the - * pg_largeobject table. We *do* however dump out anything but the - * data, as pg_upgrade copies just pg_largeobject, but not - * pg_largeobject_metadata, after the dump is restored. In versions - * before v12, this is done via proper large object commands. In - * newer versions, we dump the content of pg_largeobject_metadata and - * any associated pg_shdepend rows, which is faster to restore. (On - *
+ + + + range_length_histogram anyarray + + + A histogram of the lengths of non-empty and non-null range values of an + expression. (Null for non-range types.) + + + This histogram is calculated using the subtype_diff + range function regardless of whether range bounds are inclusive. + + + + + + range_empty_frac float4 + + + Fraction of expression entries whose values are empty ranges. + (Null for non-range types.) + + + + + + range_bounds_histogram anyarray + + + A histogram of lower and upper bounds of non-empty and non-null range + values. (Null for non-range types.) + + + These two histograms are represented as a single array of ranges, whose + lower bounds represent the histogram of lower bounds, and upper bounds + represent the histogram of upper bounds. + + diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 7553f31fef..1ea8f1faa9 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -363,7 +363,28 @@ CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3 WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4 WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5 - END) AS elem_count_histogram + END) AS elem_count_histogram, + (CASE + WHEN (stat.a).stakind1 = 6 THEN (stat.a).stavalues1 + WHEN (stat.a).stakind2 = 6 THEN (stat.a).stavalues2 + WHEN (stat.a).stakind3 = 6 THEN (stat.a).stavalues3 + WHEN (stat.a).stakind4 = 6 THEN (stat.a).stavalues4 + WHEN (stat.a).stakind5 = 6 THEN (stat.a).stavalues5 + END) AS range_length_histogram, + (CASE + WHEN (stat.a).stakind1 = 6 THEN (stat.a).stanumbers1[1] + WHEN (stat.a).stakind2 = 6 THEN (stat.a).stanumbers2[1] + WHEN (stat.a).stakind3 = 6 THEN (stat.a).stanumbers3[1] + WHEN (stat.a).stakind4 = 6 THEN (stat.a).stanumbers4[1] + WHEN (stat.a).stakind5 = 6 THEN (stat.a).stanumbers5[1] + END) AS range_empty_frac, + (CASE + WHEN (stat.a).stakind1 = 7 THEN (stat.a).stavalues1 + WHEN (stat.a).stakind2 = 7 THEN (stat.a).stavalues2 + WHEN (stat.a).stakind3 = 7 THEN (stat.a).stavalues3 + WHEN (stat.a).stakind4 = 7 THEN (stat.a).stavalues4 + WHEN (stat.a).stakind5 = 7 THEN (stat.a).stavalues5 + END) AS range_bounds_histogram FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid) LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid) LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace) diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index a09d8a6c64..a910b3d04e 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202602051 +#define CATALOG_VERSION_NO 202602101 #endif diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index f4ee2bd745..f9bc213e5a 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2696,7 +2696,31 @@ pg_stats_ext_exprs| SELECT cn.nspname AS schemaname, WHEN ((stat.a).stakind4 = 5) THEN (stat.a).stanumbers4 WHEN ((stat.a).stakind5 = 5) THEN (stat.a).stanumbers5 ELSE NULL::real[] - END AS elem_count_histogram + END AS elem_count_histogram, + CASE + WHEN ((stat.a).stakind1 = 6) THEN (stat.a).stavalues1 + WHEN ((stat.a).stakind2 = 6) THEN (stat.a).stavalues2 + WHEN ((stat.a).stakind3 = 6) THEN (stat.a).stavalues3 + WHEN ((stat.a).stakind4 = 6) THEN (stat.a).stavalues4 + WHEN ((stat.a).stakind5 = 6) THEN (stat.a).stavalues5 + ELSE NULL::anyarray + END AS range_length_histogram, + CASE + WHEN ((stat.a).stakind1 = 6) THEN (stat.a).stanumbers1[1] + WHEN ((stat.a).stakind2 = 6) THEN (stat.a).stanumbers2[1] + WHEN ((stat.a).stakind3 = 6) THEN (stat.a).stanumbers3[1] + WHEN ((stat.a).stakind4 = 6) THEN (stat.a).stanumbers4[1] + WHEN ((stat.a).stakind5 = 6) THEN (stat.a).stanumbers5[1] + ELSE NULL::real + END AS range_empty_frac, + CASE + WHEN ((stat.a).stakind1 = 7) THEN (stat.a).stavalues1 + WHEN ((stat.a).stakind2 = 7) THEN (stat.a).stavalues2 + WHEN ((stat.a).stakind3 = 7) THEN (stat.a).stavalues3 + WHEN ((stat.a).stakind4 = 7) THEN (stat.a).stavalues4 + WHEN ((stat.a).stakind5 = 7) THEN (stat.a).stavalues5 + ELSE NULL::anyarray + END AS range_bounds_histogram FROM (((((pg_statistic_ext s JOIN pg_class c ON ((c.oid = s.stxrelid))) LEFT JOIN pg_statistic_ext_data sd ON ((s.oid = sd.stxoid))) diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index b2a0657913..cb8856ac50 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -3628,3 +3628,30 @@ SELECT * FROM check_estimated_rows('SELECT * FROM sb_2 WHERE numeric_lt(y, 1.0)' -- Tidy up DROP TABLE sb_1, sb_2 CASCADE; +-- Check statistics generated for range type and expressions. +CREATE TABLE stats_ext_tbl_range(name text, irange int4range); +INSERT INTO stats_ext_tbl_range VALUES + ('red', '[1,7)'::int4range), + ('blue', '[2,8]'::int4range), + ('green', '[3,9)'::int4range); +CREATE STATISTICS stats_ext_range (mcv) + ON irange, (irange + '[4,10)'::int4range) + FROM stats_ext_tbl_range; +ANALYZE stats_ext_tbl_range; +SELECT attnames, most_common_vals + FROM pg_stats_ext + WHERE statistics_name = 'stats_ext_range'; + attnames | most_common_vals +----------+------------------------------------------------------------ + {irange} | {{"[1,7)","[1,10)"},{"[2,9)","[2,10)"},{"[3,9)","[3,10)"}} +(1 row) + +SELECT range_length_histogram, range_empty_frac, range_bounds_histogram + FROM pg_stats_ext_exprs + WHERE statistics_name = 'stats_ext_range'; + range_length_histogram | range_empty_frac | range_bounds_histogram +------------------------+------------------+------------------------------ + {7,8,9} | 0 | {"[1,10)","[2,10)","[3,10)"} +(1 row) + +DROP TABLE stats_ext_tbl_range; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 76ee9d29c0..9dcce3440c 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1866,3 +1866,21 @@ SELECT * FROM check_estimated_rows('SELECT * FROM sb_2 WHERE numeric_lt(y, 1.0)' -- Tidy up DROP TABLE sb_1, sb_2 CASCADE; + +-- Check statistics generated for range type and expressions. +CREATE TABLE stats_ext_tbl_range(name text, irange int4range); +INSERT INTO stats_ext_tbl_range VALUES + ('red', '[1,7)'::int4range), + ('blue', '[2,8]'::int4range), + ('green', '[3,9)'::int4range); +CREATE STATISTICS stats_ext_range (mcv) + ON irange, (irange + '[4,10)'::int4range) + FROM stats_ext_tbl_range; +ANALYZE stats_ext_tbl_range; +SELECT attnames, most_common_vals + FROM pg_stats_ext + WHERE statistics_name = 'stats_ext_range'; +SELECT range_length_histogram, range_empty_frac, range_bounds_histogram + FROM pg_stats_ext_exprs + WHERE statistics_name = 'stats_ext_range'; +DROP TABLE stats_ext_tbl_range; From f33c585774223757b01c8eddd134d364492ed94c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 10 Feb 2026 16:59:19 +0900 Subject: [PATCH 072/147] Simplify some log messages in extended_stats_funcs.c The log messages used in this file applied too much quoting logic: - No need for quote_identifier(), which is fine to not use in the context of a log entry. - The usual project style is to group the namespace and object together in a quoted string, when mentioned in an log message. This code quoted the namespace name and the extended statistics object name separately, which was confusing. Reported-by: Kyotaro Horiguchi Discussion: https://postgr.es/m/20260210.143752.1113524465620875233.horikyota.ntt@gmail.com --- src/backend/statistics/extended_stats_funcs.c | 32 ++++++++----------- src/test/regress/expected/stats_import.out | 14 ++++---- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/src/backend/statistics/extended_stats_funcs.c b/src/backend/statistics/extended_stats_funcs.c index b640941a9c..479f74652b 100644 --- a/src/backend/statistics/extended_stats_funcs.c +++ b/src/backend/statistics/extended_stats_funcs.c @@ -347,9 +347,8 @@ extended_statistics_update(FunctionCallInfo fcinfo) { ereport(WARNING, errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("could not find extended statistics object \"%s\".\"%s\"", - quote_identifier(nspname), - quote_identifier(stxname))); + errmsg("could not find extended statistics object \"%s.%s\"", + nspname, stxname)); success = false; goto cleanup; } @@ -364,11 +363,9 @@ extended_statistics_update(FunctionCallInfo fcinfo) { ereport(WARNING, errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not restore extended statistics object \"%s\".\"%s\": incorrect relation \"%s\".\"%s\" specified", - quote_identifier(nspname), - quote_identifier(stxname), - quote_identifier(relnspname), - quote_identifier(relname))); + errmsg("could not restore extended statistics object \"%s.%s\": incorrect relation \"%s.%s\" specified", + nspname, stxname, + relnspname, relname)); success = false; goto cleanup; @@ -420,9 +417,8 @@ extended_statistics_update(FunctionCallInfo fcinfo) errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot specify parameter \"%s\"", extarginfo[NDISTINCT_ARG].argname), - errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.", - quote_identifier(nspname), - quote_identifier(stxname))); + errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.", + nspname, stxname)); has.ndistinct = false; success = false; @@ -438,9 +434,8 @@ extended_statistics_update(FunctionCallInfo fcinfo) errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot specify parameter \"%s\"", extarginfo[DEPENDENCIES_ARG].argname), - errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.", - quote_identifier(nspname), - quote_identifier(stxname))); + errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.", + nspname, stxname)); has.dependencies = false; success = false; } @@ -463,9 +458,8 @@ extended_statistics_update(FunctionCallInfo fcinfo) extarginfo[MOST_COMMON_VALS_ARG].argname, extarginfo[MOST_COMMON_FREQS_ARG].argname, extarginfo[MOST_COMMON_BASE_FREQS_ARG].argname), - errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.", - quote_identifier(nspname), - quote_identifier(stxname))); + errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.", + nspname, stxname)); has.mcv = false; success = false; @@ -888,7 +882,7 @@ pg_clear_extended_stats(PG_FUNCTION_ARGS) table_close(pg_stext, RowExclusiveLock); ereport(WARNING, errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("could not find extended statistics object \"%s\".\"%s\"", + errmsg("could not find extended statistics object \"%s.%s\"", nspname, stxname)); PG_RETURN_VOID(); } @@ -904,7 +898,7 @@ pg_clear_extended_stats(PG_FUNCTION_ARGS) table_close(pg_stext, RowExclusiveLock); ereport(WARNING, errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not clear extended statistics object \"%s\".\"%s\": incorrect relation \"%s\".\"%s\" specified", + errmsg("could not clear extended statistics object \"%s.%s\": incorrect relation \"%s.%s\" specified", get_namespace_name(nspoid), stxname, relnspname, relname)); PG_RETURN_VOID(); diff --git a/src/test/regress/expected/stats_import.out b/src/test/regress/expected/stats_import.out index 37131f9cea..d6cc701500 100644 --- a/src/test/regress/expected/stats_import.out +++ b/src/test/regress/expected/stats_import.out @@ -1481,7 +1481,7 @@ SELECT pg_clear_extended_stats(schemaname => 'stats_import', statistics_schemaname => 'stats_import', statistics_name => 'ext_stats_not_exist', inherited => false); -WARNING: could not find extended statistics object "stats_import"."ext_stats_not_exist" +WARNING: could not find extended statistics object "stats_import.ext_stats_not_exist" pg_clear_extended_stats ------------------------- @@ -1493,7 +1493,7 @@ SELECT pg_clear_extended_stats(schemaname => 'stats_import', statistics_schemaname => 'stats_import', statistics_name => 'test_stat_clone', inherited => false); -WARNING: could not clear extended statistics object "stats_import"."test_stat_clone": incorrect relation "stats_import"."test" specified +WARNING: could not clear extended statistics object "stats_import.test_stat_clone": incorrect relation "stats_import.test" specified pg_clear_extended_stats ------------------------- @@ -1678,7 +1678,7 @@ SELECT pg_catalog.pg_restore_extended_stats( 'statistics_schemaname', 'stats_import', 'statistics_name', 'ext_stats_not_exist', 'inherited', false); -WARNING: could not find extended statistics object "stats_import"."ext_stats_not_exist" +WARNING: could not find extended statistics object "stats_import.ext_stats_not_exist" pg_restore_extended_stats --------------------------- f @@ -1691,7 +1691,7 @@ SELECT pg_catalog.pg_restore_extended_stats( 'statistics_schemaname', 'stats_import', 'statistics_name', 'test_stat_clone', 'inherited', false); -WARNING: could not restore extended statistics object "stats_import"."test_stat_clone": incorrect relation "stats_import"."test" specified +WARNING: could not restore extended statistics object "stats_import.test_stat_clone": incorrect relation "stats_import.test" specified pg_restore_extended_stats --------------------------- f @@ -1762,7 +1762,7 @@ SELECT pg_catalog.pg_restore_extended_stats( 'inherited', false, 'n_distinct', '[{"attributes" : [1,3], "ndistinct" : 4}]'::pg_ndistinct); WARNING: cannot specify parameter "n_distinct" -HINT: Extended statistics object "stats_import"."test_stat_dependencies" does not support statistics of this type. +HINT: Extended statistics object "stats_import.test_stat_dependencies" does not support statistics of this type. pg_restore_extended_stats --------------------------- f @@ -1778,7 +1778,7 @@ SELECT pg_catalog.pg_restore_extended_stats( 'dependencies', '[{"attributes": [2], "dependency": 3, "degree": 1.000000}, {"attributes": [3], "dependency": 2, "degree": 1.000000}]'::pg_dependencies); WARNING: cannot specify parameter "dependencies" -HINT: Extended statistics object "stats_import"."test_stat_ndistinct" does not support statistics of this type. +HINT: Extended statistics object "stats_import.test_stat_ndistinct" does not support statistics of this type. pg_restore_extended_stats --------------------------- f @@ -1966,7 +1966,7 @@ SELECT pg_catalog.pg_restore_extended_stats( 'most_common_freqs', '{0.25,0.25,0.25,0.25}'::double precision[], 'most_common_base_freqs', '{0.0625,0.0625,0.0625,0.0625}'::double precision[]); WARNING: cannot specify parameters "most_common_vals", "most_common_freqs" or "most_common_base_freqs" -HINT: Extended statistics object "stats_import"."test_stat_dependencies" does not support statistics of this type. +HINT: Extended statistics object "stats_import.test_stat_dependencies" does not support statistics of this type. pg_restore_extended_stats --------------------------- f From ddc3250208bd5980a25b0421d607bae202fef06c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 10 Feb 2026 16:23:05 +0200 Subject: [PATCH 073/147] Use ProcNumber rather than pid in ReplicationSlot This helps the next commit. Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/4cc13ba1-4248-4884-b6ba-4805349e7f39@iki.fi --- src/backend/replication/logical/slotsync.c | 2 +- src/backend/replication/slot.c | 63 ++++++++++++---------- src/backend/replication/slotfuncs.c | 13 ++--- src/include/replication/slot.h | 7 ++- 4 files changed, 49 insertions(+), 36 deletions(-) diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index af5682ce50..d02d44d26a 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -1757,7 +1757,7 @@ update_synced_slots_inactive_since(void) Assert(SlotIsLogical(s)); /* The slot must not be acquired by any process */ - Assert(s->active_pid == 0); + Assert(s->active_proc == INVALID_PROC_NUMBER); /* Use the same inactive_since time for all the slots. */ if (now == 0) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 4c47261c7f..d5628d6211 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -226,6 +226,7 @@ ReplicationSlotsShmemInit(void) ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[i]; /* everything else is zeroed by the memset above */ + slot->active_proc = INVALID_PROC_NUMBER; SpinLockInit(&slot->mutex); LWLockInitialize(&slot->io_in_progress_lock, LWTRANCHE_REPLICATION_SLOT_IO); @@ -461,7 +462,7 @@ ReplicationSlotCreate(const char *name, bool db_specific, * be doing that. So it's safe to initialize the slot. */ Assert(!slot->in_use); - Assert(slot->active_pid == 0); + Assert(slot->active_proc == INVALID_PROC_NUMBER); /* first initialize persistent data */ memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData)); @@ -505,8 +506,8 @@ ReplicationSlotCreate(const char *name, bool db_specific, /* We can now mark the slot active, and that makes it our slot. */ SpinLockAcquire(&slot->mutex); - Assert(slot->active_pid == 0); - slot->active_pid = MyProcPid; + Assert(slot->active_proc == INVALID_PROC_NUMBER); + slot->active_proc = MyProcNumber; SpinLockRelease(&slot->mutex); MyReplicationSlot = slot; @@ -620,6 +621,7 @@ void ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) { ReplicationSlot *s; + ProcNumber active_proc; int active_pid; Assert(name != NULL); @@ -672,17 +674,18 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) * to inactive_since in InvalidatePossiblyObsoleteSlot. */ SpinLockAcquire(&s->mutex); - if (s->active_pid == 0) - s->active_pid = MyProcPid; - active_pid = s->active_pid; + if (s->active_proc == INVALID_PROC_NUMBER) + s->active_proc = MyProcNumber; + active_proc = s->active_proc; ReplicationSlotSetInactiveSince(s, 0, false); SpinLockRelease(&s->mutex); } else { - s->active_pid = active_pid = MyProcPid; + s->active_proc = active_proc = MyProcNumber; ReplicationSlotSetInactiveSince(s, 0, true); } + active_pid = GetPGProcByNumber(active_proc)->pid; LWLockRelease(ReplicationSlotControlLock); /* @@ -690,7 +693,7 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) * wait until the owning process signals us that it's been released, or * error out. */ - if (active_pid != MyProcPid) + if (active_proc != MyProcNumber) { if (!nowait) { @@ -762,7 +765,7 @@ ReplicationSlotRelease(void) bool is_logical; TimestampTz now = 0; - Assert(slot != NULL && slot->active_pid != 0); + Assert(slot != NULL && slot->active_proc != INVALID_PROC_NUMBER); is_logical = SlotIsLogical(slot); @@ -815,7 +818,7 @@ ReplicationSlotRelease(void) * disconnecting, but wake up others that may be waiting for it. */ SpinLockAcquire(&slot->mutex); - slot->active_pid = 0; + slot->active_proc = INVALID_PROC_NUMBER; ReplicationSlotSetInactiveSince(slot, now, false); SpinLockRelease(&slot->mutex); ConditionVariableBroadcast(&slot->active_cv); @@ -877,7 +880,7 @@ ReplicationSlotCleanup(bool synced_only) found_valid_logicalslot |= (SlotIsLogical(s) && s->data.invalidated == RS_INVAL_NONE); - if ((s->active_pid == MyProcPid && + if ((s->active_proc == MyProcNumber && (!synced_only || s->data.synced))) { Assert(s->data.persistency == RS_TEMPORARY); @@ -1088,7 +1091,7 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) bool fail_softly = slot->data.persistency != RS_PERSISTENT; SpinLockAcquire(&slot->mutex); - slot->active_pid = 0; + slot->active_proc = INVALID_PROC_NUMBER; SpinLockRelease(&slot->mutex); /* wake up anyone waiting on this slot */ @@ -1110,7 +1113,7 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) * Also wake up processes waiting for it. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); - slot->active_pid = 0; + slot->active_proc = INVALID_PROC_NUMBER; slot->in_use = false; LWLockRelease(ReplicationSlotControlLock); ConditionVariableBroadcast(&slot->active_cv); @@ -1476,7 +1479,7 @@ ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive) /* count slots with spinlock held */ SpinLockAcquire(&s->mutex); (*nslots)++; - if (s->active_pid != 0) + if (s->active_proc != INVALID_PROC_NUMBER) (*nactive)++; SpinLockRelease(&s->mutex); } @@ -1520,7 +1523,7 @@ ReplicationSlotsDropDBSlots(Oid dboid) { ReplicationSlot *s; char *slotname; - int active_pid; + ProcNumber active_proc; s = &ReplicationSlotCtl->replication_slots[i]; @@ -1550,11 +1553,11 @@ ReplicationSlotsDropDBSlots(Oid dboid) SpinLockAcquire(&s->mutex); /* can't change while ReplicationSlotControlLock is held */ slotname = NameStr(s->data.name); - active_pid = s->active_pid; - if (active_pid == 0) + active_proc = s->active_proc; + if (active_proc == INVALID_PROC_NUMBER) { MyReplicationSlot = s; - s->active_pid = MyProcPid; + s->active_proc = MyProcNumber; } SpinLockRelease(&s->mutex); @@ -1579,11 +1582,11 @@ ReplicationSlotsDropDBSlots(Oid dboid) * XXX: We can consider shutting down the slot sync worker before * trying to drop synced temporary slots here. */ - if (active_pid) + if (active_proc != INVALID_PROC_NUMBER) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("replication slot \"%s\" is active for PID %d", - slotname, active_pid))); + slotname, GetPGProcByNumber(active_proc)->pid))); /* * To avoid duplicating ReplicationSlotDropAcquired() and to avoid @@ -1974,6 +1977,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, { XLogRecPtr restart_lsn; NameData slotname; + ProcNumber active_proc; int active_pid = 0; ReplicationSlotInvalidationCause invalidation_cause = RS_INVAL_NONE; TimestampTz now = 0; @@ -2027,7 +2031,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, } slotname = s->data.name; - active_pid = s->active_pid; + active_proc = s->active_proc; /* * If the slot can be acquired, do so and mark it invalidated @@ -2039,10 +2043,10 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * is terminated. So, the inactive slot can only be invalidated * immediately without being terminated. */ - if (active_pid == 0) + if (active_proc == INVALID_PROC_NUMBER) { MyReplicationSlot = s; - s->active_pid = MyProcPid; + s->active_proc = MyProcNumber; s->data.invalidated = invalidation_cause; /* @@ -2058,6 +2062,11 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, /* Let caller know */ invalidated = true; } + else + { + active_pid = GetPGProcByNumber(active_proc)->pid; + Assert(active_pid != 0); + } SpinLockRelease(&s->mutex); @@ -2073,7 +2082,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, &slot_idle_usecs); } - if (active_pid != 0) + if (active_proc != INVALID_PROC_NUMBER) { /* * Prepare the sleep on the slot's condition variable before @@ -2107,7 +2116,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, if (MyBackendType == B_STARTUP) (void) SendProcSignal(active_pid, PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT, - INVALID_PROC_NUMBER); + active_proc); else (void) kill(active_pid, SIGTERM); @@ -2875,7 +2884,7 @@ RestoreSlotFromDisk(const char *name) slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; - slot->active_pid = 0; + slot->active_proc = INVALID_PROC_NUMBER; /* * Set the time since the slot has become inactive after loading the @@ -3158,7 +3167,7 @@ StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel) SpinLockAcquire(&slot->mutex); restart_lsn = slot->data.restart_lsn; invalidated = slot->data.invalidated != RS_INVAL_NONE; - inactive = slot->active_pid == 0; + inactive = slot->active_proc == INVALID_PROC_NUMBER; SpinLockRelease(&slot->mutex); if (invalidated) diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 1ed2d80c2d..9f5e4f998f 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -20,6 +20,7 @@ #include "replication/logical.h" #include "replication/slot.h" #include "replication/slotsync.h" +#include "storage/proc.h" #include "utils/builtins.h" #include "utils/guc.h" #include "utils/pg_lsn.h" @@ -309,10 +310,10 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) values[i++] = ObjectIdGetDatum(slot_contents.data.database); values[i++] = BoolGetDatum(slot_contents.data.persistency == RS_TEMPORARY); - values[i++] = BoolGetDatum(slot_contents.active_pid != 0); + values[i++] = BoolGetDatum(slot_contents.active_proc != INVALID_PROC_NUMBER); - if (slot_contents.active_pid != 0) - values[i++] = Int32GetDatum(slot_contents.active_pid); + if (slot_contents.active_proc != INVALID_PROC_NUMBER) + values[i++] = Int32GetDatum(GetPGProcByNumber(slot_contents.active_proc)->pid); else nulls[i++] = true; @@ -377,13 +378,13 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) */ if (XLogRecPtrIsValid(slot_contents.data.restart_lsn)) { - int pid; + ProcNumber procno; SpinLockAcquire(&slot->mutex); - pid = slot->active_pid; + procno = slot->active_proc; slot_contents.data.restart_lsn = slot->data.restart_lsn; SpinLockRelease(&slot->mutex); - if (pid != 0) + if (procno != INVALID_PROC_NUMBER) { values[i++] = CStringGetTextDatum("unreserved"); walstate = WALAVAIL_UNRESERVED; diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h index f465e430cc..72f8be629f 100644 --- a/src/include/replication/slot.h +++ b/src/include/replication/slot.h @@ -185,8 +185,11 @@ typedef struct ReplicationSlot /* is this slot defined */ bool in_use; - /* Who is streaming out changes for this slot? 0 in unused slots. */ - pid_t active_pid; + /* + * Who is streaming out changes for this slot? INVALID_PROC_NUMBER in + * unused slots. + */ + ProcNumber active_proc; /* any outstanding modifications? */ bool just_dirtied; From 17f51ea818753093f929b4c235f3b89ebcc7c5fb Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 10 Feb 2026 16:23:08 +0200 Subject: [PATCH 074/147] Separate RecoveryConflictReasons from procsignals Share the same PROCSIG_RECOVERY_CONFLICT flag for all recovery conflict reasons. To distinguish, have a bitmask in PGPROC to indicate the reason(s). Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/4cc13ba1-4248-4884-b6ba-4805349e7f39@iki.fi --- src/backend/commands/dbcommands.c | 1 + src/backend/commands/tablespace.c | 1 + src/backend/replication/logical/logicalctl.c | 1 + src/backend/replication/slot.c | 6 +- src/backend/storage/buffer/bufmgr.c | 5 +- src/backend/storage/ipc/procarray.c | 136 +++++++++++++------ src/backend/storage/ipc/procsignal.c | 22 +-- src/backend/storage/ipc/standby.c | 61 ++++----- src/backend/storage/lmgr/proc.c | 5 +- src/backend/tcop/postgres.c | 117 ++++++++-------- src/backend/utils/activity/pgstat_database.c | 18 +-- src/backend/utils/adt/mcxtfuncs.c | 1 + src/include/storage/proc.h | 10 ++ src/include/storage/procarray.h | 7 +- src/include/storage/procsignal.h | 16 +-- src/include/storage/standby.h | 34 ++++- src/include/tcop/tcopprot.h | 2 +- src/tools/pgindent/typedefs.list | 1 + 18 files changed, 258 insertions(+), 186 deletions(-) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 87949054f2..33311760df 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -60,6 +60,7 @@ #include "storage/lmgr.h" #include "storage/md.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 0b06489193..3511a4ec0f 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -70,6 +70,7 @@ #include "miscadmin.h" #include "postmaster/bgwriter.h" #include "storage/fd.h" +#include "storage/procsignal.h" #include "storage/standby.h" #include "utils/acl.h" #include "utils/builtins.h" diff --git a/src/backend/replication/logical/logicalctl.c b/src/backend/replication/logical/logicalctl.c index 9f787f3dc5..4e29295120 100644 --- a/src/backend/replication/logical/logicalctl.c +++ b/src/backend/replication/logical/logicalctl.c @@ -71,6 +71,7 @@ #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "utils/injection_point.h" /* diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index d5628d6211..28c7019402 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -2114,9 +2114,9 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, slot_idle_secs); if (MyBackendType == B_STARTUP) - (void) SendProcSignal(active_pid, - PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT, - active_proc); + (void) SignalRecoveryConflict(GetPGProcByNumber(active_proc), + active_pid, + RECOVERY_CONFLICT_LOGICALSLOT); else (void) kill(active_pid, SIGTERM); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 7241477cac..d1babaff02 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -59,6 +59,7 @@ #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/proclist.h" +#include "storage/procsignal.h" #include "storage/read_stream.h" #include "storage/smgr.h" #include "storage/standby.h" @@ -6570,7 +6571,7 @@ LockBufferForCleanup(Buffer buffer) * deadlock_timeout for it. */ if (logged_recovery_conflict) - LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, + LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN, waitStart, GetCurrentTimestamp(), NULL, false); @@ -6621,7 +6622,7 @@ LockBufferForCleanup(Buffer buffer) if (TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout)) { - LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, + LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN, waitStart, now, NULL, true); logged_recovery_conflict = true; } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 301f54fb5a..40312df2ca 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -60,6 +60,7 @@ #include "port/pg_lfind.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/injection_point.h" @@ -708,6 +709,8 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; + pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); + /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->statusFlags & PROC_VACUUM_STATE_MASK) @@ -748,6 +751,8 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; + pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); + /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ if (proc->statusFlags & PROC_VACUUM_STATE_MASK) @@ -929,6 +934,7 @@ ProcArrayClearTransaction(PGPROC *proc) proc->vxid.lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; + pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); Assert(!proc->delayChkptFlags); @@ -3440,12 +3446,46 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) } /* - * SignalVirtualTransaction - used in recovery conflict processing + * SignalRecoveryConflict -- signal that a process is blocking recovery * - * Returns pid of the process signaled, or 0 if not found. + * The 'pid' is redundant with 'proc', but it acts as a cross-check to + * detect process had exited and the PGPROC entry was reused for a different + * process. + * + * Returns true if the process was signaled, or false if not found. */ -pid_t -SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode) +bool +SignalRecoveryConflict(PGPROC *proc, pid_t pid, RecoveryConflictReason reason) +{ + bool found = false; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + /* + * Kill the pid if it's still here. If not, that's what we wanted so + * ignore any errors. + */ + if (proc->pid == pid) + { + (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason)); + + /* wake up the process */ + (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, GetNumberFromPGProc(proc)); + found = true; + } + + LWLockRelease(ProcArrayLock); + + return found; +} + +/* + * SignalRecoveryConflictWithVirtualXID -- signal that a VXID is blocking recovery + * + * Like SignalRecoveryConflict, but the target is identified by VXID + */ +bool +SignalRecoveryConflictWithVirtualXID(VirtualTransactionId vxid, RecoveryConflictReason reason) { ProcArrayStruct *arrayP = procArray; int index; @@ -3467,11 +3507,13 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode) pid = proc->pid; if (pid != 0) { + (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason)); + /* * Kill the pid if it's still here. If not, that's what we * wanted so ignore any errors. */ - (void) SendProcSignal(pid, sigmode, vxid.procNumber); + (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, vxid.procNumber); } break; } @@ -3479,7 +3521,50 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode) LWLockRelease(ProcArrayLock); - return pid; + return pid != 0; +} + +/* + * SignalRecoveryConflictWithDatabase --- signal all backends specified database + * + * Like SignalRecoveryConflict, but signals all backends using the database. + */ +void +SignalRecoveryConflictWithDatabase(Oid databaseid, RecoveryConflictReason reason) +{ + ProcArrayStruct *arrayP = procArray; + int index; + + /* tell all backends to die */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (databaseid == InvalidOid || proc->databaseId == databaseid) + { + VirtualTransactionId procvxid; + pid_t pid; + + GET_VXID_FROM_PGPROC(procvxid, *proc); + + pid = proc->pid; + if (pid != 0) + { + (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason)); + + /* + * Kill the pid if it's still here. If not, that's what we + * wanted so ignore any errors. + */ + (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, procvxid.procNumber); + } + } + } + + LWLockRelease(ProcArrayLock); } /* @@ -3601,45 +3686,6 @@ CountDBConnections(Oid databaseid) return count; } -/* - * CancelDBBackends --- cancel backends that are using specified database - */ -void -CancelDBBackends(Oid databaseid, ProcSignalReason sigmode) -{ - ProcArrayStruct *arrayP = procArray; - int index; - - /* tell all backends to die */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - for (index = 0; index < arrayP->numProcs; index++) - { - int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; - - if (databaseid == InvalidOid || proc->databaseId == databaseid) - { - VirtualTransactionId procvxid; - pid_t pid; - - GET_VXID_FROM_PGPROC(procvxid, *proc); - - pid = proc->pid; - if (pid != 0) - { - /* - * Kill the pid if it's still here. If not, that's what we - * wanted so ignore any errors. - */ - (void) SendProcSignal(pid, sigmode, procvxid.procNumber); - } - } - } - - LWLockRelease(ProcArrayLock); -} - /* * CountUserBackends --- count backends that are used by specified user * (only regular backends, not any type of background worker) diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 8e56922dce..5d33559926 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -697,26 +697,8 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE)) HandleParallelApplyMessageInterrupt(); - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT)) + HandleRecoveryConflictInterrupt(); SetLatch(MyLatch); } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 6db803476c..0851789e8b 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -71,13 +71,13 @@ static volatile sig_atomic_t got_standby_delay_timeout = false; static volatile sig_atomic_t got_standby_lock_timeout = false; static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, - ProcSignalReason reason, + RecoveryConflictReason reason, uint32 wait_event_info, bool report_waiting); -static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason); +static void SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason); static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts); static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks); -static const char *get_recovery_conflict_desc(ProcSignalReason reason); +static const char *get_recovery_conflict_desc(RecoveryConflictReason reason); /* * InitRecoveryTransactionEnvironment @@ -271,7 +271,7 @@ WaitExceedsMaxStandbyDelay(uint32 wait_event_info) * to be resolved or not. */ void -LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, +LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting) { @@ -358,7 +358,8 @@ LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, */ static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, - ProcSignalReason reason, uint32 wait_event_info, + RecoveryConflictReason reason, + uint32 wait_event_info, bool report_waiting) { TimestampTz waitStart = 0; @@ -384,19 +385,19 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, /* Is it time to kill it? */ if (WaitExceedsMaxStandbyDelay(wait_event_info)) { - pid_t pid; + bool signaled; /* * Now find out who to throw out of the balloon. */ Assert(VirtualTransactionIdIsValid(*waitlist)); - pid = SignalVirtualTransaction(*waitlist, reason); + signaled = SignalRecoveryConflictWithVirtualXID(*waitlist, reason); /* * Wait a little bit for it to die so that we avoid flooding * an unresponsive backend when system is heavily loaded. */ - if (pid != 0) + if (signaled) pg_usleep(5000L); } @@ -489,7 +490,7 @@ ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, backends = GetConflictingVirtualXIDs(snapshotConflictHorizon, locator.dbOid); ResolveRecoveryConflictWithVirtualXIDs(backends, - PROCSIG_RECOVERY_CONFLICT_SNAPSHOT, + RECOVERY_CONFLICT_SNAPSHOT, WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT, true); @@ -560,7 +561,7 @@ ResolveRecoveryConflictWithTablespace(Oid tsid) temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId, InvalidOid); ResolveRecoveryConflictWithVirtualXIDs(temp_file_users, - PROCSIG_RECOVERY_CONFLICT_TABLESPACE, + RECOVERY_CONFLICT_TABLESPACE, WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE, true); } @@ -581,7 +582,7 @@ ResolveRecoveryConflictWithDatabase(Oid dbid) */ while (CountDBBackends(dbid) > 0) { - CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE); + SignalRecoveryConflictWithDatabase(dbid, RECOVERY_CONFLICT_DATABASE); /* * Wait awhile for them to die so that we avoid flooding an @@ -665,7 +666,7 @@ ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict) * because the caller, WaitOnLock(), has already reported that. */ ResolveRecoveryConflictWithVirtualXIDs(backends, - PROCSIG_RECOVERY_CONFLICT_LOCK, + RECOVERY_CONFLICT_LOCK, PG_WAIT_LOCK | locktag.locktag_type, false); } @@ -723,8 +724,8 @@ ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict) */ while (VirtualTransactionIdIsValid(*backends)) { - SignalVirtualTransaction(*backends, - PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + (void) SignalRecoveryConflictWithVirtualXID(*backends, + RECOVERY_CONFLICT_STARTUP_DEADLOCK); backends++; } @@ -802,7 +803,7 @@ ResolveRecoveryConflictWithBufferPin(void) /* * We're already behind, so clear a path as quickly as possible. */ - SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN); } else { @@ -842,7 +843,7 @@ ResolveRecoveryConflictWithBufferPin(void) ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP); if (got_standby_delay_timeout) - SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN); else if (got_standby_deadlock_timeout) { /* @@ -858,7 +859,7 @@ ResolveRecoveryConflictWithBufferPin(void) * not be so harmful because the period that the buffer is kept pinned * is basically no so long. But we should fix this? */ - SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_STARTUP_DEADLOCK); } /* @@ -873,10 +874,10 @@ ResolveRecoveryConflictWithBufferPin(void) } static void -SendRecoveryConflictWithBufferPin(ProcSignalReason reason) +SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason) { - Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN || - reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + Assert(reason == RECOVERY_CONFLICT_BUFFERPIN || + reason == RECOVERY_CONFLICT_STARTUP_DEADLOCK); /* * We send signal to all backends to ask them if they are holding the @@ -884,7 +885,7 @@ SendRecoveryConflictWithBufferPin(ProcSignalReason reason) * innocent, but we let the SIGUSR1 handling in each backend decide their * own fate. */ - CancelDBBackends(InvalidOid, reason); + SignalRecoveryConflictWithDatabase(InvalidOid, reason); } /* @@ -1489,35 +1490,33 @@ LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs, /* Return the description of recovery conflict */ static const char * -get_recovery_conflict_desc(ProcSignalReason reason) +get_recovery_conflict_desc(RecoveryConflictReason reason) { const char *reasonDesc = _("unknown reason"); switch (reason) { - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN: reasonDesc = _("recovery conflict on buffer pin"); break; - case PROCSIG_RECOVERY_CONFLICT_LOCK: + case RECOVERY_CONFLICT_LOCK: reasonDesc = _("recovery conflict on lock"); break; - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + case RECOVERY_CONFLICT_TABLESPACE: reasonDesc = _("recovery conflict on tablespace"); break; - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + case RECOVERY_CONFLICT_SNAPSHOT: reasonDesc = _("recovery conflict on snapshot"); break; - case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + case RECOVERY_CONFLICT_LOGICALSLOT: reasonDesc = _("recovery conflict on replication slot"); break; - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + case RECOVERY_CONFLICT_STARTUP_DEADLOCK: reasonDesc = _("recovery conflict on buffer deadlock"); break; - case PROCSIG_RECOVERY_CONFLICT_DATABASE: + case RECOVERY_CONFLICT_DATABASE: reasonDesc = _("recovery conflict on database"); break; - default: - break; } return reasonDesc; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 8560a903bc..31ccdb1ef8 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -504,6 +504,7 @@ InitProcess(void) Assert(dlist_is_empty(&(MyProc->myProcLocks[i]))); } #endif + pg_atomic_write_u32(&MyProc->pendingRecoveryConflicts, 0); /* Initialize fields for sync rep */ MyProc->waitLSN = InvalidXLogRecPtr; @@ -1445,7 +1446,7 @@ ProcSleep(LOCALLOCK *locallock) * because the startup process here has already waited * longer than deadlock_timeout. */ - LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK, + LogRecoveryConflict(RECOVERY_CONFLICT_LOCK, standbyWaitStart, now, cnt > 0 ? vxids : NULL, true); logged_recovery_conflict = true; @@ -1686,7 +1687,7 @@ ProcSleep(LOCALLOCK *locallock) * startup process waited longer than deadlock_timeout for it. */ if (InHotStandby && logged_recovery_conflict) - LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK, + LogRecoveryConflict(RECOVERY_CONFLICT_LOCK, standbyWaitStart, GetCurrentTimestamp(), NULL, false); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 02e9aaa6bc..664161886c 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -67,6 +67,7 @@ #include "storage/proc.h" #include "storage/procsignal.h" #include "storage/sinval.h" +#include "storage/standby.h" #include "tcop/backend_startup.h" #include "tcop/fastpath.h" #include "tcop/pquery.h" @@ -155,10 +156,6 @@ static const char *userDoption = NULL; /* -D switch */ static bool EchoQuery = false; /* -E switch */ static bool UseSemiNewlineNewline = false; /* -j switch */ -/* whether or not, and why, we were canceled by conflict with recovery */ -static volatile sig_atomic_t RecoveryConflictPending = false; -static volatile sig_atomic_t RecoveryConflictPendingReasons[NUM_PROCSIGNALS]; - /* reused buffer to pass to SendRowDescriptionMessage() */ static MemoryContext row_description_context = NULL; static StringInfoData row_description_buf; @@ -2537,34 +2534,31 @@ errdetail_params(ParamListInfo params) * Add an errdetail() line showing conflict source. */ static int -errdetail_recovery_conflict(ProcSignalReason reason) +errdetail_recovery_conflict(RecoveryConflictReason reason) { switch (reason) { - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN: errdetail("User was holding shared buffer pin for too long."); break; - case PROCSIG_RECOVERY_CONFLICT_LOCK: + case RECOVERY_CONFLICT_LOCK: errdetail("User was holding a relation lock for too long."); break; - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + case RECOVERY_CONFLICT_TABLESPACE: errdetail("User was or might have been using tablespace that must be dropped."); break; - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + case RECOVERY_CONFLICT_SNAPSHOT: errdetail("User query might have needed to see row versions that must be removed."); break; - case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + case RECOVERY_CONFLICT_LOGICALSLOT: errdetail("User was using a logical replication slot that must be invalidated."); break; - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + case RECOVERY_CONFLICT_STARTUP_DEADLOCK: errdetail("User transaction caused buffer deadlock with recovery."); break; - case PROCSIG_RECOVERY_CONFLICT_DATABASE: + case RECOVERY_CONFLICT_DATABASE: errdetail("User was connected to a database that must be dropped."); break; - default: - break; - /* no errdetail */ } return 0; @@ -3067,15 +3061,14 @@ FloatExceptionHandler(SIGNAL_ARGS) } /* - * Tell the next CHECK_FOR_INTERRUPTS() to check for a particular type of - * recovery conflict. Runs in a SIGUSR1 handler. + * Tell the next CHECK_FOR_INTERRUPTS() to process recovery conflicts. Runs + * in a SIGUSR1 handler. */ void -HandleRecoveryConflictInterrupt(ProcSignalReason reason) +HandleRecoveryConflictInterrupt(void) { - RecoveryConflictPendingReasons[reason] = true; - RecoveryConflictPending = true; - InterruptPending = true; + if (pg_atomic_read_u32(&MyProc->pendingRecoveryConflicts) != 0) + InterruptPending = true; /* latch will be set by procsignal_sigusr1_handler */ } @@ -3083,11 +3076,11 @@ HandleRecoveryConflictInterrupt(ProcSignalReason reason) * Check one individual conflict reason. */ static void -ProcessRecoveryConflictInterrupt(ProcSignalReason reason) +ProcessRecoveryConflictInterrupt(RecoveryConflictReason reason) { switch (reason) { - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + case RECOVERY_CONFLICT_STARTUP_DEADLOCK: /* * If we aren't waiting for a lock we can never deadlock. @@ -3098,21 +3091,20 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) /* Intentional fall through to check wait for pin */ /* FALLTHROUGH */ - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN: /* - * If PROCSIG_RECOVERY_CONFLICT_BUFFERPIN is requested but we - * aren't blocking the Startup process there is nothing more to - * do. + * If RECOVERY_CONFLICT_BUFFERPIN is requested but we aren't + * blocking the Startup process there is nothing more to do. * - * When PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK is requested, - * if we're waiting for locks and the startup process is not - * waiting for buffer pin (i.e., also waiting for locks), we set - * the flag so that ProcSleep() will check for deadlocks. + * When RECOVERY_CONFLICT_STARTUP_DEADLOCK is requested, if we're + * waiting for locks and the startup process is not waiting for + * buffer pin (i.e., also waiting for locks), we set the flag so + * that ProcSleep() will check for deadlocks. */ if (!HoldingBufferPinThatDelaysRecovery()) { - if (reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK && + if (reason == RECOVERY_CONFLICT_STARTUP_DEADLOCK && GetStartupBufferPinWaitBufId() < 0) CheckDeadLockAlert(); return; @@ -3121,9 +3113,9 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) /* Intentional fall through to error handling */ /* FALLTHROUGH */ - case PROCSIG_RECOVERY_CONFLICT_LOCK: - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + case RECOVERY_CONFLICT_LOCK: + case RECOVERY_CONFLICT_TABLESPACE: + case RECOVERY_CONFLICT_SNAPSHOT: /* * If we aren't in a transaction any longer then ignore. @@ -3133,34 +3125,34 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) /* FALLTHROUGH */ - case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + case RECOVERY_CONFLICT_LOGICALSLOT: /* * If we're not in a subtransaction then we are OK to throw an * ERROR to resolve the conflict. Otherwise drop through to the * FATAL case. * - * PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT is a special case that - * always throws an ERROR (ie never promotes to FATAL), though it - * still has to respect QueryCancelHoldoffCount, so it shares this - * code path. Logical decoding slots are only acquired while + * RECOVERY_CONFLICT_LOGICALSLOT is a special case that always + * throws an ERROR (ie never promotes to FATAL), though it still + * has to respect QueryCancelHoldoffCount, so it shares this code + * path. Logical decoding slots are only acquired while * performing logical decoding. During logical decoding no user * controlled code is run. During [sub]transaction abort, the * slot is released. Therefore user controlled code cannot * intercept an error before the replication slot is released. * * XXX other times that we can throw just an ERROR *may* be - * PROCSIG_RECOVERY_CONFLICT_LOCK if no locks are held in parent + * RECOVERY_CONFLICT_LOCK if no locks are held in parent * transactions * - * PROCSIG_RECOVERY_CONFLICT_SNAPSHOT if no snapshots are held by - * parent transactions and the transaction is not - * transaction-snapshot mode + * RECOVERY_CONFLICT_SNAPSHOT if no snapshots are held by parent + * transactions and the transaction is not transaction-snapshot + * mode * - * PROCSIG_RECOVERY_CONFLICT_TABLESPACE if no temp files or - * cursors open in parent transactions + * RECOVERY_CONFLICT_TABLESPACE if no temp files or cursors open + * in parent transactions */ - if (reason == PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT || + if (reason == RECOVERY_CONFLICT_LOGICALSLOT || !IsSubTransaction()) { /* @@ -3187,8 +3179,7 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) * Re-arm and defer this interrupt until later. See * similar code in ProcessInterrupts(). */ - RecoveryConflictPendingReasons[reason] = true; - RecoveryConflictPending = true; + (void) pg_atomic_fetch_or_u32(&MyProc->pendingRecoveryConflicts, (1 << reason)); InterruptPending = true; return; } @@ -3222,7 +3213,7 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) " database and repeat your command."))); break; - case PROCSIG_RECOVERY_CONFLICT_DATABASE: + case RECOVERY_CONFLICT_DATABASE: /* The database is being dropped; terminate the session */ pgstat_report_recovery_conflict(reason); @@ -3243,6 +3234,8 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) static void ProcessRecoveryConflictInterrupts(void) { + uint32 pending; + /* * We don't need to worry about joggling the elbow of proc_exit, because * proc_exit_prepare() holds interrupts, so ProcessInterrupts() won't call @@ -3250,17 +3243,27 @@ ProcessRecoveryConflictInterrupts(void) */ Assert(!proc_exit_inprogress); Assert(InterruptHoldoffCount == 0); - Assert(RecoveryConflictPending); - RecoveryConflictPending = false; + /* Are any recovery conflict pending? */ + pending = pg_atomic_read_membarrier_u32(&MyProc->pendingRecoveryConflicts); + if (pending == 0) + return; - for (ProcSignalReason reason = PROCSIG_RECOVERY_CONFLICT_FIRST; - reason <= PROCSIG_RECOVERY_CONFLICT_LAST; + /* + * Check the conflicts one by one, clearing each flag only before + * processing the particular conflict. This ensures that if multiple + * conflicts are pending, we come back here to process the remaining + * conflicts, if an error is thrown during processing one of them. + */ + for (RecoveryConflictReason reason = 0; + reason < NUM_RECOVERY_CONFLICT_REASONS; reason++) { - if (RecoveryConflictPendingReasons[reason]) + if ((pending & (1 << reason)) != 0) { - RecoveryConflictPendingReasons[reason] = false; + /* clear the flag */ + (void) pg_atomic_fetch_and_u32(&MyProc->pendingRecoveryConflicts, ~(1 << reason)); + ProcessRecoveryConflictInterrupt(reason); } } @@ -3451,7 +3454,7 @@ ProcessInterrupts(void) } } - if (RecoveryConflictPending) + if (pg_atomic_read_u32(&MyProc->pendingRecoveryConflicts) != 0) ProcessRecoveryConflictInterrupts(); if (IdleInTransactionSessionTimeoutPending) diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index d7f6d4c5ee..e6759ccaa3 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -17,7 +17,7 @@ #include "postgres.h" -#include "storage/procsignal.h" +#include "storage/standby.h" #include "utils/pgstat_internal.h" #include "utils/timestamp.h" @@ -88,31 +88,31 @@ pgstat_report_recovery_conflict(int reason) dbentry = pgstat_prep_database_pending(MyDatabaseId); - switch (reason) + switch ((RecoveryConflictReason) reason) { - case PROCSIG_RECOVERY_CONFLICT_DATABASE: + case RECOVERY_CONFLICT_DATABASE: /* * Since we drop the information about the database as soon as it * replicates, there is no point in counting these conflicts. */ break; - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + case RECOVERY_CONFLICT_TABLESPACE: dbentry->conflict_tablespace++; break; - case PROCSIG_RECOVERY_CONFLICT_LOCK: + case RECOVERY_CONFLICT_LOCK: dbentry->conflict_lock++; break; - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + case RECOVERY_CONFLICT_SNAPSHOT: dbentry->conflict_snapshot++; break; - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN: dbentry->conflict_bufferpin++; break; - case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + case RECOVERY_CONFLICT_LOGICALSLOT: dbentry->conflict_logicalslot++; break; - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + case RECOVERY_CONFLICT_STARTUP_DEADLOCK: dbentry->conflict_startup_deadlock++; break; } diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c index 12b8d4cefa..c7f7b8bc2d 100644 --- a/src/backend/utils/adt/mcxtfuncs.c +++ b/src/backend/utils/adt/mcxtfuncs.c @@ -19,6 +19,7 @@ #include "mb/pg_wchar.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/hsearch.h" diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 679f0624f9..ac0df4aeaa 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -236,6 +236,16 @@ struct PGPROC BackendType backendType; /* what kind of process is this? */ + /* + * While in hot standby mode, shows that a conflict signal has been sent + * for the current transaction. Set/cleared while holding ProcArrayLock, + * though not required. Accessed without lock, if needed. + * + * This is a bitmask; each bit corresponds to a RecoveryConflictReason + * enum value. + */ + pg_atomic_uint32 pendingRecoveryConflicts; + /* * Info about LWLock the process is currently waiting for, if any. * diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 3a8593f87b..c5ab1574fe 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -77,12 +77,15 @@ extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, bool allDbs, int excludeVacuum, int *nvxids); extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid); -extern pid_t SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode); + +extern bool SignalRecoveryConflict(PGPROC *proc, pid_t pid, RecoveryConflictReason reason); +extern bool SignalRecoveryConflictWithVirtualXID(VirtualTransactionId vxid, RecoveryConflictReason reason); +extern void SignalRecoveryConflictWithDatabase(Oid databaseid, RecoveryConflictReason reason); + extern bool MinimumActiveBackends(int min); extern int CountDBBackends(Oid databaseid); extern int CountDBConnections(Oid databaseid); -extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode); extern int CountUserBackends(Oid roleid); extern bool CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared); diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index e52b8eb769..348fba53a9 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -36,20 +36,12 @@ typedef enum PROCSIG_BARRIER, /* global barrier interrupt */ PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */ PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */ - - /* Recovery conflict reasons */ - PROCSIG_RECOVERY_CONFLICT_FIRST, - PROCSIG_RECOVERY_CONFLICT_DATABASE = PROCSIG_RECOVERY_CONFLICT_FIRST, - PROCSIG_RECOVERY_CONFLICT_TABLESPACE, - PROCSIG_RECOVERY_CONFLICT_LOCK, - PROCSIG_RECOVERY_CONFLICT_SNAPSHOT, - PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT, - PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, - PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK, - PROCSIG_RECOVERY_CONFLICT_LAST = PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK, + PROCSIG_RECOVERY_CONFLICT, /* backend is blocking recovery, check + * PGPROC->pendingRecoveryConflicts for the + * reason */ } ProcSignalReason; -#define NUM_PROCSIGNALS (PROCSIG_RECOVERY_CONFLICT_LAST + 1) +#define NUM_PROCSIGNALS (PROCSIG_RECOVERY_CONFLICT + 1) typedef enum { diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 7b10932635..65a8176785 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -16,7 +16,6 @@ #include "datatype/timestamp.h" #include "storage/lock.h" -#include "storage/procsignal.h" #include "storage/relfilelocator.h" #include "storage/standbydefs.h" @@ -25,6 +24,37 @@ extern PGDLLIMPORT int max_standby_archive_delay; extern PGDLLIMPORT int max_standby_streaming_delay; extern PGDLLIMPORT bool log_recovery_conflict_waits; +/* Recovery conflict reasons */ +typedef enum +{ + /* Backend is connected to a database that is being dropped */ + RECOVERY_CONFLICT_DATABASE, + + /* Backend is using a tablespace that is being dropped */ + RECOVERY_CONFLICT_TABLESPACE, + + /* Backend is holding a lock that is blocking recovery */ + RECOVERY_CONFLICT_LOCK, + + /* Backend is holding a snapshot that is blocking recovery */ + RECOVERY_CONFLICT_SNAPSHOT, + + /* Backend is using a logical replication slot that must be invalidated */ + RECOVERY_CONFLICT_LOGICALSLOT, + + /* Backend is holding a pin on a buffer that is blocking recovery */ + RECOVERY_CONFLICT_BUFFERPIN, + + /* + * The backend is requested to check for deadlocks. The startup process + * doesn't check for deadlock directly, because we want to kill one of the + * other backends instead of the startup process. + */ + RECOVERY_CONFLICT_STARTUP_DEADLOCK, +} RecoveryConflictReason; + +#define NUM_RECOVERY_CONFLICT_REASONS (RECOVERY_CONFLICT_STARTUP_DEADLOCK + 1) + extern void InitRecoveryTransactionEnvironment(void); extern void ShutdownRecoveryTransactionEnvironment(void); @@ -43,7 +73,7 @@ extern void CheckRecoveryConflictDeadlock(void); extern void StandbyDeadLockHandler(void); extern void StandbyTimeoutHandler(void); extern void StandbyLockTimeoutHandler(void); -extern void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, +extern void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting); diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h index 54ddee875e..5bc5bcfb20 100644 --- a/src/include/tcop/tcopprot.h +++ b/src/include/tcop/tcopprot.h @@ -74,7 +74,7 @@ extern void die(SIGNAL_ARGS); pg_noreturn extern void quickdie(SIGNAL_ARGS); extern void StatementCancelHandler(SIGNAL_ARGS); pg_noreturn extern void FloatExceptionHandler(SIGNAL_ARGS); -extern void HandleRecoveryConflictInterrupt(ProcSignalReason reason); +extern void HandleRecoveryConflictInterrupt(void); extern void ProcessClientReadInterrupt(bool blocked); extern void ProcessClientWriteInterrupt(bool blocked); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 7619845fba..df42b78bc9 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2489,6 +2489,7 @@ RecordCacheArrayEntry RecordCacheEntry RecordCompareData RecordIOData +RecoveryConflictReason RecoveryLockEntry RecoveryLockXidEntry RecoveryPauseState From be5257725d7f65708f5955a3a4beaedaa370e45b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 10 Feb 2026 16:23:10 +0200 Subject: [PATCH 075/147] Refactor ProcessRecoveryConflictInterrupt for readability Two changes here: 1. Introduce a separate RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK flag to indicate a suspected deadlock that involves a buffer pin. Previously the startup process used the same flag for a deadlock involving just regular locks, and to check for deadlocks involving the buffer pin. The cases are handled separately in the startup process, but the receiving backend had to deduce which one it was based on HoldingBufferPinThatDelaysRecovery(). With a separate flag, the receiver doesn't need to guess. 2. Rewrite the ProcessRecoveryConflictInterrupt() function to not rely on fallthrough through the switch-statement. That was difficult to read. Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/4cc13ba1-4248-4884-b6ba-4805349e7f39@iki.fi --- src/backend/storage/ipc/standby.c | 7 +- src/backend/tcop/postgres.c | 262 +++++++++++-------- src/backend/utils/activity/pgstat_database.c | 10 + src/include/storage/standby.h | 10 +- 4 files changed, 181 insertions(+), 108 deletions(-) diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 0851789e8b..d83afbfb9d 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -859,7 +859,7 @@ ResolveRecoveryConflictWithBufferPin(void) * not be so harmful because the period that the buffer is kept pinned * is basically no so long. But we should fix this? */ - SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_STARTUP_DEADLOCK); + SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK); } /* @@ -877,7 +877,7 @@ static void SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason) { Assert(reason == RECOVERY_CONFLICT_BUFFERPIN || - reason == RECOVERY_CONFLICT_STARTUP_DEADLOCK); + reason == RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK); /* * We send signal to all backends to ask them if they are holding the @@ -1512,6 +1512,9 @@ get_recovery_conflict_desc(RecoveryConflictReason reason) reasonDesc = _("recovery conflict on replication slot"); break; case RECOVERY_CONFLICT_STARTUP_DEADLOCK: + reasonDesc = _("recovery conflict on deadlock"); + break; + case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK: reasonDesc = _("recovery conflict on buffer deadlock"); break; case RECOVERY_CONFLICT_DATABASE: diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 664161886c..21de158adb 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -179,6 +179,9 @@ static bool IsTransactionExitStmt(Node *parsetree); static bool IsTransactionExitStmtList(List *pstmts); static bool IsTransactionStmtList(List *pstmts); static void drop_unnamed_stmt(void); +static void ProcessRecoveryConflictInterrupts(void); +static void ProcessRecoveryConflictInterrupt(RecoveryConflictReason reason); +static void report_recovery_conflict(RecoveryConflictReason reason); static void log_disconnections(int code, Datum arg); static void enable_statement_timeout(void); static void disable_statement_timeout(void); @@ -2554,6 +2557,9 @@ errdetail_recovery_conflict(RecoveryConflictReason reason) errdetail("User was using a logical replication slot that must be invalidated."); break; case RECOVERY_CONFLICT_STARTUP_DEADLOCK: + errdetail("User transaction caused deadlock with recovery."); + break; + case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK: errdetail("User transaction caused buffer deadlock with recovery."); break; case RECOVERY_CONFLICT_DATABASE: @@ -3083,35 +3089,62 @@ ProcessRecoveryConflictInterrupt(RecoveryConflictReason reason) case RECOVERY_CONFLICT_STARTUP_DEADLOCK: /* + * The startup process is waiting on a lock held by us, and has + * requested us to check if it is a deadlock (i.e. the deadlock + * timeout expired). + * * If we aren't waiting for a lock we can never deadlock. */ if (GetAwaitedLock() == NULL) return; - /* Intentional fall through to check wait for pin */ - /* FALLTHROUGH */ + /* Set the flag so that ProcSleep() will check for deadlocks. */ + CheckDeadLockAlert(); + return; - case RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK: /* - * If RECOVERY_CONFLICT_BUFFERPIN is requested but we aren't - * blocking the Startup process there is nothing more to do. + * The startup process is waiting on a buffer pin, and has + * requested us to check if there is a deadlock involving the pin. * - * When RECOVERY_CONFLICT_STARTUP_DEADLOCK is requested, if we're - * waiting for locks and the startup process is not waiting for - * buffer pin (i.e., also waiting for locks), we set the flag so - * that ProcSleep() will check for deadlocks. + * If we're not waiting on a lock, there can be no deadlock. + */ + if (GetAwaitedLock() == NULL) + return; + + /* + * If we're not holding the buffer pin, also no deadlock. (The + * startup process doesn't know who's holding the pin, and sends + * this signal to *all* backends, so this is the common case.) */ if (!HoldingBufferPinThatDelaysRecovery()) - { - if (reason == RECOVERY_CONFLICT_STARTUP_DEADLOCK && - GetStartupBufferPinWaitBufId() < 0) - CheckDeadLockAlert(); return; - } - /* Intentional fall through to error handling */ - /* FALLTHROUGH */ + /* + * Otherwise, we probably have a deadlock. Unfortunately the + * normal deadlock detector doesn't know about buffer pins, so we + * cannot perform comprehensively deadlock check. Instead, we + * just assume that it is a deadlock if the above two conditions + * are met. In principle this can lead to false positives, but + * it's rare in practice because sessions in a hot standby server + * rarely hold locks that can block other backends. + */ + report_recovery_conflict(reason); + return; + + case RECOVERY_CONFLICT_BUFFERPIN: + + /* + * Someone is holding a buffer pin that the startup process is + * waiting for, and it got tired of waiting. If that's us, error + * out to release the pin. + */ + if (!HoldingBufferPinThatDelaysRecovery()) + return; + + report_recovery_conflict(reason); + return; case RECOVERY_CONFLICT_LOCK: case RECOVERY_CONFLICT_TABLESPACE: @@ -3123,109 +3156,128 @@ ProcessRecoveryConflictInterrupt(RecoveryConflictReason reason) if (!IsTransactionOrTransactionBlock()) return; - /* FALLTHROUGH */ + report_recovery_conflict(reason); + return; case RECOVERY_CONFLICT_LOGICALSLOT: + report_recovery_conflict(reason); + return; - /* - * If we're not in a subtransaction then we are OK to throw an - * ERROR to resolve the conflict. Otherwise drop through to the - * FATAL case. - * - * RECOVERY_CONFLICT_LOGICALSLOT is a special case that always - * throws an ERROR (ie never promotes to FATAL), though it still - * has to respect QueryCancelHoldoffCount, so it shares this code - * path. Logical decoding slots are only acquired while - * performing logical decoding. During logical decoding no user - * controlled code is run. During [sub]transaction abort, the - * slot is released. Therefore user controlled code cannot - * intercept an error before the replication slot is released. - * - * XXX other times that we can throw just an ERROR *may* be - * RECOVERY_CONFLICT_LOCK if no locks are held in parent - * transactions - * - * RECOVERY_CONFLICT_SNAPSHOT if no snapshots are held by parent - * transactions and the transaction is not transaction-snapshot - * mode - * - * RECOVERY_CONFLICT_TABLESPACE if no temp files or cursors open - * in parent transactions - */ - if (reason == RECOVERY_CONFLICT_LOGICALSLOT || - !IsSubTransaction()) - { - /* - * If we already aborted then we no longer need to cancel. We - * do this here since we do not wish to ignore aborted - * subtransactions, which must cause FATAL, currently. - */ - if (IsAbortedTransactionBlockState()) - return; + case RECOVERY_CONFLICT_DATABASE: + + /* The database is being dropped; terminate the session */ + report_recovery_conflict(reason); + return; + } + elog(FATAL, "unrecognized conflict mode: %d", (int) reason); +} +/* + * This transaction or session is conflicting with recovery and needs to be + * killed. Roll back the transaction, if that's sufficient, or terminate the + * connection, or do nothing if we're already in an aborted state. + */ +static void +report_recovery_conflict(RecoveryConflictReason reason) +{ + bool fatal; + + if (reason == RECOVERY_CONFLICT_DATABASE) + { + /* note: no hint about reconnecting, and different errcode */ + pgstat_report_recovery_conflict(reason); + ereport(FATAL, + (errcode(ERRCODE_DATABASE_DROPPED), + errmsg("terminating connection due to conflict with recovery"), + errdetail_recovery_conflict(reason))); + } + if (reason == RECOVERY_CONFLICT_LOGICALSLOT) + { + /* + * RECOVERY_CONFLICT_LOGICALSLOT is a special case that always throws + * an ERROR (ie never promotes to FATAL), though it still has to + * respect QueryCancelHoldoffCount, so it shares this code path. + * Logical decoding slots are only acquired while performing logical + * decoding. During logical decoding no user controlled code is run. + * During [sub]transaction abort, the slot is released. Therefore + * user controlled code cannot intercept an error before the + * replication slot is released. + */ + fatal = false; + } + else + { + fatal = IsSubTransaction(); + } + + /* + * If we're not in a subtransaction then we are OK to throw an ERROR to + * resolve the conflict. + * + * XXX other times that we can throw just an ERROR *may* be + * RECOVERY_CONFLICT_LOCK if no locks are held in parent transactions + * + * RECOVERY_CONFLICT_SNAPSHOT if no snapshots are held by parent + * transactions and the transaction is not transaction-snapshot mode + * + * RECOVERY_CONFLICT_TABLESPACE if no temp files or cursors open in parent + * transactions + */ + if (!fatal) + { + /* + * If we already aborted then we no longer need to cancel. We do this + * here since we do not wish to ignore aborted subtransactions, which + * must cause FATAL, currently. + */ + if (IsAbortedTransactionBlockState()) + return; + + /* + * If a recovery conflict happens while we are waiting for input from + * the client, the client is presumably just sitting idle in a + * transaction, preventing recovery from making progress. We'll drop + * through to the FATAL case below to dislodge it, in that case. + */ + if (!DoingCommandRead) + { + /* Avoid losing sync in the FE/BE protocol. */ + if (QueryCancelHoldoffCount != 0) + { /* - * If a recovery conflict happens while we are waiting for - * input from the client, the client is presumably just - * sitting idle in a transaction, preventing recovery from - * making progress. We'll drop through to the FATAL case - * below to dislodge it, in that case. + * Re-arm and defer this interrupt until later. See similar + * code in ProcessInterrupts(). */ - if (!DoingCommandRead) - { - /* Avoid losing sync in the FE/BE protocol. */ - if (QueryCancelHoldoffCount != 0) - { - /* - * Re-arm and defer this interrupt until later. See - * similar code in ProcessInterrupts(). - */ - (void) pg_atomic_fetch_or_u32(&MyProc->pendingRecoveryConflicts, (1 << reason)); - InterruptPending = true; - return; - } - - /* - * We are cleared to throw an ERROR. Either it's the - * logical slot case, or we have a top-level transaction - * that we can abort and a conflict that isn't inherently - * non-retryable. - */ - LockErrorCleanup(); - pgstat_report_recovery_conflict(reason); - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("canceling statement due to conflict with recovery"), - errdetail_recovery_conflict(reason))); - break; - } + (void) pg_atomic_fetch_or_u32(&MyProc->pendingRecoveryConflicts, (1 << reason)); + InterruptPending = true; + return; } /* - * We couldn't resolve the conflict with ERROR, so terminate the - * whole session. + * We are cleared to throw an ERROR. Either it's the logical slot + * case, or we have a top-level transaction that we can abort and + * a conflict that isn't inherently non-retryable. */ + LockErrorCleanup(); pgstat_report_recovery_conflict(reason); - ereport(FATAL, + ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("terminating connection due to conflict with recovery"), - errdetail_recovery_conflict(reason), - errhint("In a moment you should be able to reconnect to the" - " database and repeat your command."))); - break; - - case RECOVERY_CONFLICT_DATABASE: - - /* The database is being dropped; terminate the session */ - pgstat_report_recovery_conflict(reason); - ereport(FATAL, - (errcode(ERRCODE_DATABASE_DROPPED), - errmsg("terminating connection due to conflict with recovery"), + errmsg("canceling statement due to conflict with recovery"), errdetail_recovery_conflict(reason))); - break; - - default: - elog(FATAL, "unrecognized conflict mode: %d", (int) reason); + } } + + /* + * We couldn't resolve the conflict with ERROR, so terminate the whole + * session. + */ + pgstat_report_recovery_conflict(reason); + ereport(FATAL, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("terminating connection due to conflict with recovery"), + errdetail_recovery_conflict(reason), + errhint("In a moment you should be able to reconnect to the" + " database and repeat your command."))); } /* diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index e6759ccaa3..6309909bcd 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -115,6 +115,16 @@ pgstat_report_recovery_conflict(int reason) case RECOVERY_CONFLICT_STARTUP_DEADLOCK: dbentry->conflict_startup_deadlock++; break; + case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK: + + /* + * The difference between RECOVERY_CONFLICT_STARTUP_DEADLOCK and + * RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK is merely whether a buffer + * pin was part of the deadlock. We use the same counter for both + * reasons. + */ + dbentry->conflict_startup_deadlock++; + break; } } diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 65a8176785..c63a4f2cc6 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -51,9 +51,17 @@ typedef enum * other backends instead of the startup process. */ RECOVERY_CONFLICT_STARTUP_DEADLOCK, + + /* + * Like RECOVERY_CONFLICT_STARTUP_DEADLOCK is, but the suspected deadlock + * involves a buffer pin that some other backend is holding. That needs + * special checking because the normal deadlock detector doesn't track the + * buffer pins. + */ + RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK, } RecoveryConflictReason; -#define NUM_RECOVERY_CONFLICT_REASONS (RECOVERY_CONFLICT_STARTUP_DEADLOCK + 1) +#define NUM_RECOVERY_CONFLICT_REASONS (RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK + 1) extern void InitRecoveryTransactionEnvironment(void); extern void ShutdownRecoveryTransactionEnvironment(void); From cbdf93d4712229fd82d40d823882a5bc84e407e5 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 10 Feb 2026 11:49:07 -0500 Subject: [PATCH 076/147] Fix PGS_CONSIDER_NONPARTIAL interaction with Materialize nodes. Commit 4020b370f214315b8c10430301898ac21658143f had the idea that it would be a good idea to handle testing PGS_CONSIDER_NONPARTIAL within cost_material to save callers the trouble, but that turns out not to be a very good idea. One concern is that it makes cost_material() dependent on the caller having initialized certain fields in the MaterialPath, which is a bit awkward for materialize_finished_plan, which wants to use a dummy path. Another problem is that it can result in generated materialized nested loops where the Materialize node is disabled, contrary to the intention of joinpath.c's logic in match_unsorted_outer() and consider_parallel_nestloop(), which aims to consider such paths only when they would not need to be disabled. In the previous coding, it was possible for the pgs_mask on the joinrel to have PGS_CONSIDER_NONPARTIAL set, while the inner rel had the same bit clear. In that case, we'd generate and then disable a Materialize path. That seems wrong, so instead, pull up the logic to test the PGS_CONSIDER_NONPARTIAL bit into joinpath.c, restoring the historical behavior that either we don't generate a given materialized nested loop in the first place, or we don't disable it. Discussion: http://postgr.es/m/CA+TgmoawzvCoZAwFS85tE5+c8vBkqgcS8ZstQ_ohjXQ9wGT9sw@mail.gmail.com Discussion: http://postgr.es/m/CA+TgmoYS4ZCVAF2jTce=bMP0Oq_db_srocR4cZyO0OBp9oUoGg@mail.gmail.com --- src/backend/optimizer/path/costsize.c | 5 ----- src/backend/optimizer/path/joinpath.c | 11 ++++++++++- src/backend/optimizer/plan/createplan.c | 4 ---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index c30d6e8467..89ca4e08bf 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2589,11 +2589,6 @@ cost_material(Path *path, double nbytes = relation_byte_size(tuples, width); double work_mem_bytes = work_mem * (Size) 1024; - if (path->parallel_workers == 0 && - path->parent != NULL && - (path->parent->pgs_mask & PGS_CONSIDER_NONPARTIAL) == 0) - enabled = false; - path->rows = tuples; /* diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 1e4246b49d..e0c00e26dd 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -1895,8 +1895,17 @@ match_unsorted_outer(PlannerInfo *root, /* * Consider materializing the cheapest inner path, unless that is * disabled or the path in question materializes its output anyway. + * + * At present, we only consider materialization for non-partial outer + * paths, so it's correct to test PGS_CONSIDER_NONPARTIAL here. If we + * ever want to consider materialization for partial paths, we'll need + * to create matpath whenever PGS_NESTLOOP_MATERIALIZE is set, use it + * for partial paths either way, and use it for non-partial paths only + * when PGS_CONSIDER_NONPARTIAL is also set. */ - if ((extra->pgs_mask & PGS_NESTLOOP_MATERIALIZE) != 0 && + if ((extra->pgs_mask & + (PGS_NESTLOOP_MATERIALIZE | PGS_CONSIDER_NONPARTIAL)) == + (PGS_NESTLOOP_MATERIALIZE | PGS_CONSIDER_NONPARTIAL) && inner_cheapest_total != NULL && !ExecMaterializesOutput(inner_cheapest_total->pathtype)) matpath = (Path *) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index e5200f4b3c..a50260290f 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -6526,10 +6526,6 @@ materialize_finished_plan(Plan *subplan) subplan->startup_cost -= initplan_cost; subplan->total_cost -= initplan_cost; - /* Clear fields that cost_material() will consult */ - matpath.parallel_workers = 0; - matpath.parent = NULL; - /* Set cost data */ cost_material(&matpath, enable_material, From 0f4c8d33d49da012a04076159a008c9fa80bcc47 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 10 Feb 2026 11:50:28 -0500 Subject: [PATCH 077/147] Pass cursorOptions to planner_setup_hook. Commit 94f3ad3961a2cb32d30c79f01a70db4caff13318 failed to do this because I couldn't think of a use for the information, but this has proven to be short-sighted. Best to fix it before this code is officially released. Now, the only argument to standard_planenr that isn't passed to planner_setup_hook is boundParams, but that is accessible via glob->boundParams, and so doesn't need to be passed separately. Discussion: https://www.postgresql.org/message-id/CA+TgmoYS4ZCVAF2jTce=bMP0Oq_db_srocR4cZyO0OBp9oUoGg@mail.gmail.com --- src/backend/optimizer/plan/planner.c | 3 ++- src/include/optimizer/planner.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 757bdc7b1d..0c93f41fff 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -511,7 +511,8 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, /* Allow plugins to take control after we've initialized "glob" */ if (planner_setup_hook) - (*planner_setup_hook) (glob, parse, query_string, &tuple_fraction, es); + (*planner_setup_hook) (glob, parse, query_string, cursorOptions, + &tuple_fraction, es); /* primary planning entry point (may recurse for subqueries) */ root = subquery_planner(glob, parse, NULL, NULL, false, tuple_fraction, diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index ae3f7f2edb..80509773c0 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -35,6 +35,7 @@ extern PGDLLIMPORT planner_hook_type planner_hook; /* Hook for plugins to get control after PlannerGlobal is initialized */ typedef void (*planner_setup_hook_type) (PlannerGlobal *glob, Query *parse, const char *query_string, + int cursorOptions, double *tuple_fraction, ExplainState *es); extern PGDLLIMPORT planner_setup_hook_type planner_setup_hook; From adbad833f3d9e9176e8d7005f15ea6056900227d Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 10 Feb 2026 15:33:39 -0500 Subject: [PATCH 078/147] Store information about range-table flattening in the final plan. Suppose that we're currently planning a query and, when that same query was previously planned and executed, we learned something about how a certain table within that query should be planned. We want to take note when that same table is being planned during the current planning cycle, but this is difficult to do, because the RTI of the table from the previous plan won't necessarily be equal to the RTI that we see during the current planning cycle. This is because each subquery has a separate range table during planning, but these are flattened into one range table when constructing the final plan, changing RTIs. Commit 8c49a484e8ebb0199fba4bd68eaaedaf49b48ed0 allows us to match up subqueries seen in the previous planning cycles with the subqueries currently being planned just by comparing textual names, but that's not quite enough to let us deduce anything about individual tables, because we don't know where each subquery's range table appears in the final, flattened range table. To fix that, store a list of SubPlanRTInfo objects in the final planned statement, each including the name of the subplan, the offset at which it begins in the flattened range table, and whether or not it was a dummy subplan -- if it was, some RTIs may have been dropped from the final range table, but also there's no need to control how a dummy subquery gets planned. The toplevel subquery has no name and always begins at rtoffset 0, so we make no entry for it. This commit teaches pg_overexplain's RANGE_TABLE option to make use of this new data to display the subquery name for each range table entry. Reviewed-by: Lukas Fittl Reviewed-by: Jakub Wartak Reviewed-by: Greg Burd Reviewed-by: Jacob Champion Reviewed-by: Amit Langote Reviewed-by: Haibo Yan Reviewed-by: Alexandra Wang Discussion: http://postgr.es/m/CA+TgmoZ-Jh1T6QyWoCODMVQdhTUPYkaZjWztzP1En4=ZHoKPzw@mail.gmail.com --- .../expected/pg_overexplain.out | 109 ++++++++++++++++++ contrib/pg_overexplain/pg_overexplain.c | 36 ++++++ contrib/pg_overexplain/sql/pg_overexplain.sql | 10 ++ src/backend/optimizer/plan/planner.c | 1 + src/backend/optimizer/plan/setrefs.c | 20 ++++ src/include/nodes/pathnodes.h | 3 + src/include/nodes/plannodes.h | 17 +++ src/tools/pgindent/typedefs.list | 1 + 8 files changed, 197 insertions(+) diff --git a/contrib/pg_overexplain/expected/pg_overexplain.out b/contrib/pg_overexplain/expected/pg_overexplain.out index 55d34666d8..f4ce828bc6 100644 --- a/contrib/pg_overexplain/expected/pg_overexplain.out +++ b/contrib/pg_overexplain/expected/pg_overexplain.out @@ -489,3 +489,112 @@ INSERT INTO vegetables (name, genus) VALUES ('broccoflower', 'brassica'); Result RTIs: 1 (15 rows) +-- should show "Subplan: sub" +EXPLAIN (RANGE_TABLE, COSTS OFF) +SELECT * FROM vegetables v, + (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0) sub; + QUERY PLAN +---------------------------------------------- + Nested Loop + -> Seq Scan on daucus vegetables + Filter: (genus = 'daucus'::text) + Scan RTI: 6 + -> Append + Append RTIs: 1 + -> Seq Scan on brassica v_1 + Scan RTI: 3 + -> Seq Scan on daucus v_2 + Scan RTI: 4 + RTI 1 (relation, inherited, in-from-clause): + Alias: v () + Eref: v (id, name, genus) + Relation: vegetables + Relation Kind: partitioned_table + Relation Lock Mode: AccessShareLock + Permission Info Index: 1 + RTI 2 (subquery, in-from-clause): + Alias: sub () + Eref: sub (id, name, genus) + RTI 3 (relation, in-from-clause): + Alias: v (id, name, genus) + Eref: v (id, name, genus) + Relation: brassica + Relation Kind: relation + Relation Lock Mode: AccessShareLock + RTI 4 (relation, in-from-clause): + Alias: v (id, name, genus) + Eref: v (id, name, genus) + Relation: daucus + Relation Kind: relation + Relation Lock Mode: AccessShareLock + RTI 5 (relation, inherited, in-from-clause): + Subplan: sub + Eref: vegetables (id, name, genus) + Relation: vegetables + Relation Kind: partitioned_table + Relation Lock Mode: AccessShareLock + Permission Info Index: 2 + RTI 6 (relation, in-from-clause): + Subplan: sub + Alias: vegetables (id, name, genus) + Eref: vegetables (id, name, genus) + Relation: daucus + Relation Kind: relation + Relation Lock Mode: AccessShareLock + Unprunable RTIs: 1 3 4 5 6 +(47 rows) + +-- should show "Subplan: unnamed_subquery" +EXPLAIN (RANGE_TABLE, COSTS OFF) +SELECT * FROM vegetables v, + (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0); + QUERY PLAN +---------------------------------------------- + Nested Loop + -> Seq Scan on daucus vegetables + Filter: (genus = 'daucus'::text) + Scan RTI: 6 + -> Append + Append RTIs: 1 + -> Seq Scan on brassica v_1 + Scan RTI: 3 + -> Seq Scan on daucus v_2 + Scan RTI: 4 + RTI 1 (relation, inherited, in-from-clause): + Alias: v () + Eref: v (id, name, genus) + Relation: vegetables + Relation Kind: partitioned_table + Relation Lock Mode: AccessShareLock + Permission Info Index: 1 + RTI 2 (subquery, in-from-clause): + Eref: unnamed_subquery (id, name, genus) + RTI 3 (relation, in-from-clause): + Alias: v (id, name, genus) + Eref: v (id, name, genus) + Relation: brassica + Relation Kind: relation + Relation Lock Mode: AccessShareLock + RTI 4 (relation, in-from-clause): + Alias: v (id, name, genus) + Eref: v (id, name, genus) + Relation: daucus + Relation Kind: relation + Relation Lock Mode: AccessShareLock + RTI 5 (relation, inherited, in-from-clause): + Subplan: unnamed_subquery + Eref: vegetables (id, name, genus) + Relation: vegetables + Relation Kind: partitioned_table + Relation Lock Mode: AccessShareLock + Permission Info Index: 2 + RTI 6 (relation, in-from-clause): + Subplan: unnamed_subquery + Alias: vegetables (id, name, genus) + Eref: vegetables (id, name, genus) + Relation: daucus + Relation Kind: relation + Relation Lock Mode: AccessShareLock + Unprunable RTIs: 1 3 4 5 6 +(46 rows) + diff --git a/contrib/pg_overexplain/pg_overexplain.c b/contrib/pg_overexplain/pg_overexplain.c index 316ffd1c87..bf8c768ed4 100644 --- a/contrib/pg_overexplain/pg_overexplain.c +++ b/contrib/pg_overexplain/pg_overexplain.c @@ -395,6 +395,8 @@ static void overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es) { Index rti; + ListCell *lc_subrtinfo = list_head(plannedstmt->subrtinfos); + SubPlanRTInfo *rtinfo = NULL; /* Open group, one entry per RangeTblEntry */ ExplainOpenGroup("Range Table", "Range Table", false, es); @@ -405,6 +407,18 @@ overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es) RangeTblEntry *rte = rt_fetch(rti, plannedstmt->rtable); char *kind = NULL; char *relkind; + SubPlanRTInfo *next_rtinfo; + + /* Advance to next SubRTInfo, if it's time. */ + if (lc_subrtinfo != NULL) + { + next_rtinfo = lfirst(lc_subrtinfo); + if (rti > next_rtinfo->rtoffset) + { + rtinfo = next_rtinfo; + lc_subrtinfo = lnext(plannedstmt->subrtinfos, lc_subrtinfo); + } + } /* NULL entries are possible; skip them */ if (rte == NULL) @@ -469,6 +483,28 @@ overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es) ExplainPropertyBool("In From Clause", rte->inFromCl, es); } + /* + * Indicate which subplan is the origin of which RTE. Note dummy + * subplans. Here again, we crunch more onto one line in text format. + */ + if (rtinfo != NULL) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (!rtinfo->dummy) + ExplainPropertyText("Subplan", rtinfo->plan_name, es); + else + ExplainPropertyText("Subplan", + psprintf("%s (dummy)", + rtinfo->plan_name), es); + } + else + { + ExplainPropertyText("Subplan", rtinfo->plan_name, es); + ExplainPropertyBool("Subplan Is Dummy", rtinfo->dummy, es); + } + } + /* rte->alias is optional; rte->eref is requested */ if (rte->alias != NULL) overexplain_alias("Alias", rte->alias, es); diff --git a/contrib/pg_overexplain/sql/pg_overexplain.sql b/contrib/pg_overexplain/sql/pg_overexplain.sql index 42e275ac2f..34a957cbed 100644 --- a/contrib/pg_overexplain/sql/pg_overexplain.sql +++ b/contrib/pg_overexplain/sql/pg_overexplain.sql @@ -110,3 +110,13 @@ SELECT * FROM vegetables WHERE genus = 'daucus'; -- Also test a case that involves a write. EXPLAIN (RANGE_TABLE, COSTS OFF) INSERT INTO vegetables (name, genus) VALUES ('broccoflower', 'brassica'); + +-- should show "Subplan: sub" +EXPLAIN (RANGE_TABLE, COSTS OFF) +SELECT * FROM vegetables v, + (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0) sub; + +-- should show "Subplan: unnamed_subquery" +EXPLAIN (RANGE_TABLE, COSTS OFF) +SELECT * FROM vegetables v, + (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0); diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 0c93f41fff..2c9fb50b61 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -655,6 +655,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->unprunableRelids = bms_difference(glob->allRelids, glob->prunableRelids); result->permInfos = glob->finalrteperminfos; + result->subrtinfos = glob->subrtinfos; result->resultRelations = glob->resultRelations; result->appendRelations = glob->appendRelations; result->subplans = glob->subplans; diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 16d200cfb4..a5b2314ef2 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -399,6 +399,26 @@ add_rtes_to_flat_rtable(PlannerInfo *root, bool recursing) Index rti; ListCell *lc; + /* + * Record enough information to make it possible for code that looks at + * the final range table to understand how it was constructed. (If + * finalrtable is still NIL, then this is the very topmost PlannerInfo, + * which will always have plan_name == NULL and rtoffset == 0; we omit the + * degenerate list entry.) + */ + if (root->glob->finalrtable != NIL) + { + SubPlanRTInfo *rtinfo = makeNode(SubPlanRTInfo); + + rtinfo->plan_name = root->plan_name; + rtinfo->rtoffset = list_length(root->glob->finalrtable); + + /* When recursing = true, it's an unplanned or dummy subquery. */ + rtinfo->dummy = recursing; + + root->glob->subrtinfos = lappend(root->glob->subrtinfos, rtinfo); + } + /* * Add the query's own RTEs to the flattened rangetable. * diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index fb808823ac..c1e9397623 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -208,6 +208,9 @@ typedef struct PlannerGlobal /* "flat" list of RTEPermissionInfos */ List *finalrteperminfos; + /* list of SubPlanRTInfo nodes */ + List *subrtinfos; + /* "flat" list of PlanRowMarks */ List *finalrowmarks; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 4bc6fb5670..9ae72a607e 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -131,6 +131,9 @@ typedef struct PlannedStmt */ List *subplans; + /* a list of SubPlanRTInfo objects */ + List *subrtinfos; + /* indices of subplans that require REWIND */ Bitmapset *rewindPlanIDs; @@ -1821,4 +1824,18 @@ typedef enum MonotonicFunction MONOTONICFUNC_BOTH = MONOTONICFUNC_INCREASING | MONOTONICFUNC_DECREASING, } MonotonicFunction; +/* + * SubPlanRTInfo + * + * Information about which range table entries came from which subquery + * planning cycles. + */ +typedef struct SubPlanRTInfo +{ + NodeTag type; + char *plan_name; + Index rtoffset; + bool dummy; +} SubPlanRTInfo; + #endif /* PLANNODES_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index df42b78bc9..e83ced4d74 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2931,6 +2931,7 @@ SubLink SubLinkType SubOpts SubPlan +SubPlanRTInfo SubPlanState SubRelInfo SubRemoveRels From 0d4391b265f83023d0b7eed71817517410f76e60 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 10 Feb 2026 16:46:05 -0500 Subject: [PATCH 079/147] Store information about elided nodes in the final plan. An extension (or core code) might want to reconstruct the planner's choice of join order from the final plan. To do so, it must be possible to find all of the RTIs that were part of the join problem in that plan. Commit adbad833f3d9e9176e8d7005f15ea6056900227d, together with the earlier work in 8c49a484e8ebb0199fba4bd68eaaedaf49b48ed0, is enough to let us match up RTIs we see in the final plan with RTIs that we see during the planning cycle, but we still have a problem if the planner decides to drop some RTIs out of the final plan altogether. To fix that, when setrefs.c removes a SubqueryScan, single-child Append, or single-child MergeAppend from the final Plan tree, record the type of the removed node and the RTIs that the removed node would have scanned in the final plan tree. It would be natural to record this information on the child of the removed plan node, but that would require adding an additional pointer field to type Plan, which seems undesirable. So, instead, store the information in a separate list that the executor need never consult, and use the plan_node_id to identify the plan node with which the removed node is logically associated. Also, update pg_overexplain to display these details. Reviewed-by: Lukas Fittl Reviewed-by: Jakub Wartak Reviewed-by: Greg Burd Reviewed-by: Jacob Champion Reviewed-by: Amit Langote Reviewed-by: Haibo Yan Reviewed-by: Alexandra Wang Discussion: http://postgr.es/m/CA+TgmoZ-Jh1T6QyWoCODMVQdhTUPYkaZjWztzP1En4=ZHoKPzw@mail.gmail.com --- .../expected/pg_overexplain.out | 16 ++++-- contrib/pg_overexplain/pg_overexplain.c | 39 ++++++++++++++ src/backend/optimizer/plan/planner.c | 1 + src/backend/optimizer/plan/setrefs.c | 52 ++++++++++++++++++- src/include/nodes/pathnodes.h | 3 ++ src/include/nodes/plannodes.h | 20 +++++++ src/tools/pgindent/typedefs.list | 1 + 7 files changed, 127 insertions(+), 5 deletions(-) diff --git a/contrib/pg_overexplain/expected/pg_overexplain.out b/contrib/pg_overexplain/expected/pg_overexplain.out index f4ce828bc6..198bbe10d7 100644 --- a/contrib/pg_overexplain/expected/pg_overexplain.out +++ b/contrib/pg_overexplain/expected/pg_overexplain.out @@ -452,6 +452,8 @@ SELECT * FROM vegetables WHERE genus = 'daucus'; Seq Scan on daucus vegetables Filter: (genus = 'daucus'::text) Scan RTI: 2 + Elided Node Type: Append + Elided Node RTIs: 1 RTI 1 (relation, inherited, in-from-clause): Eref: vegetables (id, name, genus) Relation: vegetables @@ -465,7 +467,7 @@ SELECT * FROM vegetables WHERE genus = 'daucus'; Relation Kind: relation Relation Lock Mode: AccessShareLock Unprunable RTIs: 1 2 -(16 rows) +(18 rows) -- Also test a case that involves a write. EXPLAIN (RANGE_TABLE, COSTS OFF) @@ -499,6 +501,10 @@ SELECT * FROM vegetables v, -> Seq Scan on daucus vegetables Filter: (genus = 'daucus'::text) Scan RTI: 6 + Elided Node Type: Append + Elided Node RTIs: 5 + Elided Node Type: SubqueryScan + Elided Node RTIs: 2 -> Append Append RTIs: 1 -> Seq Scan on brassica v_1 @@ -542,7 +548,7 @@ SELECT * FROM vegetables v, Relation Kind: relation Relation Lock Mode: AccessShareLock Unprunable RTIs: 1 3 4 5 6 -(47 rows) +(51 rows) -- should show "Subplan: unnamed_subquery" EXPLAIN (RANGE_TABLE, COSTS OFF) @@ -554,6 +560,10 @@ SELECT * FROM vegetables v, -> Seq Scan on daucus vegetables Filter: (genus = 'daucus'::text) Scan RTI: 6 + Elided Node Type: Append + Elided Node RTIs: 5 + Elided Node Type: SubqueryScan + Elided Node RTIs: 2 -> Append Append RTIs: 1 -> Seq Scan on brassica v_1 @@ -596,5 +606,5 @@ SELECT * FROM vegetables v, Relation Kind: relation Relation Lock Mode: AccessShareLock Unprunable RTIs: 1 3 4 5 6 -(46 rows) +(50 rows) diff --git a/contrib/pg_overexplain/pg_overexplain.c b/contrib/pg_overexplain/pg_overexplain.c index bf8c768ed4..e0184ba314 100644 --- a/contrib/pg_overexplain/pg_overexplain.c +++ b/contrib/pg_overexplain/pg_overexplain.c @@ -191,6 +191,8 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors, */ if (options->range_table) { + bool opened_elided_nodes = false; + switch (nodeTag(plan)) { case T_SeqScan: @@ -251,6 +253,43 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors, default: break; } + + foreach_node(ElidedNode, n, es->pstmt->elidedNodes) + { + char *elidednodetag; + + if (n->plan_node_id != plan->plan_node_id) + continue; + + if (!opened_elided_nodes) + { + ExplainOpenGroup("Elided Nodes", "Elided Nodes", false, es); + opened_elided_nodes = true; + } + + switch (n->elided_type) + { + case T_Append: + elidednodetag = "Append"; + break; + case T_MergeAppend: + elidednodetag = "MergeAppend"; + break; + case T_SubqueryScan: + elidednodetag = "SubqueryScan"; + break; + default: + elidednodetag = psprintf("%d", n->elided_type); + break; + } + + ExplainOpenGroup("Elided Node", NULL, true, es); + ExplainPropertyText("Elided Node Type", elidednodetag, es); + overexplain_bitmapset("Elided Node RTIs", n->relids, es); + ExplainCloseGroup("Elided Node", NULL, true, es); + } + if (opened_elided_nodes) + ExplainCloseGroup("Elided Nodes", "Elided Nodes", false, es); } } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 2c9fb50b61..f68142cfcb 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -666,6 +666,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->paramExecTypes = glob->paramExecTypes; /* utilityStmt should be null, but we might as well copy it */ result->utilityStmt = parse->utilityStmt; + result->elidedNodes = glob->elidedNodes; result->stmt_location = parse->stmt_location; result->stmt_len = parse->stmt_len; diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index a5b2314ef2..5ad6c13830 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -211,6 +211,9 @@ static List *set_windowagg_runcondition_references(PlannerInfo *root, List *runcondition, Plan *plan); +static void record_elided_node(PlannerGlobal *glob, int plan_node_id, + NodeTag elided_type, Bitmapset *relids); + /***************************************************************************** * @@ -1460,10 +1463,17 @@ set_subqueryscan_references(PlannerInfo *root, if (trivial_subqueryscan(plan)) { + Index scanrelid; + /* * We can omit the SubqueryScan node and just pull up the subplan. */ result = clean_up_removed_plan_level((Plan *) plan, plan->subplan); + + /* Remember that we removed a SubqueryScan */ + scanrelid = plan->scan.scanrelid + rtoffset; + record_elided_node(root->glob, plan->subplan->plan_node_id, + T_SubqueryScan, bms_make_singleton(scanrelid)); } else { @@ -1891,7 +1901,17 @@ set_append_references(PlannerInfo *root, Plan *p = (Plan *) linitial(aplan->appendplans); if (p->parallel_aware == aplan->plan.parallel_aware) - return clean_up_removed_plan_level((Plan *) aplan, p); + { + Plan *result; + + result = clean_up_removed_plan_level((Plan *) aplan, p); + + /* Remember that we removed an Append */ + record_elided_node(root->glob, p->plan_node_id, T_Append, + offset_relid_set(aplan->apprelids, rtoffset)); + + return result; + } } /* @@ -1959,7 +1979,17 @@ set_mergeappend_references(PlannerInfo *root, Plan *p = (Plan *) linitial(mplan->mergeplans); if (p->parallel_aware == mplan->plan.parallel_aware) - return clean_up_removed_plan_level((Plan *) mplan, p); + { + Plan *result; + + result = clean_up_removed_plan_level((Plan *) mplan, p); + + /* Remember that we removed a MergeAppend */ + record_elided_node(root->glob, p->plan_node_id, T_MergeAppend, + offset_relid_set(mplan->apprelids, rtoffset)); + + return result; + } } /* @@ -3774,3 +3804,21 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context) return expression_tree_walker(node, extract_query_dependencies_walker, context); } + +/* + * Record some details about a node removed from the plan during setrefs + * processing, for the benefit of code trying to reconstruct planner decisions + * from examination of the final plan tree. + */ +static void +record_elided_node(PlannerGlobal *glob, int plan_node_id, + NodeTag elided_type, Bitmapset *relids) +{ + ElidedNode *n = makeNode(ElidedNode); + + n->plan_node_id = plan_node_id; + n->elided_type = elided_type; + n->relids = relids; + + glob->elidedNodes = lappend(glob->elidedNodes, n); +} diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index c1e9397623..9cc5d2e741 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -232,6 +232,9 @@ typedef struct PlannerGlobal /* type OIDs for PARAM_EXEC Params */ List *paramExecTypes; + /* info about nodes elided from the plan during setrefs processing */ + List *elidedNodes; + /* highest PlaceHolderVar ID assigned */ Index lastPHId; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 9ae72a607e..0ad0ff404c 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -152,6 +152,9 @@ typedef struct PlannedStmt /* non-null if this is utility stmt */ Node *utilityStmt; + /* info about nodes elided from the plan during setrefs processing */ + List *elidedNodes; + /* * DefElem objects added by extensions, e.g. using planner_shutdown_hook * @@ -1838,4 +1841,21 @@ typedef struct SubPlanRTInfo bool dummy; } SubPlanRTInfo; +/* + * ElidedNode + * + * Information about nodes elided from the final plan tree: trivial subquery + * scans, and single-child Append and MergeAppend nodes. + * + * plan_node_id is that of the surviving plan node, the sole child of the + * one which was elided. + */ +typedef struct ElidedNode +{ + NodeTag type; + int plan_node_id; + NodeTag elided_type; + Bitmapset *relids; +} ElidedNode; + #endif /* PLANNODES_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index e83ced4d74..523977721e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -710,6 +710,7 @@ EachState Edge EditableObjectType ElementsState +ElidedNode EnableTimeoutParams EndDataPtrType EndDirectModify_function From 9181c870bada196711206f3a795bde6b8c43dcd3 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 11 Feb 2026 07:33:24 +0900 Subject: [PATCH 080/147] Improve type handling of varlena structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit changes the definition of varlena to a typedef, so as it becomes possible to remove "struct" markers from various declarations in the code base. Historically, "struct" markers are not the project style for variable declarations, so this update simplifies the code and makes it more consistent across the board. This change has an impact on the following structures, simplifying declarations using them: - varlena - varatt_indirect - varatt_external This cleanup has come up in a different path set that played with TOAST and varatt.h, independently worth doing on its own. Reviewed-by: Álvaro Herrera Reviewed-by: Andreas Karlsson Reviewed-by: Shinya Kato Reviewed-by: Tom Lane Reviewed-by: Chao Li Discussion: https://postgr.es/m/aW8xvVbovdhyI4yo@paquier.xyz --- contrib/amcheck/verify_heapam.c | 8 +- contrib/btree_gist/btree_utils_var.c | 4 +- contrib/pageinspect/heapfuncs.c | 2 +- doc/src/sgml/storage.sgml | 2 +- src/backend/access/brin/brin_tuple.c | 2 +- src/backend/access/common/detoast.c | 102 +++++++++--------- src/backend/access/common/indextuple.c | 2 +- src/backend/access/common/toast_compression.c | 54 +++++----- src/backend/access/common/toast_internals.c | 22 ++-- src/backend/access/hash/hashfunc.c | 4 +- src/backend/access/heap/heapam.c | 2 +- src/backend/access/heap/heaptoast.c | 14 +-- src/backend/access/table/toast_helper.c | 10 +- src/backend/executor/tstoreReceiver.c | 2 +- .../replication/logical/reorderbuffer.c | 22 ++-- src/backend/storage/large_object/inv_api.c | 2 +- src/backend/utils/adt/datum.c | 12 +-- src/backend/utils/adt/expandedrecord.c | 4 +- src/backend/utils/adt/rowtypes.c | 4 +- src/backend/utils/adt/varlena.c | 10 +- src/backend/utils/fmgr/fmgr.c | 18 ++-- src/include/access/detoast.h | 12 +-- src/include/access/heaptoast.h | 2 +- src/include/access/tableam.h | 4 +- src/include/access/toast_compression.h | 18 ++-- src/include/access/toast_helper.h | 2 +- src/include/access/toast_internals.h | 2 +- src/include/c.h | 14 +-- src/include/fmgr.h | 20 ++-- src/include/utils/varbit.h | 2 +- src/include/utils/xml.h | 2 +- src/include/varatt.h | 24 ++--- src/pl/plpgsql/src/pl_exec.c | 2 +- src/test/regress/regress.c | 12 +-- src/tools/pgindent/typedefs.list | 3 + 35 files changed, 212 insertions(+), 209 deletions(-) diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c index 30c2f58317..31e19fbc69 100644 --- a/contrib/amcheck/verify_heapam.c +++ b/contrib/amcheck/verify_heapam.c @@ -73,7 +73,7 @@ typedef enum SkipPages */ typedef struct ToastedAttribute { - struct varatt_external toast_pointer; + varatt_external toast_pointer; BlockNumber blkno; /* block in main table */ OffsetNumber offnum; /* offset in main table */ AttrNumber attnum; /* attribute in main table */ @@ -1660,11 +1660,11 @@ static bool check_tuple_attribute(HeapCheckContext *ctx) { Datum attdatum; - struct varlena *attr; + varlena *attr; char *tp; /* pointer to the tuple data */ uint16 infomask; CompactAttribute *thisatt; - struct varatt_external toast_pointer; + varatt_external toast_pointer; infomask = ctx->tuphdr->t_infomask; thisatt = TupleDescCompactAttr(RelationGetDescr(ctx->rel), ctx->attnum); @@ -1754,7 +1754,7 @@ check_tuple_attribute(HeapCheckContext *ctx) * We go further, because we need to check if the toast datum is corrupt. */ - attr = (struct varlena *) DatumGetPointer(attdatum); + attr = (varlena *) DatumGetPointer(attdatum); /* * Now we follow the logic of detoast_external_attr(), with the same diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c index f6ba1c0c82..e1945cf808 100644 --- a/contrib/btree_gist/btree_utils_var.c +++ b/contrib/btree_gist/btree_utils_var.c @@ -70,7 +70,7 @@ gbt_var_key_readable(const GBT_VARKEY *k) * Create a leaf-entry to store in the index, from a single Datum. */ static GBT_VARKEY * -gbt_var_key_from_datum(const struct varlena *u) +gbt_var_key_from_datum(const varlena *u) { int32 lowersize = VARSIZE(u); GBT_VARKEY *r; @@ -294,7 +294,7 @@ gbt_var_compress(GISTENTRY *entry, const gbtree_vinfo *tinfo) if (entry->leafkey) { - struct varlena *leaf = PG_DETOAST_DATUM(entry->key); + varlena *leaf = PG_DETOAST_DATUM(entry->key); GBT_VARKEY *r; r = gbt_var_key_from_datum(leaf); diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 2f0dfff175..8e31632ce0 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -396,7 +396,7 @@ tuple_data_split_internal(Oid relid, char *tupdata, errmsg("unexpected end of tuple data"))); if (attr->attlen == -1 && do_detoast) - attr_data = pg_detoast_datum_copy((struct varlena *) (tupdata + off)); + attr_data = pg_detoast_datum_copy((varlena *) (tupdata + off)); else { attr_data = (bytea *) palloc(len + VARHDRSZ); diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml index 02ddfda834..6b6377503b 100644 --- a/doc/src/sgml/storage.sgml +++ b/doc/src/sgml/storage.sgml @@ -1068,7 +1068,7 @@ data. Empty in ordinary tables. fixed width field, then all the bytes are simply placed. If it's a variable length field (attlen = -1) then it's a bit more complicated. All variable-length data types share the common header structure - struct varlena, which includes the total length of the stored + varlena, which includes the total length of the stored value and some flag bits. Depending on the flags, the data can be either inline or in a TOAST table; it might be compressed, too (see ). diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c index 706387e36d..69c233c62e 100644 --- a/src/backend/access/brin/brin_tuple.c +++ b/src/backend/access/brin/brin_tuple.c @@ -206,7 +206,7 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, */ if (VARATT_IS_EXTERNAL(DatumGetPointer(value))) { - value = PointerGetDatum(detoast_external_attr((struct varlena *) + value = PointerGetDatum(detoast_external_attr((varlena *) DatumGetPointer(value))); free_value = true; } diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c index 7bef01bb5f..a6c1f3a734 100644 --- a/src/backend/access/common/detoast.c +++ b/src/backend/access/common/detoast.c @@ -22,12 +22,12 @@ #include "utils/expandeddatum.h" #include "utils/rel.h" -static struct varlena *toast_fetch_datum(struct varlena *attr); -static struct varlena *toast_fetch_datum_slice(struct varlena *attr, - int32 sliceoffset, - int32 slicelength); -static struct varlena *toast_decompress_datum(struct varlena *attr); -static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); +static varlena *toast_fetch_datum(varlena *attr); +static varlena *toast_fetch_datum_slice(varlena *attr, + int32 sliceoffset, + int32 slicelength); +static varlena *toast_decompress_datum(varlena *attr); +static varlena *toast_decompress_datum_slice(varlena *attr, int32 slicelength); /* ---------- * detoast_external_attr - @@ -41,10 +41,10 @@ static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 * EXTERNAL datum, the result will be a pfree'able chunk. * ---------- */ -struct varlena * -detoast_external_attr(struct varlena *attr) +varlena * +detoast_external_attr(varlena *attr) { - struct varlena *result; + varlena *result; if (VARATT_IS_EXTERNAL_ONDISK(attr)) { @@ -58,10 +58,10 @@ detoast_external_attr(struct varlena *attr) /* * This is an indirect pointer --- dereference it */ - struct varatt_indirect redirect; + varatt_indirect redirect; VARATT_EXTERNAL_GET_POINTER(redirect, attr); - attr = (struct varlena *) redirect.pointer; + attr = (varlena *) redirect.pointer; /* nested indirect Datums aren't allowed */ Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr)); @@ -74,7 +74,7 @@ detoast_external_attr(struct varlena *attr) * Copy into the caller's memory context, in case caller tries to * pfree the result. */ - result = (struct varlena *) palloc(VARSIZE_ANY(attr)); + result = (varlena *) palloc(VARSIZE_ANY(attr)); memcpy(result, attr, VARSIZE_ANY(attr)); } else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) @@ -87,7 +87,7 @@ detoast_external_attr(struct varlena *attr) eoh = DatumGetEOHP(PointerGetDatum(attr)); resultsize = EOH_get_flat_size(eoh); - result = (struct varlena *) palloc(resultsize); + result = (varlena *) palloc(resultsize); EOH_flatten_into(eoh, result, resultsize); } else @@ -112,8 +112,8 @@ detoast_external_attr(struct varlena *attr) * datum, the result will be a pfree'able chunk. * ---------- */ -struct varlena * -detoast_attr(struct varlena *attr) +varlena * +detoast_attr(varlena *attr) { if (VARATT_IS_EXTERNAL_ONDISK(attr)) { @@ -124,7 +124,7 @@ detoast_attr(struct varlena *attr) /* If it's compressed, decompress it */ if (VARATT_IS_COMPRESSED(attr)) { - struct varlena *tmp = attr; + varlena *tmp = attr; attr = toast_decompress_datum(tmp); pfree(tmp); @@ -135,10 +135,10 @@ detoast_attr(struct varlena *attr) /* * This is an indirect pointer --- dereference it */ - struct varatt_indirect redirect; + varatt_indirect redirect; VARATT_EXTERNAL_GET_POINTER(redirect, attr); - attr = (struct varlena *) redirect.pointer; + attr = (varlena *) redirect.pointer; /* nested indirect Datums aren't allowed */ Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr)); @@ -147,11 +147,11 @@ detoast_attr(struct varlena *attr) attr = detoast_attr(attr); /* if it isn't, we'd better copy it */ - if (attr == (struct varlena *) redirect.pointer) + if (attr == (varlena *) redirect.pointer) { - struct varlena *result; + varlena *result; - result = (struct varlena *) palloc(VARSIZE_ANY(attr)); + result = (varlena *) palloc(VARSIZE_ANY(attr)); memcpy(result, attr, VARSIZE_ANY(attr)); attr = result; } @@ -179,9 +179,9 @@ detoast_attr(struct varlena *attr) */ Size data_size = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT; Size new_size = data_size + VARHDRSZ; - struct varlena *new_attr; + varlena *new_attr; - new_attr = (struct varlena *) palloc(new_size); + new_attr = (varlena *) palloc(new_size); SET_VARSIZE(new_attr, new_size); memcpy(VARDATA(new_attr), VARDATA_SHORT(attr), data_size); attr = new_attr; @@ -201,12 +201,12 @@ detoast_attr(struct varlena *attr) * If slicelength < 0, return everything beyond sliceoffset * ---------- */ -struct varlena * -detoast_attr_slice(struct varlena *attr, +varlena * +detoast_attr_slice(varlena *attr, int32 sliceoffset, int32 slicelength) { - struct varlena *preslice; - struct varlena *result; + varlena *preslice; + varlena *result; char *attrdata; int32 slicelimit; int32 attrsize; @@ -225,7 +225,7 @@ detoast_attr_slice(struct varlena *attr, if (VARATT_IS_EXTERNAL_ONDISK(attr)) { - struct varatt_external toast_pointer; + varatt_external toast_pointer; VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); @@ -266,7 +266,7 @@ detoast_attr_slice(struct varlena *attr, } else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) { - struct varatt_indirect redirect; + varatt_indirect redirect; VARATT_EXTERNAL_GET_POINTER(redirect, attr); @@ -288,7 +288,7 @@ detoast_attr_slice(struct varlena *attr, if (VARATT_IS_COMPRESSED(preslice)) { - struct varlena *tmp = preslice; + varlena *tmp = preslice; /* Decompress enough to encompass the slice and the offset */ if (slicelimit >= 0) @@ -321,7 +321,7 @@ detoast_attr_slice(struct varlena *attr, else if (slicelength < 0 || slicelimit > attrsize) slicelength = attrsize - sliceoffset; - result = (struct varlena *) palloc(slicelength + VARHDRSZ); + result = (varlena *) palloc(slicelength + VARHDRSZ); SET_VARSIZE(result, slicelength + VARHDRSZ); memcpy(VARDATA(result), attrdata + sliceoffset, slicelength); @@ -339,12 +339,12 @@ detoast_attr_slice(struct varlena *attr, * in the toast relation * ---------- */ -static struct varlena * -toast_fetch_datum(struct varlena *attr) +static varlena * +toast_fetch_datum(varlena *attr) { Relation toastrel; - struct varlena *result; - struct varatt_external toast_pointer; + varlena *result; + varatt_external toast_pointer; int32 attrsize; if (!VARATT_IS_EXTERNAL_ONDISK(attr)) @@ -355,7 +355,7 @@ toast_fetch_datum(struct varlena *attr) attrsize = VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer); - result = (struct varlena *) palloc(attrsize + VARHDRSZ); + result = (varlena *) palloc(attrsize + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) SET_VARSIZE_COMPRESSED(result, attrsize + VARHDRSZ); @@ -392,13 +392,13 @@ toast_fetch_datum(struct varlena *attr) * has to be a prefix, i.e. sliceoffset has to be 0). * ---------- */ -static struct varlena * -toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, +static varlena * +toast_fetch_datum_slice(varlena *attr, int32 sliceoffset, int32 slicelength) { Relation toastrel; - struct varlena *result; - struct varatt_external toast_pointer; + varlena *result; + varatt_external toast_pointer; int32 attrsize; if (!VARATT_IS_EXTERNAL_ONDISK(attr)) @@ -438,7 +438,7 @@ toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, if (((sliceoffset + slicelength) > attrsize) || slicelength < 0) slicelength = attrsize - sliceoffset; - result = (struct varlena *) palloc(slicelength + VARHDRSZ); + result = (varlena *) palloc(slicelength + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) SET_VARSIZE_COMPRESSED(result, slicelength + VARHDRSZ); @@ -467,8 +467,8 @@ toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, * * Decompress a compressed version of a varlena datum */ -static struct varlena * -toast_decompress_datum(struct varlena *attr) +static varlena * +toast_decompress_datum(varlena *attr) { ToastCompressionId cmid; @@ -499,8 +499,8 @@ toast_decompress_datum(struct varlena *attr) * offset handling happens in detoast_attr_slice. * Here we just decompress a slice from the front. */ -static struct varlena * -toast_decompress_datum_slice(struct varlena *attr, int32 slicelength) +static varlena * +toast_decompress_datum_slice(varlena *attr, int32 slicelength) { ToastCompressionId cmid; @@ -544,20 +544,20 @@ toast_decompress_datum_slice(struct varlena *attr, int32 slicelength) Size toast_raw_datum_size(Datum value) { - struct varlena *attr = (struct varlena *) DatumGetPointer(value); + varlena *attr = (varlena *) DatumGetPointer(value); Size result; if (VARATT_IS_EXTERNAL_ONDISK(attr)) { /* va_rawsize is the size of the original datum -- including header */ - struct varatt_external toast_pointer; + varatt_external toast_pointer; VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); result = toast_pointer.va_rawsize; } else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) { - struct varatt_indirect toast_pointer; + varatt_indirect toast_pointer; VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); @@ -600,7 +600,7 @@ toast_raw_datum_size(Datum value) Size toast_datum_size(Datum value) { - struct varlena *attr = (struct varlena *) DatumGetPointer(value); + varlena *attr = (varlena *) DatumGetPointer(value); Size result; if (VARATT_IS_EXTERNAL_ONDISK(attr)) @@ -610,14 +610,14 @@ toast_datum_size(Datum value) * compressed or not. We do not count the size of the toast pointer * ... should we? */ - struct varatt_external toast_pointer; + varatt_external toast_pointer; VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); result = VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer); } else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) { - struct varatt_indirect toast_pointer; + varatt_indirect toast_pointer; VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c index d7c8c53fd8..d6350201e0 100644 --- a/src/backend/access/common/indextuple.c +++ b/src/backend/access/common/indextuple.c @@ -108,7 +108,7 @@ index_form_tuple_context(TupleDesc tupleDescriptor, if (VARATT_IS_EXTERNAL(DatumGetPointer(values[i]))) { untoasted_values[i] = - PointerGetDatum(detoast_external_attr((struct varlena *) + PointerGetDatum(detoast_external_attr((varlena *) DatumGetPointer(values[i]))); untoasted_free[i] = true; } diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index d449613b21..4d00537049 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -36,12 +36,12 @@ int default_toast_compression = TOAST_PGLZ_COMPRESSION; * * Returns the compressed varlena, or NULL if compression fails. */ -struct varlena * -pglz_compress_datum(const struct varlena *value) +varlena * +pglz_compress_datum(const varlena *value) { int32 valsize, len; - struct varlena *tmp = NULL; + varlena *tmp = NULL; valsize = VARSIZE_ANY_EXHDR(value); @@ -57,8 +57,8 @@ pglz_compress_datum(const struct varlena *value) * Figure out the maximum possible size of the pglz output, add the bytes * that will be needed for varlena overhead, and allocate that amount. */ - tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize) + - VARHDRSZ_COMPRESSED); + tmp = (varlena *) palloc(PGLZ_MAX_OUTPUT(valsize) + + VARHDRSZ_COMPRESSED); len = pglz_compress(VARDATA_ANY(value), valsize, @@ -78,14 +78,14 @@ pglz_compress_datum(const struct varlena *value) /* * Decompress a varlena that was compressed using PGLZ. */ -struct varlena * -pglz_decompress_datum(const struct varlena *value) +varlena * +pglz_decompress_datum(const varlena *value) { - struct varlena *result; + varlena *result; int32 rawsize; /* allocate memory for the uncompressed data */ - result = (struct varlena *) palloc(VARDATA_COMPRESSED_GET_EXTSIZE(value) + VARHDRSZ); + result = (varlena *) palloc(VARDATA_COMPRESSED_GET_EXTSIZE(value) + VARHDRSZ); /* decompress the data */ rawsize = pglz_decompress((const char *) value + VARHDRSZ_COMPRESSED, @@ -105,15 +105,15 @@ pglz_decompress_datum(const struct varlena *value) /* * Decompress part of a varlena that was compressed using PGLZ. */ -struct varlena * -pglz_decompress_datum_slice(const struct varlena *value, +varlena * +pglz_decompress_datum_slice(const varlena *value, int32 slicelength) { - struct varlena *result; + varlena *result; int32 rawsize; /* allocate memory for the uncompressed data */ - result = (struct varlena *) palloc(slicelength + VARHDRSZ); + result = (varlena *) palloc(slicelength + VARHDRSZ); /* decompress the data */ rawsize = pglz_decompress((const char *) value + VARHDRSZ_COMPRESSED, @@ -135,8 +135,8 @@ pglz_decompress_datum_slice(const struct varlena *value, * * Returns the compressed varlena, or NULL if compression fails. */ -struct varlena * -lz4_compress_datum(const struct varlena *value) +varlena * +lz4_compress_datum(const varlena *value) { #ifndef USE_LZ4 NO_COMPRESSION_SUPPORT("lz4"); @@ -145,7 +145,7 @@ lz4_compress_datum(const struct varlena *value) int32 valsize; int32 len; int32 max_size; - struct varlena *tmp = NULL; + varlena *tmp = NULL; valsize = VARSIZE_ANY_EXHDR(value); @@ -154,7 +154,7 @@ lz4_compress_datum(const struct varlena *value) * that will be needed for varlena overhead, and allocate that amount. */ max_size = LZ4_compressBound(valsize); - tmp = (struct varlena *) palloc(max_size + VARHDRSZ_COMPRESSED); + tmp = (varlena *) palloc(max_size + VARHDRSZ_COMPRESSED); len = LZ4_compress_default(VARDATA_ANY(value), (char *) tmp + VARHDRSZ_COMPRESSED, @@ -178,18 +178,18 @@ lz4_compress_datum(const struct varlena *value) /* * Decompress a varlena that was compressed using LZ4. */ -struct varlena * -lz4_decompress_datum(const struct varlena *value) +varlena * +lz4_decompress_datum(const varlena *value) { #ifndef USE_LZ4 NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; - struct varlena *result; + varlena *result; /* allocate memory for the uncompressed data */ - result = (struct varlena *) palloc(VARDATA_COMPRESSED_GET_EXTSIZE(value) + VARHDRSZ); + result = (varlena *) palloc(VARDATA_COMPRESSED_GET_EXTSIZE(value) + VARHDRSZ); /* decompress the data */ rawsize = LZ4_decompress_safe((const char *) value + VARHDRSZ_COMPRESSED, @@ -211,22 +211,22 @@ lz4_decompress_datum(const struct varlena *value) /* * Decompress part of a varlena that was compressed using LZ4. */ -struct varlena * -lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength) +varlena * +lz4_decompress_datum_slice(const varlena *value, int32 slicelength) { #ifndef USE_LZ4 NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; - struct varlena *result; + varlena *result; /* slice decompression not supported prior to 1.8.3 */ if (LZ4_versionNumber() < 10803) return lz4_decompress_datum(value); /* allocate memory for the uncompressed data */ - result = (struct varlena *) palloc(slicelength + VARHDRSZ); + result = (varlena *) palloc(slicelength + VARHDRSZ); /* decompress the data */ rawsize = LZ4_decompress_safe_partial((const char *) value + VARHDRSZ_COMPRESSED, @@ -251,7 +251,7 @@ lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength) * Returns TOAST_INVALID_COMPRESSION_ID if the varlena is not compressed. */ ToastCompressionId -toast_get_compression_id(struct varlena *attr) +toast_get_compression_id(varlena *attr) { ToastCompressionId cmid = TOAST_INVALID_COMPRESSION_ID; @@ -262,7 +262,7 @@ toast_get_compression_id(struct varlena *attr) */ if (VARATT_IS_EXTERNAL_ONDISK(attr)) { - struct varatt_external toast_pointer; + varatt_external toast_pointer; VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 6836786fd0..4d0da07135 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -45,7 +45,7 @@ static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); Datum toast_compress_datum(Datum value, char cmethod) { - struct varlena *tmp = NULL; + varlena *tmp = NULL; int32 valsize; ToastCompressionId cmid = TOAST_INVALID_COMPRESSION_ID; @@ -64,11 +64,11 @@ toast_compress_datum(Datum value, char cmethod) switch (cmethod) { case TOAST_PGLZ_COMPRESSION: - tmp = pglz_compress_datum((const struct varlena *) DatumGetPointer(value)); + tmp = pglz_compress_datum((const varlena *) DatumGetPointer(value)); cmid = TOAST_PGLZ_COMPRESSION_ID; break; case TOAST_LZ4_COMPRESSION: - tmp = lz4_compress_datum((const struct varlena *) DatumGetPointer(value)); + tmp = lz4_compress_datum((const varlena *) DatumGetPointer(value)); cmid = TOAST_LZ4_COMPRESSION_ID; break; default: @@ -117,14 +117,14 @@ toast_compress_datum(Datum value, char cmethod) */ Datum toast_save_datum(Relation rel, Datum value, - struct varlena *oldexternal, int options) + varlena *oldexternal, int options) { Relation toastrel; Relation *toastidxs; TupleDesc toasttupDesc; CommandId mycid = GetCurrentCommandId(true); - struct varlena *result; - struct varatt_external toast_pointer; + varlena *result; + varatt_external toast_pointer; int32 chunk_seq = 0; char *data_p; int32 data_todo; @@ -225,7 +225,7 @@ toast_save_datum(Relation rel, Datum value, toast_pointer.va_valueid = InvalidOid; if (oldexternal != NULL) { - struct varatt_external old_toast_pointer; + varatt_external old_toast_pointer; Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); /* Must copy to access aligned fields */ @@ -287,7 +287,7 @@ toast_save_datum(Relation rel, Datum value, bool t_isnull[3] = {0}; union { - alignas(int32) struct varlena hdr; + alignas(int32) varlena hdr; /* this is to make the union big enough for a chunk: */ char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; } chunk_data; @@ -359,7 +359,7 @@ toast_save_datum(Relation rel, Datum value, /* * Create the TOAST pointer value that we'll return */ - result = (struct varlena *) palloc(TOAST_POINTER_SIZE); + result = (varlena *) palloc(TOAST_POINTER_SIZE); SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); @@ -375,8 +375,8 @@ toast_save_datum(Relation rel, Datum value, void toast_delete_datum(Relation rel, Datum value, bool is_speculative) { - struct varlena *attr = (struct varlena *) DatumGetPointer(value); - struct varatt_external toast_pointer; + varlena *attr = (varlena *) DatumGetPointer(value); + varatt_external toast_pointer; Relation toastrel; Relation *toastidxs; ScanKeyData toastkey; diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index 036421fc66..575342a21b 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -388,7 +388,7 @@ hashtextextended(PG_FUNCTION_ARGS) Datum hashvarlena(PG_FUNCTION_ARGS) { - struct varlena *key = PG_GETARG_VARLENA_PP(0); + varlena *key = PG_GETARG_VARLENA_PP(0); Datum result; result = hash_any((unsigned char *) VARDATA_ANY(key), @@ -403,7 +403,7 @@ hashvarlena(PG_FUNCTION_ARGS) Datum hashvarlenaextended(PG_FUNCTION_ARGS) { - struct varlena *key = PG_GETARG_VARLENA_PP(0); + varlena *key = PG_GETARG_VARLENA_PP(0); Datum result; result = hash_any_extended((unsigned char *) VARDATA_ANY(key), diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 3004964ab7..ff85009930 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -4535,7 +4535,7 @@ HeapDetermineColumnsInfo(Relation relation, * Check if the old tuple's attribute is stored externally and is a * member of external_cols. */ - if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) && + if (VARATT_IS_EXTERNAL((varlena *) DatumGetPointer(value1)) && bms_is_member(attidx, external_cols)) *has_external = true; } diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c index 6ddf6c6cf9..ba541bd60c 100644 --- a/src/backend/access/heap/heaptoast.c +++ b/src/backend/access/heap/heaptoast.c @@ -371,9 +371,9 @@ toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc) */ if (!toast_isnull[i] && TupleDescCompactAttr(tupleDesc, i)->attlen == -1) { - struct varlena *new_value; + varlena *new_value; - new_value = (struct varlena *) DatumGetPointer(toast_values[i]); + new_value = (varlena *) DatumGetPointer(toast_values[i]); if (VARATT_IS_EXTERNAL(new_value)) { new_value = detoast_external_attr(new_value); @@ -485,9 +485,9 @@ toast_flatten_tuple_to_datum(HeapTupleHeader tup, has_nulls = true; else if (TupleDescCompactAttr(tupleDesc, i)->attlen == -1) { - struct varlena *new_value; + varlena *new_value; - new_value = (struct varlena *) DatumGetPointer(toast_values[i]); + new_value = (varlena *) DatumGetPointer(toast_values[i]); if (VARATT_IS_EXTERNAL(new_value) || VARATT_IS_COMPRESSED(new_value)) { @@ -586,9 +586,9 @@ toast_build_flattened_tuple(TupleDesc tupleDesc, */ if (!isnull[i] && TupleDescCompactAttr(tupleDesc, i)->attlen == -1) { - struct varlena *new_value; + varlena *new_value; - new_value = (struct varlena *) DatumGetPointer(new_values[i]); + new_value = (varlena *) DatumGetPointer(new_values[i]); if (VARATT_IS_EXTERNAL(new_value)) { new_value = detoast_external_attr(new_value); @@ -625,7 +625,7 @@ toast_build_flattened_tuple(TupleDesc tupleDesc, void heap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, int32 sliceoffset, int32 slicelength, - struct varlena *result) + varlena *result) { Relation *toastidxs; ScanKeyData toastkey[3]; diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c index d8a604a0b3..0d792a60ca 100644 --- a/src/backend/access/table/toast_helper.c +++ b/src/backend/access/table/toast_helper.c @@ -49,8 +49,8 @@ toast_tuple_init(ToastTupleContext *ttc) for (i = 0; i < numAttrs; i++) { Form_pg_attribute att = TupleDescAttr(tupleDesc, i); - struct varlena *old_value; - struct varlena *new_value; + varlena *old_value; + varlena *new_value; ttc->ttc_attr[i].tai_colflags = 0; ttc->ttc_attr[i].tai_oldexternal = NULL; @@ -62,9 +62,9 @@ toast_tuple_init(ToastTupleContext *ttc) * For UPDATE get the old and new values of this attribute */ old_value = - (struct varlena *) DatumGetPointer(ttc->ttc_oldvalues[i]); + (varlena *) DatumGetPointer(ttc->ttc_oldvalues[i]); new_value = - (struct varlena *) DatumGetPointer(ttc->ttc_values[i]); + (varlena *) DatumGetPointer(ttc->ttc_values[i]); /* * If the old value is stored on disk, check if it has changed so @@ -102,7 +102,7 @@ toast_tuple_init(ToastTupleContext *ttc) /* * For INSERT simply get the new value */ - new_value = (struct varlena *) DatumGetPointer(ttc->ttc_values[i]); + new_value = (varlena *) DatumGetPointer(ttc->ttc_values[i]); } /* diff --git a/src/backend/executor/tstoreReceiver.c b/src/backend/executor/tstoreReceiver.c index 2ce96b6940..8531d4ca43 100644 --- a/src/backend/executor/tstoreReceiver.c +++ b/src/backend/executor/tstoreReceiver.c @@ -161,7 +161,7 @@ tstoreReceiveSlot_detoast(TupleTableSlot *slot, DestReceiver *self) { if (VARATT_IS_EXTERNAL(DatumGetPointer(val))) { - val = PointerGetDatum(detoast_external_attr((struct varlena *) + val = PointerGetDatum(detoast_external_attr((varlena *) DatumGetPointer(val))); myState->tofree[nfree++] = val; } diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 2d2a6d5e9e..94b2b29945 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -182,8 +182,8 @@ typedef struct ReorderBufferToastEnt Size num_chunks; /* number of chunks we've already seen */ Size size; /* combined size of chunks seen */ dlist_head chunks; /* linked list of chunks */ - struct varlena *reconstructed; /* reconstructed varlena now pointed to in - * main tup */ + varlena *reconstructed; /* reconstructed varlena now pointed to in + * main tup */ } ReorderBufferToastEnt; /* Disk serialization support datastructures */ @@ -5133,13 +5133,13 @@ ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, { CompactAttribute *attr = TupleDescCompactAttr(desc, natt); ReorderBufferToastEnt *ent; - struct varlena *varlena; + varlena *varlena_pointer; /* va_rawsize is the size of the original datum -- including header */ - struct varatt_external toast_pointer; - struct varatt_indirect redirect_pointer; - struct varlena *new_datum = NULL; - struct varlena *reconstructed; + varatt_external toast_pointer; + varatt_indirect redirect_pointer; + varlena *new_datum = NULL; + varlena *reconstructed; dlist_iter it; Size data_done = 0; @@ -5155,13 +5155,13 @@ ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, continue; /* ok, we know we have a toast datum */ - varlena = (struct varlena *) DatumGetPointer(attrs[natt]); + varlena_pointer = (varlena *) DatumGetPointer(attrs[natt]); /* no need to do anything if the tuple isn't external */ - if (!VARATT_IS_EXTERNAL(varlena)) + if (!VARATT_IS_EXTERNAL(varlena_pointer)) continue; - VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena); + VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena_pointer); /* * Check whether the toast tuple changed, replace if so. @@ -5175,7 +5175,7 @@ ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, continue; new_datum = - (struct varlena *) palloc0(INDIRECT_POINTER_SIZE); + (varlena *) palloc0(INDIRECT_POINTER_SIZE); free[natt] = true; diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c index 466c1a856c..a3cce496c2 100644 --- a/src/backend/storage/large_object/inv_api.c +++ b/src/backend/storage/large_object/inv_api.c @@ -142,7 +142,7 @@ getdatafield(Form_pg_largeobject tuple, if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) - detoast_attr((struct varlena *) datafield); + detoast_attr((varlena *) datafield); freeit = true; } len = VARSIZE(datafield) - VARHDRSZ; diff --git a/src/backend/utils/adt/datum.c b/src/backend/utils/adt/datum.c index cc26bd67a5..8832785540 100644 --- a/src/backend/utils/adt/datum.c +++ b/src/backend/utils/adt/datum.c @@ -26,7 +26,7 @@ * The number of significant bytes are always equal to the typlen. * * C) if a type is not "byVal" and has typlen == -1, - * then the "Datum" always points to a "struct varlena". + * then the "Datum" always points to a "varlena". * This varlena structure has information about the actual length of this * particular instance of the type and about its value. * @@ -82,7 +82,7 @@ datumGetSize(Datum value, bool typByVal, int typLen) else if (typLen == -1) { /* It is a varlena datatype */ - struct varlena *s = (struct varlena *) DatumGetPointer(value); + varlena *s = (varlena *) DatumGetPointer(value); if (!s) ereport(ERROR, @@ -138,7 +138,7 @@ datumCopy(Datum value, bool typByVal, int typLen) else if (typLen == -1) { /* It is a varlena datatype */ - struct varlena *vl = (struct varlena *) DatumGetPointer(value); + varlena *vl = (varlena *) DatumGetPointer(value); if (VARATT_IS_EXTERNAL_EXPANDED(vl)) { @@ -288,8 +288,8 @@ datum_image_eq(Datum value1, Datum value2, bool typByVal, int typLen) result = false; else { - struct varlena *arg1val; - struct varlena *arg2val; + varlena *arg1val; + varlena *arg2val; arg1val = PG_DETOAST_DATUM_PACKED(value1); arg2val = PG_DETOAST_DATUM_PACKED(value2); @@ -346,7 +346,7 @@ datum_image_hash(Datum value, bool typByVal, int typLen) result = hash_bytes((unsigned char *) DatumGetPointer(value), typLen); else if (typLen == -1) { - struct varlena *val; + varlena *val; len = toast_raw_datum_size(value); diff --git a/src/backend/utils/adt/expandedrecord.c b/src/backend/utils/adt/expandedrecord.c index d21ef9d8c0..123792aa72 100644 --- a/src/backend/utils/adt/expandedrecord.c +++ b/src/backend/utils/adt/expandedrecord.c @@ -1159,7 +1159,7 @@ expanded_record_set_field_internal(ExpandedRecordHeader *erh, int fnumber, { /* Detoasting should be done in short-lived context. */ oldcxt = MemoryContextSwitchTo(get_short_term_cxt(erh)); - newValue = PointerGetDatum(detoast_external_attr((struct varlena *) DatumGetPointer(newValue))); + newValue = PointerGetDatum(detoast_external_attr((varlena *) DatumGetPointer(newValue))); MemoryContextSwitchTo(oldcxt); } else @@ -1305,7 +1305,7 @@ expanded_record_set_fields(ExpandedRecordHeader *erh, if (expand_external) { /* Detoast as requested while copying the value */ - newValue = PointerGetDatum(detoast_external_attr((struct varlena *) DatumGetPointer(newValue))); + newValue = PointerGetDatum(detoast_external_attr((varlena *) DatumGetPointer(newValue))); } else { diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c index db67e86e76..e4eb7111ee 100644 --- a/src/backend/utils/adt/rowtypes.c +++ b/src/backend/utils/adt/rowtypes.c @@ -1515,8 +1515,8 @@ record_image_cmp(FunctionCallInfo fcinfo) { Size len1, len2; - struct varlena *arg1val; - struct varlena *arg2val; + varlena *arg1val; + varlena *arg2val; len1 = toast_raw_datum_size(values1[i1]); len2 = toast_raw_datum_size(values2[i2]); diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 6bb14620a6..dbecd7160d 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -42,7 +42,7 @@ #include "utils/sortsupport.h" #include "utils/varlena.h" -typedef struct varlena VarString; +typedef varlena VarString; /* * State for text_position_* functions. @@ -4179,7 +4179,7 @@ pg_column_compression(PG_FUNCTION_ARGS) PG_RETURN_NULL(); /* get the compression method id stored in the compressed varlena */ - cmid = toast_get_compression_id((struct varlena *) + cmid = toast_get_compression_id((varlena *) DatumGetPointer(PG_GETARG_DATUM(0))); if (cmid == TOAST_INVALID_COMPRESSION_ID) PG_RETURN_NULL(); @@ -4208,8 +4208,8 @@ Datum pg_column_toast_chunk_id(PG_FUNCTION_ARGS) { int typlen; - struct varlena *attr; - struct varatt_external toast_pointer; + varlena *attr; + varatt_external toast_pointer; /* On first call, get the input type's typlen, and save at *fn_extra */ if (fcinfo->flinfo->fn_extra == NULL) @@ -4231,7 +4231,7 @@ pg_column_toast_chunk_id(PG_FUNCTION_ARGS) if (typlen != -1) PG_RETURN_NULL(); - attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0)); + attr = (varlena *) DatumGetPointer(PG_GETARG_DATUM(0)); if (!VARATT_IS_EXTERNAL_ONDISK(attr)) PG_RETURN_NULL(); diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c index 05984e7ef2..4e26df7c63 100644 --- a/src/backend/utils/fmgr/fmgr.c +++ b/src/backend/utils/fmgr/fmgr.c @@ -1793,8 +1793,8 @@ OidSendFunctionCall(Oid functionId, Datum val) *------------------------------------------------------------------------- */ -struct varlena * -pg_detoast_datum(struct varlena *datum) +varlena * +pg_detoast_datum(varlena *datum) { if (VARATT_IS_EXTENDED(datum)) return detoast_attr(datum); @@ -1802,8 +1802,8 @@ pg_detoast_datum(struct varlena *datum) return datum; } -struct varlena * -pg_detoast_datum_copy(struct varlena *datum) +varlena * +pg_detoast_datum_copy(varlena *datum) { if (VARATT_IS_EXTENDED(datum)) return detoast_attr(datum); @@ -1811,22 +1811,22 @@ pg_detoast_datum_copy(struct varlena *datum) { /* Make a modifiable copy of the varlena object */ Size len = VARSIZE(datum); - struct varlena *result = (struct varlena *) palloc(len); + varlena *result = (varlena *) palloc(len); memcpy(result, datum, len); return result; } } -struct varlena * -pg_detoast_datum_slice(struct varlena *datum, int32 first, int32 count) +varlena * +pg_detoast_datum_slice(varlena *datum, int32 first, int32 count) { /* Only get the specified portion from the toast rel */ return detoast_attr_slice(datum, first, count); } -struct varlena * -pg_detoast_datum_packed(struct varlena *datum) +varlena * +pg_detoast_datum_packed(varlena *datum) { if (VARATT_IS_COMPRESSED(datum) || VARATT_IS_EXTERNAL(datum)) return detoast_attr(datum); diff --git a/src/include/access/detoast.h b/src/include/access/detoast.h index 6db3a29191..fbd98181a3 100644 --- a/src/include/access/detoast.h +++ b/src/include/access/detoast.h @@ -14,7 +14,7 @@ /* * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum - * into a local "struct varatt_external" toast pointer. This should be + * into a local "varatt_external" toast pointer. This should be * just a memcpy, but some versions of gcc seem to produce broken code * that assumes the datum contents are aligned. Introducing an explicit * intermediate "varattrib_1b_e *" variable seems to fix it. @@ -41,7 +41,7 @@ do { \ * in compressed format. * ---------- */ -extern struct varlena *detoast_external_attr(struct varlena *attr); +extern varlena *detoast_external_attr(varlena *attr); /* ---------- * detoast_attr() - @@ -50,7 +50,7 @@ extern struct varlena *detoast_external_attr(struct varlena *attr); * it as needed. * ---------- */ -extern struct varlena *detoast_attr(struct varlena *attr); +extern varlena *detoast_attr(varlena *attr); /* ---------- * detoast_attr_slice() - @@ -59,9 +59,9 @@ extern struct varlena *detoast_attr(struct varlena *attr); * (Handles all cases for attribute storage) * ---------- */ -extern struct varlena *detoast_attr_slice(struct varlena *attr, - int32 sliceoffset, - int32 slicelength); +extern varlena *detoast_attr_slice(varlena *attr, + int32 sliceoffset, + int32 slicelength); /* ---------- * toast_raw_datum_size - diff --git a/src/include/access/heaptoast.h b/src/include/access/heaptoast.h index 21baa0834b..725c0ce755 100644 --- a/src/include/access/heaptoast.h +++ b/src/include/access/heaptoast.h @@ -144,6 +144,6 @@ extern HeapTuple toast_build_flattened_tuple(TupleDesc tupleDesc, */ extern void heap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, int32 sliceoffset, - int32 slicelength, struct varlena *result); + int32 slicelength, varlena *result); #endif /* HEAPTOAST_H */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 7260b7b3d5..251379016b 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -750,7 +750,7 @@ typedef struct TableAmRoutine int32 attrsize, int32 sliceoffset, int32 slicelength, - struct varlena *result); + varlena *result); /* ------------------------------------------------------------------------ @@ -1906,7 +1906,7 @@ table_relation_toast_am(Relation rel) static inline void table_relation_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, int32 sliceoffset, - int32 slicelength, struct varlena *result) + int32 slicelength, varlena *result) { toastrel->rd_tableam->relation_fetch_toast_slice(toastrel, valueid, attrsize, diff --git a/src/include/access/toast_compression.h b/src/include/access/toast_compression.h index 4b42f7a047..5f3ffa9ab2 100644 --- a/src/include/access/toast_compression.h +++ b/src/include/access/toast_compression.h @@ -54,19 +54,19 @@ typedef enum ToastCompressionId /* pglz compression/decompression routines */ -extern struct varlena *pglz_compress_datum(const struct varlena *value); -extern struct varlena *pglz_decompress_datum(const struct varlena *value); -extern struct varlena *pglz_decompress_datum_slice(const struct varlena *value, - int32 slicelength); +extern varlena *pglz_compress_datum(const varlena *value); +extern varlena *pglz_decompress_datum(const varlena *value); +extern varlena *pglz_decompress_datum_slice(const varlena *value, + int32 slicelength); /* lz4 compression/decompression routines */ -extern struct varlena *lz4_compress_datum(const struct varlena *value); -extern struct varlena *lz4_decompress_datum(const struct varlena *value); -extern struct varlena *lz4_decompress_datum_slice(const struct varlena *value, - int32 slicelength); +extern varlena *lz4_compress_datum(const varlena *value); +extern varlena *lz4_decompress_datum(const varlena *value); +extern varlena *lz4_decompress_datum_slice(const varlena *value, + int32 slicelength); /* other stuff */ -extern ToastCompressionId toast_get_compression_id(struct varlena *attr); +extern ToastCompressionId toast_get_compression_id(varlena *attr); extern char CompressionNameToMethod(const char *compression); extern const char *GetCompressionMethodName(char method); diff --git a/src/include/access/toast_helper.h b/src/include/access/toast_helper.h index 9bd6bfaffe..e8ecb995cb 100644 --- a/src/include/access/toast_helper.h +++ b/src/include/access/toast_helper.h @@ -29,7 +29,7 @@ */ typedef struct { - struct varlena *tai_oldexternal; + varlena *tai_oldexternal; int32 tai_size; uint8 tai_colflags; char tai_compression; diff --git a/src/include/access/toast_internals.h b/src/include/access/toast_internals.h index 75690e0bc8..d382db3426 100644 --- a/src/include/access/toast_internals.h +++ b/src/include/access/toast_internals.h @@ -50,7 +50,7 @@ extern Oid toast_get_valid_index(Oid toastoid, LOCKMODE lock); extern void toast_delete_datum(Relation rel, Datum value, bool is_speculative); extern Datum toast_save_datum(Relation rel, Datum value, - struct varlena *oldexternal, int options); + varlena *oldexternal, int options); extern int toast_open_indexes(Relation toastrel, LOCKMODE lock, diff --git a/src/include/c.h b/src/include/c.h index 063eac9808..3fc09ec1e4 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -689,7 +689,7 @@ typedef uint64 Oid8; #define OID8_MAX UINT64_MAX /* ---------------- - * Variable-length datatypes all share the 'struct varlena' header. + * Variable-length datatypes all share the 'varlena' header. * * NOTE: for TOASTable types, this is an oversimplification, since the value * may be compressed or moved out-of-line. However datatype-specific routines @@ -702,11 +702,11 @@ typedef uint64 Oid8; * See varatt.h for details of the TOASTed form. * ---------------- */ -struct varlena +typedef struct varlena { char vl_len_[4]; /* Do not touch this field directly! */ char vl_dat[FLEXIBLE_ARRAY_MEMBER]; /* Data content is here */ -}; +} varlena; #define VARHDRSZ ((int32) sizeof(int32)) @@ -715,10 +715,10 @@ struct varlena * There is no terminating null or anything like that --- the data length is * always VARSIZE_ANY_EXHDR(ptr). */ -typedef struct varlena bytea; -typedef struct varlena text; -typedef struct varlena BpChar; /* blank-padded char, ie SQL char(n) */ -typedef struct varlena VarChar; /* var-length char, ie SQL varchar(n) */ +typedef varlena bytea; +typedef varlena text; +typedef varlena BpChar; /* blank-padded char, ie SQL char(n) */ +typedef varlena VarChar; /* var-length char, ie SQL varchar(n) */ /* * Specialized array types. These are physically laid out just the same diff --git a/src/include/fmgr.h b/src/include/fmgr.h index eabbc78b28..10d02bdb79 100644 --- a/src/include/fmgr.h +++ b/src/include/fmgr.h @@ -231,22 +231,22 @@ extern void fmgr_symbol(Oid functionId, char **mod, char **fn); * Note: it'd be nice if these could be macros, but I see no way to do that * without evaluating the arguments multiple times, which is NOT acceptable. */ -extern struct varlena *pg_detoast_datum(struct varlena *datum); -extern struct varlena *pg_detoast_datum_copy(struct varlena *datum); -extern struct varlena *pg_detoast_datum_slice(struct varlena *datum, - int32 first, int32 count); -extern struct varlena *pg_detoast_datum_packed(struct varlena *datum); +extern varlena *pg_detoast_datum(varlena *datum); +extern varlena *pg_detoast_datum_copy(varlena *datum); +extern varlena *pg_detoast_datum_slice(varlena *datum, + int32 first, int32 count); +extern varlena *pg_detoast_datum_packed(varlena *datum); #define PG_DETOAST_DATUM(datum) \ - pg_detoast_datum((struct varlena *) DatumGetPointer(datum)) + pg_detoast_datum((varlena *) DatumGetPointer(datum)) #define PG_DETOAST_DATUM_COPY(datum) \ - pg_detoast_datum_copy((struct varlena *) DatumGetPointer(datum)) + pg_detoast_datum_copy((varlena *) DatumGetPointer(datum)) #define PG_DETOAST_DATUM_SLICE(datum,f,c) \ - pg_detoast_datum_slice((struct varlena *) DatumGetPointer(datum), \ + pg_detoast_datum_slice((varlena *) DatumGetPointer(datum), \ (int32) (f), (int32) (c)) /* WARNING -- unaligned pointer */ #define PG_DETOAST_DATUM_PACKED(datum) \ - pg_detoast_datum_packed((struct varlena *) DatumGetPointer(datum)) + pg_detoast_datum_packed((varlena *) DatumGetPointer(datum)) /* * Support for cleaning up detoasted copies of inputs. This must only @@ -283,7 +283,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum); #define PG_GETARG_FLOAT8(n) DatumGetFloat8(PG_GETARG_DATUM(n)) #define PG_GETARG_INT64(n) DatumGetInt64(PG_GETARG_DATUM(n)) /* use this if you want the raw, possibly-toasted input datum: */ -#define PG_GETARG_RAW_VARLENA_P(n) ((struct varlena *) PG_GETARG_POINTER(n)) +#define PG_GETARG_RAW_VARLENA_P(n) ((varlena *) PG_GETARG_POINTER(n)) /* use this if you want the input datum de-toasted: */ #define PG_GETARG_VARLENA_P(n) PG_DETOAST_DATUM(PG_GETARG_DATUM(n)) /* and this if you can handle 1-byte-header datums: */ diff --git a/src/include/utils/varbit.h b/src/include/utils/varbit.h index 82be976f5c..20cb14d75b 100644 --- a/src/include/utils/varbit.h +++ b/src/include/utils/varbit.h @@ -20,7 +20,7 @@ #include "fmgr.h" /* - * Modeled on struct varlena from c.h, but data type is bits8. + * Modeled on varlena from c.h, but data type is bits8. * * Caution: if bit_len is not a multiple of BITS_PER_BYTE, the low-order * bits of the last byte of bit_dat[] are unused and MUST be zeroes. diff --git a/src/include/utils/xml.h b/src/include/utils/xml.h index 03acb25544..023fdeb453 100644 --- a/src/include/utils/xml.h +++ b/src/include/utils/xml.h @@ -20,7 +20,7 @@ #include "nodes/execnodes.h" #include "nodes/primnodes.h" -typedef struct varlena xmltype; +typedef varlena xmltype; typedef enum { diff --git a/src/include/varatt.h b/src/include/varatt.h index fd7d5912f7..000bdf33b9 100644 --- a/src/include/varatt.h +++ b/src/include/varatt.h @@ -16,7 +16,7 @@ #define VARATT_H /* - * struct varatt_external is a traditional "TOAST pointer", that is, the + * varatt_external is a traditional "TOAST pointer", that is, the * information needed to fetch a Datum stored out-of-line in a TOAST table. * The data is compressed if and only if the external size stored in * va_extinfo is less than va_rawsize - VARHDRSZ. @@ -36,7 +36,7 @@ typedef struct varatt_external * compression method */ Oid va_valueid; /* Unique ID of value within TOAST table */ Oid va_toastrelid; /* RelID of TOAST table containing it */ -} varatt_external; +} varatt_external; /* * These macros define the "saved size" portion of va_extinfo. Its remaining @@ -46,27 +46,27 @@ typedef struct varatt_external #define VARLENA_EXTSIZE_MASK ((1U << VARLENA_EXTSIZE_BITS) - 1) /* - * struct varatt_indirect is a "TOAST pointer" representing an out-of-line + * varatt_indirect is a "TOAST pointer" representing an out-of-line * Datum that's stored in memory, not in an external toast relation. * The creator of such a Datum is entirely responsible that the referenced * storage survives for as long as referencing pointer Datums can exist. * - * Note that just as for struct varatt_external, this struct is stored + * Note that just as for varatt_external, this struct is stored * unaligned within any containing tuple. */ typedef struct varatt_indirect { - struct varlena *pointer; /* Pointer to in-memory varlena */ -} varatt_indirect; + varlena *pointer; /* Pointer to in-memory varlena */ +} varatt_indirect; /* - * struct varatt_expanded is a "TOAST pointer" representing an out-of-line + * varatt_expanded is a "TOAST pointer" representing an out-of-line * Datum that is stored in memory, in some type-specific, not necessarily * physically contiguous format that is convenient for computation not * storage. APIs for this, in particular the definition of struct * ExpandedObjectHeader, are in src/include/utils/expandeddatum.h. * - * Note that just as for struct varatt_external, this struct is stored + * Note that just as for varatt_external, this struct is stored * unaligned within any containing tuple. */ typedef struct ExpandedObjectHeader ExpandedObjectHeader; @@ -502,15 +502,15 @@ VARDATA_COMPRESSED_GET_COMPRESS_METHOD(const void *PTR) return ((const varattrib_4b *) PTR)->va_compressed.va_tcinfo >> VARLENA_EXTSIZE_BITS; } -/* Same for external Datums; but note argument is a struct varatt_external */ +/* Same for external Datums; but note argument is a varatt_external */ static inline Size -VARATT_EXTERNAL_GET_EXTSIZE(struct varatt_external toast_pointer) +VARATT_EXTERNAL_GET_EXTSIZE(varatt_external toast_pointer) { return toast_pointer.va_extinfo & VARLENA_EXTSIZE_MASK; } static inline uint32 -VARATT_EXTERNAL_GET_COMPRESS_METHOD(struct varatt_external toast_pointer) +VARATT_EXTERNAL_GET_COMPRESS_METHOD(varatt_external toast_pointer) { return toast_pointer.va_extinfo >> VARLENA_EXTSIZE_BITS; } @@ -533,7 +533,7 @@ VARATT_EXTERNAL_GET_COMPRESS_METHOD(struct varatt_external toast_pointer) * actually saves space, so we expect either equality or less-than. */ static inline bool -VARATT_EXTERNAL_IS_COMPRESSED(struct varatt_external toast_pointer) +VARATT_EXTERNAL_IS_COMPRESSED(varatt_external toast_pointer) { return VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer) < (Size) (toast_pointer.va_rawsize - VARHDRSZ); diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index 75325117ec..f80264e184 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -8818,7 +8818,7 @@ assign_simple_var(PLpgSQL_execstate *estate, PLpgSQL_var *var, * pain, but there's little choice. */ oldcxt = MemoryContextSwitchTo(get_eval_mcontext(estate)); - detoasted = PointerGetDatum(detoast_external_attr((struct varlena *) DatumGetPointer(newvalue))); + detoasted = PointerGetDatum(detoast_external_attr((varlena *) DatumGetPointer(newvalue))); MemoryContextSwitchTo(oldcxt); /* Now's a good time to not leak the input value if it's freeable */ if (freeable) diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index bea858f03c..96cf30ac92 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -376,9 +376,9 @@ make_tuple_indirect(PG_FUNCTION_ARGS) for (i = 0; i < ncolumns; i++) { - struct varlena *attr; - struct varlena *new_attr; - struct varatt_indirect redirect_pointer; + varlena *attr; + varlena *new_attr; + varatt_indirect redirect_pointer; /* only work on existing, not-null varlenas */ if (TupleDescAttr(tupdesc, i)->attisdropped || @@ -387,7 +387,7 @@ make_tuple_indirect(PG_FUNCTION_ARGS) TupleDescAttr(tupdesc, i)->attstorage == TYPSTORAGE_PLAIN) continue; - attr = (struct varlena *) DatumGetPointer(values[i]); + attr = (varlena *) DatumGetPointer(values[i]); /* don't recursively indirect */ if (VARATT_IS_EXTERNAL_INDIRECT(attr)) @@ -398,14 +398,14 @@ make_tuple_indirect(PG_FUNCTION_ARGS) attr = detoast_external_attr(attr); else { - struct varlena *oldattr = attr; + varlena *oldattr = attr; attr = palloc0(VARSIZE_ANY(oldattr)); memcpy(attr, oldattr, VARSIZE_ANY(oldattr)); } /* build indirection Datum */ - new_attr = (struct varlena *) palloc0(INDIRECT_POINTER_SIZE); + new_attr = (varlena *) palloc0(INDIRECT_POINTER_SIZE); redirect_pointer.pointer = attr; SET_VARTAG_EXTERNAL(new_attr, VARTAG_INDIRECT); memcpy(VARDATA_EXTERNAL(new_attr), &redirect_pointer, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 523977721e..a942d030d2 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4270,9 +4270,12 @@ va_list vacuumingOptions validate_string_relopt varatt_expanded +varatt_external +varatt_indirect varattrib_1b varattrib_1b_e varattrib_4b +varlena vartag_external vbits verifier_context From 7358abcc6076f4b2530d10126ab379f8aea612a5 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 10 Feb 2026 17:55:59 -0500 Subject: [PATCH 081/147] Store information about Append node consolidation in the final plan. An extension (or core code) might want to reconstruct the planner's decisions about whether and where to perform partitionwise joins from the final plan. To do so, it must be possible to find all of the RTIs of partitioned tables appearing in the plan. But when an AppendPath or MergeAppendPath pulls up child paths from a subordinate AppendPath or MergeAppendPath, the RTIs of the subordinate path do not appear in the final plan, making this kind of reconstruction impossible. To avoid this, propagate the RTI sets that would have been present in the 'apprelids' field of the subordinate Append or MergeAppend nodes that would have been created into the surviving Append or MergeAppend node, using a new 'child_append_relid_sets' field for that purpose. The value of this field is a list of Bitmapsets, because each relation whose append-list was pulled up had its own set of RTIs: just one, if it was a partitionwise scan, or more than one, if it was a partitionwise join. Since our goal is to see where partitionwise joins were done, it is essential to avoid losing the information about how the RTIs were grouped in the pulled-up relations. This commit also updates pg_overexplain so that EXPLAIN (RANGE_TABLE) will display the saved RTI sets. Co-authored-by: Robert Haas Co-authored-by: Lukas Fittl Reviewed-by: Lukas Fittl Reviewed-by: Jakub Wartak Reviewed-by: Greg Burd Reviewed-by: Jacob Champion Reviewed-by: Amit Langote Reviewed-by: Haibo Yan Reviewed-by: Alexandra Wang Discussion: http://postgr.es/m/CA+TgmoZ-Jh1T6QyWoCODMVQdhTUPYkaZjWztzP1En4=ZHoKPzw@mail.gmail.com --- .../expected/pg_overexplain.out | 10 +- contrib/pg_overexplain/pg_overexplain.c | 56 +++++ src/backend/optimizer/path/allpaths.c | 199 +++++++++++------- src/backend/optimizer/path/joinrels.c | 3 +- src/backend/optimizer/plan/createplan.c | 2 + src/backend/optimizer/plan/planner.c | 7 +- src/backend/optimizer/prep/prepunion.c | 43 ++-- src/backend/optimizer/util/pathnode.c | 26 ++- src/include/nodes/pathnodes.h | 10 + src/include/nodes/plannodes.h | 11 + src/include/optimizer/pathnode.h | 18 +- src/tools/pgindent/typedefs.list | 1 + 12 files changed, 274 insertions(+), 112 deletions(-) diff --git a/contrib/pg_overexplain/expected/pg_overexplain.out b/contrib/pg_overexplain/expected/pg_overexplain.out index 198bbe10d7..f376d2e799 100644 --- a/contrib/pg_overexplain/expected/pg_overexplain.out +++ b/contrib/pg_overexplain/expected/pg_overexplain.out @@ -104,6 +104,7 @@ $$); Parallel Safe: true Plan Node ID: 2 Append RTIs: 1 + Child Append RTIs: none -> Seq Scan on brassica vegetables_1 Disabled Nodes: 0 Parallel Safe: true @@ -142,7 +143,7 @@ $$); Relation Kind: relation Relation Lock Mode: AccessShareLock Unprunable RTIs: 1 3 4 -(53 rows) +(54 rows) -- Test a different output format. SELECT explain_filter($$ @@ -197,6 +198,7 @@ $$); none + none + 1 + + none + 0 + + + @@ -507,6 +509,7 @@ SELECT * FROM vegetables v, Elided Node RTIs: 2 -> Append Append RTIs: 1 + Child Append RTIs: none -> Seq Scan on brassica v_1 Scan RTI: 3 -> Seq Scan on daucus v_2 @@ -548,7 +551,7 @@ SELECT * FROM vegetables v, Relation Kind: relation Relation Lock Mode: AccessShareLock Unprunable RTIs: 1 3 4 5 6 -(51 rows) +(52 rows) -- should show "Subplan: unnamed_subquery" EXPLAIN (RANGE_TABLE, COSTS OFF) @@ -566,6 +569,7 @@ SELECT * FROM vegetables v, Elided Node RTIs: 2 -> Append Append RTIs: 1 + Child Append RTIs: none -> Seq Scan on brassica v_1 Scan RTI: 3 -> Seq Scan on daucus v_2 @@ -606,5 +610,5 @@ SELECT * FROM vegetables v, Relation Kind: relation Relation Lock Mode: AccessShareLock Unprunable RTIs: 1 3 4 5 6 -(50 rows) +(51 rows) diff --git a/contrib/pg_overexplain/pg_overexplain.c b/contrib/pg_overexplain/pg_overexplain.c index e0184ba314..36e6aac0e2 100644 --- a/contrib/pg_overexplain/pg_overexplain.c +++ b/contrib/pg_overexplain/pg_overexplain.c @@ -54,6 +54,8 @@ static void overexplain_alias(const char *qlabel, Alias *alias, ExplainState *es); static void overexplain_bitmapset(const char *qlabel, Bitmapset *bms, ExplainState *es); +static void overexplain_bitmapset_list(const char *qlabel, List *bms_list, + ExplainState *es); static void overexplain_intlist(const char *qlabel, List *list, ExplainState *es); @@ -232,11 +234,17 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors, overexplain_bitmapset("Append RTIs", ((Append *) plan)->apprelids, es); + overexplain_bitmapset_list("Child Append RTIs", + ((Append *) plan)->child_append_relid_sets, + es); break; case T_MergeAppend: overexplain_bitmapset("Append RTIs", ((MergeAppend *) plan)->apprelids, es); + overexplain_bitmapset_list("Child Append RTIs", + ((MergeAppend *) plan)->child_append_relid_sets, + es); break; case T_Result: @@ -815,6 +823,54 @@ overexplain_bitmapset(const char *qlabel, Bitmapset *bms, ExplainState *es) pfree(buf.data); } +/* + * Emit a text property describing the contents of a list of bitmapsets. + * If a bitmapset contains exactly 1 member, we just print an integer; + * otherwise, we surround the list of members by parentheses. + * + * If there are no bitmapsets in the list, we print the word "none". + */ +static void +overexplain_bitmapset_list(const char *qlabel, List *bms_list, + ExplainState *es) +{ + StringInfoData buf; + + initStringInfo(&buf); + + foreach_node(Bitmapset, bms, bms_list) + { + if (bms_membership(bms) == BMS_SINGLETON) + appendStringInfo(&buf, " %d", bms_singleton_member(bms)); + else + { + int x = -1; + bool first = true; + + appendStringInfoString(&buf, " ("); + while ((x = bms_next_member(bms, x)) >= 0) + { + if (first) + first = false; + else + appendStringInfoChar(&buf, ' '); + appendStringInfo(&buf, "%d", x); + } + appendStringInfoChar(&buf, ')'); + } + } + + if (buf.len == 0) + { + ExplainPropertyText(qlabel, "none", es); + return; + } + + Assert(buf.data[0] == ' '); + ExplainPropertyText(qlabel, buf.data + 1, es); + pfree(buf.data); +} + /* * Emit a text property describing the contents of a list of integers, OIDs, * or XIDs -- either a space-separated list of integer members, or the word diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index b4581e54d9..90275e2587 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -128,8 +128,10 @@ static Path *get_cheapest_parameterized_child_path(PlannerInfo *root, Relids required_outer); static void accumulate_append_subpath(Path *path, List **subpaths, - List **special_subpaths); -static Path *get_singleton_append_subpath(Path *path); + List **special_subpaths, + List **child_append_relid_sets); +static Path *get_singleton_append_subpath(Path *path, + List **child_append_relid_sets); static void set_dummy_rel_pathlist(RelOptInfo *rel); static void set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); @@ -1404,22 +1406,21 @@ void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, List *live_childrels) { - List *subpaths = NIL; - bool subpaths_valid = true; - List *startup_subpaths = NIL; - bool startup_subpaths_valid = true; - List *partial_subpaths = NIL; - List *pa_partial_subpaths = NIL; - List *pa_nonpartial_subpaths = NIL; - bool partial_subpaths_valid = true; - bool pa_subpaths_valid; + AppendPathInput unparameterized = {0}; + AppendPathInput startup = {0}; + AppendPathInput partial_only = {0}; + AppendPathInput parallel_append = {0}; + bool unparameterized_valid = true; + bool startup_valid = true; + bool partial_only_valid = true; + bool parallel_append_valid = true; List *all_child_pathkeys = NIL; List *all_child_outers = NIL; ListCell *l; double partial_rows = -1; /* If appropriate, consider parallel append */ - pa_subpaths_valid = enable_parallel_append && rel->consider_parallel; + parallel_append_valid = enable_parallel_append && rel->consider_parallel; /* * For every non-dummy child, remember the cheapest path. Also, identify @@ -1443,9 +1444,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (childrel->pathlist != NIL && childrel->cheapest_total_path->param_info == NULL) accumulate_append_subpath(childrel->cheapest_total_path, - &subpaths, NULL); + &unparameterized.subpaths, NULL, &unparameterized.child_append_relid_sets); else - subpaths_valid = false; + unparameterized_valid = false; /* * When the planner is considering cheap startup plans, we'll also @@ -1471,11 +1472,12 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, /* cheapest_startup_path must not be a parameterized path. */ Assert(cheapest_path->param_info == NULL); accumulate_append_subpath(cheapest_path, - &startup_subpaths, - NULL); + &startup.subpaths, + NULL, + &startup.child_append_relid_sets); } else - startup_subpaths_valid = false; + startup_valid = false; /* Same idea, but for a partial plan. */ @@ -1483,16 +1485,17 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, { cheapest_partial_path = linitial(childrel->partial_pathlist); accumulate_append_subpath(cheapest_partial_path, - &partial_subpaths, NULL); + &partial_only.partial_subpaths, NULL, + &partial_only.child_append_relid_sets); } else - partial_subpaths_valid = false; + partial_only_valid = false; /* * Same idea, but for a parallel append mixing partial and non-partial * paths. */ - if (pa_subpaths_valid) + if (parallel_append_valid) { Path *nppath = NULL; @@ -1502,7 +1505,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (cheapest_partial_path == NULL && nppath == NULL) { /* Neither a partial nor a parallel-safe path? Forget it. */ - pa_subpaths_valid = false; + parallel_append_valid = false; } else if (nppath == NULL || (cheapest_partial_path != NULL && @@ -1511,8 +1514,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, /* Partial path is cheaper or the only option. */ Assert(cheapest_partial_path != NULL); accumulate_append_subpath(cheapest_partial_path, - &pa_partial_subpaths, - &pa_nonpartial_subpaths); + ¶llel_append.partial_subpaths, + ¶llel_append.subpaths, + ¶llel_append.child_append_relid_sets); } else { @@ -1530,8 +1534,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * figure that out. */ accumulate_append_subpath(nppath, - &pa_nonpartial_subpaths, - NULL); + ¶llel_append.subpaths, + NULL, + ¶llel_append.child_append_relid_sets); } } @@ -1605,28 +1610,28 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * unparameterized Append path for the rel. (Note: this is correct even * if we have zero or one live subpath due to constraint exclusion.) */ - if (subpaths_valid) - add_path(rel, (Path *) create_append_path(root, rel, subpaths, NIL, + if (unparameterized_valid) + add_path(rel, (Path *) create_append_path(root, rel, unparameterized, NIL, NULL, 0, false, -1)); /* build an AppendPath for the cheap startup paths, if valid */ - if (startup_subpaths_valid) - add_path(rel, (Path *) create_append_path(root, rel, startup_subpaths, - NIL, NIL, NULL, 0, false, -1)); + if (startup_valid) + add_path(rel, (Path *) create_append_path(root, rel, startup, + NIL, NULL, 0, false, -1)); /* * Consider an append of unordered, unparameterized partial paths. Make * it parallel-aware if possible. */ - if (partial_subpaths_valid && partial_subpaths != NIL) + if (partial_only_valid && partial_only.partial_subpaths != NIL) { AppendPath *appendpath; ListCell *lc; int parallel_workers = 0; /* Find the highest number of workers requested for any subpath. */ - foreach(lc, partial_subpaths) + foreach(lc, partial_only.partial_subpaths) { Path *path = lfirst(lc); @@ -1653,7 +1658,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, Assert(parallel_workers > 0); /* Generate a partial append path. */ - appendpath = create_append_path(root, rel, NIL, partial_subpaths, + appendpath = create_append_path(root, rel, partial_only, NIL, NULL, parallel_workers, enable_parallel_append, -1); @@ -1674,7 +1679,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * a non-partial path that is substantially cheaper than any partial path; * otherwise, we should use the append path added in the previous step.) */ - if (pa_subpaths_valid && pa_nonpartial_subpaths != NIL) + if (parallel_append_valid && parallel_append.subpaths != NIL) { AppendPath *appendpath; ListCell *lc; @@ -1684,7 +1689,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * Find the highest number of workers requested for any partial * subpath. */ - foreach(lc, pa_partial_subpaths) + foreach(lc, parallel_append.partial_subpaths) { Path *path = lfirst(lc); @@ -1702,8 +1707,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, max_parallel_workers_per_gather); Assert(parallel_workers > 0); - appendpath = create_append_path(root, rel, pa_nonpartial_subpaths, - pa_partial_subpaths, + appendpath = create_append_path(root, rel, parallel_append, NIL, NULL, parallel_workers, true, partial_rows); add_partial_path(rel, (Path *) appendpath); @@ -1713,7 +1717,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * Also build unparameterized ordered append paths based on the collected * list of child pathkeys. */ - if (subpaths_valid) + if (unparameterized_valid) generate_orderedappend_paths(root, rel, live_childrels, all_child_pathkeys); @@ -1734,10 +1738,10 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, { Relids required_outer = (Relids) lfirst(l); ListCell *lcr; + AppendPathInput parameterized = {0}; + bool parameterized_valid = true; /* Select the child paths for an Append with this parameterization */ - subpaths = NIL; - subpaths_valid = true; foreach(lcr, live_childrels) { RelOptInfo *childrel = (RelOptInfo *) lfirst(lcr); @@ -1746,7 +1750,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (childrel->pathlist == NIL) { /* failed to make a suitable path for this child */ - subpaths_valid = false; + parameterized_valid = false; break; } @@ -1756,15 +1760,16 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (subpath == NULL) { /* failed to make a suitable path for this child */ - subpaths_valid = false; + parameterized_valid = false; break; } - accumulate_append_subpath(subpath, &subpaths, NULL); + accumulate_append_subpath(subpath, ¶meterized.subpaths, NULL, + ¶meterized.child_append_relid_sets); } - if (subpaths_valid) + if (parameterized_valid) add_path(rel, (Path *) - create_append_path(root, rel, subpaths, NIL, + create_append_path(root, rel, parameterized, NIL, required_outer, 0, false, -1)); } @@ -1785,13 +1790,14 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, { Path *path = (Path *) lfirst(l); AppendPath *appendpath; + AppendPathInput append = {0}; /* skip paths with no pathkeys. */ if (path->pathkeys == NIL) continue; - appendpath = create_append_path(root, rel, NIL, list_make1(path), - NIL, NULL, + append.partial_subpaths = list_make1(path); + appendpath = create_append_path(root, rel, append, NIL, NULL, path->parallel_workers, true, partial_rows); add_partial_path(rel, (Path *) appendpath); @@ -1873,9 +1879,9 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, foreach(lcp, all_child_pathkeys) { List *pathkeys = (List *) lfirst(lcp); - List *startup_subpaths = NIL; - List *total_subpaths = NIL; - List *fractional_subpaths = NIL; + AppendPathInput startup = {0}; + AppendPathInput total = {0}; + AppendPathInput fractional = {0}; bool startup_neq_total = false; bool fraction_neq_total = false; bool match_partition_order; @@ -2038,16 +2044,23 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, * just a single subpath (and hence aren't doing anything * useful). */ - cheapest_startup = get_singleton_append_subpath(cheapest_startup); - cheapest_total = get_singleton_append_subpath(cheapest_total); + cheapest_startup = + get_singleton_append_subpath(cheapest_startup, + &startup.child_append_relid_sets); + cheapest_total = + get_singleton_append_subpath(cheapest_total, + &total.child_append_relid_sets); - startup_subpaths = lappend(startup_subpaths, cheapest_startup); - total_subpaths = lappend(total_subpaths, cheapest_total); + startup.subpaths = lappend(startup.subpaths, cheapest_startup); + total.subpaths = lappend(total.subpaths, cheapest_total); if (cheapest_fractional) { - cheapest_fractional = get_singleton_append_subpath(cheapest_fractional); - fractional_subpaths = lappend(fractional_subpaths, cheapest_fractional); + cheapest_fractional = + get_singleton_append_subpath(cheapest_fractional, + &fractional.child_append_relid_sets); + fractional.subpaths = + lappend(fractional.subpaths, cheapest_fractional); } } else @@ -2057,13 +2070,16 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, * child paths for the MergeAppend. */ accumulate_append_subpath(cheapest_startup, - &startup_subpaths, NULL); + &startup.subpaths, NULL, + &startup.child_append_relid_sets); accumulate_append_subpath(cheapest_total, - &total_subpaths, NULL); + &total.subpaths, NULL, + &total.child_append_relid_sets); if (cheapest_fractional) accumulate_append_subpath(cheapest_fractional, - &fractional_subpaths, NULL); + &fractional.subpaths, NULL, + &fractional.child_append_relid_sets); } } @@ -2073,8 +2089,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, /* We only need Append */ add_path(rel, (Path *) create_append_path(root, rel, - startup_subpaths, - NIL, + startup, pathkeys, NULL, 0, @@ -2083,19 +2098,17 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, if (startup_neq_total) add_path(rel, (Path *) create_append_path(root, rel, - total_subpaths, - NIL, + total, pathkeys, NULL, 0, false, -1)); - if (fractional_subpaths && fraction_neq_total) + if (fractional.subpaths && fraction_neq_total) add_path(rel, (Path *) create_append_path(root, rel, - fractional_subpaths, - NIL, + fractional, pathkeys, NULL, 0, @@ -2107,20 +2120,23 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, /* We need MergeAppend */ add_path(rel, (Path *) create_merge_append_path(root, rel, - startup_subpaths, + startup.subpaths, + startup.child_append_relid_sets, pathkeys, NULL)); if (startup_neq_total) add_path(rel, (Path *) create_merge_append_path(root, rel, - total_subpaths, + total.subpaths, + total.child_append_relid_sets, pathkeys, NULL)); - if (fractional_subpaths && fraction_neq_total) + if (fractional.subpaths && fraction_neq_total) add_path(rel, (Path *) create_merge_append_path(root, rel, - fractional_subpaths, + fractional.subpaths, + fractional.child_append_relid_sets, pathkeys, NULL)); } @@ -2223,7 +2239,8 @@ get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo *rel, * paths). */ static void -accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) +accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths, + List **child_append_relid_sets) { if (IsA(path, AppendPath)) { @@ -2232,6 +2249,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) if (!apath->path.parallel_aware || apath->first_partial_path == 0) { *subpaths = list_concat(*subpaths, apath->subpaths); + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + apath->child_append_relid_sets); return; } else if (special_subpaths != NULL) @@ -2246,6 +2268,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) apath->first_partial_path); *special_subpaths = list_concat(*special_subpaths, new_special_subpaths); + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + apath->child_append_relid_sets); return; } } @@ -2254,6 +2281,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) MergeAppendPath *mpath = (MergeAppendPath *) path; *subpaths = list_concat(*subpaths, mpath->subpaths); + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + mpath->child_append_relid_sets); return; } @@ -2265,10 +2297,15 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) * Returns the single subpath of an Append/MergeAppend, or just * return 'path' if it's not a single sub-path Append/MergeAppend. * + * As a side effect, whenever we return a single subpath rather than the + * original path, add the relid sets for the original path to + * child_append_relid_sets, so that those relids don't entirely disappear + * from the final plan. + * * Note: 'path' must not be a parallel-aware path. */ static Path * -get_singleton_append_subpath(Path *path) +get_singleton_append_subpath(Path *path, List **child_append_relid_sets) { Assert(!path->parallel_aware); @@ -2277,14 +2314,28 @@ get_singleton_append_subpath(Path *path) AppendPath *apath = (AppendPath *) path; if (list_length(apath->subpaths) == 1) + { + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + apath->child_append_relid_sets); return (Path *) linitial(apath->subpaths); + } } else if (IsA(path, MergeAppendPath)) { MergeAppendPath *mpath = (MergeAppendPath *) path; if (list_length(mpath->subpaths) == 1) + { + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + mpath->child_append_relid_sets); return (Path *) linitial(mpath->subpaths); + } } return path; @@ -2304,6 +2355,8 @@ get_singleton_append_subpath(Path *path) static void set_dummy_rel_pathlist(RelOptInfo *rel) { + AppendPathInput in = {0}; + /* Set dummy size estimates --- we leave attr_widths[] as zeroes */ rel->rows = 0; rel->reltarget->width = 0; @@ -2313,7 +2366,7 @@ set_dummy_rel_pathlist(RelOptInfo *rel) rel->partial_pathlist = NIL; /* Set up the dummy path */ - add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL, + add_path(rel, (Path *) create_append_path(NULL, rel, in, NIL, rel->lateral_relids, 0, false, -1)); diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 2615651c07..443e2dca7c 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1513,6 +1513,7 @@ void mark_dummy_rel(RelOptInfo *rel) { MemoryContext oldcontext; + AppendPathInput in = {0}; /* Already marked? */ if (is_dummy_rel(rel)) @@ -1529,7 +1530,7 @@ mark_dummy_rel(RelOptInfo *rel) rel->partial_pathlist = NIL; /* Set up the dummy path */ - add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL, + add_path(rel, (Path *) create_append_path(NULL, rel, in, NIL, rel->lateral_relids, 0, false, -1)); diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index a50260290f..959df43c39 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -1263,6 +1263,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) plan->plan.lefttree = NULL; plan->plan.righttree = NULL; plan->apprelids = rel->relids; + plan->child_append_relid_sets = best_path->child_append_relid_sets; if (pathkeys != NIL) { @@ -1475,6 +1476,7 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, plan->lefttree = NULL; plan->righttree = NULL; node->apprelids = rel->relids; + node->child_append_relid_sets = best_path->child_append_relid_sets; /* * Compute sort column info, and adjust MergeAppend's tlist as needed. diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index f68142cfcb..006b328196 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4063,7 +4063,7 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, * might get between 0 and N output rows. Offhand I think that's * desired.) */ - List *paths = NIL; + AppendPathInput append = {0}; while (--nrows >= 0) { @@ -4071,13 +4071,12 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, create_group_result_path(root, grouped_rel, grouped_rel->reltarget, (List *) parse->havingQual); - paths = lappend(paths, path); + append.subpaths = lappend(append.subpaths, path); } path = (Path *) create_append_path(root, grouped_rel, - paths, - NIL, + append, NIL, NULL, 0, diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 78c95c36dd..f50c296e3d 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -696,9 +696,9 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, ListCell *lc; ListCell *lc2; ListCell *lc3; - List *cheapest_pathlist = NIL; - List *ordered_pathlist = NIL; - List *partial_pathlist = NIL; + AppendPathInput cheapest = {0}; + AppendPathInput ordered = {0}; + AppendPathInput partial = {0}; bool partial_paths_valid = true; bool consider_parallel = true; List *rellist; @@ -783,7 +783,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, if (is_dummy_rel(rel)) continue; - cheapest_pathlist = lappend(cheapest_pathlist, + cheapest.subpaths = lappend(cheapest.subpaths, rel->cheapest_total_path); if (try_sorted) @@ -795,7 +795,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, false); if (ordered_path != NULL) - ordered_pathlist = lappend(ordered_pathlist, ordered_path); + ordered.subpaths = lappend(ordered.subpaths, ordered_path); else { /* @@ -818,20 +818,20 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, else if (rel->partial_pathlist == NIL) partial_paths_valid = false; else - partial_pathlist = lappend(partial_pathlist, - linitial(rel->partial_pathlist)); + partial.partial_subpaths = lappend(partial.partial_subpaths, + linitial(rel->partial_pathlist)); } } /* Build result relation. */ result_rel = fetch_upper_rel(root, UPPERREL_SETOP, relids); result_rel->reltarget = create_setop_pathtarget(root, tlist, - cheapest_pathlist); + cheapest.subpaths); result_rel->consider_parallel = consider_parallel; result_rel->consider_startup = (root->tuple_fraction > 0); /* If all UNION children were dummy rels, make the resulting rel dummy */ - if (cheapest_pathlist == NIL) + if (cheapest.subpaths == NIL) { mark_dummy_rel(result_rel); @@ -842,8 +842,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, * Append the child results together using the cheapest paths from each * union child. */ - apath = (Path *) create_append_path(root, result_rel, cheapest_pathlist, - NIL, NIL, NULL, 0, false, -1); + apath = (Path *) create_append_path(root, result_rel, cheapest, + NIL, NULL, 0, false, -1); /* * Estimate number of groups. For now we just assume the output is unique @@ -862,7 +862,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, int parallel_workers = 0; /* Find the highest number of workers requested for any subpath. */ - foreach(lc, partial_pathlist) + foreach(lc, partial.partial_subpaths) { Path *subpath = lfirst(lc); @@ -881,14 +881,14 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, if (enable_parallel_append) { parallel_workers = Max(parallel_workers, - pg_leftmost_one_pos32(list_length(partial_pathlist)) + 1); + pg_leftmost_one_pos32(list_length(partial.partial_subpaths)) + 1); parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather); } Assert(parallel_workers > 0); papath = (Path *) - create_append_path(root, result_rel, NIL, partial_pathlist, + create_append_path(root, result_rel, partial, NIL, NULL, parallel_workers, enable_parallel_append, -1); gpath = (Path *) @@ -901,7 +901,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, double dNumGroups; bool can_sort = grouping_is_sortable(groupList); bool can_hash = grouping_is_hashable(groupList); - Path *first_path = linitial(cheapest_pathlist); + Path *first_path = linitial(cheapest.subpaths); /* * Estimate the number of UNION output rows. In the case when only a @@ -911,7 +911,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, * contain Vars with varno==0, which estimate_num_groups() wouldn't * like. */ - if (list_length(cheapest_pathlist) == 1 && + if (list_length(cheapest.subpaths) == 1 && first_path->parent->reloptkind != RELOPT_UPPER_REL) { dNumGroups = estimate_num_groups(root, @@ -1017,7 +1017,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, path = (Path *) create_merge_append_path(root, result_rel, - ordered_pathlist, + ordered.subpaths, + NIL, union_pathkeys, NULL); @@ -1216,6 +1217,9 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root, if (op->all) { Path *apath; + AppendPathInput append = {0}; + + append.subpaths = list_make1(lpath); /* * EXCEPT ALL: If the right-hand input is dummy then we can @@ -1224,8 +1228,9 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root, * between the set op targetlist and the targetlist of the * left input. The Append will be removed in setrefs.c. */ - apath = (Path *) create_append_path(root, result_rel, list_make1(lpath), - NIL, NIL, NULL, 0, false, -1); + apath = (Path *) create_append_path(root, result_rel, + append, NIL, NULL, 0, + false, -1); add_path(result_rel, apath); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 7b6c5d51e5..9678c20ff1 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1298,7 +1298,7 @@ create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel, AppendPath * create_append_path(PlannerInfo *root, RelOptInfo *rel, - List *subpaths, List *partial_subpaths, + AppendPathInput input, List *pathkeys, Relids required_outer, int parallel_workers, bool parallel_aware, double rows) @@ -1308,6 +1308,7 @@ create_append_path(PlannerInfo *root, Assert(!parallel_aware || parallel_workers > 0); + pathnode->child_append_relid_sets = input.child_append_relid_sets; pathnode->path.pathtype = T_Append; pathnode->path.parent = rel; pathnode->path.pathtarget = rel->reltarget; @@ -1323,7 +1324,7 @@ create_append_path(PlannerInfo *root, * on the simpler get_appendrel_parampathinfo. There's no point in doing * the more expensive thing for a dummy path, either. */ - if (rel->reloptkind == RELOPT_BASEREL && root && subpaths != NIL) + if (rel->reloptkind == RELOPT_BASEREL && root && input.subpaths != NIL) pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); @@ -1354,11 +1355,11 @@ create_append_path(PlannerInfo *root, */ Assert(pathkeys == NIL); - list_sort(subpaths, append_total_cost_compare); - list_sort(partial_subpaths, append_startup_cost_compare); + list_sort(input.subpaths, append_total_cost_compare); + list_sort(input.partial_subpaths, append_startup_cost_compare); } - pathnode->first_partial_path = list_length(subpaths); - pathnode->subpaths = list_concat(subpaths, partial_subpaths); + pathnode->first_partial_path = list_length(input.subpaths); + pathnode->subpaths = list_concat(input.subpaths, input.partial_subpaths); /* * Apply query-wide LIMIT if known and path is for sole base relation. @@ -1470,6 +1471,7 @@ MergeAppendPath * create_merge_append_path(PlannerInfo *root, RelOptInfo *rel, List *subpaths, + List *child_append_relid_sets, List *pathkeys, Relids required_outer) { @@ -1485,6 +1487,7 @@ create_merge_append_path(PlannerInfo *root, */ Assert(bms_is_empty(rel->lateral_relids) && bms_is_empty(required_outer)); + pathnode->child_append_relid_sets = child_append_relid_sets; pathnode->path.pathtype = T_MergeAppend; pathnode->path.parent = rel; pathnode->path.pathtarget = rel->reltarget; @@ -3932,11 +3935,12 @@ reparameterize_path(PlannerInfo *root, Path *path, case T_Append: { AppendPath *apath = (AppendPath *) path; - List *childpaths = NIL; - List *partialpaths = NIL; + AppendPathInput new_append = {0}; int i; ListCell *lc; + new_append.child_append_relid_sets = apath->child_append_relid_sets; + /* Reparameterize the children */ i = 0; foreach(lc, apath->subpaths) @@ -3950,13 +3954,13 @@ reparameterize_path(PlannerInfo *root, Path *path, return NULL; /* We have to re-split the regular and partial paths */ if (i < apath->first_partial_path) - childpaths = lappend(childpaths, spath); + new_append.subpaths = lappend(new_append.subpaths, spath); else - partialpaths = lappend(partialpaths, spath); + new_append.partial_subpaths = lappend(new_append.partial_subpaths, spath); i++; } return (Path *) - create_append_path(root, rel, childpaths, partialpaths, + create_append_path(root, rel, new_append, apath->path.pathkeys, required_outer, apath->path.parallel_workers, apath->path.parallel_aware, diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 9cc5d2e741..c175ee95b6 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -2250,6 +2250,12 @@ typedef struct CustomPath * For partial Append, 'subpaths' contains non-partial subpaths followed by * partial subpaths. * + * Whenever accumulate_append_subpath() allows us to consolidate multiple + * levels of Append paths down to one, we store the RTI sets for the omitted + * paths in child_append_relid_sets. This is not necessary for planning or + * execution; we do it for the benefit of code that wants to inspect the + * final plan and understand how it came to be. + * * Note: it is possible for "subpaths" to contain only one, or even no, * elements. These cases are optimized during create_append_plan. * In particular, an AppendPath with no subpaths is a "dummy" path that @@ -2265,6 +2271,7 @@ typedef struct AppendPath /* Index of first partial path in subpaths; list_length(subpaths) if none */ int first_partial_path; Cardinality limit_tuples; /* hard limit on output tuples, or -1 */ + List *child_append_relid_sets; } AppendPath; #define IS_DUMMY_APPEND(p) \ @@ -2281,12 +2288,15 @@ extern bool is_dummy_rel(RelOptInfo *rel); /* * MergeAppendPath represents a MergeAppend plan, ie, the merging of sorted * results from several member plans to produce similarly-sorted output. + * + * child_append_relid_sets has the same meaning here as for AppendPath. */ typedef struct MergeAppendPath { Path path; List *subpaths; /* list of component Paths */ Cardinality limit_tuples; /* hard limit on output tuples, or -1 */ + List *child_append_relid_sets; } MergeAppendPath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 0ad0ff404c..485bec5aab 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -394,9 +394,16 @@ struct PartitionPruneInfo; /* forward reference to struct below */ typedef struct Append { Plan plan; + /* RTIs of appendrel(s) formed by this node */ Bitmapset *apprelids; + + /* sets of RTIs of appendrels consolidated into this node */ + List *child_append_relid_sets; + + /* plans to run */ List *appendplans; + /* # of asynchronous plans */ int nasyncplans; @@ -426,6 +433,10 @@ typedef struct MergeAppend /* RTIs of appendrel(s) formed by this node */ Bitmapset *apprelids; + /* sets of RTIs of appendrels consolidated into this node */ + List *child_append_relid_sets; + + /* plans to run */ List *mergeplans; /* these fields are just like the sort-key info in struct Sort: */ diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 224750859c..cf8a654fa5 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -17,6 +17,20 @@ #include "nodes/bitmapset.h" #include "nodes/pathnodes.h" +/* + * Everything in subpaths or partial_subpaths will become part of the + * Append node's subpaths list. Partial and non-partial subpaths can be + * mixed in the same Append node only if it is parallel-aware. + * + * See the comments for AppendPath for the meaning and purpose of the + * child_append_relid_sets field. + */ +typedef struct AppendPathInput +{ + List *subpaths; + List *partial_subpaths; + List *child_append_relid_sets; +} AppendPathInput; /* Hook for plugins to get control during joinrel setup */ typedef void (*joinrel_setup_hook_type) (PlannerInfo *root, @@ -78,14 +92,16 @@ extern TidRangePath *create_tidrangescan_path(PlannerInfo *root, List *tidrangequals, Relids required_outer, int parallel_workers); + extern AppendPath *create_append_path(PlannerInfo *root, RelOptInfo *rel, - List *subpaths, List *partial_subpaths, + AppendPathInput input, List *pathkeys, Relids required_outer, int parallel_workers, bool parallel_aware, double rows); extern MergeAppendPath *create_merge_append_path(PlannerInfo *root, RelOptInfo *rel, List *subpaths, + List *child_append_relid_sets, List *pathkeys, Relids required_outer); extern GroupResultPath *create_group_result_path(PlannerInfo *root, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a942d030d2..39c76691c8 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -125,6 +125,7 @@ AnlIndexData AnyArrayType Append AppendPath +AppendPathInput AppendRelInfo AppendState ApplyErrorCallbackArg From ab32a9e21d37ede830635f502283883592ab0a62 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Feb 2026 11:49:18 +0200 Subject: [PATCH 082/147] Remove useless store to local variable It was a leftover from commit 5764f611e1, which converted the loop to use dclist_foreach. Reviewed-by: Bertrand Drouvot Discussion: https://www.postgresql.org/message-id/3dd6f70c-b94d-4428-8e75-74a7136396be@iki.fi --- src/backend/storage/lmgr/lock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 7f0cd784f7..e1168ad383 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -4148,7 +4148,6 @@ GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data) if (queued_proc == blocked_proc) break; data->waiter_pids[data->npids++] = queued_proc->pid; - queued_proc = (PGPROC *) queued_proc->links.next; } bproc->num_locks = data->nlocks - bproc->first_lock; From 227a6ea65740bb8c5b1f37df016d7861fcba11c5 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Wed, 11 Feb 2026 10:25:05 +0000 Subject: [PATCH 083/147] doc: Clarify RLS policies applied for ON CONFLICT DO NOTHING. On the CREATE POLICY page, the description of per-command policies stated that SELECT policies are applied when an INSERT has an ON CONFLICT DO NOTHING clause. However, that is only the case if it includes an arbiter clause, so clarify that. While at it, also clarify the comment in the regression tests that cover this. Author: Dean Rasheed Reviewed-by: Viktor Holmberg Discussion: https://postgr.es/m/CAEZATCXGwMQ+x00YY9XYG46T0kCajH=21QaYL9Xatz0dLKii+g@mail.gmail.com Backpatch-through: 14 --- doc/src/sgml/ref/create_policy.sgml | 10 ++++++++-- src/test/regress/expected/rowsecurity.out | 5 +++-- src/test/regress/sql/rowsecurity.sql | 5 +++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/src/sgml/ref/create_policy.sgml b/doc/src/sgml/ref/create_policy.sgml index 42d43ad7bf..9065ccb65f 100644 --- a/doc/src/sgml/ref/create_policy.sgml +++ b/doc/src/sgml/ref/create_policy.sgml @@ -294,7 +294,9 @@ CREATE POLICY name ON If an INSERT has an ON CONFLICT DO - NOTHING/UPDATE clause, SELECT + UPDATE clause, or an ON CONFLICT DO + NOTHING clause with an arbiter index or constraint + specification, then SELECT permissions are required on the relation, and the rows proposed for insertion are checked using the relation's SELECT policies. If a row proposed for insertion does not satisfy the @@ -540,7 +542,11 @@ CREATE POLICY name ON INSERT ... ON CONFLICT - Check new row  + Check new row  + + If an arbiter index or constraint is specified. + + Row proposed for insertion is checked regardless of whether or not a conflict occurs. diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index c958ef4d70..e17f9188df 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -170,8 +170,9 @@ NOTICE: SELECT USING on rls_test_tgt.(1,"tgt d","TGT D") 1 | tgt d | TGT D (1 row) --- INSERT ... ON CONFLICT DO NOTHING should apply INSERT CHECK and SELECT USING --- policy clauses (to new value, whether it conflicts or not) +-- INSERT ... ON CONFLICT DO NOTHING with an arbiter clause should apply +-- INSERT CHECK and SELECT USING policy clauses (to new value, whether it +-- conflicts or not) INSERT INTO rls_test_tgt VALUES (1, 'tgt a') ON CONFLICT (a) DO NOTHING; NOTICE: INSERT CHECK on rls_test_tgt.(1,"tgt a","TGT A") NOTICE: SELECT USING on rls_test_tgt.(1,"tgt a","TGT A") diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index 5d923c5ca3..fb6502d497 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -121,8 +121,9 @@ BEGIN; DELETE FROM rls_test_tgt; ROLLBACK; BEGIN; DELETE FROM rls_test_tgt WHERE a = 1; ROLLBACK; DELETE FROM rls_test_tgt RETURNING *; --- INSERT ... ON CONFLICT DO NOTHING should apply INSERT CHECK and SELECT USING --- policy clauses (to new value, whether it conflicts or not) +-- INSERT ... ON CONFLICT DO NOTHING with an arbiter clause should apply +-- INSERT CHECK and SELECT USING policy clauses (to new value, whether it +-- conflicts or not) INSERT INTO rls_test_tgt VALUES (1, 'tgt a') ON CONFLICT (a) DO NOTHING; INSERT INTO rls_test_tgt VALUES (1, 'tgt b') ON CONFLICT (a) DO NOTHING; From bc953bf52314ca881a18703f86b68743ef6f3a32 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Wed, 11 Feb 2026 10:52:58 +0000 Subject: [PATCH 084/147] doc: Mention all SELECT privileges required by INSERT ... ON CONFLICT. On the INSERT page, mention that SELECT privileges are also required for any columns mentioned in the arbiter clause, including those referred to by the constraint, and clarify that this applies to all forms of ON CONFLICT, not just ON CONFLICT DO UPDATE. Author: Dean Rasheed Reviewed-by: Viktor Holmberg Discussion: https://postgr.es/m/CAEZATCXGwMQ+x00YY9XYG46T0kCajH=21QaYL9Xatz0dLKii+g@mail.gmail.com Backpatch-through: 14 --- doc/src/sgml/ref/insert.sgml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/ref/insert.sgml b/doc/src/sgml/ref/insert.sgml index 04962e39e1..42eec5f4ed 100644 --- a/doc/src/sgml/ref/insert.sgml +++ b/doc/src/sgml/ref/insert.sgml @@ -114,10 +114,13 @@ INSERT INTO table_name [ AS INSERT privilege on the listed columns. Similarly, when ON CONFLICT DO UPDATE is specified, you only need UPDATE privilege on the column(s) that are - listed to be updated. However, ON CONFLICT DO UPDATE - also requires SELECT privilege on any column whose - values are read in the ON CONFLICT DO UPDATE - expressions or condition. + listed to be updated. However, all forms of ON CONFLICT + also require SELECT privilege on any column whose values + are read. This includes any column mentioned in + conflict_target (including columns referred to + by the arbiter constraint), and any column mentioned in an + ON CONFLICT DO UPDATE expression, + or a WHERE clause condition. From 7984ce7a1d21819865e473f17cb6b928cf58a10d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Feb 2026 16:48:45 +0200 Subject: [PATCH 085/147] Move ProcStructLock to the ProcGlobal struct It protects the freeProcs and some other fields in ProcGlobal, so let's move it there. It's good for cache locality to have it next to the thing it protects, and just makes more sense anyway. I believe it was allocated as a separate shared memory area just for historical reasons. Reviewed-by: Chao Li Reviewed-by: Ashutosh Bapat Discussion: https://www.postgresql.org/message-id/b78719db-0c54-409f-b185-b0d59261143f@iki.fi --- src/backend/postmaster/launch_backend.c | 3 -- src/backend/storage/lmgr/proc.c | 50 +++++++++---------------- src/include/storage/proc.h | 12 +++++- 3 files changed, 29 insertions(+), 36 deletions(-) diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index 05b1feef3c..e9134b9751 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -104,7 +104,6 @@ typedef struct char **LWLockTrancheNames; int *LWLockCounter; LWLockPadded *MainLWLockArray; - slock_t *ProcStructLock; PROC_HDR *ProcGlobal; PGPROC *AuxiliaryProcs; PGPROC *PreparedXactProcs; @@ -735,7 +734,6 @@ save_backend_variables(BackendParameters *param, param->LWLockTrancheNames = LWLockTrancheNames; param->LWLockCounter = LWLockCounter; param->MainLWLockArray = MainLWLockArray; - param->ProcStructLock = ProcStructLock; param->ProcGlobal = ProcGlobal; param->AuxiliaryProcs = AuxiliaryProcs; param->PreparedXactProcs = PreparedXactProcs; @@ -995,7 +993,6 @@ restore_backend_variables(BackendParameters *param) LWLockTrancheNames = param->LWLockTrancheNames; LWLockCounter = param->LWLockCounter; MainLWLockArray = param->MainLWLockArray; - ProcStructLock = param->ProcStructLock; ProcGlobal = param->ProcGlobal; AuxiliaryProcs = param->AuxiliaryProcs; PreparedXactProcs = param->PreparedXactProcs; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 31ccdb1ef8..fd8318bdf3 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -66,15 +66,6 @@ bool log_lock_waits = true; /* Pointer to this process's PGPROC struct, if any */ PGPROC *MyProc = NULL; -/* - * This spinlock protects the freelist of recycled PGPROC structures. - * We cannot use an LWLock because the LWLock manager depends on already - * having a PGPROC and a wait semaphore! But these structures are touched - * relatively infrequently (only at backend startup or shutdown) and not for - * very long, so a spinlock is okay. - */ -NON_EXEC_STATIC slock_t *ProcStructLock = NULL; - /* Pointers to shared-memory structures */ PROC_HDR *ProcGlobal = NULL; NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL; @@ -214,6 +205,7 @@ InitProcGlobal(void) * Initialize the data structures. */ ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY; + SpinLockInit(&ProcGlobal->freeProcsLock); dlist_init(&ProcGlobal->freeProcs); dlist_init(&ProcGlobal->autovacFreeProcs); dlist_init(&ProcGlobal->bgworkerFreeProcs); @@ -378,12 +370,6 @@ InitProcGlobal(void) */ AuxiliaryProcs = &procs[MaxBackends]; PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS]; - - /* Create ProcStructLock spinlock, too */ - ProcStructLock = (slock_t *) ShmemInitStruct("ProcStructLock spinlock", - sizeof(slock_t), - &found); - SpinLockInit(ProcStructLock); } /* @@ -429,17 +415,17 @@ InitProcess(void) * Try to get a proc struct from the appropriate free list. If this * fails, we must be out of PGPROC structures (not to mention semaphores). * - * While we are holding the ProcStructLock, also copy the current shared + * While we are holding the spinlock, also copy the current shared * estimate of spins_per_delay to local storage. */ - SpinLockAcquire(ProcStructLock); + SpinLockAcquire(&ProcGlobal->freeProcsLock); set_spins_per_delay(ProcGlobal->spins_per_delay); if (!dlist_is_empty(procgloballist)) { MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist)); - SpinLockRelease(ProcStructLock); + SpinLockRelease(&ProcGlobal->freeProcsLock); } else { @@ -449,7 +435,7 @@ InitProcess(void) * error message. XXX do we need to give a different failure message * in the autovacuum case? */ - SpinLockRelease(ProcStructLock); + SpinLockRelease(&ProcGlobal->freeProcsLock); if (AmWalSenderProcess()) ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), @@ -634,13 +620,13 @@ InitAuxiliaryProcess(void) RegisterPostmasterChildActive(); /* - * We use the ProcStructLock to protect assignment and releasing of + * We use the freeProcsLock to protect assignment and releasing of * AuxiliaryProcs entries. * - * While we are holding the ProcStructLock, also copy the current shared + * While we are holding the spinlock, also copy the current shared * estimate of spins_per_delay to local storage. */ - SpinLockAcquire(ProcStructLock); + SpinLockAcquire(&ProcGlobal->freeProcsLock); set_spins_per_delay(ProcGlobal->spins_per_delay); @@ -655,7 +641,7 @@ InitAuxiliaryProcess(void) } if (proctype >= NUM_AUXILIARY_PROCS) { - SpinLockRelease(ProcStructLock); + SpinLockRelease(&ProcGlobal->freeProcsLock); elog(FATAL, "all AuxiliaryProcs are in use"); } @@ -663,7 +649,7 @@ InitAuxiliaryProcess(void) /* use volatile pointer to prevent code rearrangement */ ((volatile PGPROC *) auxproc)->pid = MyProcPid; - SpinLockRelease(ProcStructLock); + SpinLockRelease(&ProcGlobal->freeProcsLock); MyProc = auxproc; MyProcNumber = GetNumberFromPGProc(MyProc); @@ -789,7 +775,7 @@ HaveNFreeProcs(int n, int *nfree) Assert(n > 0); Assert(nfree); - SpinLockAcquire(ProcStructLock); + SpinLockAcquire(&ProcGlobal->freeProcsLock); *nfree = 0; dlist_foreach(iter, &ProcGlobal->freeProcs) @@ -799,7 +785,7 @@ HaveNFreeProcs(int n, int *nfree) break; } - SpinLockRelease(ProcStructLock); + SpinLockRelease(&ProcGlobal->freeProcsLock); return (*nfree == n); } @@ -980,9 +966,9 @@ ProcKill(int code, Datum arg) procgloballist = leader->procgloballist; /* Leader exited first; return its PGPROC. */ - SpinLockAcquire(ProcStructLock); + SpinLockAcquire(&ProcGlobal->freeProcsLock); dlist_push_head(procgloballist, &leader->links); - SpinLockRelease(ProcStructLock); + SpinLockRelease(&ProcGlobal->freeProcsLock); } } else if (leader != MyProc) @@ -1013,7 +999,7 @@ ProcKill(int code, Datum arg) proc->vxid.lxid = InvalidTransactionId; procgloballist = proc->procgloballist; - SpinLockAcquire(ProcStructLock); + SpinLockAcquire(&ProcGlobal->freeProcsLock); /* * If we're still a member of a locking group, that means we're a leader @@ -1032,7 +1018,7 @@ ProcKill(int code, Datum arg) /* Update shared estimate of spins_per_delay */ ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay); - SpinLockRelease(ProcStructLock); + SpinLockRelease(&ProcGlobal->freeProcsLock); } /* @@ -1072,7 +1058,7 @@ AuxiliaryProcKill(int code, Datum arg) MyProcNumber = INVALID_PROC_NUMBER; DisownLatch(&proc->procLatch); - SpinLockAcquire(ProcStructLock); + SpinLockAcquire(&ProcGlobal->freeProcsLock); /* Mark auxiliary proc no longer in use */ proc->pid = 0; @@ -1082,7 +1068,7 @@ AuxiliaryProcKill(int code, Datum arg) /* Update shared estimate of spins_per_delay */ ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay); - SpinLockRelease(ProcStructLock); + SpinLockRelease(&ProcGlobal->freeProcsLock); } /* diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index ac0df4aeaa..23e5cd9816 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -420,6 +420,16 @@ typedef struct PROC_HDR /* Length of allProcs array */ uint32 allProcCount; + + /* + * This spinlock protects the below freelists of PGPROC structures. We + * cannot use an LWLock because the LWLock manager depends on already + * having a PGPROC and a wait semaphore! But these structures are touched + * relatively infrequently (only at backend startup or shutdown) and not + * for very long, so a spinlock is okay. + */ + slock_t freeProcsLock; + /* Head of list of free PGPROC structures */ dlist_head freeProcs; /* Head of list of autovacuum & special worker free PGPROC structures */ @@ -428,6 +438,7 @@ typedef struct PROC_HDR dlist_head bgworkerFreeProcs; /* Head of list of walsender free PGPROC structures */ dlist_head walsenderFreeProcs; + /* First pgproc waiting for group XID clear */ pg_atomic_uint32 procArrayGroupFirst; /* First pgproc waiting for group transaction status update */ @@ -489,7 +500,6 @@ extern PGDLLIMPORT int IdleSessionTimeout; extern PGDLLIMPORT bool log_lock_waits; #ifdef EXEC_BACKEND -extern PGDLLIMPORT slock_t *ProcStructLock; extern PGDLLIMPORT PGPROC *AuxiliaryProcs; #endif From 1efdd7cc630a963e56f34d44877d2097b98166d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Wed, 11 Feb 2026 16:38:18 +0100 Subject: [PATCH 086/147] Cleanup for log_min_messages changes in 38e0190ced71 * Remove an unused variable * Use "default log level" consistently (instead of "generic") * Keep the process types in alphabetical order (missed one place in the SGML docs) * Since log_min_messages type was changed from enum to string, it is a good idea to add single quotes when printing it out. Otherwise it fails if the user copies and pastes from the SHOW output to SET, except in the simplest case. Using single quotes reduces confusion. * Use lowercase string for the burned-in default value, to keep the same output as previous versions. Author: Euler Taveira Author: Man Zeng Author: Noriyoshi Shinoda Reviewed-by: Chao Li Discussion: https://postgr.es/m/202602091250.genyflm2d5dw@alvherre.pgsql --- doc/src/sgml/config.sgml | 2 +- src/backend/utils/error/elog.c | 14 +++++++------- src/backend/utils/misc/guc_parameters.dat | 2 +- src/backend/utils/misc/postgresql.conf.sample | 2 +- src/include/postmaster/proctypelist.h | 2 +- src/include/utils/guc.h | 2 -- 6 files changed, 11 insertions(+), 13 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 3734298696..61d39fd336 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7145,9 +7145,9 @@ local0.* /var/log/postgresql checkpointer ioworker postmaster - syslogger slotsyncworker startup + syslogger walreceiver walsender walsummarizer diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 129906e2da..59315e94e3 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -2190,7 +2190,7 @@ check_log_min_messages(char **newval, void **extra, GucSource source) char *result; int newlevel[BACKEND_NUM_TYPES]; bool assigned[BACKEND_NUM_TYPES] = {0}; - int genericlevel = -1; /* -1 means not assigned */ + int defaultlevel = -1; /* -1 means not assigned */ const char *const process_types[] = { #define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ @@ -2228,8 +2228,8 @@ check_log_min_messages(char **newval, void **extra, GucSource source) const struct config_enum_entry *entry; bool found; - /* Reject duplicates for generic log level. */ - if (genericlevel != -1) + /* Reject duplicates for default log level. */ + if (defaultlevel != -1) { GUC_check_errdetail("Redundant specification of default log level."); goto lmm_fail; @@ -2241,7 +2241,7 @@ check_log_min_messages(char **newval, void **extra, GucSource source) { if (pg_strcasecmp(entry->name, elem) == 0) { - genericlevel = entry->val; + defaultlevel = entry->val; found = true; break; } @@ -2331,9 +2331,9 @@ check_log_min_messages(char **newval, void **extra, GucSource source) } /* - * The generic log level must be specified. It is the fallback value. + * The default log level must be specified. It is the fallback value. */ - if (genericlevel == -1) + if (defaultlevel == -1) { GUC_check_errdetail("Default log level was not defined."); guc_free(rawstring); @@ -2345,7 +2345,7 @@ check_log_min_messages(char **newval, void **extra, GucSource source) for (int i = 0; i < BACKEND_NUM_TYPES; i++) { if (!assigned[i]) - newlevel[i] = genericlevel; + newlevel[i] = defaultlevel; } /* diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 762b8efe6b..4b8bd57d1e 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1698,7 +1698,7 @@ long_desc => 'Each level includes all the levels that follow it. The later the level, the fewer messages are sent.', flags => 'GUC_LIST_INPUT', variable => 'log_min_messages_string', - boot_val => '"WARNING"', + boot_val => '"warning"', check_hook => 'check_log_min_messages', assign_hook => 'assign_log_min_messages', }, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 6e82c8e055..12183f6996 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -532,7 +532,7 @@ # - When to Log - -#log_min_messages = warning # comma-separated list of +#log_min_messages = 'warning' # comma-separated list of # process_type:level entries, plus # one freestanding level as default. # Valid process types are: diff --git a/src/include/postmaster/proctypelist.h b/src/include/postmaster/proctypelist.h index 4e259e84c2..feac19ba20 100644 --- a/src/include/postmaster/proctypelist.h +++ b/src/include/postmaster/proctypelist.h @@ -26,7 +26,7 @@ /* * List of process types (symbol, category, description, Main function, - * shmem_attach, message level) entries. + * shmem_attach) entries. */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 8acbdba7ff..c46203fabf 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -329,8 +329,6 @@ extern PGDLLIMPORT bool trace_sort; extern PGDLLIMPORT bool optimize_bounded_sort; #endif -extern PGDLLIMPORT const char *const log_min_messages_process_types[]; - /* * Declarations for options for enum values * From a3fd53babb8e8bde688739ec367a6d170495cfb4 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 11 Feb 2026 11:03:01 -0500 Subject: [PATCH 087/147] Further stabilize a postgres_fdw test case. The buildfarm occasionally shows a variant row order in the output of this UPDATE ... RETURNING, implying that the preceding INSERT dropped one of the rows into some free space within the table rather than appending them all at the end. It's not entirely clear why that happens some times and not other times, but we have established that it's affected by concurrent activity in other databases of the cluster. In any case, the behavior is not wrong; the test is at fault for presuming that a seqscan will give deterministic row ordering. Add an ORDER BY atop the update to stop the buildfarm noise. The buildfarm seems to have shown this only in v18 and master branches, but just in case the cause is older, back-patch to all supported branches. Discussion: https://postgr.es/m/3866274.1770743162@sss.pgh.pa.us Backpatch-through: 14 --- .../postgres_fdw/expected/postgres_fdw.out | 35 ++++++++++++------- contrib/postgres_fdw/sql/postgres_fdw.sql | 11 ++++-- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 7cad5e67d0..2ccb72c539 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -6503,20 +6503,31 @@ UPDATE ft2 d SET c2 = CASE WHEN random() >= 0 THEN d.c2 ELSE 0 END ALTER SERVER loopback OPTIONS (DROP extensions); INSERT INTO ft2 (c1,c2,c3) SELECT id, id % 10, to_char(id, 'FM00000') FROM generate_series(2001, 2010) id; +-- this will do a remote seqscan, causing unstable result order, so sort EXPLAIN (verbose, costs off) -UPDATE ft2 SET c3 = 'bar' WHERE postgres_fdw_abs(c1) > 2000 RETURNING *; -- can't be pushed down - QUERY PLAN ----------------------------------------------------------------------------------------------------------- - Update on public.ft2 - Output: c1, c2, c3, c4, c5, c6, c7, c8 - Remote SQL: UPDATE "S 1"."T 1" SET c3 = $2 WHERE ctid = $1 RETURNING "C 1", c2, c3, c4, c5, c6, c7, c8 - -> Foreign Scan on public.ft2 - Output: 'bar'::text, ctid, ft2.* - Filter: (postgres_fdw_abs(ft2.c1) > 2000) - Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8, ctid FROM "S 1"."T 1" FOR UPDATE -(7 rows) +WITH cte AS ( + UPDATE ft2 SET c3 = 'bar' WHERE postgres_fdw_abs(c1) > 2000 RETURNING * +) SELECT * FROM cte ORDER BY c1; -- can't be pushed down + QUERY PLAN +------------------------------------------------------------------------------------------------------------------ + Sort + Output: cte.c1, cte.c2, cte.c3, cte.c4, cte.c5, cte.c6, cte.c7, cte.c8 + Sort Key: cte.c1 + CTE cte + -> Update on public.ft2 + Output: ft2.c1, ft2.c2, ft2.c3, ft2.c4, ft2.c5, ft2.c6, ft2.c7, ft2.c8 + Remote SQL: UPDATE "S 1"."T 1" SET c3 = $2 WHERE ctid = $1 RETURNING "C 1", c2, c3, c4, c5, c6, c7, c8 + -> Foreign Scan on public.ft2 + Output: 'bar'::text, ft2.ctid, ft2.* + Filter: (postgres_fdw_abs(ft2.c1) > 2000) + Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8, ctid FROM "S 1"."T 1" FOR UPDATE + -> CTE Scan on cte + Output: cte.c1, cte.c2, cte.c3, cte.c4, cte.c5, cte.c6, cte.c7, cte.c8 +(13 rows) -UPDATE ft2 SET c3 = 'bar' WHERE postgres_fdw_abs(c1) > 2000 RETURNING *; +WITH cte AS ( + UPDATE ft2 SET c3 = 'bar' WHERE postgres_fdw_abs(c1) > 2000 RETURNING * +) SELECT * FROM cte ORDER BY c1; c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 ------+----+-----+----+----+----+------------+---- 2001 | 1 | bar | | | | ft2 | diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index eff25bd2ba..72d2d9c311 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -1613,9 +1613,16 @@ UPDATE ft2 d SET c2 = CASE WHEN random() >= 0 THEN d.c2 ELSE 0 END ALTER SERVER loopback OPTIONS (DROP extensions); INSERT INTO ft2 (c1,c2,c3) SELECT id, id % 10, to_char(id, 'FM00000') FROM generate_series(2001, 2010) id; + +-- this will do a remote seqscan, causing unstable result order, so sort EXPLAIN (verbose, costs off) -UPDATE ft2 SET c3 = 'bar' WHERE postgres_fdw_abs(c1) > 2000 RETURNING *; -- can't be pushed down -UPDATE ft2 SET c3 = 'bar' WHERE postgres_fdw_abs(c1) > 2000 RETURNING *; +WITH cte AS ( + UPDATE ft2 SET c3 = 'bar' WHERE postgres_fdw_abs(c1) > 2000 RETURNING * +) SELECT * FROM cte ORDER BY c1; -- can't be pushed down +WITH cte AS ( + UPDATE ft2 SET c3 = 'bar' WHERE postgres_fdw_abs(c1) > 2000 RETURNING * +) SELECT * FROM cte ORDER BY c1; + EXPLAIN (verbose, costs off) UPDATE ft2 SET c3 = 'baz' FROM ft4 INNER JOIN ft5 ON (ft4.c1 = ft5.c1) From 1d92e0c2cc4789255c630d8776bbe85ca9ebc27f Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 11 Feb 2026 10:36:15 -0600 Subject: [PATCH 088/147] Add password expiration warnings. This commit adds a new parameter called password_expiration_warning_threshold that controls when the server begins emitting imminent-password-expiration warnings upon successful password authentication. By default, this parameter is set to 7 days, but this functionality can be disabled by setting it to 0. This patch also introduces a new "connection warning" infrastructure that can be reused elsewhere. For example, we may want to warn about the use of MD5 passwords for a couple of releases before removing MD5 password support. Author: Gilles Darold Co-authored-by: Nathan Bossart Reviewed-by: Japin Li Reviewed-by: songjinzhou Reviewed-by: liu xiaohui Reviewed-by: Yuefei Shi Reviewed-by: Steven Niu Reviewed-by: Soumya S Murali Reviewed-by: Euler Taveira Reviewed-by: Zsolt Parragi Reviewed-by: Chao Li Reviewed-by: Greg Sabino Mullane Reviewed-by: Peter Eisentraut Discussion: https://postgr.es/m/129bcfbf-47a6-e58a-190a-62fc21a17d03%40migops.com --- doc/src/sgml/config.sgml | 22 ++++++ src/backend/libpq/crypt.c | 73 +++++++++++++++++-- src/backend/utils/init/postinit.c | 69 ++++++++++++++++++ src/backend/utils/misc/guc_parameters.dat | 10 +++ src/backend/utils/misc/postgresql.conf.sample | 3 +- src/include/libpq/crypt.h | 3 + src/include/miscadmin.h | 1 + src/test/authentication/t/001_password.pl | 34 +++++++++ 8 files changed, 209 insertions(+), 6 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 61d39fd336..6bc2690ce0 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1157,6 +1157,28 @@ include_dir 'conf.d' + + password_expiration_warning_threshold (integer) + + password_expiration_warning_threshold configuration parameter + + + + + When this parameter is greater than zero, the server will emit a + WARNING upon successful password authentication if + less than this amount of time remains until the authenticated role's + password expires. Note that a role's password only expires if a date + was specified in a VALID UNTIL clause for + CREATE ROLE or ALTER ROLE. If + this value is specified without units, it is taken as seconds. The + default is 7 days. This parameter can only be set in the + postgresql.conf file or on the server command + line. + + + + md5_password_warnings (boolean) diff --git a/src/backend/libpq/crypt.c b/src/backend/libpq/crypt.c index 5272206045..dbdd0e40f4 100644 --- a/src/backend/libpq/crypt.c +++ b/src/backend/libpq/crypt.c @@ -20,10 +20,15 @@ #include "common/scram-common.h" #include "libpq/crypt.h" #include "libpq/scram.h" +#include "miscadmin.h" #include "utils/builtins.h" +#include "utils/memutils.h" #include "utils/syscache.h" #include "utils/timestamp.h" +/* Threshold for password expiration warnings. */ +int password_expiration_warning_threshold = 604800; + /* Enables deprecation warnings for MD5 passwords. */ bool md5_password_warnings = true; @@ -71,13 +76,71 @@ get_role_password(const char *role, const char **logdetail) ReleaseSysCache(roleTup); /* - * Password OK, but check to be sure we are not past rolvaliduntil + * Password OK, but check to be sure we are not past rolvaliduntil or + * password_expiration_warning_threshold. */ - if (!isnull && vuntil < GetCurrentTimestamp()) + if (!isnull) { - *logdetail = psprintf(_("User \"%s\" has an expired password."), - role); - return NULL; + TimestampTz now = GetCurrentTimestamp(); + uint64 expire_time = TimestampDifferenceMicroseconds(now, vuntil); + + /* + * If we're past rolvaliduntil, the connection attempt should fail, so + * update logdetail and return NULL. + */ + if (vuntil < now) + { + *logdetail = psprintf(_("User \"%s\" has an expired password."), + role); + return NULL; + } + + /* + * If we're past the warning threshold, the connection attempt should + * succeed, but we still want to emit a warning. To do so, we queue + * the warning message using StoreConnectionWarning() so that it will + * be emitted at the end of InitPostgres(), and we return normally. + */ + if (expire_time / USECS_PER_SEC < password_expiration_warning_threshold) + { + MemoryContext oldcontext; + int days; + int hours; + int minutes; + char *warning; + char *detail; + + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + days = expire_time / USECS_PER_DAY; + hours = (expire_time % USECS_PER_DAY) / USECS_PER_HOUR; + minutes = (expire_time % USECS_PER_HOUR) / USECS_PER_MINUTE; + + warning = pstrdup(_("role password will expire soon")); + + if (days > 0) + detail = psprintf(ngettext("The password for role \"%s\" will expire in %d day.", + "The password for role \"%s\" will expire in %d days.", + days), + role, days); + else if (hours > 0) + detail = psprintf(ngettext("The password for role \"%s\" will expire in %d hour.", + "The password for role \"%s\" will expire in %d hours.", + hours), + role, hours); + else if (minutes > 0) + detail = psprintf(ngettext("The password for role \"%s\" will expire in %d minute.", + "The password for role \"%s\" will expire in %d minutes.", + minutes), + role, minutes); + else + detail = psprintf(_("The password for role \"%s\" will expire in less than 1 minute."), + role); + + StoreConnectionWarning(warning, detail); + + MemoryContextSwitchTo(oldcontext); + } } return shadow_pass; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 3f401faf3d..b59e08605c 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -70,6 +70,13 @@ #include "utils/syscache.h" #include "utils/timeout.h" +/* has this backend called EmitConnectionWarnings()? */ +static bool ConnectionWarningsEmitted; + +/* content of warnings to send via EmitConnectionWarnings() */ +static List *ConnectionWarningMessages; +static List *ConnectionWarningDetails; + static HeapTuple GetDatabaseTuple(const char *dbname); static HeapTuple GetDatabaseTupleByOid(Oid dboid); static void PerformAuthentication(Port *port); @@ -85,6 +92,7 @@ static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); static void process_settings(Oid databaseid, Oid roleid); +static void EmitConnectionWarnings(void); /*** InitPostgres support ***/ @@ -987,6 +995,9 @@ InitPostgres(const char *in_dbname, Oid dboid, /* close the transaction we started above */ CommitTransactionCommand(); + /* send any WARNINGs we've accumulated during initialization */ + EmitConnectionWarnings(); + return; } @@ -1232,6 +1243,9 @@ InitPostgres(const char *in_dbname, Oid dboid, /* close the transaction we started above */ if (!bootstrap) CommitTransactionCommand(); + + /* send any WARNINGs we've accumulated during initialization */ + EmitConnectionWarnings(); } /* @@ -1446,3 +1460,58 @@ ThereIsAtLeastOneRole(void) return result; } + +/* + * Stores a warning message to be sent later via EmitConnectionWarnings(). + * Both msg and detail must be non-NULL. + * + * NB: Caller should ensure the strings are allocated in a long-lived context + * like TopMemoryContext. + */ +void +StoreConnectionWarning(char *msg, char *detail) +{ + MemoryContext oldcontext; + + Assert(msg); + Assert(detail); + + if (ConnectionWarningsEmitted) + elog(ERROR, "StoreConnectionWarning() called after EmitConnectionWarnings()"); + + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + ConnectionWarningMessages = lappend(ConnectionWarningMessages, msg); + ConnectionWarningDetails = lappend(ConnectionWarningDetails, detail); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Sends the warning messages saved via StoreConnectionWarning() and frees the + * strings and lists. + * + * NB: This can only be called once per backend. + */ +static void +EmitConnectionWarnings(void) +{ + ListCell *lc_msg; + ListCell *lc_detail; + + if (ConnectionWarningsEmitted) + elog(ERROR, "EmitConnectionWarnings() called more than once"); + else + ConnectionWarningsEmitted = true; + + forboth(lc_msg, ConnectionWarningMessages, + lc_detail, ConnectionWarningDetails) + { + ereport(WARNING, + (errmsg("%s", (char *) lfirst(lc_msg)), + errdetail("%s", (char *) lfirst(lc_detail)))); + } + + list_free_deep(ConnectionWarningMessages); + list_free_deep(ConnectionWarningDetails); +} diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 4b8bd57d1e..271c033952 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -2251,6 +2251,16 @@ options => 'password_encryption_options', }, +{ name => 'password_expiration_warning_threshold', type => 'int', context => 'PGC_SIGHUP', group => 'CONN_AUTH_AUTH', + short_desc => 'Threshold for password expiration warnings.', + long_desc => '0 means not to emit these warnings.', + flags => 'GUC_UNIT_S', + variable => 'password_expiration_warning_threshold', + boot_val => '604800', + min => '0', + max => 'INT_MAX', +}, + { name => 'plan_cache_mode', type => 'enum', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', short_desc => 'Controls the planner\'s selection of custom or generic plan.', long_desc => 'Prepared statements can have custom and generic plans, and the planner will attempt to choose which is better. This can be set to override the default behavior.', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 12183f6996..f938cc65a3 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -96,7 +96,8 @@ #authentication_timeout = 1min # 1s-600s #password_encryption = scram-sha-256 # scram-sha-256 or (deprecated) md5 #scram_iterations = 4096 -#md5_password_warnings = on # display md5 deprecation warnings? +#password_expiration_warning_threshold = 7d # threshold for expiration warnings +#md5_password_warnings = on # display md5 deprecation warnings? #oauth_validator_libraries = '' # comma-separated list of trusted validator modules # GSSAPI using Kerberos diff --git a/src/include/libpq/crypt.h b/src/include/libpq/crypt.h index f01886e109..ebef0d0f78 100644 --- a/src/include/libpq/crypt.h +++ b/src/include/libpq/crypt.h @@ -25,6 +25,9 @@ */ #define MAX_ENCRYPTED_PASSWORD_LEN (512) +/* Threshold for password expiration warnings. */ +extern PGDLLIMPORT int password_expiration_warning_threshold; + /* Enables deprecation warnings for MD5 passwords. */ extern PGDLLIMPORT bool md5_password_warnings; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index db559b39c4..f16f35659b 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -507,6 +507,7 @@ extern void InitPostgres(const char *in_dbname, Oid dboid, bits32 flags, char *out_dbname); extern void BaseInit(void); +extern void StoreConnectionWarning(char *msg, char *detail); /* in utils/init/miscinit.c */ extern PGDLLIMPORT bool IgnoreSystemIndexes; diff --git a/src/test/authentication/t/001_password.pl b/src/test/authentication/t/001_password.pl index f4d65ba7ba..0ec9aa9f4e 100644 --- a/src/test/authentication/t/001_password.pl +++ b/src/test/authentication/t/001_password.pl @@ -68,8 +68,24 @@ sub test_conn $node->append_conf('postgresql.conf', "log_connections = on\n"); # Needed to allow connect_fails to inspect postmaster log: $node->append_conf('postgresql.conf', "log_min_messages = debug2"); +$node->append_conf('postgresql.conf', "password_expiration_warning_threshold = '1100d'"); $node->start; +# Set up roles for password_expiration_warning_threshold test +my $current_year = 1900 + ${ [ localtime(time) ] }[5]; +my $expire_year = $current_year - 1; +$node->safe_psql( + 'postgres', + "CREATE ROLE expired LOGIN VALID UNTIL '$expire_year-01-01' PASSWORD 'pass'"); +$expire_year = $current_year + 2; +$node->safe_psql( + 'postgres', + "CREATE ROLE expiration_warnings LOGIN VALID UNTIL '$expire_year-01-01' PASSWORD 'pass'"); +$expire_year = $current_year + 5; +$node->safe_psql( + 'postgres', + "CREATE ROLE no_warnings LOGIN VALID UNTIL '$expire_year-01-01' PASSWORD 'pass'"); + # Test behavior of log_connections GUC # # There wasn't another test file where these tests obviously fit, and we don't @@ -531,6 +547,24 @@ sub test_conn qr/authentication method requirement "!password,!md5,!scram-sha-256" failed: server requested SCRAM-SHA-256 authentication/ ); +# Test password_expiration_warning_threshold +$node->connect_fails( + "user=expired dbname=postgres", + "connection fails due to expired password", + expected_stderr => + qr/password authentication failed for user "expired"/ +); +$node->connect_ok( + "user=expiration_warnings dbname=postgres", + "connection succeeds with password expiration warning", + expected_stderr => + qr/role password will expire soon/ +); +$node->connect_ok( + "user=no_warnings dbname=postgres", + "connection succeeds with no password expiration warning" +); + # Test SYSTEM_USER <> NULL with parallel workers. $node->safe_psql( 'postgres', From 78a5e3074b824b4bbcb75ea4dd565ce735f54293 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Feb 2026 18:50:57 +0200 Subject: [PATCH 089/147] Fix pg_stat_get_backend_wait_event() for aux processes The pg_stat_activity view shows information for aux processes, but the pg_stat_get_backend_wait_event() and pg_stat_get_backend_wait_event_type() functions did not. To fix, call AuxiliaryPidGetProc(pid) if BackendPidGetProc(pid) returns NULL, like we do in pg_stat_get_activity(). In version 17 and above, it's a little silly to use those functions when we already have the ProcNumber at hand, but it was necessary before v17 because the backend ID was different from ProcNumber. I have other plans for wait_event_info on master, so it doesn't seem worth applying a different fix on different versions now. Reviewed-by: Sami Imseih Reviewed-by: Chao Li Reviewed-by: Kyotaro Horiguchi Discussion: https://www.postgresql.org/message-id/c0320e04-6e85-4c49-80c5-27cfb3a58108@iki.fi Backpatch-through: 14 --- src/backend/utils/adt/pgstatfuncs.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 73ca0bb0b7..b1df96e7b0 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -824,8 +824,14 @@ pg_stat_get_backend_wait_event_type(PG_FUNCTION_ARGS) wait_event_type = ""; else if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid)) wait_event_type = ""; - else if ((proc = BackendPidGetProc(beentry->st_procpid)) != NULL) - wait_event_type = pgstat_get_wait_event_type(proc->wait_event_info); + else + { + proc = BackendPidGetProc(beentry->st_procpid); + if (!proc) + proc = AuxiliaryPidGetProc(beentry->st_procpid); + if (proc) + wait_event_type = pgstat_get_wait_event_type(proc->wait_event_info); + } if (!wait_event_type) PG_RETURN_NULL(); @@ -845,8 +851,14 @@ pg_stat_get_backend_wait_event(PG_FUNCTION_ARGS) wait_event = ""; else if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid)) wait_event = ""; - else if ((proc = BackendPidGetProc(beentry->st_procpid)) != NULL) - wait_event = pgstat_get_wait_event(proc->wait_event_info); + else + { + proc = BackendPidGetProc(beentry->st_procpid); + if (!proc) + proc = AuxiliaryPidGetProc(beentry->st_procpid); + if (proc) + wait_event = pgstat_get_wait_event(proc->wait_event_info); + } if (!wait_event) PG_RETURN_NULL(); From 9863c90759ecb3c200520db9a8b02c33eaec6e17 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 11 Feb 2026 16:53:14 -0500 Subject: [PATCH 090/147] Fix plpgsql's handling of "return simple_record_variable". If the variable's value is null, exec_stmt_return() missed filling in estate->rettype. This is a pretty old bug, but we'd managed not to notice because that value isn't consulted for a null result ... unless we have to cast it to a domain. That case led to a failure with "cache lookup failed for type 0". The correct way to assign the data type is known by exec_eval_datum. While we could copy-and-paste that logic, it seems like a better idea to just invoke exec_eval_datum, as the ROW case already does. Reported-by: Pavel Stehule Author: Tom Lane Discussion: https://postgr.es/m/CAFj8pRBT_ahexDf-zT-cyH8bMR_qcySKM8D5nv5MvTWPiatYGA@mail.gmail.com Backpatch-through: 14 --- .../plpgsql/src/expected/plpgsql_domain.out | 13 ++++++++++++ src/pl/plpgsql/src/pl_exec.c | 20 +++---------------- src/pl/plpgsql/src/sql/plpgsql_domain.sql | 10 ++++++++++ 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/pl/plpgsql/src/expected/plpgsql_domain.out b/src/pl/plpgsql/src/expected/plpgsql_domain.out index 516c2b9e08..11c012ea02 100644 --- a/src/pl/plpgsql/src/expected/plpgsql_domain.out +++ b/src/pl/plpgsql/src/expected/plpgsql_domain.out @@ -395,3 +395,16 @@ SELECT * FROM test_assign_ordered_named_pairs(1,2,0); -- should fail someday {"(1,2)"} (1 row) +CREATE FUNCTION test_null_ordered_named_pair() + RETURNS ordered_named_pair AS $$ +declare v ordered_named_pair; +begin +return v; +end +$$ LANGUAGE plpgsql; +SELECT * FROM test_null_ordered_named_pair(); + i | j +---+--- + | +(1 row) + diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index f80264e184..723048ab83 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -3255,28 +3255,14 @@ exec_stmt_return(PLpgSQL_execstate *estate, PLpgSQL_stmt_return *stmt) } break; - case PLPGSQL_DTYPE_REC: - { - PLpgSQL_rec *rec = (PLpgSQL_rec *) retvar; - - /* If record is empty, we return NULL not a row of nulls */ - if (rec->erh && !ExpandedRecordIsEmpty(rec->erh)) - { - estate->retval = ExpandedRecordGetDatum(rec->erh); - estate->retisnull = false; - estate->rettype = rec->rectypeid; - } - } - break; - case PLPGSQL_DTYPE_ROW: + case PLPGSQL_DTYPE_REC: { - PLpgSQL_row *row = (PLpgSQL_row *) retvar; + /* exec_eval_datum can handle these cases */ int32 rettypmod; - /* We get here if there are multiple OUT parameters */ exec_eval_datum(estate, - (PLpgSQL_datum *) row, + retvar, &estate->rettype, &rettypmod, &estate->retval, diff --git a/src/pl/plpgsql/src/sql/plpgsql_domain.sql b/src/pl/plpgsql/src/sql/plpgsql_domain.sql index 8f99aae5a9..4c5dd7dc70 100644 --- a/src/pl/plpgsql/src/sql/plpgsql_domain.sql +++ b/src/pl/plpgsql/src/sql/plpgsql_domain.sql @@ -277,3 +277,13 @@ $$ LANGUAGE plpgsql; SELECT * FROM test_assign_ordered_named_pairs(1,2,3); SELECT * FROM test_assign_ordered_named_pairs(2,1,3); SELECT * FROM test_assign_ordered_named_pairs(1,2,0); -- should fail someday + +CREATE FUNCTION test_null_ordered_named_pair() + RETURNS ordered_named_pair AS $$ +declare v ordered_named_pair; +begin +return v; +end +$$ LANGUAGE plpgsql; + +SELECT * FROM test_null_ordered_named_pair(); From cf74558feb8f41b2bc459f59ed3f991024d04893 Mon Sep 17 00:00:00 2001 From: Richard Guo Date: Thu, 12 Feb 2026 15:30:13 +0900 Subject: [PATCH 091/147] Reduce LEFT JOIN to ANTI JOIN using NOT NULL constraints For a LEFT JOIN, if any var from the right-hand side (RHS) is forced to null by upper-level quals but is known to be non-null for any matching row, the only way the upper quals can be satisfied is if the join fails to match, producing a null-extended row. Thus, we can treat this left join as an anti-join. Previously, this transformation was limited to cases where the join's own quals were strict for the var forced to null by upper qual levels. This patch extends the logic to check table constraints, leveraging the NOT NULL attribute information already available thanks to the infrastructure introduced by e2debb643. If a forced-null var belongs to the RHS and is defined as NOT NULL in the schema (and is not nullable due to lower-level outer joins), we know that the left join can be reduced to an anti-join. Note that to ensure the var is not nullable by any lower-level outer joins within the current subtree, we collect the relids of base rels that are nullable within each subtree during the first pass of the reduce-outer-joins process. This allows us to verify in the second pass that a NOT NULL var is indeed safe to treat as non-nullable. Based on a proposal by Nicolas Adenis-Lamarre, but this is not the original patch. Suggested-by: Nicolas Adenis-Lamarre Author: Tender Wang Co-authored-by: Richard Guo Discussion: https://postgr.es/m/CACPGbctKMDP50PpRH09in+oWbHtZdahWSroRstLPOoSDKwoFsw@mail.gmail.com --- src/backend/optimizer/prep/prepjointree.c | 204 ++++++++++++++++++---- src/test/regress/expected/join.out | 62 +++++++ src/test/regress/sql/join.sql | 27 +++ 3 files changed, 260 insertions(+), 33 deletions(-) diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index c80bfc88d8..c90f4b3273 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -88,6 +88,8 @@ typedef struct reduce_outer_joins_pass1_state { Relids relids; /* base relids within this subtree */ bool contains_outer; /* does subtree contain outer join(s)? */ + Relids nullable_rels; /* base relids that are nullable within this + * subtree */ List *sub_states; /* List of states for subtree components */ } reduce_outer_joins_pass1_state; @@ -161,6 +163,8 @@ static void reduce_outer_joins_pass2(Node *jtnode, List *forced_null_vars); static void report_reduced_full_join(reduce_outer_joins_pass2_state *state2, int rtindex, Relids relids); +static bool has_notnull_forced_var(PlannerInfo *root, List *forced_null_vars, + reduce_outer_joins_pass1_state *right_state); static Node *remove_useless_results_recurse(PlannerInfo *root, Node *jtnode, Node **parent_quals, Relids *dropped_outer_joins); @@ -3144,13 +3148,16 @@ flatten_simple_union_all(PlannerInfo *root) * to each side separately.) * * Another transformation we apply here is to recognize cases like - * SELECT ... FROM a LEFT JOIN b ON (a.x = b.y) WHERE b.y IS NULL; - * If the join clause is strict for b.y, then only null-extended rows could - * pass the upper WHERE, and we can conclude that what the query is really - * specifying is an anti-semijoin. We change the join type from JOIN_LEFT - * to JOIN_ANTI. The IS NULL clause then becomes redundant, and must be - * removed to prevent bogus selectivity calculations, but we leave it to - * distribute_qual_to_rels to get rid of such clauses. + * SELECT ... FROM a LEFT JOIN b ON (a.x = b.y) WHERE b.z IS NULL; + * If we can prove that b.z must be non-null for any matching row, either + * because the join clause is strict for b.z and b.z happens to be the join + * key b.y, or because b.z is defined NOT NULL by table constraints and is + * not nullable due to lower-level outer joins, then only null-extended rows + * could pass the upper WHERE, and we can conclude that what the query is + * really specifying is an anti-semijoin. We change the join type from + * JOIN_LEFT to JOIN_ANTI. The IS NULL clause then becomes redundant, and + * must be removed to prevent bogus selectivity calculations, but we leave + * it to distribute_qual_to_rels to get rid of such clauses. * * Also, we get rid of JOIN_RIGHT cases by flipping them around to become * JOIN_LEFT. This saves some code here and in some later planner routines; @@ -3174,8 +3181,9 @@ reduce_outer_joins(PlannerInfo *root) * to stop descending the jointree as soon as there are no outer joins * below our current point. This consideration forces a two-pass process. * The first pass gathers information about which base rels appear below - * each side of each join clause, and about whether there are outer - * join(s) below each side of each join clause. The second pass examines + * each side of each join clause, about whether there are outer join(s) + * below each side of each join clause, and about which base rels are from + * the nullable side of those outer join(s). The second pass examines * qual clauses and changes join types as it descends the tree. */ state1 = reduce_outer_joins_pass1((Node *) root->parse->jointree); @@ -3243,6 +3251,7 @@ reduce_outer_joins_pass1(Node *jtnode) result = palloc_object(reduce_outer_joins_pass1_state); result->relids = NULL; result->contains_outer = false; + result->nullable_rels = NULL; result->sub_states = NIL; if (jtnode == NULL) @@ -3266,29 +3275,62 @@ reduce_outer_joins_pass1(Node *jtnode) result->relids = bms_add_members(result->relids, sub_state->relids); result->contains_outer |= sub_state->contains_outer; + result->nullable_rels = bms_add_members(result->nullable_rels, + sub_state->nullable_rels); result->sub_states = lappend(result->sub_states, sub_state); } } else if (IsA(jtnode, JoinExpr)) { JoinExpr *j = (JoinExpr *) jtnode; - reduce_outer_joins_pass1_state *sub_state; + reduce_outer_joins_pass1_state *left_state; + reduce_outer_joins_pass1_state *right_state; + + /* Recurse to children */ + left_state = reduce_outer_joins_pass1(j->larg); + right_state = reduce_outer_joins_pass1(j->rarg); /* join's own RT index is not wanted in result->relids */ - if (IS_OUTER_JOIN(j->jointype)) - result->contains_outer = true; - - sub_state = reduce_outer_joins_pass1(j->larg); - result->relids = bms_add_members(result->relids, - sub_state->relids); - result->contains_outer |= sub_state->contains_outer; - result->sub_states = lappend(result->sub_states, sub_state); - - sub_state = reduce_outer_joins_pass1(j->rarg); - result->relids = bms_add_members(result->relids, - sub_state->relids); - result->contains_outer |= sub_state->contains_outer; - result->sub_states = lappend(result->sub_states, sub_state); + result->relids = bms_union(left_state->relids, right_state->relids); + + /* Store children's states for pass 2 */ + result->sub_states = list_make2(left_state, right_state); + + /* Collect outer join information */ + switch (j->jointype) + { + case JOIN_INNER: + case JOIN_SEMI: + /* No new nullability; propagate state from children */ + result->contains_outer = left_state->contains_outer || + right_state->contains_outer; + result->nullable_rels = bms_union(left_state->nullable_rels, + right_state->nullable_rels); + break; + case JOIN_LEFT: + case JOIN_ANTI: + /* RHS is nullable; LHS keeps existing status */ + result->contains_outer = true; + result->nullable_rels = bms_union(left_state->nullable_rels, + right_state->relids); + break; + case JOIN_RIGHT: + /* LHS is nullable; RHS keeps existing status */ + result->contains_outer = true; + result->nullable_rels = bms_union(left_state->relids, + right_state->nullable_rels); + break; + case JOIN_FULL: + /* Both sides are nullable */ + result->contains_outer = true; + result->nullable_rels = bms_union(left_state->relids, + right_state->relids); + break; + default: + elog(ERROR, "unrecognized join type: %d", + (int) j->jointype); + break; + } } else elog(ERROR, "unrecognized node type: %d", @@ -3440,15 +3482,16 @@ reduce_outer_joins_pass2(Node *jtnode, /* * See if we can reduce JOIN_LEFT to JOIN_ANTI. This is the case if - * the join's own quals are strict for any var that was forced null by - * higher qual levels. NOTE: there are other ways that we could - * detect an anti-join, in particular if we were to check whether Vars - * coming from the RHS must be non-null because of table constraints. - * That seems complicated and expensive though (in particular, one - * would have to be wary of lower outer joins). For the moment this - * seems sufficient. + * any var from the RHS was forced null by higher qual levels, but is + * known to be non-nullable. We detect this either by seeing if the + * join's own quals are strict for the var, or by checking if the var + * is defined NOT NULL by table constraints (being careful to exclude + * vars that are nullable due to lower-level outer joins). In either + * case, the only way the higher qual clause's requirement for NULL + * can be met is if the join fails to match, producing a null-extended + * row. Thus, we can treat this as an anti-join. */ - if (jointype == JOIN_LEFT) + if (jointype == JOIN_LEFT && forced_null_vars != NIL) { List *nonnullable_vars; Bitmapset *overlap; @@ -3460,9 +3503,13 @@ reduce_outer_joins_pass2(Node *jtnode, * It's not sufficient to check whether nonnullable_vars and * forced_null_vars overlap: we need to know if the overlap * includes any RHS variables. + * + * Also check if any forced-null var is defined NOT NULL by table + * constraints. */ overlap = mbms_overlap_sets(nonnullable_vars, forced_null_vars); - if (bms_overlap(overlap, right_state->relids)) + if (bms_overlap(overlap, right_state->relids) || + has_notnull_forced_var(root, forced_null_vars, right_state)) jointype = JOIN_ANTI; } @@ -3598,6 +3645,97 @@ report_reduced_full_join(reduce_outer_joins_pass2_state *state2, state2->partial_reduced = lappend(state2->partial_reduced, statep); } +/* + * has_notnull_forced_var + * Check if "forced_null_vars" contains any Vars belonging to the subtree + * indicated by "right_state" that are known to be non-nullable due to + * table constraints. + * + * Note that we must also consider the situation where a NOT NULL Var can be + * nulled by lower-level outer joins. + * + * Helper for reduce_outer_joins_pass2. + */ +static bool +has_notnull_forced_var(PlannerInfo *root, List *forced_null_vars, + reduce_outer_joins_pass1_state *right_state) +{ + int varno = -1; + + foreach_node(Bitmapset, attrs, forced_null_vars) + { + RangeTblEntry *rte; + Bitmapset *notnullattnums; + Bitmapset *forcednullattnums = NULL; + int attno; + + varno++; + + /* Skip empty bitmaps */ + if (bms_is_empty(attrs)) + continue; + + /* Skip Vars that do not belong to the target relations */ + if (!bms_is_member(varno, right_state->relids)) + continue; + + /* + * Skip Vars that can be nulled by lower-level outer joins within the + * given subtree. These Vars might be NULL even if the schema defines + * them as NOT NULL. + */ + if (bms_is_member(varno, right_state->nullable_rels)) + continue; + + /* + * Iterate over attributes and adjust the bitmap indexes by + * FirstLowInvalidHeapAttributeNumber to get the actual attribute + * numbers. + */ + attno = -1; + while ((attno = bms_next_member(attrs, attno)) >= 0) + { + AttrNumber real_attno = attno + FirstLowInvalidHeapAttributeNumber; + + /* system columns cannot be NULL */ + if (real_attno < 0) + return true; + + forcednullattnums = bms_add_member(forcednullattnums, real_attno); + } + + rte = rt_fetch(varno, root->parse->rtable); + + /* + * We must skip inheritance parent tables, as some child tables may + * have a NOT NULL constraint for a column while others may not. This + * cannot happen with partitioned tables, though. + */ + if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) + { + bms_free(forcednullattnums); + continue; + } + + /* Get the column not-null constraint information for this relation */ + notnullattnums = find_relation_notnullatts(root, rte->relid); + + /* + * Check if any forced-null attributes are defined as NOT NULL by + * table constraints. + */ + if (bms_overlap(notnullattnums, forcednullattnums)) + { + bms_free(forcednullattnums); + return true; + } + + bms_free(forcednullattnums); + } + + return false; +} + /* * remove_useless_result_rtes diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index d05a0ca037..63d3c5d3ac 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3273,6 +3273,68 @@ where (hundred, thousand) in (select twothousand, twothousand from onek); reset enable_memoize; -- +-- more antijoin recognition tests using NOT NULL constraints +-- +begin; +create temp table tbl_anti(a int not null, b int, c int); +-- this is an antijoin, as t2.a is non-null for any matching row +explain (costs off) +select * from tenk1 t1 left join tbl_anti t2 on t1.unique1 = t2.b +where t2.a is null; + QUERY PLAN +---------------------------------- + Hash Right Anti Join + Hash Cond: (t2.b = t1.unique1) + -> Seq Scan on tbl_anti t2 + -> Hash + -> Seq Scan on tenk1 t1 +(5 rows) + +-- this is an antijoin, as t2.a is non-null for any matching row +explain (costs off) +select * from tenk1 t1 left join + (tbl_anti t2 left join tbl_anti t3 on t2.c = t3.c) on t1.unique1 = t2.b +where t2.a is null; + QUERY PLAN +------------------------------------------- + Hash Right Anti Join + Hash Cond: (t2.b = t1.unique1) + -> Merge Left Join + Merge Cond: (t2.c = t3.c) + -> Sort + Sort Key: t2.c + -> Seq Scan on tbl_anti t2 + -> Sort + Sort Key: t3.c + -> Seq Scan on tbl_anti t3 + -> Hash + -> Seq Scan on tenk1 t1 +(12 rows) + +-- this is not an antijoin, as t3.a can be nulled by t2/t3 join +explain (costs off) +select * from tenk1 t1 left join + (tbl_anti t2 left join tbl_anti t3 on t2.c = t3.c) on t1.unique1 = t2.b +where t3.a is null; + QUERY PLAN +------------------------------------------- + Hash Right Join + Hash Cond: (t2.b = t1.unique1) + Filter: (t3.a IS NULL) + -> Merge Left Join + Merge Cond: (t2.c = t3.c) + -> Sort + Sort Key: t2.c + -> Seq Scan on tbl_anti t2 + -> Sort + Sort Key: t3.c + -> Seq Scan on tbl_anti t3 + -> Hash + -> Seq Scan on tenk1 t1 +(13 rows) + +rollback; +-- -- regression test for bogus RTE_GROUP entries -- explain (costs off) diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index b91fb7574d..14cbec2876 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -866,6 +866,33 @@ select 1 from tenk1 where (hundred, thousand) in (select twothousand, twothousand from onek); reset enable_memoize; +-- +-- more antijoin recognition tests using NOT NULL constraints +-- + +begin; + +create temp table tbl_anti(a int not null, b int, c int); + +-- this is an antijoin, as t2.a is non-null for any matching row +explain (costs off) +select * from tenk1 t1 left join tbl_anti t2 on t1.unique1 = t2.b +where t2.a is null; + +-- this is an antijoin, as t2.a is non-null for any matching row +explain (costs off) +select * from tenk1 t1 left join + (tbl_anti t2 left join tbl_anti t3 on t2.c = t3.c) on t1.unique1 = t2.b +where t2.a is null; + +-- this is not an antijoin, as t3.a can be nulled by t2/t3 join +explain (costs off) +select * from tenk1 t1 left join + (tbl_anti t2 left join tbl_anti t3 on t2.c = t3.c) on t1.unique1 = t2.b +where t3.a is null; + +rollback; + -- -- regression test for bogus RTE_GROUP entries -- From 706cadde3239842a41a3375d50dda8b33325c008 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Thu, 12 Feb 2026 09:01:42 +0000 Subject: [PATCH 092/147] Remove p_is_insert from struct ParseState. The only place that used p_is_insert was transformAssignedExpr(), which used it to distinguish INSERT from UPDATE when handling indirection on assignment target columns -- see commit c1ca3a19df3. However, this information is already available to transformAssignedExpr() via its exprKind parameter, which is always either EXPR_KIND_INSERT_TARGET or EXPR_KIND_UPDATE_TARGET. As noted in the commit message for c1ca3a19df3, this use of p_is_insert isn't particularly pretty, so have transformAssignedExpr() use the exprKind parameter instead. This then allows p_is_insert to be removed entirely, which simplifies state management in a few other places across the parser. Author: Viktor Holmberg Reviewed-by: Dean Rasheed Discussion: https://postgr.es/m/badc3b4c-da73-4000-b8d3-638a6f53a769@Spark --- src/backend/parser/analyze.c | 9 --------- src/backend/parser/parse_merge.c | 11 +++-------- src/backend/parser/parse_target.c | 6 ++++-- src/include/parser/parse_node.h | 5 ----- 4 files changed, 7 insertions(+), 24 deletions(-) diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 029ca3b68c..50d51c880d 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -657,7 +657,6 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) Assert(pstate->p_ctenamespace == NIL); qry->commandType = CMD_INSERT; - pstate->p_is_insert = true; /* process the WITH clause independently of all else */ if (stmt->withClause) @@ -1222,13 +1221,6 @@ transformOnConflictClause(ParseState *pstate, /* Process DO UPDATE */ if (onConflictClause->action == ONCONFLICT_UPDATE) { - /* - * Expressions in the UPDATE targetlist need to be handled like UPDATE - * not INSERT. We don't need to save/restore this because all INSERT - * expressions have been parsed already. - */ - pstate->p_is_insert = false; - /* * Add the EXCLUDED pseudo relation to the query namespace, making it * available in the UPDATE subexpressions. @@ -2495,7 +2487,6 @@ transformUpdateStmt(ParseState *pstate, UpdateStmt *stmt) Node *qual; qry->commandType = CMD_UPDATE; - pstate->p_is_insert = false; /* process the WITH clause independently of all else */ if (stmt->withClause) diff --git a/src/backend/parser/parse_merge.c b/src/backend/parser/parse_merge.c index e08dc18dd7..0a70d48fd4 100644 --- a/src/backend/parser/parse_merge.c +++ b/src/backend/parser/parse_merge.c @@ -307,8 +307,6 @@ transformMergeStmt(ParseState *pstate, MergeStmt *stmt) List *icolumns; List *attrnos; - pstate->p_is_insert = true; - icolumns = checkInsertTargets(pstate, mergeWhenClause->targetList, &attrnos); @@ -381,12 +379,9 @@ transformMergeStmt(ParseState *pstate, MergeStmt *stmt) } break; case CMD_UPDATE: - { - pstate->p_is_insert = false; - action->targetList = - transformUpdateTargetList(pstate, - mergeWhenClause->targetList); - } + action->targetList = + transformUpdateTargetList(pstate, + mergeWhenClause->targetList); break; case CMD_DELETE: break; diff --git a/src/backend/parser/parse_target.c b/src/backend/parser/parse_target.c index b5a2f915b6..dbf5b2b5c0 100644 --- a/src/backend/parser/parse_target.c +++ b/src/backend/parser/parse_target.c @@ -438,6 +438,7 @@ markTargetListOrigin(ParseState *pstate, TargetEntry *tle, * pstate parse state * expr expression to be modified * exprKind indicates which type of statement we're dealing with + * (EXPR_KIND_INSERT_TARGET or EXPR_KIND_UPDATE_TARGET) * colname target column name (ie, name of attribute to be assigned to) * attrno target attribute number * indirection subscripts/field names for target column, if any @@ -471,7 +472,8 @@ transformAssignedExpr(ParseState *pstate, * set p_expr_kind here because we can parse subscripts without going * through transformExpr(). */ - Assert(exprKind != EXPR_KIND_NONE); + Assert(exprKind == EXPR_KIND_INSERT_TARGET || + exprKind == EXPR_KIND_UPDATE_TARGET); sv_expr_kind = pstate->p_expr_kind; pstate->p_expr_kind = exprKind; @@ -530,7 +532,7 @@ transformAssignedExpr(ParseState *pstate, { Node *colVar; - if (pstate->p_is_insert) + if (exprKind == EXPR_KIND_INSERT_TARGET) { /* * The command is INSERT INTO table (col.something) ... so there diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h index a9bffb8a78..f23e21f318 100644 --- a/src/include/parser/parse_node.h +++ b/src/include/parser/parse_node.h @@ -153,10 +153,6 @@ typedef Node *(*CoerceParamHook) (ParseState *pstate, Param *param, * * p_grouping_nsitem: the ParseNamespaceItem that represents the grouping step. * - * p_is_insert: true to process assignment expressions like INSERT, false - * to process them like UPDATE. (Note this can change intra-statement, for - * cases like INSERT ON CONFLICT UPDATE.) - * * p_windowdefs: list of WindowDefs representing WINDOW and OVER clauses. * We collect these while transforming expressions and then transform them * afterwards (so that any resjunk tlist items needed for the sort/group @@ -209,7 +205,6 @@ struct ParseState Relation p_target_relation; /* INSERT/UPDATE/DELETE/MERGE target rel */ ParseNamespaceItem *p_target_nsitem; /* target rel's NSItem, or NULL */ ParseNamespaceItem *p_grouping_nsitem; /* NSItem for grouping, or NULL */ - bool p_is_insert; /* process assignment like INSERT not UPDATE */ List *p_windowdefs; /* raw representations of window clauses */ ParseExprKind p_expr_kind; /* what kind of expression we're parsing */ int p_next_resno; /* next targetlist resno to assign */ From 788ec96d591d0a7c916f2f4332765f46410d73b5 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Thu, 12 Feb 2026 14:38:31 +0530 Subject: [PATCH 093/147] Refactor slot synchronization logic in slotsync.c. Following e68b6adad9, the reason for skipping slot synchronization is stored as a slot property. This commit removes redundant function parameters that previously tracked this state, instead relying directly on the slot property. Additionally, this change centralizes the logic for skipping synchronization when required WAL has not yet been received or flushed. By consolidating this check, we reduce code duplication and the risk of inconsistent state updates across different code paths. In passing, add an assertion to ensure a slot is marked as temporary if a consistent point has not been reached during synchronization. Author: Zhijie Hou Reviewed-by: Shveta Malik Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/TY4PR01MB16907DD16098BE3B20486D4569463A@TY4PR01MB16907.jpnprd01.prod.outlook.com Discussion: https://postgr.es/m/CAFPTHDZAA+gWDntpa5ucqKKba41=tXmoXqN3q4rpjO9cdxgQrw@mail.gmail.com --- src/backend/replication/logical/slotsync.c | 163 +++++++-------------- 1 file changed, 51 insertions(+), 112 deletions(-) diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index d02d44d26a..062a08ccb8 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -194,31 +194,40 @@ update_slotsync_skip_stats(SlotSyncSkipReason skip_reason) * * If no update was needed (the data of the remote slot is the same as the * local slot) return false, otherwise true. - * - * *found_consistent_snapshot will be true iff the remote slot's LSN or xmin is - * modified, and decoding from the corresponding LSN's can reach a - * consistent snapshot. - * - * *remote_slot_precedes will be true if the remote slot's LSN or xmin - * precedes locally reserved position. */ static bool -update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, - bool *found_consistent_snapshot, - bool *remote_slot_precedes) +update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid) { ReplicationSlot *slot = MyReplicationSlot; bool updated_xmin_or_lsn = false; bool updated_config = false; SlotSyncSkipReason skip_reason = SS_SKIP_NONE; + XLogRecPtr latestFlushPtr = GetStandbyFlushRecPtr(NULL); Assert(slot->data.invalidated == RS_INVAL_NONE); - if (found_consistent_snapshot) - *found_consistent_snapshot = false; + /* + * Make sure that concerned WAL is received and flushed before syncing + * slot to target lsn received from the primary server. + */ + if (remote_slot->confirmed_lsn > latestFlushPtr) + { + update_slotsync_skip_stats(SS_SKIP_WAL_NOT_FLUSHED); - if (remote_slot_precedes) - *remote_slot_precedes = false; + /* + * Can get here only if GUC 'synchronized_standby_slots' on the + * primary server was not configured correctly. + */ + ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("skipping slot synchronization because the received slot sync" + " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X", + LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), + remote_slot->name, + LSN_FORMAT_ARGS(latestFlushPtr))); + + return false; + } /* * Don't overwrite if we already have a newer catalog_xmin and @@ -262,9 +271,6 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, LSN_FORMAT_ARGS(slot->data.restart_lsn), slot->data.catalog_xmin)); - if (remote_slot_precedes) - *remote_slot_precedes = true; - /* * Skip updating the configuration. This is required to avoid syncing * two_phase_at without syncing confirmed_lsn. Otherwise, the prepared @@ -304,14 +310,13 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, slot->data.confirmed_flush = remote_slot->confirmed_lsn; slot->data.catalog_xmin = remote_slot->catalog_xmin; SpinLockRelease(&slot->mutex); - - if (found_consistent_snapshot) - *found_consistent_snapshot = true; } else { + bool found_consistent_snapshot; + LogicalSlotAdvanceAndCheckSnapState(remote_slot->confirmed_lsn, - found_consistent_snapshot); + &found_consistent_snapshot); /* Sanity check */ if (slot->data.confirmed_flush != remote_slot->confirmed_lsn) @@ -326,8 +331,18 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, * If we can't reach a consistent snapshot, the slot won't be * persisted. See update_and_persist_local_synced_slot(). */ - if (found_consistent_snapshot && !(*found_consistent_snapshot)) + if (!found_consistent_snapshot) + { + Assert(MyReplicationSlot->data.persistency == RS_TEMPORARY); + + ereport(LOG, + errmsg("could not synchronize replication slot \"%s\"", + remote_slot->name), + errdetail("Synchronization could lead to data loss, because the standby could not build a consistent snapshot to decode WALs at LSN %X/%08X.", + LSN_FORMAT_ARGS(slot->data.restart_lsn))); + skip_reason = SS_SKIP_NO_CONSISTENT_SNAPSHOT; + } } updated_xmin_or_lsn = true; @@ -619,27 +634,27 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, bool *slot_persistence_pending) { ReplicationSlot *slot = MyReplicationSlot; - bool found_consistent_snapshot = false; - bool remote_slot_precedes = false; /* Slotsync skip stats are handled in function update_local_synced_slot() */ - (void) update_local_synced_slot(remote_slot, remote_dbid, - &found_consistent_snapshot, - &remote_slot_precedes); + (void) update_local_synced_slot(remote_slot, remote_dbid); /* - * Check if the primary server has caught up. Refer to the comment atop - * the file for details on this check. + * Check if the slot cannot be synchronized. Refer to the comment atop the + * file for details on this check. */ - if (remote_slot_precedes) + if (slot->slotsync_skip_reason != SS_SKIP_NONE) { /* - * The remote slot didn't catch up to locally reserved position. + * We reach this point when the remote slot didn't catch up to locally + * reserved position, or it cannot reach the consistent point from the + * restart_lsn, or the WAL prior to the remote confirmed flush LSN has + * not been received and flushed. * - * We do not drop the slot because the restart_lsn can be ahead of the - * current location when recreating the slot in the next cycle. It may - * take more time to create such a slot. Therefore, we keep this slot - * and attempt the synchronization in the next cycle. + * We do not drop the slot because the restart_lsn and confirmed_lsn + * can be ahead of the current location when recreating the slot in + * the next cycle. It may take more time to create such a slot or + * reach the consistent point. Therefore, we keep this slot and + * attempt the synchronization in the next cycle. * * We also update the slot_persistence_pending parameter, so the SQL * function can retry. @@ -650,24 +665,6 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, return false; } - /* - * Don't persist the slot if it cannot reach the consistent point from the - * restart_lsn. See comments atop this file. - */ - if (!found_consistent_snapshot) - { - ereport(LOG, - errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("Synchronization could lead to data loss, because the standby could not build a consistent snapshot to decode WALs at LSN %X/%08X.", - LSN_FORMAT_ARGS(slot->data.restart_lsn))); - - /* Set this, so that SQL function can retry */ - if (slot_persistence_pending) - *slot_persistence_pending = true; - - return false; - } - ReplicationSlotPersist(); ereport(LOG, @@ -698,7 +695,6 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid, bool *slot_persistence_pending) { ReplicationSlot *slot; - XLogRecPtr latestFlushPtr = GetStandbyFlushRecPtr(NULL); bool slot_updated = false; /* Search for the named slot */ @@ -765,34 +761,6 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid, return slot_updated; } - /* - * Make sure that concerned WAL is received and flushed before syncing - * slot to target lsn received from the primary server. - * - * Report statistics only after the slot has been acquired, ensuring - * it cannot be dropped during the reporting process. - */ - if (remote_slot->confirmed_lsn > latestFlushPtr) - { - update_slotsync_skip_stats(SS_SKIP_WAL_NOT_FLUSHED); - - /* - * Can get here only if GUC 'synchronized_standby_slots' on the - * primary server was not configured correctly. - */ - ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("skipping slot synchronization because the received slot sync" - " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X", - LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), - remote_slot->name, - LSN_FORMAT_ARGS(latestFlushPtr))); - - ReplicationSlotRelease(); - - return slot_updated; - } - /* Slot not ready yet, let's attempt to make it sync-ready now. */ if (slot->data.persistency == RS_TEMPORARY) { @@ -819,8 +787,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid, LSN_FORMAT_ARGS(slot->data.confirmed_flush), LSN_FORMAT_ARGS(remote_slot->confirmed_lsn))); - slot_updated = update_local_synced_slot(remote_slot, remote_dbid, - NULL, NULL); + slot_updated = update_local_synced_slot(remote_slot, remote_dbid); } } /* Otherwise create the slot first. */ @@ -869,34 +836,6 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid, LWLockRelease(ProcArrayLock); LWLockRelease(ReplicationSlotControlLock); - /* - * Make sure that concerned WAL is received and flushed before syncing - * slot to target lsn received from the primary server. - * - * Report statistics only after the slot has been acquired, ensuring - * it cannot be dropped during the reporting process. - */ - if (remote_slot->confirmed_lsn > latestFlushPtr) - { - update_slotsync_skip_stats(SS_SKIP_WAL_NOT_FLUSHED); - - /* - * Can get here only if GUC 'synchronized_standby_slots' on the - * primary server was not configured correctly. - */ - ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("skipping slot synchronization because the received slot sync" - " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X", - LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), - remote_slot->name, - LSN_FORMAT_ARGS(latestFlushPtr))); - - ReplicationSlotRelease(); - - return false; - } - update_and_persist_local_synced_slot(remote_slot, remote_dbid, slot_persistence_pending); From 88327092ff06c48676d2a603420089bf493770f3 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Thu, 12 Feb 2026 09:55:06 +0000 Subject: [PATCH 094/147] Add support for INSERT ... ON CONFLICT DO SELECT. This adds a new ON CONFLICT action DO SELECT [FOR UPDATE/SHARE], which returns the pre-existing rows when conflicts are detected. The INSERT statement must have a RETURNING clause, when DO SELECT is specified. The optional FOR UPDATE/SHARE clause allows the rows to be locked before they are are returned. As with a DO UPDATE conflict action, an optional WHERE clause may be used to prevent rows from being selected for return (but as with a DO UPDATE action, rows filtered out by the WHERE clause are still locked). Bumps catversion as stored rules change. Author: Andreas Karlsson Author: Marko Tiikkaja Author: Viktor Holmberg Reviewed-by: Joel Jacobson Reviewed-by: Kirill Reshke Reviewed-by: Dean Rasheed Reviewed-by: Jian He Discussion: https://postgr.es/m/d631b406-13b7-433e-8c0b-c6040c4b4663@Spark Discussion: https://postgr.es/m/5fca222d-62ae-4a2f-9fcb-0eca56277094@Spark Discussion: https://postgr.es/m/2b5db2e6-8ece-44d0-9890-f256fdca9f7e@proxel.se Discussion: https://postgr.es/m/CAL9smLCdV-v3KgOJX3mU19FYK82N7yzqJj2HAwWX70E=P98kgQ@mail.gmail.com --- contrib/postgres_fdw/postgres_fdw.c | 2 +- doc/src/sgml/dml.sgml | 2 +- doc/src/sgml/fdwhandler.sgml | 2 +- doc/src/sgml/mvcc.sgml | 12 + doc/src/sgml/postgres-fdw.sgml | 2 +- doc/src/sgml/ref/create_policy.sgml | 29 +- doc/src/sgml/ref/create_view.sgml | 4 +- doc/src/sgml/ref/insert.sgml | 135 +++++-- doc/src/sgml/ref/merge.sgml | 3 +- src/backend/access/heap/heapam.c | 8 +- src/backend/commands/explain.c | 36 +- src/backend/executor/execIndexing.c | 6 +- src/backend/executor/execPartition.c | 134 ++++--- src/backend/executor/nodeModifyTable.c | 375 ++++++++++++++---- src/backend/optimizer/plan/createplan.c | 4 + src/backend/optimizer/plan/setrefs.c | 5 +- src/backend/optimizer/util/plancat.c | 27 +- src/backend/parser/analyze.c | 54 ++- src/backend/parser/gram.y | 20 +- src/backend/parser/parse_clause.c | 14 +- src/backend/rewrite/rewriteHandler.c | 27 +- src/backend/rewrite/rowsecurity.c | 111 +++--- src/backend/utils/adt/ruleutils.c | 77 ++-- src/include/catalog/catversion.h | 2 +- src/include/nodes/execnodes.h | 13 +- src/include/nodes/lockoptions.h | 3 +- src/include/nodes/nodes.h | 1 + src/include/nodes/parsenodes.h | 10 +- src/include/nodes/plannodes.h | 4 +- src/include/nodes/primnodes.h | 12 +- .../expected/insert-conflict-do-select.out | 138 +++++++ src/test/isolation/isolation_schedule | 1 + .../specs/insert-conflict-do-select.spec | 53 +++ src/test/regress/expected/constraints.out | 4 + src/test/regress/expected/insert_conflict.out | 216 +++++++++- src/test/regress/expected/privileges.out | 26 ++ src/test/regress/expected/rowsecurity.out | 94 ++++- src/test/regress/expected/rules.out | 55 +++ src/test/regress/expected/triggers.out | 10 +- src/test/regress/expected/updatable_views.out | 82 +++- src/test/regress/sql/constraints.sql | 3 + src/test/regress/sql/insert_conflict.sql | 74 +++- src/test/regress/sql/privileges.sql | 18 + src/test/regress/sql/rowsecurity.sql | 57 ++- src/test/regress/sql/rules.sql | 26 ++ src/test/regress/sql/triggers.sql | 3 +- src/test/regress/sql/updatable_views.sql | 31 +- src/tools/pgindent/typedefs.list | 2 +- 48 files changed, 1675 insertions(+), 352 deletions(-) create mode 100644 src/test/isolation/expected/insert-conflict-do-select.out create mode 100644 src/test/isolation/specs/insert-conflict-do-select.spec diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 3572689e33..60d90329a6 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -1856,7 +1856,7 @@ postgresPlanForeignModify(PlannerInfo *root, returningList = (List *) list_nth(plan->returningLists, subplan_index); /* - * ON CONFLICT DO UPDATE and DO NOTHING case with inference specification + * ON CONFLICT DO NOTHING/SELECT/UPDATE with inference specification * should have already been rejected in the optimizer, as presently there * is no way to recognize an arbiter index on a foreign table. Only DO * NOTHING is supported without an inference specification. diff --git a/doc/src/sgml/dml.sgml b/doc/src/sgml/dml.sgml index 61c64cf6c4..cd348d5773 100644 --- a/doc/src/sgml/dml.sgml +++ b/doc/src/sgml/dml.sgml @@ -385,7 +385,7 @@ UPDATE products SET price = price * 1.10 for a DELETE. However, there are situations where it can still be useful for those commands. For example, in an INSERT with an - ON CONFLICT DO UPDATE + ON CONFLICT DO SELECT/UPDATE clause, the old values will be non-NULL for conflicting rows. Similarly, if a DELETE is turned into an UPDATE by a rewrite rule, diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml index c6d66414b8..9826e09f98 100644 --- a/doc/src/sgml/fdwhandler.sgml +++ b/doc/src/sgml/fdwhandler.sgml @@ -2045,7 +2045,7 @@ GetForeignServerByName(const char *name, bool missing_ok); INSERT with an ON CONFLICT clause does not support specifying the conflict target, as unique constraints or exclusion constraints on remote tables are not locally known. This - in turn implies that ON CONFLICT DO UPDATE is not supported, + in turn implies that ON CONFLICT DO SELECT/UPDATE is not supported, since the specification is mandatory there. diff --git a/doc/src/sgml/mvcc.sgml b/doc/src/sgml/mvcc.sgml index 049ee75a4b..e775260936 100644 --- a/doc/src/sgml/mvcc.sgml +++ b/doc/src/sgml/mvcc.sgml @@ -366,6 +366,18 @@ conventionally visible to the command. + + INSERT with an ON CONFLICT DO + SELECT clause behaves similarly to ON CONFLICT DO + UPDATE. In Read Committed mode, each row proposed for insertion + is guaranteed to either insert or return the conflicting row (unless there are + unrelated errors). If a conflict originates in another transaction whose + effects are not yet visible to the INSERT, the command + will wait for that transaction to commit or roll back, then return the + conflicting row if it was committed (even though that row was not visible + when the command started). + + INSERT with an ON CONFLICT DO NOTHING clause may have insertion not proceed for a row due to diff --git a/doc/src/sgml/postgres-fdw.sgml b/doc/src/sgml/postgres-fdw.sgml index 9b032fbf67..fcf10e4317 100644 --- a/doc/src/sgml/postgres-fdw.sgml +++ b/doc/src/sgml/postgres-fdw.sgml @@ -82,7 +82,7 @@ Note that postgres_fdw currently lacks support for INSERT statements with an ON CONFLICT DO - UPDATE clause. However, the ON CONFLICT DO NOTHING + SELECT/UPDATE clause. However, the ON CONFLICT DO NOTHING clause is supported, provided a unique index inference specification is omitted. Note also that postgres_fdw supports row movement diff --git a/doc/src/sgml/ref/create_policy.sgml b/doc/src/sgml/ref/create_policy.sgml index 9065ccb65f..d8a036739c 100644 --- a/doc/src/sgml/ref/create_policy.sgml +++ b/doc/src/sgml/ref/create_policy.sgml @@ -294,7 +294,7 @@ CREATE POLICY name ON If an INSERT has an ON CONFLICT DO - UPDATE clause, or an ON CONFLICT DO + SELECT/UPDATE clause, or an ON CONFLICT DO NOTHING clause with an arbiter index or constraint specification, then SELECT permissions are required on the relation, and the rows proposed for @@ -338,8 +338,8 @@ CREATE POLICY name ON - Note that an INSERT with an ON CONFLICT - DO NOTHING/UPDATE clause will check the + Note that an INSERT with an + ON CONFLICT clause will check the INSERT policies' WITH CHECK expressions for all rows proposed for insertion, regardless of whether or not they end up being inserted. @@ -352,9 +352,10 @@ CREATE POLICY name ON Using UPDATE for a policy means that it will apply - to UPDATE, SELECT FOR UPDATE, - and SELECT FOR SHARE commands, as well as - auxiliary ON CONFLICT DO UPDATE clauses of + to UPDATE and + SELECT FOR UPDATE/SHARE commands, as well as + auxiliary ON CONFLICT DO UPDATE and + ON CONFLICT DO SELECT FOR UPDATE/SHARE clauses of INSERT commands, and MERGE commands containing UPDATE actions. Since an UPDATE command @@ -578,6 +579,22 @@ CREATE POLICY name ON + + ON CONFLICT DO SELECT + Check existing row + + + + + + + ON CONFLICT DO SELECT FOR UPDATE/SHARE + Check existing row + + Check existing row + + + MERGE Filter source & target rows diff --git a/doc/src/sgml/ref/create_view.sgml b/doc/src/sgml/ref/create_view.sgml index f8a4740608..60215eba3b 100644 --- a/doc/src/sgml/ref/create_view.sgml +++ b/doc/src/sgml/ref/create_view.sgml @@ -415,7 +415,7 @@ CREATE VIEW vista AS SELECT text 'Hello World' AS hello; DELETE, or MERGE statement on the view into the corresponding statement on the underlying base relation. INSERT statements that have an ON - CONFLICT DO UPDATE clause are fully supported. + CONFLICT clause are fully supported. @@ -430,7 +430,7 @@ CREATE VIEW vista AS SELECT text 'Hello World' AS hello; an INSERT or MERGE command can potentially insert base-relation rows that do not satisfy the WHERE condition and thus are not - visible through the view (ON CONFLICT DO UPDATE may + visible through the view (ON CONFLICT DO SELECT/UPDATE may similarly affect an existing row not visible through the view). The CHECK OPTION may be used to prevent INSERT, UPDATE, and diff --git a/doc/src/sgml/ref/insert.sgml b/doc/src/sgml/ref/insert.sgml index 42eec5f4ed..121a9edcb9 100644 --- a/doc/src/sgml/ref/insert.sgml +++ b/doc/src/sgml/ref/insert.sgml @@ -37,6 +37,7 @@ INSERT INTO table_name [ AS and conflict_action is one of: DO NOTHING + DO SELECT [ FOR { UPDATE | NO KEY UPDATE | SHARE | KEY SHARE } ] [ WHERE condition ] DO UPDATE SET { column_name = { expression | DEFAULT } | ( column_name [, ...] ) = [ ROW ] ( { expression | DEFAULT } [, ...] ) | ( column_name [, ...] ) = ( sub-SELECT ) @@ -89,24 +90,27 @@ INSERT INTO table_name [ AS The optional RETURNING clause causes INSERT to compute and return value(s) based on each row actually inserted - (or updated, if an ON CONFLICT DO UPDATE clause was - used). This is primarily useful for obtaining values that were + (or selected or updated, if an ON CONFLICT DO SELECT/UPDATE + clause was used). This is primarily useful for obtaining values that were supplied by defaults, such as a serial sequence number. However, any expression using the table's columns is allowed. The syntax of the RETURNING list is identical to that of the output list of SELECT. Only rows that were successfully - inserted or updated will be returned. For example, if a row was - locked but not updated because an ON CONFLICT DO UPDATE - ... WHERE clause condition was not satisfied, the - row will not be returned. + inserted, updated, or selected will be returned. For example, if a row was + locked but not updated or selected because an ON CONFLICT ... + WHERE clause condition + was not satisfied, the row will not be returned. You must have INSERT privilege on a table in order to insert into it. If ON CONFLICT DO UPDATE is present, UPDATE privilege on the table is also - required. + required. If ON CONFLICT DO SELECT is present, + SELECT privilege on the table is required. + If ON CONFLICT DO SELECT FOR UPDATE/SHARE is used, + UPDATE privilege is required on at least one + column, in addition to SELECT privilege. @@ -343,8 +347,11 @@ INSERT INTO table_name [ AS For a simple INSERT, all old values will be NULL. However, for an INSERT - with an ON CONFLICT DO UPDATE clause, the old - values may be non-NULL. + with an ON CONFLICT DO SELECT/UPDATE clause, the + old values may be non-NULL (when the row proposed + for insertion conflicts with an existing row). If the + SELECT path is taken, the new values will be + identical to the old values, since no modification takes place. @@ -380,6 +387,9 @@ INSERT INTO table_name [ AS ON CONFLICT DO UPDATE updates the existing row that conflicts with the row proposed for insertion as its alternative action. + ON CONFLICT DO SELECT returns the existing row + that conflicts with the row proposed for insertion, optionally + with row-level locking. @@ -411,6 +421,15 @@ INSERT INTO table_name [ AS . + + ON CONFLICT DO SELECT similarly allows an atomic + INSERT or SELECT outcome. This + is also known as idempotent insert or + get or create. For ON CONFLICT DO + SELECT, a RETURNING clause + must be provided. + + conflict_target @@ -424,7 +443,8 @@ INSERT INTO table_name [ AS conflict_target; when omitted, conflicts with all usable constraints (and unique indexes) are handled. For ON CONFLICT DO - UPDATE, a conflict_target + UPDATE and ON CONFLICT DO SELECT, + a conflict_target must be provided. @@ -434,19 +454,23 @@ INSERT INTO table_name [ AS conflict_action - conflict_action specifies an - alternative ON CONFLICT action. It can be - either DO NOTHING, or a DO - UPDATE clause specifying the exact details of the - UPDATE action to be performed in case of a - conflict. The SET and - WHERE clauses in ON CONFLICT DO - UPDATE have access to the existing row using the - table's name (or an alias), and to the row proposed for insertion - using the special excluded table. - SELECT privilege is required on any column in the - target table where corresponding excluded - columns are read. + conflict_action specifies an alternative + ON CONFLICT action. It can be + DO NOTHING, a DO SELECT + clause that allows conflicting rows to be returned, or a + DO UPDATE clause specifying the exact details + of the UPDATE action to be performed in case + of a conflict. + + + The SET clause in DO UPDATE + and the WHERE clause in both + DO SELECT and DO UPDATE have + access to the existing row using the table's name (or an alias), + and to the row proposed for insertion using the special + excluded table. SELECT + privilege is required on any column in the target table where + corresponding excluded columns are read. Note that the effects of all per-row BEFORE @@ -545,24 +569,41 @@ INSERT INTO table_name [ AS + + FOR UPDATE + FOR NO KEY UPDATE + FOR SHARE + FOR KEY SHARE + + + When specified in an ON CONFLICT DO SELECT clause, + conflicting table rows are locked against concurrent updates. + See in the + documentation. + + + + condition An expression that returns a value of type boolean. Only rows for which this expression - returns true will be updated, although all - rows will be locked when the ON CONFLICT DO UPDATE - action is taken. Note that - condition is evaluated last, after - a conflict has been identified as a candidate to update. + returns true will be updated or selected for + return, although all conflicting rows will be locked when + ON CONFLICT DO UPDATE or + ON CONFLICT DO SELECT FOR UPDATE/SHARE is + specified. Note that condition is + evaluated last, after a conflict has been identified as a candidate + to update or select. Note that exclusion constraints are not supported as arbiters with - ON CONFLICT DO UPDATE. In all cases, only + ON CONFLICT DO SELECT/UPDATE. In all cases, only NOT DEFERRABLE constraints and unique indexes are supported as arbiters. @@ -610,7 +651,7 @@ INSERT INTO table_name [ AS oid count The count is the number of - rows inserted or updated. oid is always 0 (it + rows inserted, updated, or selected for return. oid is always 0 (it used to be the OID assigned to the inserted row if count was exactly one and the target table was declared WITH OIDS and 0 otherwise, but creating a table @@ -621,8 +662,7 @@ INSERT oid countINSERT command contains a RETURNING clause, the result will be similar to that of a SELECT statement containing the columns and values defined in the - RETURNING list, computed over the row(s) inserted or - updated by the command. + RETURNING list, computed over the row(s) affected by the command.
@@ -796,6 +836,35 @@ INSERT INTO distributors AS d (did, dname) VALUES (8, 'Anvil Distribution') -- index to arbitrate taking the DO NOTHING action) INSERT INTO distributors (did, dname) VALUES (9, 'Antwerp Design') ON CONFLICT ON CONSTRAINT distributors_pkey DO NOTHING; + +
+ + Insert new distributor if possible, otherwise return the existing + distributor row. Example assumes a unique index has been defined + that constrains values appearing in the did column. + This is useful for get-or-create patterns: + +INSERT INTO distributors (did, dname) VALUES (11, 'Global Electronics') + ON CONFLICT (did) DO SELECT + RETURNING *; + + + + Insert a new distributor if the ID doesn't match, otherwise return + the existing row, if its name doesn't match: + +INSERT INTO distributors AS d (did, dname) VALUES (12, 'Micro Devices Inc') + ON CONFLICT (did) DO SELECT WHERE d.dname != EXCLUDED.dname + RETURNING *; + + + + Insert a new distributor or return and lock the existing row for update. + This is useful when you need to ensure exclusive access to the row: + +INSERT INTO distributors (did, dname) VALUES (13, 'Advanced Systems') + ON CONFLICT (did) DO SELECT FOR UPDATE + RETURNING *; diff --git a/doc/src/sgml/ref/merge.sgml b/doc/src/sgml/ref/merge.sgml index c2e181066a..765fe7a7d6 100644 --- a/doc/src/sgml/ref/merge.sgml +++ b/doc/src/sgml/ref/merge.sgml @@ -714,7 +714,8 @@ MERGE total_count on the behavior at each isolation level. You may also wish to consider using INSERT ... ON CONFLICT as an alternative statement which offers the ability to run an - UPDATE if a concurrent INSERT + UPDATE or return the existing row (with + DO SELECT) if a concurrent INSERT occurs. There are a variety of differences and restrictions between the two statement types and they are not interchangeable. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ff85009930..98d53caeea 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -4689,10 +4689,10 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (result == TM_Invisible) { /* - * This is possible, but only when locking a tuple for ON CONFLICT - * UPDATE. We return this value here rather than throwing an error in - * order to give that case the opportunity to throw a more specific - * error. + * This is possible, but only when locking a tuple for ON CONFLICT DO + * SELECT/UPDATE. We return this value here rather than throwing an + * error in order to give that case the opportunity to throw a more + * specific error. */ result = TM_Invisible; goto out_locked; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index b7bb111688..b9587983f8 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -4672,10 +4672,36 @@ show_modifytable_info(ModifyTableState *mtstate, List *ancestors, if (node->onConflictAction != ONCONFLICT_NONE) { - ExplainPropertyText("Conflict Resolution", - node->onConflictAction == ONCONFLICT_NOTHING ? - "NOTHING" : "UPDATE", - es); + const char *resolution = NULL; + + if (node->onConflictAction == ONCONFLICT_NOTHING) + resolution = "NOTHING"; + else if (node->onConflictAction == ONCONFLICT_UPDATE) + resolution = "UPDATE"; + else + { + Assert(node->onConflictAction == ONCONFLICT_SELECT); + switch (node->onConflictLockStrength) + { + case LCS_NONE: + resolution = "SELECT"; + break; + case LCS_FORKEYSHARE: + resolution = "SELECT FOR KEY SHARE"; + break; + case LCS_FORSHARE: + resolution = "SELECT FOR SHARE"; + break; + case LCS_FORNOKEYUPDATE: + resolution = "SELECT FOR NO KEY UPDATE"; + break; + case LCS_FORUPDATE: + resolution = "SELECT FOR UPDATE"; + break; + } + } + + ExplainPropertyText("Conflict Resolution", resolution, es); /* * Don't display arbiter indexes at all when DO NOTHING variant @@ -4684,7 +4710,7 @@ show_modifytable_info(ModifyTableState *mtstate, List *ancestors, if (idxNames) ExplainPropertyList("Conflict Arbiter Indexes", idxNames, es); - /* ON CONFLICT DO UPDATE WHERE qual is specially displayed */ + /* ON CONFLICT DO SELECT/UPDATE WHERE qual is specially displayed */ if (node->onConflictWhere) { show_upper_qual((List *) node->onConflictWhere, "Conflict Filter", diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 6ae0f95959..f0ba7eac87 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -54,9 +54,9 @@ * --------------------- * * Speculative insertion is a two-phase mechanism used to implement - * INSERT ... ON CONFLICT DO UPDATE/NOTHING. The tuple is first inserted - * to the heap and update the indexes as usual, but if a constraint is - * violated, we can still back out the insertion without aborting the whole + * INSERT ... ON CONFLICT. The tuple is first inserted into the heap + * and the indexes are updated as usual, but if a constraint is violated, + * we can still back out of the insertion without aborting the whole * transaction. In an INSERT ... ON CONFLICT statement, if a conflict is * detected, the inserted tuple is backed out and the ON CONFLICT action is * executed instead. diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index d13e786cf1..bab294f5e9 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -883,20 +883,27 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes; /* - * In the DO UPDATE case, we have some more state to initialize. + * In the DO UPDATE and DO SELECT cases, we have some more state to + * initialize. */ - if (node->onConflictAction == ONCONFLICT_UPDATE) + if (node->onConflictAction == ONCONFLICT_UPDATE || + node->onConflictAction == ONCONFLICT_SELECT) { - OnConflictSetState *onconfl = makeNode(OnConflictSetState); + OnConflictActionState *onconfl = makeNode(OnConflictActionState); TupleConversionMap *map; map = ExecGetRootToChildMap(leaf_part_rri, estate); - Assert(node->onConflictSet != NIL); + Assert(node->onConflictSet != NIL || + node->onConflictAction == ONCONFLICT_SELECT); Assert(rootResultRelInfo->ri_onConflict != NULL); leaf_part_rri->ri_onConflict = onconfl; + /* Lock strength for DO SELECT [FOR UPDATE/SHARE] */ + onconfl->oc_LockStrength = + rootResultRelInfo->ri_onConflict->oc_LockStrength; + /* * Need a separate existing slot for each partition, as the * partition could be of a different AM, even if the tuple @@ -909,7 +916,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, /* * If the partition's tuple descriptor matches exactly the root * parent (the common case), we can re-use most of the parent's ON - * CONFLICT SET state, skipping a bunch of work. Otherwise, we + * CONFLICT action state, skipping a bunch of work. Otherwise, we * need to create state specific to this partition. */ if (map == NULL) @@ -917,7 +924,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, /* * It's safe to reuse these from the partition root, as we * only process one tuple at a time (therefore we won't - * overwrite needed data in slots), and the results of + * overwrite needed data in slots), and the results of any * projections are independent of the underlying storage. * Projections and where clauses themselves don't store state * / are independent of the underlying storage. @@ -931,66 +938,81 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, } else { - List *onconflset; - List *onconflcols; - /* - * Translate expressions in onConflictSet to account for - * different attribute numbers. For that, map partition - * varattnos twice: first to catch the EXCLUDED - * pseudo-relation (INNER_VAR), and second to handle the main - * target relation (firstVarno). + * For ON CONFLICT DO UPDATE, translate expressions in + * onConflictSet to account for different attribute numbers. + * For that, map partition varattnos twice: first to catch the + * EXCLUDED pseudo-relation (INNER_VAR), and second to handle + * the main target relation (firstVarno). */ - onconflset = copyObject(node->onConflictSet); - if (part_attmap == NULL) - part_attmap = - build_attrmap_by_name(RelationGetDescr(partrel), - RelationGetDescr(firstResultRel), - false); - onconflset = (List *) - map_variable_attnos((Node *) onconflset, - INNER_VAR, 0, - part_attmap, - RelationGetForm(partrel)->reltype, - &found_whole_row); - /* We ignore the value of found_whole_row. */ - onconflset = (List *) - map_variable_attnos((Node *) onconflset, - firstVarno, 0, - part_attmap, - RelationGetForm(partrel)->reltype, - &found_whole_row); - /* We ignore the value of found_whole_row. */ - - /* Finally, adjust the target colnos to match the partition. */ - onconflcols = adjust_partition_colnos(node->onConflictCols, - leaf_part_rri); - - /* create the tuple slot for the UPDATE SET projection */ - onconfl->oc_ProjSlot = - table_slot_create(partrel, - &mtstate->ps.state->es_tupleTable); + if (node->onConflictAction == ONCONFLICT_UPDATE) + { + List *onconflset; + List *onconflcols; + + onconflset = copyObject(node->onConflictSet); + if (part_attmap == NULL) + part_attmap = + build_attrmap_by_name(RelationGetDescr(partrel), + RelationGetDescr(firstResultRel), + false); + onconflset = (List *) + map_variable_attnos((Node *) onconflset, + INNER_VAR, 0, + part_attmap, + RelationGetForm(partrel)->reltype, + &found_whole_row); + /* We ignore the value of found_whole_row. */ + onconflset = (List *) + map_variable_attnos((Node *) onconflset, + firstVarno, 0, + part_attmap, + RelationGetForm(partrel)->reltype, + &found_whole_row); + /* We ignore the value of found_whole_row. */ - /* build UPDATE SET projection state */ - onconfl->oc_ProjInfo = - ExecBuildUpdateProjection(onconflset, - true, - onconflcols, - partrelDesc, - econtext, - onconfl->oc_ProjSlot, - &mtstate->ps); + /* + * Finally, adjust the target colnos to match the + * partition. + */ + onconflcols = adjust_partition_colnos(node->onConflictCols, + leaf_part_rri); + + /* create the tuple slot for the UPDATE SET projection */ + onconfl->oc_ProjSlot = + table_slot_create(partrel, + &mtstate->ps.state->es_tupleTable); + + /* build UPDATE SET projection state */ + onconfl->oc_ProjInfo = + ExecBuildUpdateProjection(onconflset, + true, + onconflcols, + partrelDesc, + econtext, + onconfl->oc_ProjSlot, + &mtstate->ps); + } /* - * If there is a WHERE clause, initialize state where it will - * be evaluated, mapping the attribute numbers appropriately. - * As with onConflictSet, we need to map partition varattnos - * to the partition's tupdesc. + * For both ON CONFLICT DO UPDATE and ON CONFLICT DO SELECT, + * there may be a WHERE clause. If so, initialize state where + * it will be evaluated, mapping the attribute numbers + * appropriately. As with onConflictSet, we need to map + * partition varattnos twice, to catch both the EXCLUDED + * pseudo-relation (INNER_VAR), and the main target relation + * (firstVarno). */ if (node->onConflictWhere) { List *clause; + if (part_attmap == NULL) + part_attmap = + build_attrmap_by_name(RelationGetDescr(partrel), + RelationGetDescr(firstResultRel), + false); + clause = copyObject((List *) node->onConflictWhere); clause = (List *) map_variable_attnos((Node *) clause, diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index f5e9d36994..6802fc13e9 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -147,12 +147,24 @@ static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ItemPointer tupleid, TupleTableSlot *oldslot, TupleTableSlot *newslot); +static bool ExecOnConflictLockRow(ModifyTableContext *context, + TupleTableSlot *existing, + ItemPointer conflictTid, + Relation relation, + LockTupleMode lockmode, + bool isUpdate); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning); +static bool ExecOnConflictSelect(ModifyTableContext *context, + ResultRelInfo *resultRelInfo, + ItemPointer conflictTid, + TupleTableSlot *excludedSlot, + bool canSetTag, + TupleTableSlot **returning); static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, EState *estate, PartitionTupleRouting *proute, @@ -274,7 +286,7 @@ ExecCheckPlanOutput(Relation resultRel, List *targetList) * * context: context for the ModifyTable operation * resultRelInfo: current result rel - * cmdType: operation/merge action performed (INSERT, UPDATE, or DELETE) + * isDelete: true if the operation/merge action is a DELETE * oldSlot: slot holding old tuple deleted or updated * newSlot: slot holding new tuple inserted or updated * planSlot: slot holding tuple returned by top subplan node @@ -283,12 +295,15 @@ ExecCheckPlanOutput(Relation resultRel, List *targetList) * econtext's scan tuple and its old & new tuples are not needed (FDW direct- * modify is disabled if the RETURNING list refers to any OLD/NEW values). * + * Note: For the SELECT path of INSERT ... ON CONFLICT DO SELECT, oldSlot and + * newSlot are both the existing tuple, since it's not changed. + * * Returns a slot holding the result tuple */ static TupleTableSlot * ExecProcessReturning(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - CmdType cmdType, + bool isDelete, TupleTableSlot *oldSlot, TupleTableSlot *newSlot, TupleTableSlot *planSlot) @@ -298,23 +313,17 @@ ExecProcessReturning(ModifyTableContext *context, ExprContext *econtext = projectReturning->pi_exprContext; /* Make tuple and any needed join variables available to ExecProject */ - switch (cmdType) + if (isDelete) { - case CMD_INSERT: - case CMD_UPDATE: - /* return new tuple by default */ - if (newSlot) - econtext->ecxt_scantuple = newSlot; - break; - - case CMD_DELETE: - /* return old tuple by default */ - if (oldSlot) - econtext->ecxt_scantuple = oldSlot; - break; - - default: - elog(ERROR, "unrecognized commandType: %d", (int) cmdType); + /* return old tuple by default */ + if (oldSlot) + econtext->ecxt_scantuple = oldSlot; + } + else + { + /* return new tuple by default */ + if (newSlot) + econtext->ecxt_scantuple = newSlot; } econtext->ecxt_outertuple = planSlot; @@ -1158,6 +1167,26 @@ ExecInsert(ModifyTableContext *context, else goto vlock; } + else if (onconflict == ONCONFLICT_SELECT) + { + /* + * In case of ON CONFLICT DO SELECT, optionally lock the + * conflicting tuple, fetch it and project RETURNING on + * it. Be prepared to retry if locking fails because of a + * concurrent UPDATE/DELETE to the conflict tuple. + */ + TupleTableSlot *returning = NULL; + + if (ExecOnConflictSelect(context, resultRelInfo, + &conflictTid, slot, canSetTag, + &returning)) + { + InstrCountTuples2(&mtstate->ps, 1); + return returning; + } + else + goto vlock; + } else { /* @@ -1329,7 +1358,7 @@ ExecInsert(ModifyTableContext *context, } } - result = ExecProcessReturning(context, resultRelInfo, CMD_INSERT, + result = ExecProcessReturning(context, resultRelInfo, false, oldSlot, slot, planSlot); /* @@ -1890,7 +1919,7 @@ ExecDelete(ModifyTableContext *context, return NULL; } - rslot = ExecProcessReturning(context, resultRelInfo, CMD_DELETE, + rslot = ExecProcessReturning(context, resultRelInfo, true, slot, NULL, context->planSlot); /* @@ -2692,56 +2721,37 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, /* Process RETURNING if present */ if (resultRelInfo->ri_projectReturning) - return ExecProcessReturning(context, resultRelInfo, CMD_UPDATE, + return ExecProcessReturning(context, resultRelInfo, false, oldSlot, slot, context->planSlot); return NULL; } /* - * ExecOnConflictUpdate --- execute UPDATE of INSERT ON CONFLICT DO UPDATE + * ExecOnConflictLockRow --- lock the row for ON CONFLICT DO SELECT/UPDATE * - * Try to lock tuple for update as part of speculative insertion. If - * a qual originating from ON CONFLICT DO UPDATE is satisfied, update - * (but still lock row, even though it may not satisfy estate's - * snapshot). + * Try to lock tuple for update as part of speculative insertion for ON + * CONFLICT DO UPDATE or ON CONFLICT DO SELECT FOR UPDATE/SHARE. * - * Returns true if we're done (with or without an update), or false if - * the caller must retry the INSERT from scratch. + * Returns true if the row is successfully locked, or false if the caller must + * retry the INSERT from scratch. */ static bool -ExecOnConflictUpdate(ModifyTableContext *context, - ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, - TupleTableSlot *excludedSlot, - bool canSetTag, - TupleTableSlot **returning) +ExecOnConflictLockRow(ModifyTableContext *context, + TupleTableSlot *existing, + ItemPointer conflictTid, + Relation relation, + LockTupleMode lockmode, + bool isUpdate) { - ModifyTableState *mtstate = context->mtstate; - ExprContext *econtext = mtstate->ps.ps_ExprContext; - Relation relation = resultRelInfo->ri_RelationDesc; - ExprState *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause; - TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing; TM_FailureData tmfd; - LockTupleMode lockmode; TM_Result test; Datum xminDatum; TransactionId xmin; bool isnull; /* - * Parse analysis should have blocked ON CONFLICT for all system - * relations, which includes these. There's no fundamental obstacle to - * supporting this; we'd just need to handle LOCKTAG_TUPLE like the other - * ExecUpdate() caller. - */ - Assert(!resultRelInfo->ri_needLockTagTuple); - - /* Determine lock mode to use */ - lockmode = ExecUpdateLockMode(context->estate, resultRelInfo); - - /* - * Lock tuple for update. Don't follow updates when tuple cannot be + * Lock tuple with lockmode. Don't follow updates when tuple cannot be * locked without doing so. A row locking conflict here means our * previous conclusion that the tuple is conclusively committed is not * true anymore. @@ -2786,7 +2796,7 @@ ExecOnConflictUpdate(ModifyTableContext *context, (errcode(ERRCODE_CARDINALITY_VIOLATION), /* translator: %s is a SQL command name */ errmsg("%s command cannot affect row a second time", - "ON CONFLICT DO UPDATE"), + isUpdate ? "ON CONFLICT DO UPDATE" : "ON CONFLICT DO SELECT"), errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); /* This shouldn't happen */ @@ -2834,6 +2844,50 @@ ExecOnConflictUpdate(ModifyTableContext *context, } /* Success, the tuple is locked. */ + return true; +} + +/* + * ExecOnConflictUpdate --- execute UPDATE of INSERT ON CONFLICT DO UPDATE + * + * Try to lock tuple for update as part of speculative insertion. If + * a qual originating from ON CONFLICT DO UPDATE is satisfied, update + * (but still lock row, even though it may not satisfy estate's + * snapshot). + * + * Returns true if we're done (with or without an update), or false if + * the caller must retry the INSERT from scratch. + */ +static bool +ExecOnConflictUpdate(ModifyTableContext *context, + ResultRelInfo *resultRelInfo, + ItemPointer conflictTid, + TupleTableSlot *excludedSlot, + bool canSetTag, + TupleTableSlot **returning) +{ + ModifyTableState *mtstate = context->mtstate; + ExprContext *econtext = mtstate->ps.ps_ExprContext; + Relation relation = resultRelInfo->ri_RelationDesc; + ExprState *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause; + TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing; + LockTupleMode lockmode; + + /* + * Parse analysis should have blocked ON CONFLICT for all system + * relations, which includes these. There's no fundamental obstacle to + * supporting this; we'd just need to handle LOCKTAG_TUPLE like the other + * ExecUpdate() caller. + */ + Assert(!resultRelInfo->ri_needLockTagTuple); + + /* Determine lock mode to use */ + lockmode = ExecUpdateLockMode(context->estate, resultRelInfo); + + /* Lock tuple for update */ + if (!ExecOnConflictLockRow(context, existing, conflictTid, + resultRelInfo->ri_RelationDesc, lockmode, true)) + return false; /* * Verify that the tuple is visible to our MVCC snapshot if the current @@ -2875,11 +2929,13 @@ ExecOnConflictUpdate(ModifyTableContext *context, * security barrier quals (if any), enforced here as RLS checks/WCOs. * * The rewriter creates UPDATE RLS checks/WCOs for UPDATE security - * quals, and stores them as WCOs of "kind" WCO_RLS_CONFLICT_CHECK, - * but that's almost the extent of its special handling for ON - * CONFLICT DO UPDATE. + * quals, and stores them as WCOs of "kind" WCO_RLS_CONFLICT_CHECK. + * Since SELECT permission on the target table is always required for + * INSERT ... ON CONFLICT DO UPDATE, the rewriter also adds SELECT RLS + * checks/WCOs for SELECT security quals, using WCOs of the same kind, + * and this check enforces them too. * - * The rewriter will also have associated UPDATE applicable straight + * The rewriter will also have associated UPDATE-applicable straight * RLS checks/WCOs for the benefit of the ExecUpdate() call that * follows. INSERTs and UPDATEs naturally have mutually exclusive WCO * kinds, so there is no danger of spurious over-enforcement in the @@ -2924,6 +2980,141 @@ ExecOnConflictUpdate(ModifyTableContext *context, return true; } +/* + * ExecOnConflictSelect --- execute SELECT of INSERT ON CONFLICT DO SELECT + * + * If SELECT FOR UPDATE/SHARE is specified, try to lock tuple as part of + * speculative insertion. If a qual originating from ON CONFLICT DO SELECT is + * satisfied, select (but still lock row, even though it may not satisfy + * estate's snapshot). + * + * Returns true if we're done (with or without a select), or false if the + * caller must retry the INSERT from scratch. + */ +static bool +ExecOnConflictSelect(ModifyTableContext *context, + ResultRelInfo *resultRelInfo, + ItemPointer conflictTid, + TupleTableSlot *excludedSlot, + bool canSetTag, + TupleTableSlot **returning) +{ + ModifyTableState *mtstate = context->mtstate; + ExprContext *econtext = mtstate->ps.ps_ExprContext; + Relation relation = resultRelInfo->ri_RelationDesc; + ExprState *onConflictSelectWhere = resultRelInfo->ri_onConflict->oc_WhereClause; + TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing; + LockClauseStrength lockStrength = resultRelInfo->ri_onConflict->oc_LockStrength; + + /* + * Parse analysis should have blocked ON CONFLICT for all system + * relations, which includes these. There's no fundamental obstacle to + * supporting this; we'd just need to handle LOCKTAG_TUPLE appropriately. + */ + Assert(!resultRelInfo->ri_needLockTagTuple); + + /* Fetch/lock existing tuple, according to the requested lock strength */ + if (lockStrength == LCS_NONE) + { + if (!table_tuple_fetch_row_version(relation, + conflictTid, + SnapshotAny, + existing)) + elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); + } + else + { + LockTupleMode lockmode; + + switch (lockStrength) + { + case LCS_FORKEYSHARE: + lockmode = LockTupleKeyShare; + break; + case LCS_FORSHARE: + lockmode = LockTupleShare; + break; + case LCS_FORNOKEYUPDATE: + lockmode = LockTupleNoKeyExclusive; + break; + case LCS_FORUPDATE: + lockmode = LockTupleExclusive; + break; + default: + elog(ERROR, "Unexpected lock strength %d", (int) lockStrength); + } + + if (!ExecOnConflictLockRow(context, existing, conflictTid, + resultRelInfo->ri_RelationDesc, lockmode, false)) + return false; + } + + /* + * Verify that the tuple is visible to our MVCC snapshot if the current + * isolation level mandates that. See comments in ExecOnConflictUpdate(). + */ + ExecCheckTupleVisible(context->estate, relation, existing); + + /* + * Make tuple and any needed join variables available to ExecQual. The + * EXCLUDED tuple is installed in ecxt_innertuple, while the target's + * existing tuple is installed in the scantuple. EXCLUDED has been made + * to reference INNER_VAR in setrefs.c, but there is no other redirection. + */ + econtext->ecxt_scantuple = existing; + econtext->ecxt_innertuple = excludedSlot; + econtext->ecxt_outertuple = NULL; + + if (!ExecQual(onConflictSelectWhere, econtext)) + { + ExecClearTuple(existing); /* see return below */ + InstrCountFiltered1(&mtstate->ps, 1); + return true; /* done with the tuple */ + } + + if (resultRelInfo->ri_WithCheckOptions != NIL) + { + /* + * Check target's existing tuple against SELECT-applicable USING + * security barrier quals (if any), enforced here as RLS checks/WCOs. + * + * The rewriter creates WCOs from the USING quals of SELECT policies, + * and stores them as WCOs of "kind" WCO_RLS_CONFLICT_CHECK. If FOR + * UPDATE/SHARE was specified, UPDATE permissions are required on the + * target table, and the rewriter also adds WCOs built from the USING + * quals of UPDATE policies, using WCOs of the same kind, and this + * check enforces them too. + */ + ExecWithCheckOptions(WCO_RLS_CONFLICT_CHECK, resultRelInfo, + existing, + mtstate->ps.state); + } + + /* RETURNING is required for DO SELECT */ + Assert(resultRelInfo->ri_projectReturning); + + *returning = ExecProcessReturning(context, resultRelInfo, false, + existing, existing, context->planSlot); + + if (canSetTag) + context->estate->es_processed++; + + /* + * Before releasing the existing tuple, make sure that the returning slot + * has a local copy of any pass-by-reference values. + */ + ExecMaterializeSlot(*returning); + + /* + * Clear out existing tuple, as there might not be another conflict among + * the next input rows. Don't want to hold resources till the end of the + * query. + */ + ExecClearTuple(existing); + + return true; +} + /* * Perform MERGE. */ @@ -3549,7 +3740,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, case CMD_UPDATE: rslot = ExecProcessReturning(context, resultRelInfo, - CMD_UPDATE, + false, resultRelInfo->ri_oldTupleSlot, newslot, context->planSlot); @@ -3558,7 +3749,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, case CMD_DELETE: rslot = ExecProcessReturning(context, resultRelInfo, - CMD_DELETE, + true, resultRelInfo->ri_oldTupleSlot, NULL, context->planSlot); @@ -4329,7 +4520,8 @@ ExecModifyTable(PlanState *pstate) Assert((resultRelInfo->ri_projectReturning->pi_state.flags & EEO_FLAG_HAS_OLD) == 0 && (resultRelInfo->ri_projectReturning->pi_state.flags & EEO_FLAG_HAS_NEW) == 0); - slot = ExecProcessReturning(&context, resultRelInfo, operation, + slot = ExecProcessReturning(&context, resultRelInfo, + operation == CMD_DELETE, NULL, NULL, context.planSlot); return slot; @@ -5031,49 +5223,60 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) } /* - * If needed, Initialize target list, projection and qual for ON CONFLICT - * DO UPDATE. + * For ON CONFLICT DO SELECT/UPDATE, initialize the ON CONFLICT action + * state. */ - if (node->onConflictAction == ONCONFLICT_UPDATE) + if (node->onConflictAction == ONCONFLICT_UPDATE || + node->onConflictAction == ONCONFLICT_SELECT) { - OnConflictSetState *onconfl = makeNode(OnConflictSetState); - ExprContext *econtext; - TupleDesc relationDesc; + OnConflictActionState *onconfl = makeNode(OnConflictActionState); /* already exists if created by RETURNING processing above */ if (mtstate->ps.ps_ExprContext == NULL) ExecAssignExprContext(estate, &mtstate->ps); - econtext = mtstate->ps.ps_ExprContext; - relationDesc = resultRelInfo->ri_RelationDesc->rd_att; - - /* create state for DO UPDATE SET operation */ + /* action state for DO SELECT/UPDATE */ resultRelInfo->ri_onConflict = onconfl; + /* lock strength for DO SELECT [FOR UPDATE/SHARE] */ + onconfl->oc_LockStrength = node->onConflictLockStrength; + /* initialize slot for the existing tuple */ onconfl->oc_Existing = table_slot_create(resultRelInfo->ri_RelationDesc, &mtstate->ps.state->es_tupleTable); /* - * Create the tuple slot for the UPDATE SET projection. We want a slot - * of the table's type here, because the slot will be used to insert - * into the table, and for RETURNING processing - which may access - * system attributes. + * For ON CONFLICT DO UPDATE, initialize target list and projection. */ - onconfl->oc_ProjSlot = - table_slot_create(resultRelInfo->ri_RelationDesc, - &mtstate->ps.state->es_tupleTable); + if (node->onConflictAction == ONCONFLICT_UPDATE) + { + ExprContext *econtext; + TupleDesc relationDesc; + + econtext = mtstate->ps.ps_ExprContext; + relationDesc = resultRelInfo->ri_RelationDesc->rd_att; - /* build UPDATE SET projection state */ - onconfl->oc_ProjInfo = - ExecBuildUpdateProjection(node->onConflictSet, - true, - node->onConflictCols, - relationDesc, - econtext, - onconfl->oc_ProjSlot, - &mtstate->ps); + /* + * Create the tuple slot for the UPDATE SET projection. We want a + * slot of the table's type here, because the slot will be used to + * insert into the table, and for RETURNING processing - which may + * access system attributes. + */ + onconfl->oc_ProjSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &mtstate->ps.state->es_tupleTable); + + /* build UPDATE SET projection state */ + onconfl->oc_ProjInfo = + ExecBuildUpdateProjection(node->onConflictSet, + true, + node->onConflictCols, + relationDesc, + econtext, + onconfl->oc_ProjSlot, + &mtstate->ps); + } /* initialize state to evaluate the WHERE clause, if any */ if (node->onConflictWhere) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 959df43c39..21f1988cf2 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -7043,6 +7043,7 @@ make_modifytable(PlannerInfo *root, Plan *subplan, if (!onconflict) { node->onConflictAction = ONCONFLICT_NONE; + node->onConflictLockStrength = LCS_NONE; node->onConflictSet = NIL; node->onConflictCols = NIL; node->onConflictWhere = NULL; @@ -7054,6 +7055,9 @@ make_modifytable(PlannerInfo *root, Plan *subplan, { node->onConflictAction = onconflict->action; + /* Lock strength for ON CONFLICT DO SELECT [FOR UPDATE/SHARE] */ + node->onConflictLockStrength = onconflict->lockStrength; + /* * Here we convert the ON CONFLICT UPDATE tlist, if any, to the * executor's convention of having consecutive resno's. The actual diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 5ad6c13830..1b5b9b5ed9 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -1163,7 +1163,8 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) * those are already used by RETURNING and it seems better to * be non-conflicting. */ - if (splan->onConflictSet) + if (splan->onConflictAction == ONCONFLICT_UPDATE || + splan->onConflictAction == ONCONFLICT_SELECT) { indexed_tlist *itlist; @@ -3146,7 +3147,7 @@ search_indexed_tlist_for_sortgroupref(Expr *node, * other-relation Vars by OUTER_VAR references, while leaving target Vars * alone. Thus inner_itlist = NULL and acceptable_rel = the ID of the * target relation should be passed. - * 3) ON CONFLICT UPDATE SET/WHERE clauses. Here references to EXCLUDED are + * 3) ON CONFLICT SET and WHERE clauses. Here references to EXCLUDED are * to be replaced with INNER_VAR references, while leaving target Vars (the * to-be-updated relation) alone. Correspondingly inner_itlist is to be * EXCLUDED elements, outer_itlist = NULL and acceptable_rel the target diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 1b20bc805e..d63e7390be 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -841,9 +841,9 @@ infer_arbiter_indexes(PlannerInfo *root) /* * Quickly return NIL for ON CONFLICT DO NOTHING without an inference - * specification or named constraint. ON CONFLICT DO UPDATE statements - * must always provide one or the other (but parser ought to have caught - * that already). + * specification or named constraint. ON CONFLICT DO SELECT/UPDATE + * statements must always provide one or the other (but parser ought to + * have caught that already). */ if (onconflict->arbiterElems == NIL && onconflict->constraint == InvalidOid) @@ -1024,10 +1024,17 @@ infer_arbiter_indexes(PlannerInfo *root) */ if (indexOidFromConstraint == idxForm->indexrelid) { - if (idxForm->indisexclusion && onconflict->action == ONCONFLICT_UPDATE) + /* + * ON CONFLICT DO UPDATE and ON CONFLICT DO SELECT are not + * supported with exclusion constraints. + */ + if (idxForm->indisexclusion && + (onconflict->action == ONCONFLICT_UPDATE || + onconflict->action == ONCONFLICT_SELECT)) ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("ON CONFLICT DO UPDATE not supported with exclusion constraints"))); + errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("ON CONFLICT DO %s not supported with exclusion constraints", + onconflict->action == ONCONFLICT_UPDATE ? "UPDATE" : "SELECT")); /* Consider this one a match already */ results = lappend_oid(results, idxForm->indexrelid); @@ -1037,10 +1044,12 @@ infer_arbiter_indexes(PlannerInfo *root) else if (indexOidFromConstraint != InvalidOid) { /* - * In the case of "ON constraint_name DO UPDATE" we need to skip - * non-unique candidates. + * In the case of "ON constraint_name DO SELECT/UPDATE" we need to + * skip non-unique candidates. */ - if (!idxForm->indisunique && onconflict->action == ONCONFLICT_UPDATE) + if (!idxForm->indisunique && + (onconflict->action == ONCONFLICT_UPDATE || + onconflict->action == ONCONFLICT_SELECT)) continue; } else diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 50d51c880d..539c16c4f7 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -650,7 +650,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) ListCell *icols; ListCell *attnos; ListCell *lc; - bool isOnConflictUpdate; + bool requiresUpdatePerm; AclMode targetPerms; /* There can't be any outer WITH to worry about */ @@ -668,8 +668,14 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) qry->override = stmt->override; - isOnConflictUpdate = (stmt->onConflictClause && - stmt->onConflictClause->action == ONCONFLICT_UPDATE); + /* + * ON CONFLICT DO UPDATE and ON CONFLICT DO SELECT FOR UPDATE/SHARE + * require UPDATE permission on the target relation. + */ + requiresUpdatePerm = (stmt->onConflictClause && + (stmt->onConflictClause->action == ONCONFLICT_UPDATE || + (stmt->onConflictClause->action == ONCONFLICT_SELECT && + stmt->onConflictClause->lockStrength != LCS_NONE))); /* * We have three cases to deal with: DEFAULT VALUES (selectStmt == NULL), @@ -719,7 +725,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) * to the joinlist or namespace. */ targetPerms = ACL_INSERT; - if (isOnConflictUpdate) + if (requiresUpdatePerm) targetPerms |= ACL_UPDATE; qry->resultRelation = setTargetTable(pstate, stmt->relation, false, false, targetPerms); @@ -1026,6 +1032,15 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) false, true, true); } + /* ON CONFLICT DO SELECT requires a RETURNING clause */ + if (stmt->onConflictClause && + stmt->onConflictClause->action == ONCONFLICT_SELECT && + !stmt->returningClause) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("ON CONFLICT DO SELECT requires a RETURNING clause"), + parser_errposition(pstate, stmt->onConflictClause->location)); + /* Process ON CONFLICT, if any. */ if (stmt->onConflictClause) qry->onConflict = transformOnConflictClause(pstate, @@ -1184,12 +1199,13 @@ transformOnConflictClause(ParseState *pstate, OnConflictExpr *result; /* - * If this is ON CONFLICT ... UPDATE, first create the range table entry - * for the EXCLUDED pseudo relation, so that that will be present while - * processing arbiter expressions. (You can't actually reference it from - * there, but this provides a useful error message if you try.) + * If this is ON CONFLICT DO SELECT/UPDATE, first create the range table + * entry for the EXCLUDED pseudo relation, so that that will be present + * while processing arbiter expressions. (You can't actually reference it + * from there, but this provides a useful error message if you try.) */ - if (onConflictClause->action == ONCONFLICT_UPDATE) + if (onConflictClause->action == ONCONFLICT_UPDATE || + onConflictClause->action == ONCONFLICT_SELECT) { Relation targetrel = pstate->p_target_relation; RangeTblEntry *exclRte; @@ -1218,21 +1234,22 @@ transformOnConflictClause(ParseState *pstate, transformOnConflictArbiter(pstate, onConflictClause, &arbiterElems, &arbiterWhere, &arbiterConstraint); - /* Process DO UPDATE */ - if (onConflictClause->action == ONCONFLICT_UPDATE) + /* Process DO SELECT/UPDATE */ + if (onConflictClause->action == ONCONFLICT_UPDATE || + onConflictClause->action == ONCONFLICT_SELECT) { /* * Add the EXCLUDED pseudo relation to the query namespace, making it - * available in the UPDATE subexpressions. + * available in SET and WHERE subexpressions. */ addNSItemToQuery(pstate, exclNSItem, false, true, true); - /* - * Now transform the UPDATE subexpressions. - */ - onConflictSet = - transformUpdateTargetList(pstate, onConflictClause->targetList); + /* Process the UPDATE SET clause */ + if (onConflictClause->action == ONCONFLICT_UPDATE) + onConflictSet = + transformUpdateTargetList(pstate, onConflictClause->targetList); + /* Process the SELECT/UPDATE WHERE clause */ onConflictWhere = transformWhereClause(pstate, onConflictClause->whereClause, EXPR_KIND_WHERE, "WHERE"); @@ -1246,13 +1263,14 @@ transformOnConflictClause(ParseState *pstate, pstate->p_namespace = list_delete_last(pstate->p_namespace); } - /* Finally, build ON CONFLICT DO [NOTHING | UPDATE] expression */ + /* Finally, build ON CONFLICT DO [NOTHING | SELECT | UPDATE] expression */ result = makeNode(OnConflictExpr); result->action = onConflictClause->action; result->arbiterElems = arbiterElems; result->arbiterWhere = arbiterWhere; result->constraint = arbiterConstraint; + result->lockStrength = onConflictClause->lockStrength; result->onConflictSet = onConflictSet; result->onConflictWhere = onConflictWhere; result->exclRelIndex = exclRelIndex; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 713ee5c10a..c567252acc 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -481,7 +481,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type OptNoLog %type OnCommitOption -%type for_locking_strength +%type for_locking_strength opt_for_locking_strength %type for_locking_item %type for_locking_clause opt_for_locking_clause for_locking_items %type locked_rels_list @@ -12496,12 +12496,24 @@ insert_column_item: ; opt_on_conflict: + ON CONFLICT opt_conf_expr DO SELECT opt_for_locking_strength where_clause + { + $$ = makeNode(OnConflictClause); + $$->action = ONCONFLICT_SELECT; + $$->infer = $3; + $$->targetList = NIL; + $$->lockStrength = $6; + $$->whereClause = $7; + $$->location = @1; + } + | ON CONFLICT opt_conf_expr DO UPDATE SET set_clause_list where_clause { $$ = makeNode(OnConflictClause); $$->action = ONCONFLICT_UPDATE; $$->infer = $3; $$->targetList = $7; + $$->lockStrength = LCS_NONE; $$->whereClause = $8; $$->location = @1; } @@ -12512,6 +12524,7 @@ opt_on_conflict: $$->action = ONCONFLICT_NOTHING; $$->infer = $3; $$->targetList = NIL; + $$->lockStrength = LCS_NONE; $$->whereClause = NULL; $$->location = @1; } @@ -13741,6 +13754,11 @@ for_locking_strength: | FOR KEY SHARE { $$ = LCS_FORKEYSHARE; } ; +opt_for_locking_strength: + for_locking_strength { $$ = $1; } + | /* EMPTY */ { $$ = LCS_NONE; } + ; + locked_rels_list: OF qualified_name_list { $$ = $2; } | /* EMPTY */ { $$ = NIL; } diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index e35fd25c9b..06b65d4a60 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -3373,13 +3373,15 @@ transformOnConflictArbiter(ParseState *pstate, *arbiterWhere = NULL; *constraint = InvalidOid; - if (onConflictClause->action == ONCONFLICT_UPDATE && !infer) + if ((onConflictClause->action == ONCONFLICT_UPDATE || + onConflictClause->action == ONCONFLICT_SELECT) && !infer) ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("ON CONFLICT DO UPDATE requires inference specification or constraint name"), - errhint("For example, ON CONFLICT (column_name)."), - parser_errposition(pstate, - exprLocation((Node *) onConflictClause)))); + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("ON CONFLICT DO %s requires inference specification or constraint name", + onConflictClause->action == ONCONFLICT_UPDATE ? "UPDATE" : "SELECT"), + errhint("For example, ON CONFLICT (column_name)."), + parser_errposition(pstate, + exprLocation((Node *) onConflictClause))); /* * To simplify certain aspects of its design, speculative insertion into diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 19dcce80ec..7c99290be4 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -658,6 +658,19 @@ rewriteRuleAction(Query *parsetree, rule_action = sub_action; } + /* + * If rule_action is INSERT .. ON CONFLICT DO SELECT, the parser should + * have verified that it has a RETURNING clause, but we must also check + * that the triggering query has a RETURNING clause. + */ + if (rule_action->onConflict && + rule_action->onConflict->action == ONCONFLICT_SELECT && + (!rule_action->returningList || !parsetree->returningList)) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("ON CONFLICT DO SELECT requires a RETURNING clause"), + errdetail("A rule action is INSERT ... ON CONFLICT DO SELECT, which requires a RETURNING clause.")); + /* * If rule_action has a RETURNING clause, then either throw it away if the * triggering query has no RETURNING clause, or rewrite it to emit what @@ -3643,11 +3656,12 @@ rewriteTargetView(Query *parsetree, Relation view) } /* - * For INSERT .. ON CONFLICT .. DO UPDATE, we must also update assorted - * stuff in the onConflict data structure. + * For INSERT .. ON CONFLICT .. DO SELECT/UPDATE, we must also update + * assorted stuff in the onConflict data structure. */ if (parsetree->onConflict && - parsetree->onConflict->action == ONCONFLICT_UPDATE) + (parsetree->onConflict->action == ONCONFLICT_UPDATE || + parsetree->onConflict->action == ONCONFLICT_SELECT)) { Index old_exclRelIndex, new_exclRelIndex; @@ -3656,9 +3670,8 @@ rewriteTargetView(Query *parsetree, Relation view) List *tmp_tlist; /* - * Like the INSERT/UPDATE code above, update the resnos in the - * auxiliary UPDATE targetlist to refer to columns of the base - * relation. + * For ON CONFLICT DO UPDATE, update the resnos in the auxiliary + * UPDATE targetlist to refer to columns of the base relation. */ foreach(lc, parsetree->onConflict->onConflictSet) { @@ -3677,7 +3690,7 @@ rewriteTargetView(Query *parsetree, Relation view) } /* - * Also, create a new RTE for the EXCLUDED pseudo-relation, using the + * Create a new RTE for the EXCLUDED pseudo-relation, using the * query's new base rel (which may well have a different column list * from the view, hence we need a new column alias list). This should * match transformOnConflictClause. In particular, note that the diff --git a/src/backend/rewrite/rowsecurity.c b/src/backend/rewrite/rowsecurity.c index 93a205d02b..e88a1bc1a8 100644 --- a/src/backend/rewrite/rowsecurity.c +++ b/src/backend/rewrite/rowsecurity.c @@ -301,40 +301,48 @@ get_row_security_policies(Query *root, RangeTblEntry *rte, int rt_index, } /* - * For INSERT ... ON CONFLICT DO UPDATE we need additional policy - * checks for the UPDATE which may be applied to the same RTE. + * For INSERT ... ON CONFLICT DO SELECT/UPDATE we need additional + * policy checks for the SELECT/UPDATE which may be applied to the + * same RTE. */ - if (commandType == CMD_INSERT && - root->onConflict && root->onConflict->action == ONCONFLICT_UPDATE) + if (commandType == CMD_INSERT && root->onConflict && + (root->onConflict->action == ONCONFLICT_UPDATE || + root->onConflict->action == ONCONFLICT_SELECT)) { - List *conflict_permissive_policies; - List *conflict_restrictive_policies; + List *conflict_permissive_policies = NIL; + List *conflict_restrictive_policies = NIL; List *conflict_select_permissive_policies = NIL; List *conflict_select_restrictive_policies = NIL; - /* Get the policies that apply to the auxiliary UPDATE */ - get_policies_for_relation(rel, CMD_UPDATE, user_id, - &conflict_permissive_policies, - &conflict_restrictive_policies); - - /* - * Enforce the USING clauses of the UPDATE policies using WCOs - * rather than security quals. This ensures that an error is - * raised if the conflicting row cannot be updated due to RLS, - * rather than the change being silently dropped. - */ - add_with_check_options(rel, rt_index, - WCO_RLS_CONFLICT_CHECK, - conflict_permissive_policies, - conflict_restrictive_policies, - withCheckOptions, - hasSubLinks, - true); + if (perminfo->requiredPerms & ACL_UPDATE) + { + /* + * Get the policies that apply to the auxiliary UPDATE or + * SELECT FOR UPDATE/SHARE. + */ + get_policies_for_relation(rel, CMD_UPDATE, user_id, + &conflict_permissive_policies, + &conflict_restrictive_policies); + + /* + * Enforce the USING clauses of the UPDATE policies using WCOs + * rather than security quals. This ensures that an error is + * raised if the conflicting row cannot be updated/locked due + * to RLS, rather than the change being silently dropped. + */ + add_with_check_options(rel, rt_index, + WCO_RLS_CONFLICT_CHECK, + conflict_permissive_policies, + conflict_restrictive_policies, + withCheckOptions, + hasSubLinks, + true); + } /* * Get and add ALL/SELECT policies, as WCO_RLS_CONFLICT_CHECK WCOs - * to ensure they are considered when taking the UPDATE path of an - * INSERT .. ON CONFLICT DO UPDATE, if SELECT rights are required + * to ensure they are considered when taking the SELECT/UPDATE + * path of an INSERT .. ON CONFLICT, if SELECT rights are required * for this relation, also as WCO policies, again, to avoid * silently dropping data. See above. */ @@ -352,29 +360,36 @@ get_row_security_policies(Query *root, RangeTblEntry *rte, int rt_index, true); } - /* Enforce the WITH CHECK clauses of the UPDATE policies */ - add_with_check_options(rel, rt_index, - WCO_RLS_UPDATE_CHECK, - conflict_permissive_policies, - conflict_restrictive_policies, - withCheckOptions, - hasSubLinks, - false); - /* - * Add ALL/SELECT policies as WCO_RLS_UPDATE_CHECK WCOs, to ensure - * that the final updated row is visible when taking the UPDATE - * path of an INSERT .. ON CONFLICT DO UPDATE, if SELECT rights - * are required for this relation. + * For INSERT .. ON CONFLICT DO UPDATE, add additional policies to + * be checked when the auxiliary UPDATE is executed. */ - if (perminfo->requiredPerms & ACL_SELECT) + if (root->onConflict->action == ONCONFLICT_UPDATE) + { + /* Enforce the WITH CHECK clauses of the UPDATE policies */ add_with_check_options(rel, rt_index, WCO_RLS_UPDATE_CHECK, - conflict_select_permissive_policies, - conflict_select_restrictive_policies, + conflict_permissive_policies, + conflict_restrictive_policies, withCheckOptions, hasSubLinks, - true); + false); + + /* + * Add ALL/SELECT policies as WCO_RLS_UPDATE_CHECK WCOs, to + * ensure that the final updated row is visible when taking + * the UPDATE path of an INSERT .. ON CONFLICT, if SELECT + * rights are required for this relation. + */ + if (perminfo->requiredPerms & ACL_SELECT) + add_with_check_options(rel, rt_index, + WCO_RLS_UPDATE_CHECK, + conflict_select_permissive_policies, + conflict_select_restrictive_policies, + withCheckOptions, + hasSubLinks, + true); + } } } @@ -398,8 +413,8 @@ get_row_security_policies(Query *root, RangeTblEntry *rte, int rt_index, * XXX We are setting up USING quals as WITH CHECK. If RLS prohibits * UPDATE/DELETE on the target row, we shall throw an error instead of * silently ignoring the row. This is different than how normal - * UPDATE/DELETE works and more in line with INSERT ON CONFLICT DO UPDATE - * handling. + * UPDATE/DELETE works and more in line with INSERT ON CONFLICT DO + * SELECT/UPDATE handling. */ if (commandType == CMD_MERGE) { @@ -784,9 +799,9 @@ add_security_quals(int rt_index, * added by an INSERT or UPDATE are consistent with the specified RLS * policies. Normally new data must satisfy the WITH CHECK clauses from the * policies. If a policy has no explicit WITH CHECK clause, its USING clause - * is used instead. In the special case of an UPDATE arising from an - * INSERT ... ON CONFLICT DO UPDATE, existing records are first checked using - * a WCO_RLS_CONFLICT_CHECK WithCheckOption, which always uses the USING + * is used instead. In the special case of a SELECT or UPDATE arising from an + * INSERT ... ON CONFLICT DO SELECT/UPDATE, existing records are first checked + * using a WCO_RLS_CONFLICT_CHECK WithCheckOption, which always uses the USING * clauses from RLS policies. * * New WCOs are added to withCheckOptions, and hasSubLinks is set to true if diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index b5a7ad9066..89cbdd3b1e 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -426,6 +426,7 @@ static void get_update_query_targetlist_def(Query *query, List *targetList, static void get_delete_query_def(Query *query, deparse_context *context); static void get_merge_query_def(Query *query, deparse_context *context); static void get_utility_query_def(Query *query, deparse_context *context); +static char *get_lock_clause_strength(LockClauseStrength strength); static void get_basic_select_query(Query *query, deparse_context *context); static void get_target_list(List *targetList, deparse_context *context); static void get_returning_clause(Query *query, deparse_context *context); @@ -5186,10 +5187,10 @@ set_deparse_plan(deparse_namespace *dpns, Plan *plan) * source, and all INNER_VAR Vars in other parts of the query refer to its * targetlist. * - * For ON CONFLICT .. UPDATE we just need the inner tlist to point to the - * excluded expression's tlist. (Similar to the SubqueryScan we don't want - * to reuse OUTER, it's used for RETURNING in some modify table cases, - * although not INSERT .. CONFLICT). + * For ON CONFLICT DO SELECT/UPDATE we just need the inner tlist to point + * to the excluded expression's tlist. (Similar to the SubqueryScan we + * don't want to reuse OUTER, it's used for RETURNING in some modify table + * cases, although not INSERT .. CONFLICT). */ if (IsA(plan, SubqueryScan)) dpns->inner_plan = ((SubqueryScan *) plan)->subplan; @@ -5997,30 +5998,9 @@ get_select_query_def(Query *query, deparse_context *context) if (rc->pushedDown) continue; - switch (rc->strength) - { - case LCS_NONE: - /* we intentionally throw an error for LCS_NONE */ - elog(ERROR, "unrecognized LockClauseStrength %d", - (int) rc->strength); - break; - case LCS_FORKEYSHARE: - appendContextKeyword(context, " FOR KEY SHARE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - break; - case LCS_FORSHARE: - appendContextKeyword(context, " FOR SHARE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - break; - case LCS_FORNOKEYUPDATE: - appendContextKeyword(context, " FOR NO KEY UPDATE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - break; - case LCS_FORUPDATE: - appendContextKeyword(context, " FOR UPDATE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - break; - } + appendContextKeyword(context, + get_lock_clause_strength(rc->strength), + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); appendStringInfo(buf, " OF %s", quote_identifier(get_rtable_name(rc->rti, @@ -6033,6 +6013,28 @@ get_select_query_def(Query *query, deparse_context *context) } } +static char * +get_lock_clause_strength(LockClauseStrength strength) +{ + switch (strength) + { + case LCS_NONE: + /* we intentionally throw an error for LCS_NONE */ + elog(ERROR, "unrecognized LockClauseStrength %d", + (int) strength); + break; + case LCS_FORKEYSHARE: + return " FOR KEY SHARE"; + case LCS_FORSHARE: + return " FOR SHARE"; + case LCS_FORNOKEYUPDATE: + return " FOR NO KEY UPDATE"; + case LCS_FORUPDATE: + return " FOR UPDATE"; + } + return NULL; /* keep compiler quiet */ +} + /* * Detect whether query looks like SELECT ... FROM VALUES(), * with no need to rename the output columns of the VALUES RTE. @@ -7125,7 +7127,7 @@ get_insert_query_def(Query *query, deparse_context *context) { appendStringInfoString(buf, " DO NOTHING"); } - else + else if (confl->action == ONCONFLICT_UPDATE) { appendStringInfoString(buf, " DO UPDATE SET "); /* Deparse targetlist */ @@ -7140,6 +7142,23 @@ get_insert_query_def(Query *query, deparse_context *context) get_rule_expr(confl->onConflictWhere, context, false); } } + else + { + Assert(confl->action == ONCONFLICT_SELECT); + appendStringInfoString(buf, " DO SELECT"); + + /* Add FOR [KEY] UPDATE/SHARE clause if present */ + if (confl->lockStrength != LCS_NONE) + appendStringInfoString(buf, get_lock_clause_strength(confl->lockStrength)); + + /* Add a WHERE clause if given */ + if (confl->onConflictWhere != NULL) + { + appendContextKeyword(context, " WHERE ", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_rule_expr(confl->onConflictWhere, context, false); + } + } } /* Add RETURNING if present */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index a910b3d04e..0bdd42a2b8 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202602101 +#define CATALOG_VERSION_NO 202602121 #endif diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index f8053d9e57..63c067d5aa 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -424,19 +424,20 @@ typedef struct JunkFilter } JunkFilter; /* - * OnConflictSetState + * OnConflictActionState * - * Executor state of an ON CONFLICT DO UPDATE operation. + * Executor state of an ON CONFLICT DO SELECT/UPDATE operation. */ -typedef struct OnConflictSetState +typedef struct OnConflictActionState { NodeTag type; TupleTableSlot *oc_Existing; /* slot to store existing target tuple in */ TupleTableSlot *oc_ProjSlot; /* CONFLICT ... SET ... projection target */ ProjectionInfo *oc_ProjInfo; /* for ON CONFLICT DO UPDATE SET */ + LockClauseStrength oc_LockStrength; /* lock strength for DO SELECT */ ExprState *oc_WhereClause; /* state for the WHERE clause */ -} OnConflictSetState; +} OnConflictActionState; /* ---------------- * MergeActionState information @@ -581,8 +582,8 @@ typedef struct ResultRelInfo /* list of arbiter indexes to use to check conflicts */ List *ri_onConflictArbiterIndexes; - /* ON CONFLICT evaluation state */ - OnConflictSetState *ri_onConflict; + /* ON CONFLICT evaluation state for DO SELECT/UPDATE */ + OnConflictActionState *ri_onConflict; /* for MERGE, lists of MergeActionState (one per MergeMatchKind) */ List *ri_MergeActions[NUM_MERGE_MATCH_KINDS]; diff --git a/src/include/nodes/lockoptions.h b/src/include/nodes/lockoptions.h index 22864454c3..7961444eed 100644 --- a/src/include/nodes/lockoptions.h +++ b/src/include/nodes/lockoptions.h @@ -20,7 +20,8 @@ */ typedef enum LockClauseStrength { - LCS_NONE, /* no such clause - only used in PlanRowMark */ + LCS_NONE, /* no such clause - only used in PlanRowMark + * and ON CONFLICT DO SELECT */ LCS_FORKEYSHARE, /* FOR KEY SHARE */ LCS_FORSHARE, /* FOR SHARE */ LCS_FORNOKEYUPDATE, /* FOR NO KEY UPDATE */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index b6ad28618a..59a7df31ab 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -428,6 +428,7 @@ typedef enum OnConflictAction ONCONFLICT_NONE, /* No "ON CONFLICT" clause */ ONCONFLICT_NOTHING, /* ON CONFLICT ... DO NOTHING */ ONCONFLICT_UPDATE, /* ON CONFLICT ... DO UPDATE */ + ONCONFLICT_SELECT, /* ON CONFLICT ... DO SELECT */ } OnConflictAction; /* diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 646d6ced76..0aec49bdd2 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -200,7 +200,7 @@ typedef struct Query /* OVERRIDING clause */ OverridingKind override pg_node_attr(query_jumble_ignore); - OnConflictExpr *onConflict; /* ON CONFLICT DO [NOTHING | UPDATE] */ + OnConflictExpr *onConflict; /* ON CONFLICT DO NOTHING/SELECT/UPDATE */ /* * The following three fields describe the contents of the RETURNING list @@ -1417,7 +1417,8 @@ typedef enum WCOKind WCO_VIEW_CHECK, /* WCO on an auto-updatable view */ WCO_RLS_INSERT_CHECK, /* RLS INSERT WITH CHECK policy */ WCO_RLS_UPDATE_CHECK, /* RLS UPDATE WITH CHECK policy */ - WCO_RLS_CONFLICT_CHECK, /* RLS ON CONFLICT DO UPDATE USING policy */ + WCO_RLS_CONFLICT_CHECK, /* RLS ON CONFLICT DO SELECT/UPDATE USING + * policy */ WCO_RLS_MERGE_UPDATE_CHECK, /* RLS MERGE UPDATE USING policy */ WCO_RLS_MERGE_DELETE_CHECK, /* RLS MERGE DELETE USING policy */ } WCOKind; @@ -1679,9 +1680,10 @@ typedef struct InferClause typedef struct OnConflictClause { NodeTag type; - OnConflictAction action; /* DO NOTHING or UPDATE? */ + OnConflictAction action; /* DO NOTHING, SELECT, or UPDATE */ InferClause *infer; /* Optional index inference clause */ - List *targetList; /* the target list (of ResTarget) */ + LockClauseStrength lockStrength; /* lock strength for DO SELECT */ + List *targetList; /* target list (of ResTarget) for DO UPDATE */ Node *whereClause; /* qualifications */ ParseLoc location; /* token location, or -1 if unknown */ } OnConflictClause; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 485bec5aab..8c9321aab8 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -368,11 +368,13 @@ typedef struct ModifyTable OnConflictAction onConflictAction; /* List of ON CONFLICT arbiter index OIDs */ List *arbiterIndexes; + /* lock strength for ON CONFLICT DO SELECT */ + LockClauseStrength onConflictLockStrength; /* INSERT ON CONFLICT DO UPDATE targetlist */ List *onConflictSet; /* target column numbers for onConflictSet */ List *onConflictCols; - /* WHERE for ON CONFLICT UPDATE */ + /* WHERE for ON CONFLICT DO SELECT/UPDATE */ Node *onConflictWhere; /* RTI of the EXCLUDED pseudo relation */ Index exclRelRTI; diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index 5211cadc25..384df50c80 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -20,6 +20,7 @@ #include "access/attnum.h" #include "access/cmptype.h" #include "nodes/bitmapset.h" +#include "nodes/lockoptions.h" #include "nodes/pg_list.h" @@ -2370,7 +2371,7 @@ typedef struct FromExpr typedef struct OnConflictExpr { NodeTag type; - OnConflictAction action; /* DO NOTHING or UPDATE? */ + OnConflictAction action; /* DO NOTHING, SELECT, or UPDATE */ /* Arbiter */ List *arbiterElems; /* unique index arbiter list (of @@ -2378,9 +2379,14 @@ typedef struct OnConflictExpr Node *arbiterWhere; /* unique index arbiter WHERE clause */ Oid constraint; /* pg_constraint OID for arbiter */ - /* ON CONFLICT UPDATE */ + /* ON CONFLICT DO SELECT */ + LockClauseStrength lockStrength; /* strength of lock for DO SELECT */ + + /* ON CONFLICT DO UPDATE */ List *onConflictSet; /* List of ON CONFLICT SET TargetEntrys */ - Node *onConflictWhere; /* qualifiers to restrict UPDATE to */ + + /* both ON CONFLICT DO SELECT and UPDATE */ + Node *onConflictWhere; /* qualifiers to restrict SELECT/UPDATE */ int exclRelIndex; /* RT index of 'excluded' relation */ List *exclRelTlist; /* tlist of the EXCLUDED pseudo relation */ } OnConflictExpr; diff --git a/src/test/isolation/expected/insert-conflict-do-select.out b/src/test/isolation/expected/insert-conflict-do-select.out new file mode 100644 index 0000000000..bccfd47dcf --- /dev/null +++ b/src/test/isolation/expected/insert-conflict-do-select.out @@ -0,0 +1,138 @@ +Parsed test spec with 2 sessions + +starting permutation: insert1 insert2 c1 select2 c2 +step insert1: INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT RETURNING *; +key|val +---+-------- + 1|original +(1 row) + +step insert2: INSERT INTO doselect(key, val) VALUES(1, 'insert2') ON CONFLICT (key) DO SELECT RETURNING *; +key|val +---+-------- + 1|original +(1 row) + +step c1: COMMIT; +step select2: SELECT * FROM doselect; +key|val +---+-------- + 1|original +(1 row) + +step c2: COMMIT; + +starting permutation: insert1_update insert2_update c1 select2 c2 +step insert1_update: INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; +key|val +---+-------- + 1|original +(1 row) + +step insert2_update: INSERT INTO doselect(key, val) VALUES(1, 'insert2') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; +step c1: COMMIT; +step insert2_update: <... completed> +key|val +---+-------- + 1|original +(1 row) + +step select2: SELECT * FROM doselect; +key|val +---+-------- + 1|original +(1 row) + +step c2: COMMIT; + +starting permutation: insert1_update insert2_update a1 select2 c2 +step insert1_update: INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; +key|val +---+-------- + 1|original +(1 row) + +step insert2_update: INSERT INTO doselect(key, val) VALUES(1, 'insert2') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; +step a1: ABORT; +step insert2_update: <... completed> +key|val +---+-------- + 1|original +(1 row) + +step select2: SELECT * FROM doselect; +key|val +---+-------- + 1|original +(1 row) + +step c2: COMMIT; + +starting permutation: insert1_keyshare insert2_update c1 select2 c2 +step insert1_keyshare: INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR KEY SHARE RETURNING *; +key|val +---+-------- + 1|original +(1 row) + +step insert2_update: INSERT INTO doselect(key, val) VALUES(1, 'insert2') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; +step c1: COMMIT; +step insert2_update: <... completed> +key|val +---+-------- + 1|original +(1 row) + +step select2: SELECT * FROM doselect; +key|val +---+-------- + 1|original +(1 row) + +step c2: COMMIT; + +starting permutation: insert1_share insert2_update c1 select2 c2 +step insert1_share: INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR SHARE RETURNING *; +key|val +---+-------- + 1|original +(1 row) + +step insert2_update: INSERT INTO doselect(key, val) VALUES(1, 'insert2') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; +step c1: COMMIT; +step insert2_update: <... completed> +key|val +---+-------- + 1|original +(1 row) + +step select2: SELECT * FROM doselect; +key|val +---+-------- + 1|original +(1 row) + +step c2: COMMIT; + +starting permutation: insert1_nokeyupd insert2_update c1 select2 c2 +step insert1_nokeyupd: INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR NO KEY UPDATE RETURNING *; +key|val +---+-------- + 1|original +(1 row) + +step insert2_update: INSERT INTO doselect(key, val) VALUES(1, 'insert2') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; +step c1: COMMIT; +step insert2_update: <... completed> +key|val +---+-------- + 1|original +(1 row) + +step select2: SELECT * FROM doselect; +key|val +---+-------- + 1|original +(1 row) + +step c2: COMMIT; diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 6a4d3532e0..4e466580cd 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -54,6 +54,7 @@ test: insert-conflict-do-update-2 test: insert-conflict-do-update-3 test: insert-conflict-do-update-4 test: insert-conflict-specconflict +test: insert-conflict-do-select test: merge-insert-update test: merge-delete test: merge-update diff --git a/src/test/isolation/specs/insert-conflict-do-select.spec b/src/test/isolation/specs/insert-conflict-do-select.spec new file mode 100644 index 0000000000..dcfd9f8cb5 --- /dev/null +++ b/src/test/isolation/specs/insert-conflict-do-select.spec @@ -0,0 +1,53 @@ +# INSERT...ON CONFLICT DO SELECT test +# +# This test verifies locking behavior of ON CONFLICT DO SELECT with different +# lock strengths: no lock, FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, and +# FOR UPDATE. + +setup +{ + CREATE TABLE doselect (key int primary key, val text); + INSERT INTO doselect VALUES (1, 'original'); +} + +teardown +{ + DROP TABLE doselect; +} + +session s1 +setup +{ + BEGIN ISOLATION LEVEL READ COMMITTED; +} +step insert1 { INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT RETURNING *; } +step insert1_keyshare { INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR KEY SHARE RETURNING *; } +step insert1_share { INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR SHARE RETURNING *; } +step insert1_nokeyupd { INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR NO KEY UPDATE RETURNING *; } +step insert1_update { INSERT INTO doselect(key, val) VALUES(1, 'insert1') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; } +step c1 { COMMIT; } +step a1 { ABORT; } + +session s2 +setup +{ + BEGIN ISOLATION LEVEL READ COMMITTED; +} +step insert2 { INSERT INTO doselect(key, val) VALUES(1, 'insert2') ON CONFLICT (key) DO SELECT RETURNING *; } +step insert2_update { INSERT INTO doselect(key, val) VALUES(1, 'insert2') ON CONFLICT (key) DO SELECT FOR UPDATE RETURNING *; } +step select2 { SELECT * FROM doselect; } +step c2 { COMMIT; } + +# Test 1: DO SELECT without locking - should not block +permutation insert1 insert2 c1 select2 c2 + +# Test 2: DO SELECT FOR UPDATE - should block until first transaction commits +permutation insert1_update insert2_update c1 select2 c2 + +# Test 3: DO SELECT FOR UPDATE - should unblock when first transaction aborts +permutation insert1_update insert2_update a1 select2 c2 + +# Test 4: Different lock strengths all properly acquire locks +permutation insert1_keyshare insert2_update c1 select2 c2 +permutation insert1_share insert2_update c1 select2 c2 +permutation insert1_nokeyupd insert2_update c1 select2 c2 diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out index ebc892a2a4..a6fa9cacb7 100644 --- a/src/test/regress/expected/constraints.out +++ b/src/test/regress/expected/constraints.out @@ -780,6 +780,10 @@ INSERT INTO circles VALUES('<(20,20), 10>', '<(0,0), 4>') INSERT INTO circles VALUES('<(20,20), 10>', '<(0,0), 4>') ON CONFLICT ON CONSTRAINT circles_c1_c2_excl DO UPDATE SET c2 = EXCLUDED.c2; ERROR: ON CONFLICT DO UPDATE not supported with exclusion constraints +-- fail, because DO SELECT variant requires unique index +INSERT INTO circles VALUES('<(20,20), 10>', '<(0,0), 4>') + ON CONFLICT ON CONSTRAINT circles_c1_c2_excl DO SELECT RETURNING *; +ERROR: ON CONFLICT DO SELECT not supported with exclusion constraints -- succeed because c1 doesn't overlap INSERT INTO circles VALUES('<(20,20), 1>', '<(0,0), 5>'); -- succeed because c2 doesn't overlap diff --git a/src/test/regress/expected/insert_conflict.out b/src/test/regress/expected/insert_conflict.out index b0e1296208..34e2e7ee35 100644 --- a/src/test/regress/expected/insert_conflict.out +++ b/src/test/regress/expected/insert_conflict.out @@ -249,6 +249,25 @@ explain (costs off, format json) insert into insertconflicttest values (0, 'Bilb ] (1 row) +-- Should display lock strength, if specified +explain (costs off) insert into insertconflicttest values (1, 'Apple') on conflict (key) do select returning *; + QUERY PLAN +--------------------------------------- + Insert on insertconflicttest + Conflict Resolution: SELECT + Conflict Arbiter Indexes: key_index + -> Result +(4 rows) + +explain (costs off) insert into insertconflicttest values (1, 'Apple') on conflict (key) do select for key share returning *; + QUERY PLAN +--------------------------------------------- + Insert on insertconflicttest + Conflict Resolution: SELECT FOR KEY SHARE + Conflict Arbiter Indexes: key_index + -> Result +(4 rows) + -- Fails (no unique index inference specification, required for do update variant): insert into insertconflicttest values (1, 'Apple') on conflict do update set fruit = excluded.fruit; ERROR: ON CONFLICT DO UPDATE requires inference specification or constraint name @@ -304,6 +323,48 @@ ERROR: column "insertconflicttest" of relation "insertconflicttest" does not ex LINE 1: ...3, 'Kiwi') on conflict (key, fruit) do update set insertconf... ^ HINT: SET target columns cannot be qualified with the relation name. +-- +-- DO SELECT tests +-- +delete from insertconflicttest where fruit = 'Apple'; +insert into insertconflicttest values (1, 'Apple') on conflict (key) do select; -- fails +ERROR: ON CONFLICT DO SELECT requires a RETURNING clause +LINE 1: ...nsert into insertconflicttest values (1, 'Apple') on conflic... + ^ +insert into insertconflicttest as i values (1, 'Apple') on conflict (key) do select returning old, new, i; + old | new | i +-----+-----------+----------- + | (1,Apple) | (1,Apple) +(1 row) + +insert into insertconflicttest as i values (1, 'Orange') on conflict (key) do select returning old, new, i; + old | new | i +-----------+-----------+----------- + (1,Apple) | (1,Apple) | (1,Apple) +(1 row) + +insert into insertconflicttest as i values (1, 'Apple') on conflict (key) do select where i.fruit = 'Apple' returning *; + key | fruit +-----+------- + 1 | Apple +(1 row) + +insert into insertconflicttest as i values (1, 'Apple') on conflict (key) do select where i.fruit = 'Orange' returning *; + key | fruit +-----+------- +(0 rows) + +insert into insertconflicttest as i values (1, 'Orange') on conflict (key) do select where excluded.fruit = 'Apple' returning *; + key | fruit +-----+------- +(0 rows) + +insert into insertconflicttest as i values (1, 'Orange') on conflict (key) do select where excluded.fruit = 'Orange' returning *; + key | fruit +-----+------- + 1 | Apple +(1 row) + drop index key_index; -- -- Composite key tests @@ -748,13 +809,58 @@ insert into selfconflict values (6,1), (6,2) on conflict(f1) do update set f2 = ERROR: ON CONFLICT DO UPDATE command cannot affect row a second time HINT: Ensure that no rows proposed for insertion within the same command have duplicate constrained values. commit; +begin transaction isolation level read committed; +insert into selfconflict values (7,1), (7,2) on conflict(f1) do select returning *; + f1 | f2 +----+---- + 7 | 1 + 7 | 1 +(2 rows) + +commit; +begin transaction isolation level repeatable read; +insert into selfconflict values (8,1), (8,2) on conflict(f1) do select returning *; + f1 | f2 +----+---- + 8 | 1 + 8 | 1 +(2 rows) + +commit; +begin transaction isolation level serializable; +insert into selfconflict values (9,1), (9,2) on conflict(f1) do select returning *; + f1 | f2 +----+---- + 9 | 1 + 9 | 1 +(2 rows) + +commit; +begin transaction isolation level read committed; +insert into selfconflict values (10,1), (10,2) on conflict(f1) do select for update returning *; +ERROR: ON CONFLICT DO SELECT command cannot affect row a second time +HINT: Ensure that no rows proposed for insertion within the same command have duplicate constrained values. +commit; +begin transaction isolation level repeatable read; +insert into selfconflict values (11,1), (11,2) on conflict(f1) do select for update returning *; +ERROR: ON CONFLICT DO SELECT command cannot affect row a second time +HINT: Ensure that no rows proposed for insertion within the same command have duplicate constrained values. +commit; +begin transaction isolation level serializable; +insert into selfconflict values (12,1), (12,2) on conflict(f1) do select for update returning *; +ERROR: ON CONFLICT DO SELECT command cannot affect row a second time +HINT: Ensure that no rows proposed for insertion within the same command have duplicate constrained values. +commit; select * from selfconflict; f1 | f2 ----+---- 1 | 1 2 | 1 3 | 1 -(3 rows) + 7 | 1 + 8 | 1 + 9 | 1 +(6 rows) drop table selfconflict; -- check ON CONFLICT handling with partitioned tables @@ -765,11 +871,31 @@ insert into parted_conflict_test values (1, 'a') on conflict do nothing; -- index on a required, which does exist in parent insert into parted_conflict_test values (1, 'a') on conflict (a) do nothing; insert into parted_conflict_test values (1, 'a') on conflict (a) do update set b = excluded.b; +insert into parted_conflict_test values (1, 'a') on conflict (a) do select returning *; + a | b +---+--- + 1 | a +(1 row) + +insert into parted_conflict_test values (1, 'a') on conflict (a) do select for update returning *; + a | b +---+--- + 1 | a +(1 row) + -- targeting partition directly will work insert into parted_conflict_test_1 values (1, 'a') on conflict (a) do nothing; insert into parted_conflict_test_1 values (1, 'b') on conflict (a) do update set b = excluded.b; +insert into parted_conflict_test_1 values (1, 'b') on conflict (a) do select returning b; + b +--- + b +(1 row) + -- index on b required, which doesn't exist in parent -insert into parted_conflict_test values (2, 'b') on conflict (b) do update set a = excluded.a; +insert into parted_conflict_test values (2, 'b') on conflict (b) do update set a = excluded.a; -- fail +ERROR: there is no unique or exclusion constraint matching the ON CONFLICT specification +insert into parted_conflict_test values (2, 'b') on conflict (b) do select returning b; -- fail ERROR: there is no unique or exclusion constraint matching the ON CONFLICT specification -- targeting partition directly will work insert into parted_conflict_test_1 values (2, 'b') on conflict (b) do update set a = excluded.a; @@ -780,13 +906,31 @@ select * from parted_conflict_test order by a; 2 | b (1 row) --- now check that DO UPDATE works correctly for target partition with --- different attribute numbers +-- now check that DO UPDATE and DO SELECT work correctly for target partition +-- with different attribute numbers create table parted_conflict_test_2 (b char, a int unique); alter table parted_conflict_test attach partition parted_conflict_test_2 for values in (3); truncate parted_conflict_test; insert into parted_conflict_test values (3, 'a') on conflict (a) do update set b = excluded.b; insert into parted_conflict_test values (3, 'b') on conflict (a) do update set b = excluded.b; +insert into parted_conflict_test values (3, 'a') on conflict (a) do select returning b; + b +--- + b +(1 row) + +insert into parted_conflict_test values (3, 'a') on conflict (a) do select where excluded.b = 'a' returning parted_conflict_test; + parted_conflict_test +---------------------- + (3,b) +(1 row) + +insert into parted_conflict_test values (3, 'a') on conflict (a) do select where parted_conflict_test.b = 'b' returning b; + b +--- + b +(1 row) + -- should see (3, 'b') select * from parted_conflict_test order by a; a | b @@ -800,6 +944,12 @@ create table parted_conflict_test_3 partition of parted_conflict_test for values truncate parted_conflict_test; insert into parted_conflict_test (a, b) values (4, 'a') on conflict (a) do update set b = excluded.b; insert into parted_conflict_test (a, b) values (4, 'b') on conflict (a) do update set b = excluded.b where parted_conflict_test.b = 'a'; +insert into parted_conflict_test (a, b) values (4, 'b') on conflict (a) do select returning b; + b +--- + b +(1 row) + -- should see (4, 'b') select * from parted_conflict_test order by a; a | b @@ -813,6 +963,11 @@ create table parted_conflict_test_4_1 partition of parted_conflict_test_4 for va truncate parted_conflict_test; insert into parted_conflict_test (a, b) values (5, 'a') on conflict (a) do update set b = excluded.b; insert into parted_conflict_test (a, b) values (5, 'b') on conflict (a) do update set b = excluded.b where parted_conflict_test.b = 'a'; +insert into parted_conflict_test (a, b) values (5, 'b') on conflict (a) do select where parted_conflict_test.b = 'a' returning b; + b +--- +(0 rows) + -- should see (5, 'b') select * from parted_conflict_test order by a; a | b @@ -833,6 +988,59 @@ select * from parted_conflict_test order by a; 4 | b (3 rows) +-- test DO SELECT with multiple rows hitting different partitions +truncate parted_conflict_test; +insert into parted_conflict_test (a, b) values (1, 'a'), (2, 'b'), (4, 'c'); +insert into parted_conflict_test (a, b) values (1, 'x'), (2, 'y'), (4, 'z') + on conflict (a) do select returning *, tableoid::regclass; + a | b | tableoid +---+---+------------------------ + 1 | a | parted_conflict_test_1 + 2 | b | parted_conflict_test_1 + 4 | c | parted_conflict_test_3 +(3 rows) + +-- should see original values (1, 'a'), (2, 'b'), (4, 'c') +select * from parted_conflict_test order by a; + a | b +---+--- + 1 | a + 2 | b + 4 | c +(3 rows) + +-- test DO SELECT with WHERE filtering across partitions +insert into parted_conflict_test (a, b) values (1, 'n') on conflict (a) do select where parted_conflict_test.b = 'a' returning *; + a | b +---+--- + 1 | a +(1 row) + +insert into parted_conflict_test (a, b) values (2, 'n') on conflict (a) do select where parted_conflict_test.b = 'x' returning *; + a | b +---+--- +(0 rows) + +-- test DO SELECT with EXCLUDED in WHERE across partitions with different layouts +insert into parted_conflict_test (a, b) values (3, 't') on conflict (a) do select where excluded.b = 't' returning *; + a | b +---+--- + 3 | t +(1 row) + +-- test DO SELECT FOR UPDATE across different partition layouts +insert into parted_conflict_test (a, b) values (1, 'l') on conflict (a) do select for update returning *; + a | b +---+--- + 1 | a +(1 row) + +insert into parted_conflict_test (a, b) values (3, 'l') on conflict (a) do select for update returning *; + a | b +---+--- + 3 | t +(1 row) + drop table parted_conflict_test; -- test behavior of inserting a conflicting tuple into an intermediate -- partitioning level diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index daafaa94fd..84c1c1ca38 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -921,6 +921,32 @@ INSERT INTO atest5(two) VALUES (6) ON CONFLICT (two) DO UPDATE set one = 8; -- f ERROR: permission denied for table atest5 INSERT INTO atest5(three) VALUES (4) ON CONFLICT (two) DO UPDATE set three = 10; -- fails (due to INSERT) ERROR: permission denied for table atest5 +-- Check that column level privileges are enforced for ON CONFLICT ... WHERE +-- Ok. we may select one +INSERT INTO atest5(two) VALUES (2) ON CONFLICT (two) DO SELECT WHERE atest5.one = 1 RETURNING atest5.two; + two +----- + 2 +(1 row) + +-- Error. No select rights on three +INSERT INTO atest5(two) VALUES (2) ON CONFLICT (two) DO SELECT WHERE atest5.three = 1 RETURNING atest5.two; +ERROR: permission denied for table atest5 +-- Check that ON CONFLICT ... SELECT FOR UPDATE/SHARE requires an updatable column +SET SESSION AUTHORIZATION regress_priv_user1; +REVOKE UPDATE (three) ON atest5 FROM regress_priv_user4; +SET SESSION AUTHORIZATION regress_priv_user4; +INSERT INTO atest5(two) VALUES (2) ON CONFLICT (two) DO SELECT FOR UPDATE RETURNING atest5.two; -- fails +ERROR: permission denied for table atest5 +SET SESSION AUTHORIZATION regress_priv_user1; +GRANT UPDATE (three) ON atest5 TO regress_priv_user4; +SET SESSION AUTHORIZATION regress_priv_user4; +INSERT INTO atest5(two) VALUES (2) ON CONFLICT (two) DO SELECT FOR UPDATE RETURNING atest5.two; -- ok + two +----- + 2 +(1 row) + -- Check that the columns in the inference require select privileges INSERT INTO atest5(four) VALUES (4); -- fail ERROR: permission denied for table atest5 diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index e17f9188df..07d93e7def 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -218,6 +218,50 @@ NOTICE: SELECT USING on rls_test_tgt.(3,"tgt d","TGT D") 3 | tgt d | TGT D (1 row) +ROLLBACK; +-- INSERT ... ON CONFLICT DO SELECT should apply INSERT CHECK and SELECT USING +-- policy clauses to values proposed for insert. In the event of a conflict it +-- should also apply SELECT USING policy clauses to the existing values. +BEGIN; +INSERT INTO rls_test_tgt VALUES (4, 'tgt a') ON CONFLICT (a) DO SELECT RETURNING *; +NOTICE: INSERT CHECK on rls_test_tgt.(4,"tgt a","TGT A") +NOTICE: SELECT USING on rls_test_tgt.(4,"tgt a","TGT A") + a | b | c +---+-------+------- + 4 | tgt a | TGT A +(1 row) + +INSERT INTO rls_test_tgt VALUES (4, 'tgt b') ON CONFLICT (a) DO SELECT RETURNING *; +NOTICE: INSERT CHECK on rls_test_tgt.(4,"tgt b","TGT B") +NOTICE: SELECT USING on rls_test_tgt.(4,"tgt b","TGT B") +NOTICE: SELECT USING on rls_test_tgt.(4,"tgt a","TGT A") + a | b | c +---+-------+------- + 4 | tgt a | TGT A +(1 row) + +ROLLBACK; +-- INSERT ... ON CONFLICT DO SELECT FOR UPDATE should also apply UPDATE USING +-- policy clauses to the existing values, in the event of a conflict. +BEGIN; +INSERT INTO rls_test_tgt VALUES (5, 'tgt a') ON CONFLICT (a) DO SELECT FOR UPDATE RETURNING *; +NOTICE: INSERT CHECK on rls_test_tgt.(5,"tgt a","TGT A") +NOTICE: SELECT USING on rls_test_tgt.(5,"tgt a","TGT A") + a | b | c +---+-------+------- + 5 | tgt a | TGT A +(1 row) + +INSERT INTO rls_test_tgt VALUES (5, 'tgt b') ON CONFLICT (a) DO SELECT FOR UPDATE RETURNING *; +NOTICE: INSERT CHECK on rls_test_tgt.(5,"tgt b","TGT B") +NOTICE: SELECT USING on rls_test_tgt.(5,"tgt b","TGT B") +NOTICE: UPDATE USING on rls_test_tgt.(5,"tgt a","TGT A") +NOTICE: SELECT USING on rls_test_tgt.(5,"tgt a","TGT A") + a | b | c +---+-------+------- + 5 | tgt a | TGT A +(1 row) + ROLLBACK; -- MERGE should always apply SELECT USING policy clauses to both source and -- target rows @@ -2395,10 +2439,58 @@ INSERT INTO document VALUES (1, (SELECT cid from category WHERE cname = 'novel') ON CONFLICT (did) DO UPDATE SET dauthor = 'regress_rls_carol'; ERROR: new row violates row-level security policy for table "document" -- +-- INSERT ... ON CONFLICT DO SELECT and Row-level security +-- +SET SESSION AUTHORIZATION regress_rls_alice; +DROP POLICY p3_with_all ON document; +CREATE POLICY p1_select_novels ON document FOR SELECT + USING (cid = (SELECT cid from category WHERE cname = 'novel')); +CREATE POLICY p2_insert_own ON document FOR INSERT + WITH CHECK (dauthor = current_user); +CREATE POLICY p3_update_novels ON document FOR UPDATE + USING (cid = (SELECT cid from category WHERE cname = 'novel') AND dlevel = 1) + WITH CHECK (dauthor = current_user); +SET SESSION AUTHORIZATION regress_rls_bob; +-- DO SELECT requires SELECT rights, should succeed for novel +INSERT INTO document VALUES (1, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'another novel') + ON CONFLICT (did) DO SELECT RETURNING did, dauthor, dtitle; + did | dauthor | dtitle +-----+-----------------+---------------- + 1 | regress_rls_bob | my first novel +(1 row) + +-- DO SELECT requires SELECT rights, should fail for non-novel +INSERT INTO document VALUES (33, (SELECT cid from category WHERE cname = 'science fiction'), 1, 'regress_rls_bob', 'another sci-fi') + ON CONFLICT (did) DO SELECT RETURNING did, dauthor, dtitle; +ERROR: new row violates row-level security policy for table "document" +-- DO SELECT with WHERE and EXCLUDED reference +INSERT INTO document VALUES (1, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'another novel') + ON CONFLICT (did) DO SELECT WHERE excluded.dlevel = 1 RETURNING did, dauthor, dtitle; + did | dauthor | dtitle +-----+-----------------+---------------- + 1 | regress_rls_bob | my first novel +(1 row) + +-- DO SELECT FOR UPDATE requires both SELECT and UPDATE rights, should succeed for novel and dlevel = 1 +INSERT INTO document VALUES (1, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'another novel') + ON CONFLICT (did) DO SELECT FOR UPDATE RETURNING did, dauthor, dtitle; + did | dauthor | dtitle +-----+-----------------+---------------- + 1 | regress_rls_bob | my first novel +(1 row) + +-- should fail UPDATE USING policy for novel with dlevel = 2 +INSERT INTO document VALUES (2, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'another novel') + ON CONFLICT (did) DO SELECT FOR UPDATE RETURNING did, dauthor, dtitle; +ERROR: new row violates row-level security policy (USING expression) for table "document" +SET SESSION AUTHORIZATION regress_rls_alice; +DROP POLICY p1_select_novels ON document; +DROP POLICY p2_insert_own ON document; +DROP POLICY p3_update_novels ON document; +-- -- MERGE -- RESET SESSION AUTHORIZATION; -DROP POLICY p3_with_all ON document; ALTER TABLE document ADD COLUMN dnotes text DEFAULT ''; -- all documents are readable CREATE POLICY p1 ON document FOR SELECT USING (true); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index f9bc213e5a..78a37d9fc8 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -3608,6 +3608,61 @@ SELECT * FROM hat_data WHERE hat_name IN ('h8', 'h9', 'h7') ORDER BY hat_name; (3 rows) DROP RULE hat_upsert ON hats; +-- DO SELECT with a WHERE clause +CREATE RULE hat_confsel AS ON INSERT TO hats + DO INSTEAD + INSERT INTO hat_data VALUES ( + NEW.hat_name, + NEW.hat_color) + ON CONFLICT (hat_name) + DO SELECT FOR UPDATE + WHERE excluded.hat_color <> 'forbidden' AND hat_data.* != excluded.* + RETURNING *; +SELECT definition FROM pg_rules WHERE tablename = 'hats' ORDER BY rulename; + definition +-------------------------------------------------------------------------------------- + CREATE RULE hat_confsel AS + + ON INSERT TO public.hats DO INSTEAD INSERT INTO hat_data (hat_name, hat_color) + + VALUES (new.hat_name, new.hat_color) ON CONFLICT(hat_name) DO SELECT FOR UPDATE + + WHERE ((excluded.hat_color <> 'forbidden'::bpchar) AND (hat_data.* <> excluded.*))+ + RETURNING hat_data.hat_name, + + hat_data.hat_color; +(1 row) + +-- fails without RETURNING +INSERT INTO hats VALUES ('h7', 'blue'); +ERROR: ON CONFLICT DO SELECT requires a RETURNING clause +DETAIL: A rule action is INSERT ... ON CONFLICT DO SELECT, which requires a RETURNING clause. +-- works (returns conflicts) +EXPLAIN (costs off) +INSERT INTO hats VALUES ('h7', 'blue') RETURNING *; + QUERY PLAN +------------------------------------------------------------------------------------------------- + Insert on hat_data + Conflict Resolution: SELECT FOR UPDATE + Conflict Arbiter Indexes: hat_data_unique_idx + Conflict Filter: ((excluded.hat_color <> 'forbidden'::bpchar) AND (hat_data.* <> excluded.*)) + -> Result +(5 rows) + +INSERT INTO hats VALUES ('h7', 'blue') RETURNING *; + hat_name | hat_color +------------+------------ + h7 | black +(1 row) + +-- conflicts excluded by WHERE clause +INSERT INTO hats VALUES ('h7', 'forbidden') RETURNING *; + hat_name | hat_color +----------+----------- +(0 rows) + +INSERT INTO hats VALUES ('h7', 'black') RETURNING *; + hat_name | hat_color +----------+----------- +(0 rows) + +DROP RULE hat_confsel ON hats; drop table hats; drop table hat_data; -- test for pg_get_functiondef properly regurgitating SET parameters diff --git a/src/test/regress/expected/triggers.out b/src/test/regress/expected/triggers.out index 1acdd12d29..98dee63b50 100644 --- a/src/test/regress/expected/triggers.out +++ b/src/test/regress/expected/triggers.out @@ -1670,7 +1670,7 @@ drop table trigger_ddl_table; drop function trigger_ddl_func(); -- -- Verify behavior of before and after triggers with INSERT...ON CONFLICT --- DO UPDATE +-- DO UPDATE and DO SELECT -- create table upsert (key int4 primary key, color text); create function upsert_before_func() @@ -1745,6 +1745,14 @@ insert into upsert values(8, 'yellow') on conflict (key) do update set color = ' WARNING: before insert (new): (8,yellow) WARNING: before insert (new, modified): (9,"yellow trig modified") WARNING: after insert (new): (9,"yellow trig modified") +insert into upsert values(8, 'blue') on conflict (key) do select for update where upsert.color = 'yellow trig modified' returning old.*, new.*, upsert.*; +WARNING: before insert (new): (8,blue) +WARNING: before insert (new, modified): (9,"blue trig modified") + key | color | key | color | key | color +-----+----------------------+-----+----------------------+-----+---------------------- + 9 | yellow trig modified | 9 | yellow trig modified | 9 | yellow trig modified +(1 row) + select * from upsert; key | color -----+----------------------------- diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out index 03df7e75b7..9cea538b8e 100644 --- a/src/test/regress/expected/updatable_views.out +++ b/src/test/regress/expected/updatable_views.out @@ -316,6 +316,21 @@ SELECT * FROM rw_view15; 3 | UNSPECIFIED (6 rows) +INSERT INTO rw_view15 (a) VALUES (3) + ON CONFLICT (a) DO UPDATE SET a = excluded.a WHERE excluded.upper = 'UNSPECIFIED' + RETURNING old, new; + old | new +-----------------+----------------- + (3,UNSPECIFIED) | (3,UNSPECIFIED) +(1 row) + +INSERT INTO rw_view15 (a) VALUES (3) + ON CONFLICT (a) DO SELECT WHERE excluded.upper = 'UNSPECIFIED' RETURNING old, new; + old | new +-----------------+----------------- + (3,UNSPECIFIED) | (3,UNSPECIFIED) +(1 row) + SELECT * FROM rw_view15; a | upper ----+------------- @@ -3646,7 +3661,7 @@ ERROR: new row violates check option for view "wcowrtest_v2" DETAIL: Failing row contains (2, no such row in sometable). drop view wcowrtest_v, wcowrtest_v2; drop table wcowrtest, sometable; --- Check INSERT .. ON CONFLICT DO UPDATE works correctly when the view's +-- Check INSERT .. ON CONFLICT DO SELECT/UPDATE works correctly when the view's -- columns are named and ordered differently than the underlying table's. create table uv_iocu_tab (a text unique, b float); insert into uv_iocu_tab values ('xyxyxy', 0); @@ -3668,6 +3683,13 @@ select * from uv_iocu_tab; xyxyxy | 1 (1 row) +insert into uv_iocu_view (a, b) values ('xyxyxy', 1) + on conflict (a) do select where uv_iocu_view.c = 2 and excluded.c = 2 returning *; + b | c | a | two +---+---+--------+----- + 1 | 2 | xyxyxy | 2.0 +(1 row) + -- OK to access view columns that are not present in underlying base -- relation in the ON CONFLICT portion of the query insert into uv_iocu_view (a, b) values ('xyxyxy', 3) @@ -3731,6 +3753,25 @@ select * from uv_iocu_view; Rejected: (y,1,"(1,y)") | 1 | (1,"Rejected: (y,1,""(1,y)"")") (1 row) +explain (costs off) +insert into uv_iocu_view (aa,bb) values (1,'Rejected: (y,1,"(1,y)")') + on conflict (aa) do select where uv_iocu_view.* = excluded.* returning *; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------- + Insert on uv_iocu_tab + Conflict Resolution: SELECT + Conflict Arbiter Indexes: uv_iocu_tab_a_key + Conflict Filter: (ROW(uv_iocu_tab.b, uv_iocu_tab.a, (uv_iocu_tab.*)::text) = ROW(excluded.b, excluded.a, (excluded.*)::text)) + -> Result +(5 rows) + +insert into uv_iocu_view (aa,bb) values (1,'Rejected: (y,1,"(1,y)")') + on conflict (aa) do select where uv_iocu_view.* = excluded.* returning *; + bb | aa | cc +-------------------------+----+--------------------------------- + Rejected: (y,1,"(1,y)") | 1 | (1,"Rejected: (y,1,""(1,y)"")") +(1 row) + -- Test omitting a column of the base relation delete from uv_iocu_view; insert into uv_iocu_view (aa,bb) values (1,'x'); @@ -3751,6 +3792,13 @@ select * from uv_iocu_view; Rejected: ("table default",1,"(1,""table default"")") | 1 | (1,"Rejected: (""table default"",1,""(1,""""table default"""")"")") (1 row) +insert into uv_iocu_view (aa) values (1) + on conflict (aa) do select returning *; + bb | aa | cc +-------------------------------------------------------+----+--------------------------------------------------------------------- + Rejected: ("table default",1,"(1,""table default"")") | 1 | (1,"Rejected: (""table default"",1,""(1,""""table default"""")"")") +(1 row) + alter view uv_iocu_view alter column bb set default 'view default'; insert into uv_iocu_view (aa) values (1) on conflict (aa) do update set bb = 'Rejected: '||excluded.*; @@ -3760,6 +3808,13 @@ select * from uv_iocu_view; Rejected: ("view default",1,"(1,""view default"")") | 1 | (1,"Rejected: (""view default"",1,""(1,""""view default"""")"")") (1 row) +insert into uv_iocu_view (aa) values (1) + on conflict (aa) do select returning *; + bb | aa | cc +-----------------------------------------------------+----+------------------------------------------------------------------- + Rejected: ("view default",1,"(1,""view default"")") | 1 | (1,"Rejected: (""view default"",1,""(1,""""view default"""")"")") +(1 row) + -- Should fail to update non-updatable columns insert into uv_iocu_view (aa) values (1) on conflict (aa) do update set cc = 'XXX'; @@ -3767,7 +3822,7 @@ ERROR: cannot insert into column "cc" of view "uv_iocu_view" DETAIL: View columns that are not columns of their base relation are not updatable. drop view uv_iocu_view; drop table uv_iocu_tab; --- ON CONFLICT DO UPDATE permissions checks +-- ON CONFLICT DO SELECT/UPDATE permissions checks create user regress_view_user1; create user regress_view_user2; set session authorization regress_view_user1; @@ -3791,6 +3846,16 @@ insert into rw_view1 values ('zzz',2.0,1) insert into rw_view1 values ('zzz',2.0,1) on conflict (aa) do update set cc = 3.0; -- Not allowed ERROR: permission denied for view rw_view1 +insert into rw_view1 values ('yyy',2.0,1) + on conflict (aa) do select for update returning cc; -- Not allowed +ERROR: permission denied for view rw_view1 +insert into rw_view1 values ('yyy',2.0,1) + on conflict (aa) do select for update returning aa, bb; + aa | bb +----+-------- + 1 | yyyxxx +(1 row) + reset session authorization; select * from base_tbl; a | b | c @@ -3807,9 +3872,19 @@ create view rw_view2 as select b as bb, c as cc, a as aa from base_tbl; insert into rw_view2 (aa,bb) values (1,'xxx') on conflict (aa) do update set bb = excluded.bb; -- Not allowed ERROR: permission denied for table base_tbl +insert into rw_view2 (aa,bb) values (1,'xxx') + on conflict (aa) do select returning 1; -- Not allowed +ERROR: permission denied for table base_tbl create view rw_view3 as select b as bb, a as aa from base_tbl; insert into rw_view3 (aa,bb) values (1,'xxx') on conflict (aa) do update set bb = excluded.bb; -- OK +insert into rw_view3 (aa,bb) values (1,'xxx') + on conflict (aa) do select returning aa, bb; -- OK + aa | bb +----+----- + 1 | xxx +(1 row) + reset session authorization; select * from base_tbl; a | b | c @@ -3822,6 +3897,9 @@ create view rw_view4 as select aa, bb, cc FROM rw_view1; insert into rw_view4 (aa,bb) values (1,'yyy') on conflict (aa) do update set bb = excluded.bb; -- Not allowed ERROR: permission denied for view rw_view1 +insert into rw_view4 (aa,bb) values (1,'yyy') + on conflict (aa) do select returning 1; -- Not allowed +ERROR: permission denied for view rw_view1 create view rw_view5 as select aa, bb FROM rw_view1; insert into rw_view5 (aa,bb) values (1,'yyy') on conflict (aa) do update set bb = excluded.bb; -- OK diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql index 1e9989698b..b7f6efdd81 100644 --- a/src/test/regress/sql/constraints.sql +++ b/src/test/regress/sql/constraints.sql @@ -568,6 +568,9 @@ INSERT INTO circles VALUES('<(20,20), 10>', '<(0,0), 4>') -- fail, because DO UPDATE variant requires unique index INSERT INTO circles VALUES('<(20,20), 10>', '<(0,0), 4>') ON CONFLICT ON CONSTRAINT circles_c1_c2_excl DO UPDATE SET c2 = EXCLUDED.c2; +-- fail, because DO SELECT variant requires unique index +INSERT INTO circles VALUES('<(20,20), 10>', '<(0,0), 4>') + ON CONFLICT ON CONSTRAINT circles_c1_c2_excl DO SELECT RETURNING *; -- succeed because c1 doesn't overlap INSERT INTO circles VALUES('<(20,20), 1>', '<(0,0), 5>'); -- succeed because c2 doesn't overlap diff --git a/src/test/regress/sql/insert_conflict.sql b/src/test/regress/sql/insert_conflict.sql index 03b1f0e44b..a5a84d1d4b 100644 --- a/src/test/regress/sql/insert_conflict.sql +++ b/src/test/regress/sql/insert_conflict.sql @@ -93,6 +93,9 @@ explain (costs off) insert into insertconflicttest values (0, 'Bilberry') on con explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key) do update set fruit = excluded.fruit where excluded.fruit != 'Elderberry'; -- Does the same, but JSON format shows "Conflict Arbiter Index" as JSON array: explain (costs off, format json) insert into insertconflicttest values (0, 'Bilberry') on conflict (key) do update set fruit = excluded.fruit where insertconflicttest.fruit != 'Lime' returning *; +-- Should display lock strength, if specified +explain (costs off) insert into insertconflicttest values (1, 'Apple') on conflict (key) do select returning *; +explain (costs off) insert into insertconflicttest values (1, 'Apple') on conflict (key) do select for key share returning *; -- Fails (no unique index inference specification, required for do update variant): insert into insertconflicttest values (1, 'Apple') on conflict do update set fruit = excluded.fruit; @@ -130,6 +133,18 @@ insert into insertconflicttest AS ict values (6, 'Passionfruit') on conflict (ke -- Check helpful hint when qualifying set column with target table insert into insertconflicttest values (3, 'Kiwi') on conflict (key, fruit) do update set insertconflicttest.fruit = 'Mango'; +-- +-- DO SELECT tests +-- +delete from insertconflicttest where fruit = 'Apple'; +insert into insertconflicttest values (1, 'Apple') on conflict (key) do select; -- fails +insert into insertconflicttest as i values (1, 'Apple') on conflict (key) do select returning old, new, i; +insert into insertconflicttest as i values (1, 'Orange') on conflict (key) do select returning old, new, i; +insert into insertconflicttest as i values (1, 'Apple') on conflict (key) do select where i.fruit = 'Apple' returning *; +insert into insertconflicttest as i values (1, 'Apple') on conflict (key) do select where i.fruit = 'Orange' returning *; +insert into insertconflicttest as i values (1, 'Orange') on conflict (key) do select where excluded.fruit = 'Apple' returning *; +insert into insertconflicttest as i values (1, 'Orange') on conflict (key) do select where excluded.fruit = 'Orange' returning *; + drop index key_index; -- @@ -459,6 +474,30 @@ begin transaction isolation level serializable; insert into selfconflict values (6,1), (6,2) on conflict(f1) do update set f2 = 0; commit; +begin transaction isolation level read committed; +insert into selfconflict values (7,1), (7,2) on conflict(f1) do select returning *; +commit; + +begin transaction isolation level repeatable read; +insert into selfconflict values (8,1), (8,2) on conflict(f1) do select returning *; +commit; + +begin transaction isolation level serializable; +insert into selfconflict values (9,1), (9,2) on conflict(f1) do select returning *; +commit; + +begin transaction isolation level read committed; +insert into selfconflict values (10,1), (10,2) on conflict(f1) do select for update returning *; +commit; + +begin transaction isolation level repeatable read; +insert into selfconflict values (11,1), (11,2) on conflict(f1) do select for update returning *; +commit; + +begin transaction isolation level serializable; +insert into selfconflict values (12,1), (12,2) on conflict(f1) do select for update returning *; +commit; + select * from selfconflict; drop table selfconflict; @@ -473,13 +512,17 @@ insert into parted_conflict_test values (1, 'a') on conflict do nothing; -- index on a required, which does exist in parent insert into parted_conflict_test values (1, 'a') on conflict (a) do nothing; insert into parted_conflict_test values (1, 'a') on conflict (a) do update set b = excluded.b; +insert into parted_conflict_test values (1, 'a') on conflict (a) do select returning *; +insert into parted_conflict_test values (1, 'a') on conflict (a) do select for update returning *; -- targeting partition directly will work insert into parted_conflict_test_1 values (1, 'a') on conflict (a) do nothing; insert into parted_conflict_test_1 values (1, 'b') on conflict (a) do update set b = excluded.b; +insert into parted_conflict_test_1 values (1, 'b') on conflict (a) do select returning b; -- index on b required, which doesn't exist in parent -insert into parted_conflict_test values (2, 'b') on conflict (b) do update set a = excluded.a; +insert into parted_conflict_test values (2, 'b') on conflict (b) do update set a = excluded.a; -- fail +insert into parted_conflict_test values (2, 'b') on conflict (b) do select returning b; -- fail -- targeting partition directly will work insert into parted_conflict_test_1 values (2, 'b') on conflict (b) do update set a = excluded.a; @@ -487,13 +530,16 @@ insert into parted_conflict_test_1 values (2, 'b') on conflict (b) do update set -- should see (2, 'b') select * from parted_conflict_test order by a; --- now check that DO UPDATE works correctly for target partition with --- different attribute numbers +-- now check that DO UPDATE and DO SELECT work correctly for target partition +-- with different attribute numbers create table parted_conflict_test_2 (b char, a int unique); alter table parted_conflict_test attach partition parted_conflict_test_2 for values in (3); truncate parted_conflict_test; insert into parted_conflict_test values (3, 'a') on conflict (a) do update set b = excluded.b; insert into parted_conflict_test values (3, 'b') on conflict (a) do update set b = excluded.b; +insert into parted_conflict_test values (3, 'a') on conflict (a) do select returning b; +insert into parted_conflict_test values (3, 'a') on conflict (a) do select where excluded.b = 'a' returning parted_conflict_test; +insert into parted_conflict_test values (3, 'a') on conflict (a) do select where parted_conflict_test.b = 'b' returning b; -- should see (3, 'b') select * from parted_conflict_test order by a; @@ -504,6 +550,7 @@ create table parted_conflict_test_3 partition of parted_conflict_test for values truncate parted_conflict_test; insert into parted_conflict_test (a, b) values (4, 'a') on conflict (a) do update set b = excluded.b; insert into parted_conflict_test (a, b) values (4, 'b') on conflict (a) do update set b = excluded.b where parted_conflict_test.b = 'a'; +insert into parted_conflict_test (a, b) values (4, 'b') on conflict (a) do select returning b; -- should see (4, 'b') select * from parted_conflict_test order by a; @@ -514,6 +561,7 @@ create table parted_conflict_test_4_1 partition of parted_conflict_test_4 for va truncate parted_conflict_test; insert into parted_conflict_test (a, b) values (5, 'a') on conflict (a) do update set b = excluded.b; insert into parted_conflict_test (a, b) values (5, 'b') on conflict (a) do update set b = excluded.b where parted_conflict_test.b = 'a'; +insert into parted_conflict_test (a, b) values (5, 'b') on conflict (a) do select where parted_conflict_test.b = 'a' returning b; -- should see (5, 'b') select * from parted_conflict_test order by a; @@ -526,6 +574,26 @@ insert into parted_conflict_test (a, b) values (1, 'b'), (2, 'c'), (4, 'b') on c -- should see (1, 'b'), (2, 'a'), (4, 'b') select * from parted_conflict_test order by a; +-- test DO SELECT with multiple rows hitting different partitions +truncate parted_conflict_test; +insert into parted_conflict_test (a, b) values (1, 'a'), (2, 'b'), (4, 'c'); +insert into parted_conflict_test (a, b) values (1, 'x'), (2, 'y'), (4, 'z') + on conflict (a) do select returning *, tableoid::regclass; + +-- should see original values (1, 'a'), (2, 'b'), (4, 'c') +select * from parted_conflict_test order by a; + +-- test DO SELECT with WHERE filtering across partitions +insert into parted_conflict_test (a, b) values (1, 'n') on conflict (a) do select where parted_conflict_test.b = 'a' returning *; +insert into parted_conflict_test (a, b) values (2, 'n') on conflict (a) do select where parted_conflict_test.b = 'x' returning *; + +-- test DO SELECT with EXCLUDED in WHERE across partitions with different layouts +insert into parted_conflict_test (a, b) values (3, 't') on conflict (a) do select where excluded.b = 't' returning *; + +-- test DO SELECT FOR UPDATE across different partition layouts +insert into parted_conflict_test (a, b) values (1, 'l') on conflict (a) do select for update returning *; +insert into parted_conflict_test (a, b) values (3, 'l') on conflict (a) do select for update returning *; + drop table parted_conflict_test; -- test behavior of inserting a conflicting tuple into an intermediate diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 96eff1104d..66e06d91a4 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -565,6 +565,24 @@ INSERT INTO atest5(two) VALUES (6) ON CONFLICT (two) DO UPDATE set three = EXCLU INSERT INTO atest5(two) VALUES (6) ON CONFLICT (two) DO UPDATE set three = EXCLUDED.three; INSERT INTO atest5(two) VALUES (6) ON CONFLICT (two) DO UPDATE set one = 8; -- fails (due to UPDATE) INSERT INTO atest5(three) VALUES (4) ON CONFLICT (two) DO UPDATE set three = 10; -- fails (due to INSERT) +-- Check that column level privileges are enforced for ON CONFLICT ... WHERE +-- Ok. we may select one +INSERT INTO atest5(two) VALUES (2) ON CONFLICT (two) DO SELECT WHERE atest5.one = 1 RETURNING atest5.two; +-- Error. No select rights on three +INSERT INTO atest5(two) VALUES (2) ON CONFLICT (two) DO SELECT WHERE atest5.three = 1 RETURNING atest5.two; + +-- Check that ON CONFLICT ... SELECT FOR UPDATE/SHARE requires an updatable column +SET SESSION AUTHORIZATION regress_priv_user1; +REVOKE UPDATE (three) ON atest5 FROM regress_priv_user4; +SET SESSION AUTHORIZATION regress_priv_user4; + +INSERT INTO atest5(two) VALUES (2) ON CONFLICT (two) DO SELECT FOR UPDATE RETURNING atest5.two; -- fails + +SET SESSION AUTHORIZATION regress_priv_user1; +GRANT UPDATE (three) ON atest5 TO regress_priv_user4; +SET SESSION AUTHORIZATION regress_priv_user4; + +INSERT INTO atest5(two) VALUES (2) ON CONFLICT (two) DO SELECT FOR UPDATE RETURNING atest5.two; -- ok -- Check that the columns in the inference require select privileges INSERT INTO atest5(four) VALUES (4); -- fail diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index fb6502d497..6b3566271d 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -142,6 +142,21 @@ INSERT INTO rls_test_tgt VALUES (3, 'tgt a') ON CONFLICT (a) DO UPDATE SET b = ' INSERT INTO rls_test_tgt VALUES (3, 'tgt c') ON CONFLICT (a) DO UPDATE SET b = 'tgt d' RETURNING *; ROLLBACK; +-- INSERT ... ON CONFLICT DO SELECT should apply INSERT CHECK and SELECT USING +-- policy clauses to values proposed for insert. In the event of a conflict it +-- should also apply SELECT USING policy clauses to the existing values. +BEGIN; +INSERT INTO rls_test_tgt VALUES (4, 'tgt a') ON CONFLICT (a) DO SELECT RETURNING *; +INSERT INTO rls_test_tgt VALUES (4, 'tgt b') ON CONFLICT (a) DO SELECT RETURNING *; +ROLLBACK; + +-- INSERT ... ON CONFLICT DO SELECT FOR UPDATE should also apply UPDATE USING +-- policy clauses to the existing values, in the event of a conflict. +BEGIN; +INSERT INTO rls_test_tgt VALUES (5, 'tgt a') ON CONFLICT (a) DO SELECT FOR UPDATE RETURNING *; +INSERT INTO rls_test_tgt VALUES (5, 'tgt b') ON CONFLICT (a) DO SELECT FOR UPDATE RETURNING *; +ROLLBACK; + -- MERGE should always apply SELECT USING policy clauses to both source and -- target rows MERGE INTO rls_test_tgt t USING rls_test_src s ON t.a = s.a @@ -953,11 +968,51 @@ INSERT INTO document VALUES (4, (SELECT cid from category WHERE cname = 'novel') INSERT INTO document VALUES (1, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'my first novel') ON CONFLICT (did) DO UPDATE SET dauthor = 'regress_rls_carol'; +-- +-- INSERT ... ON CONFLICT DO SELECT and Row-level security +-- +SET SESSION AUTHORIZATION regress_rls_alice; +DROP POLICY p3_with_all ON document; + +CREATE POLICY p1_select_novels ON document FOR SELECT + USING (cid = (SELECT cid from category WHERE cname = 'novel')); +CREATE POLICY p2_insert_own ON document FOR INSERT + WITH CHECK (dauthor = current_user); +CREATE POLICY p3_update_novels ON document FOR UPDATE + USING (cid = (SELECT cid from category WHERE cname = 'novel') AND dlevel = 1) + WITH CHECK (dauthor = current_user); + +SET SESSION AUTHORIZATION regress_rls_bob; + +-- DO SELECT requires SELECT rights, should succeed for novel +INSERT INTO document VALUES (1, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'another novel') + ON CONFLICT (did) DO SELECT RETURNING did, dauthor, dtitle; + +-- DO SELECT requires SELECT rights, should fail for non-novel +INSERT INTO document VALUES (33, (SELECT cid from category WHERE cname = 'science fiction'), 1, 'regress_rls_bob', 'another sci-fi') + ON CONFLICT (did) DO SELECT RETURNING did, dauthor, dtitle; + +-- DO SELECT with WHERE and EXCLUDED reference +INSERT INTO document VALUES (1, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'another novel') + ON CONFLICT (did) DO SELECT WHERE excluded.dlevel = 1 RETURNING did, dauthor, dtitle; + +-- DO SELECT FOR UPDATE requires both SELECT and UPDATE rights, should succeed for novel and dlevel = 1 +INSERT INTO document VALUES (1, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'another novel') + ON CONFLICT (did) DO SELECT FOR UPDATE RETURNING did, dauthor, dtitle; + +-- should fail UPDATE USING policy for novel with dlevel = 2 +INSERT INTO document VALUES (2, (SELECT cid from category WHERE cname = 'novel'), 1, 'regress_rls_bob', 'another novel') + ON CONFLICT (did) DO SELECT FOR UPDATE RETURNING did, dauthor, dtitle; + +SET SESSION AUTHORIZATION regress_rls_alice; +DROP POLICY p1_select_novels ON document; +DROP POLICY p2_insert_own ON document; +DROP POLICY p3_update_novels ON document; + -- -- MERGE -- RESET SESSION AUTHORIZATION; -DROP POLICY p3_with_all ON document; ALTER TABLE document ADD COLUMN dnotes text DEFAULT ''; -- all documents are readable diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql index 3f240bec7b..40f5c16e54 100644 --- a/src/test/regress/sql/rules.sql +++ b/src/test/regress/sql/rules.sql @@ -1205,6 +1205,32 @@ SELECT * FROM hat_data WHERE hat_name IN ('h8', 'h9', 'h7') ORDER BY hat_name; DROP RULE hat_upsert ON hats; +-- DO SELECT with a WHERE clause +CREATE RULE hat_confsel AS ON INSERT TO hats + DO INSTEAD + INSERT INTO hat_data VALUES ( + NEW.hat_name, + NEW.hat_color) + ON CONFLICT (hat_name) + DO SELECT FOR UPDATE + WHERE excluded.hat_color <> 'forbidden' AND hat_data.* != excluded.* + RETURNING *; +SELECT definition FROM pg_rules WHERE tablename = 'hats' ORDER BY rulename; + +-- fails without RETURNING +INSERT INTO hats VALUES ('h7', 'blue'); + +-- works (returns conflicts) +EXPLAIN (costs off) +INSERT INTO hats VALUES ('h7', 'blue') RETURNING *; +INSERT INTO hats VALUES ('h7', 'blue') RETURNING *; + +-- conflicts excluded by WHERE clause +INSERT INTO hats VALUES ('h7', 'forbidden') RETURNING *; +INSERT INTO hats VALUES ('h7', 'black') RETURNING *; + +DROP RULE hat_confsel ON hats; + drop table hats; drop table hat_data; diff --git a/src/test/regress/sql/triggers.sql b/src/test/regress/sql/triggers.sql index cc878455ac..ea39817ee3 100644 --- a/src/test/regress/sql/triggers.sql +++ b/src/test/regress/sql/triggers.sql @@ -1148,7 +1148,7 @@ drop function trigger_ddl_func(); -- -- Verify behavior of before and after triggers with INSERT...ON CONFLICT --- DO UPDATE +-- DO UPDATE and DO SELECT -- create table upsert (key int4 primary key, color text); @@ -1197,6 +1197,7 @@ insert into upsert values(5, 'purple') on conflict (key) do update set color = ' insert into upsert values(6, 'white') on conflict (key) do update set color = 'updated ' || upsert.color; insert into upsert values(7, 'pink') on conflict (key) do update set color = 'updated ' || upsert.color; insert into upsert values(8, 'yellow') on conflict (key) do update set color = 'updated ' || upsert.color; +insert into upsert values(8, 'blue') on conflict (key) do select for update where upsert.color = 'yellow trig modified' returning old.*, new.*, upsert.*; select * from upsert; diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql index c071fffc11..1635adde2d 100644 --- a/src/test/regress/sql/updatable_views.sql +++ b/src/test/regress/sql/updatable_views.sql @@ -106,6 +106,12 @@ INSERT INTO rw_view15 (a) VALUES (3) ON CONFLICT (a) DO UPDATE set a = excluded. SELECT * FROM rw_view15; INSERT INTO rw_view15 (a) VALUES (3) ON CONFLICT (a) DO UPDATE set upper = 'blarg'; -- fails SELECT * FROM rw_view15; +INSERT INTO rw_view15 (a) VALUES (3) + ON CONFLICT (a) DO UPDATE SET a = excluded.a WHERE excluded.upper = 'UNSPECIFIED' + RETURNING old, new; +INSERT INTO rw_view15 (a) VALUES (3) + ON CONFLICT (a) DO SELECT WHERE excluded.upper = 'UNSPECIFIED' RETURNING old, new; + SELECT * FROM rw_view15; ALTER VIEW rw_view15 ALTER COLUMN upper SET DEFAULT 'NOT SET'; INSERT INTO rw_view15 (a) VALUES (4); -- should fail @@ -1850,7 +1856,7 @@ insert into wcowrtest_v2 values (2, 'no such row in sometable'); drop view wcowrtest_v, wcowrtest_v2; drop table wcowrtest, sometable; --- Check INSERT .. ON CONFLICT DO UPDATE works correctly when the view's +-- Check INSERT .. ON CONFLICT DO SELECT/UPDATE works correctly when the view's -- columns are named and ordered differently than the underlying table's. create table uv_iocu_tab (a text unique, b float); insert into uv_iocu_tab values ('xyxyxy', 0); @@ -1863,6 +1869,8 @@ select * from uv_iocu_tab; insert into uv_iocu_view (a, b) values ('xyxyxy', 1) on conflict (a) do update set b = excluded.b; select * from uv_iocu_tab; +insert into uv_iocu_view (a, b) values ('xyxyxy', 1) + on conflict (a) do select where uv_iocu_view.c = 2 and excluded.c = 2 returning *; -- OK to access view columns that are not present in underlying base -- relation in the ON CONFLICT portion of the query @@ -1899,6 +1907,11 @@ insert into uv_iocu_view (aa,bb) values (1,'y') and excluded.bb != '' and excluded.cc is not null; select * from uv_iocu_view; +explain (costs off) +insert into uv_iocu_view (aa,bb) values (1,'Rejected: (y,1,"(1,y)")') + on conflict (aa) do select where uv_iocu_view.* = excluded.* returning *; +insert into uv_iocu_view (aa,bb) values (1,'Rejected: (y,1,"(1,y)")') + on conflict (aa) do select where uv_iocu_view.* = excluded.* returning *; -- Test omitting a column of the base relation delete from uv_iocu_view; @@ -1911,11 +1924,15 @@ alter table uv_iocu_tab alter column b set default 'table default'; insert into uv_iocu_view (aa) values (1) on conflict (aa) do update set bb = 'Rejected: '||excluded.*; select * from uv_iocu_view; +insert into uv_iocu_view (aa) values (1) + on conflict (aa) do select returning *; alter view uv_iocu_view alter column bb set default 'view default'; insert into uv_iocu_view (aa) values (1) on conflict (aa) do update set bb = 'Rejected: '||excluded.*; select * from uv_iocu_view; +insert into uv_iocu_view (aa) values (1) + on conflict (aa) do select returning *; -- Should fail to update non-updatable columns insert into uv_iocu_view (aa) values (1) @@ -1924,7 +1941,7 @@ insert into uv_iocu_view (aa) values (1) drop view uv_iocu_view; drop table uv_iocu_tab; --- ON CONFLICT DO UPDATE permissions checks +-- ON CONFLICT DO SELECT/UPDATE permissions checks create user regress_view_user1; create user regress_view_user2; @@ -1948,6 +1965,10 @@ insert into rw_view1 values ('zzz',2.0,1) on conflict (aa) do update set bb = rw_view1.bb||'xxx'; -- OK insert into rw_view1 values ('zzz',2.0,1) on conflict (aa) do update set cc = 3.0; -- Not allowed +insert into rw_view1 values ('yyy',2.0,1) + on conflict (aa) do select for update returning cc; -- Not allowed +insert into rw_view1 values ('yyy',2.0,1) + on conflict (aa) do select for update returning aa, bb; reset session authorization; select * from base_tbl; @@ -1960,9 +1981,13 @@ set session authorization regress_view_user2; create view rw_view2 as select b as bb, c as cc, a as aa from base_tbl; insert into rw_view2 (aa,bb) values (1,'xxx') on conflict (aa) do update set bb = excluded.bb; -- Not allowed +insert into rw_view2 (aa,bb) values (1,'xxx') + on conflict (aa) do select returning 1; -- Not allowed create view rw_view3 as select b as bb, a as aa from base_tbl; insert into rw_view3 (aa,bb) values (1,'xxx') on conflict (aa) do update set bb = excluded.bb; -- OK +insert into rw_view3 (aa,bb) values (1,'xxx') + on conflict (aa) do select returning aa, bb; -- OK reset session authorization; select * from base_tbl; @@ -1970,6 +1995,8 @@ set session authorization regress_view_user2; create view rw_view4 as select aa, bb, cc FROM rw_view1; insert into rw_view4 (aa,bb) values (1,'yyy') on conflict (aa) do update set bb = excluded.bb; -- Not allowed +insert into rw_view4 (aa,bb) values (1,'yyy') + on conflict (aa) do select returning 1; -- Not allowed create view rw_view5 as select aa, bb FROM rw_view1; insert into rw_view5 (aa,bb) values (1,'yyy') on conflict (aa) do update set bb = excluded.bb; -- OK diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 39c76691c8..6e2d876a40 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1840,9 +1840,9 @@ OldToNewMappingData OnCommitAction OnCommitItem OnConflictAction +OnConflictActionState OnConflictClause OnConflictExpr -OnConflictSetState OpClassCacheEnt OpExpr OpFamilyMember From cb7b2e5e8efb3e5fb08052425cd00f067a56f877 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 12 Feb 2026 11:32:49 -0600 Subject: [PATCH 095/147] Remove some unnecessary optimizations in popcount code. Over the past few releases, we've added a huge amount of complexity to our popcount implementations. Commits fbe327e5b4, 79e232ca01, 8c6653516c, and 25dc485074 did some preliminary refactoring, but many opportunities remain. In particular, if we disclaim interest in micro-optimizing this code for 32-bit builds and in unnecessary alignment checks on x86-64, we can remove a decent chunk of code. I cannot find public discussion or benchmarks for the code this commit removes, but it seems unlikely that this change will noticeably impact performance on affected systems. Suggested-by: John Naylor Reviewed-by: John Naylor Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- src/include/port/pg_bitutils.h | 16 +------- src/port/pg_bitutils.c | 30 --------------- src/port/pg_popcount_x86.c | 67 ++++++---------------------------- 3 files changed, 14 insertions(+), 99 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 35761f509e..20c11b79c6 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -333,13 +333,7 @@ pg_popcount(const char *buf, int bytes) * We set the threshold to the point at which we'll first use special * instructions in the optimized version. */ -#if SIZEOF_VOID_P >= 8 - int threshold = 8; -#else - int threshold = 4; -#endif - - if (bytes < threshold) + if (bytes < 8) { uint64 popcnt = 0; @@ -364,13 +358,7 @@ pg_popcount_masked(const char *buf, int bytes, bits8 mask) * We set the threshold to the point at which we'll first use special * instructions in the optimized version. */ -#if SIZEOF_VOID_P >= 8 - int threshold = 8; -#else - int threshold = 4; -#endif - - if (bytes < threshold) + if (bytes < 8) { uint64 popcnt = 0; diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index ffda75825e..bec06c06fc 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -167,20 +167,6 @@ pg_popcount_portable(const char *buf, int bytes) bytes -= 8; } - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_portable(*words++); - bytes -= 4; - } - buf = (const char *) words; } #endif @@ -215,22 +201,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) bytes -= 8; } - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - uint32 maskv = ~((uint32) 0) / 0xFF * mask; - - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_portable(*words++ & maskv); - bytes -= 4; - } - buf = (const char *) words; } #endif diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 245f0167d0..7aebf69898 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -376,40 +376,20 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); * pg_popcount_sse42 * Returns the number of 1-bits in buf */ +pg_attribute_no_sanitize_alignment() static uint64 pg_popcount_sse42(const char *buf, int bytes) { uint64 popcnt = 0; + const uint64 *words = (const uint64 *) buf; -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) + while (bytes >= 8) { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_sse42(*words++); - bytes -= 8; - } - - buf = (const char *) words; + popcnt += pg_popcount64_sse42(*words++); + bytes -= 8; } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - while (bytes >= 4) - { - popcnt += pg_popcount32_sse42(*words++); - bytes -= 4; - } - - buf = (const char *) words; - } -#endif + buf = (const char *) words; /* Process any remaining bytes */ while (bytes--) @@ -422,44 +402,21 @@ pg_popcount_sse42(const char *buf, int bytes) * pg_popcount_masked_sse42 * Returns the number of 1-bits in buf after applying the mask to each byte */ +pg_attribute_no_sanitize_alignment() static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask) { uint64 popcnt = 0; - -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned */ uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + const uint64 *words = (const uint64 *) buf; - if (buf == (const char *) TYPEALIGN(8, buf)) + while (bytes >= 8) { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_sse42(*words++ & maskv); - bytes -= 8; - } - - buf = (const char *) words; + popcnt += pg_popcount64_sse42(*words++ & maskv); + bytes -= 8; } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - uint32 maskv = ~((uint32) 0) / 0xFF * mask; - - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_sse42(*words++ & maskv); - bytes -= 4; - } - buf = (const char *) words; - } -#endif + buf = (const char *) words; /* Process any remaining bytes */ while (bytes--) From a4688988835f1f5e607040c8d89e52cbfba9369b Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 12 Feb 2026 11:32:49 -0600 Subject: [PATCH 096/147] Remove specialized word-length popcount implementations. The uses of these functions do not justify the level of micro-optimization we've done and may even hurt performance in some cases (e.g., due to using function pointers). This commit removes all architecture-specific implementations of pg_popcount{32,64} and converts the portable ones to inlined functions in pg_bitutils.h. These inlined versions should produce the same code as before (but inlined), so in theory this is a net gain for many machines. A follow-up commit will replace the remaining loops over these word-length popcount functions with calls to pg_popcount(), further reducing the need for architecture-specific implementations. Suggested-by: John Naylor Reviewed-by: John Naylor Reviewed-by: Greg Burd Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- src/include/port/pg_bitutils.h | 75 +++++++++++++++++++++++----------- src/port/pg_bitutils.c | 65 +---------------------------- src/port/pg_popcount_aarch64.c | 20 +++------ src/port/pg_popcount_x86.c | 43 +------------------ 4 files changed, 59 insertions(+), 144 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 20c11b79c6..789663edd9 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -276,46 +276,73 @@ pg_ceil_log2_64(uint64 num) return pg_leftmost_one_pos64(num - 1) + 1; } -extern int pg_popcount32_portable(uint32 word); -extern int pg_popcount64_portable(uint64 word); extern uint64 pg_popcount_portable(const char *buf, int bytes); extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask); -#ifdef HAVE_X86_64_POPCNTQ +#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) /* - * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check + * Attempt to use specialized CPU instructions, but perform a runtime check * first. */ -extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); -extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); -#elif defined(USE_NEON) -/* Use the Neon version of pg_popcount{32,64} without function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); - -/* - * We can try to use an SVE-optimized pg_popcount() on some systems For that, - * we do use a function pointer. - */ -#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK -extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); -extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); #else +/* Use a portable implementation -- no need for a function pointer. */ extern uint64 pg_popcount_optimized(const char *buf, int bytes); extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); + #endif -#else -/* Use a portable implementation -- no need for a function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); -extern uint64 pg_popcount_optimized(const char *buf, int bytes); -extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); +/* + * pg_popcount32 + * Return the number of 1 bits set in word + */ +static inline int +pg_popcount32(uint32 word) +{ +#ifdef HAVE__BUILTIN_POPCOUNT + return __builtin_popcount(word); +#else /* !HAVE__BUILTIN_POPCOUNT */ + int result = 0; + + while (word != 0) + { + result += pg_number_of_ones[word & 255]; + word >>= 8; + } + return result; +#endif /* HAVE__BUILTIN_POPCOUNT */ +} + +/* + * pg_popcount64 + * Return the number of 1 bits set in word + */ +static inline int +pg_popcount64(uint64 word) +{ +#ifdef HAVE__BUILTIN_POPCOUNT +#if SIZEOF_LONG == 8 + return __builtin_popcountl(word); +#elif SIZEOF_LONG_LONG == 8 + return __builtin_popcountll(word); +#else +#error "cannot find integer of the same size as uint64_t" #endif +#else /* !HAVE__BUILTIN_POPCOUNT */ + int result = 0; + + while (word != 0) + { + result += pg_number_of_ones[word & 255]; + word >>= 8; + } + + return result; +#endif /* HAVE__BUILTIN_POPCOUNT */ +} /* * Returns the number of 1-bits in buf. diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index bec06c06fc..49b130f130 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -96,56 +96,6 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -/* - * pg_popcount32_portable - * Return the number of 1 bits set in word - */ -int -pg_popcount32_portable(uint32 word) -{ -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ -} - -/* - * pg_popcount64_portable - * Return the number of 1 bits set in word - */ -int -pg_popcount64_portable(uint64 word) -{ -#ifdef HAVE__BUILTIN_POPCOUNT -#if SIZEOF_LONG == 8 - return __builtin_popcountl(word); -#elif SIZEOF_LONG_LONG == 8 - return __builtin_popcountll(word); -#else -#error "cannot find integer of the same size as uint64_t" -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ -} - /* * pg_popcount_portable * Returns the number of 1-bits in buf @@ -163,7 +113,7 @@ pg_popcount_portable(const char *buf, int bytes) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++); + popcnt += pg_popcount64(*words++); bytes -= 8; } @@ -197,7 +147,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++ & maskv); + popcnt += pg_popcount64(*words++ & maskv); bytes -= 8; } @@ -220,17 +170,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) * actual external functions. The compiler should be able to inline the * portable versions here. */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount32_portable(word); -} - -int -pg_popcount64(uint64 word) -{ - return pg_popcount64_portable(word); -} /* * pg_popcount_optimized diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index ba57f2cd4b..f474ef4551 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -292,21 +292,11 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) #endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ /* - * pg_popcount32 + * pg_popcount64_neon * Return number of 1 bits in word */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount64((uint64) word); -} - -/* - * pg_popcount64 - * Return number of 1 bits in word - */ -int -pg_popcount64(uint64 word) +static inline int +pg_popcount64_neon(uint64 word) { /* * For some compilers, __builtin_popcountl() already emits Neon @@ -383,7 +373,7 @@ pg_popcount_neon(const char *buf, int bytes) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64(*((const uint64 *) buf)); + popcnt += pg_popcount64_neon(*((const uint64 *) buf)); buf += sizeof(uint64); } @@ -465,7 +455,7 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64(*((const uint64 *) buf) & mask64); + popcnt += pg_popcount64_neon(*((const uint64 *) buf) & mask64); buf += sizeof(uint64); } diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 7aebf69898..6bce089432 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -36,8 +36,6 @@ * operation, but in practice this is close enough, and "sse42" seems easier to * follow than "popcnt" for these names. */ -static inline int pg_popcount32_sse42(uint32 word); -static inline int pg_popcount64_sse42(uint64 word); static uint64 pg_popcount_sse42(const char *buf, int bytes); static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask); @@ -55,12 +53,8 @@ static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); * what the current CPU supports) and then will call the pointer to fulfill the * caller's request. */ -static int pg_popcount32_choose(uint32 word); -static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); -int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; -int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; @@ -157,7 +151,7 @@ pg_popcount_avx512_available(void) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ /* - * These functions get called on the first call to pg_popcount32 etc. + * These functions get called on the first call to pg_popcount(), etc. * They detect whether we can use the asm implementations, and replace * the function pointers so that subsequent calls are routed directly to * the chosen implementation. @@ -167,15 +161,11 @@ choose_popcount_functions(void) { if (pg_popcount_sse42_available()) { - pg_popcount32 = pg_popcount32_sse42; - pg_popcount64 = pg_popcount64_sse42; pg_popcount_optimized = pg_popcount_sse42; pg_popcount_masked_optimized = pg_popcount_masked_sse42; } else { - pg_popcount32 = pg_popcount32_portable; - pg_popcount64 = pg_popcount64_portable; pg_popcount_optimized = pg_popcount_portable; pg_popcount_masked_optimized = pg_popcount_masked_portable; } @@ -189,20 +179,6 @@ choose_popcount_functions(void) #endif } -static int -pg_popcount32_choose(uint32 word) -{ - choose_popcount_functions(); - return pg_popcount32(word); -} - -static int -pg_popcount64_choose(uint64 word) -{ - choose_popcount_functions(); - return pg_popcount64(word); -} - static uint64 pg_popcount_choose(const char *buf, int bytes) { @@ -338,23 +314,6 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount32_sse42 - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount32_sse42(uint32 word) -{ -#ifdef _MSC_VER - return __popcnt(word); -#else - uint32 res; - -__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif -} - /* * pg_popcount64_sse42 * Return the number of 1 bits set in word From d7a4291bb73e891243de7649ba92e7337a476434 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 12 Feb 2026 19:41:02 +0200 Subject: [PATCH 097/147] Fix comment neglected in commit ddc3250208 I renamed the field in commit ddc3250208, but missed this one reference. --- src/include/replication/slot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h index 72f8be629f..4b4709f6e2 100644 --- a/src/include/replication/slot.h +++ b/src/include/replication/slot.h @@ -215,7 +215,7 @@ typedef struct ReplicationSlot /* is somebody performing io on this slot? */ LWLock io_in_progress_lock; - /* Condition variable signaled when active_pid changes */ + /* Condition variable signaled when active_proc changes */ ConditionVariable active_cv; /* all the remaining data is only used for logical slots */ From d7edcec35c7c28edb3bf283dfe9c892b042ca158 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 12 Feb 2026 19:41:06 +0200 Subject: [PATCH 098/147] Make pg_numa_query_pages() work in frontend programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's currently only used in the server, but it was placed in src/port with the idea that it might be useful in client programs too. However, it will currently fail to link if used in a client program, because CHECK_FOR_INTERRUPTS() is not usable in client programs. Fix that by wrapping it in "#ifndef FRONTEND". Reviewed-by: Álvaro Herrera Discussion: https://www.postgresql.org/message-id/21cc7a48-99d9-4f69-9a3f-2c2de61ac8e5%40iki.fi Backpatch-through: 18 --- src/port/pg_numa.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c index d574a686b4..8954669273 100644 --- a/src/port/pg_numa.c +++ b/src/port/pg_numa.c @@ -87,7 +87,9 @@ pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) unsigned long count_chunk = Min(count - next, NUMA_QUERY_CHUNK_SIZE); +#ifndef FRONTEND CHECK_FOR_INTERRUPTS(); +#endif /* * Bail out if any of the chunks errors out (ret<0). We ignore (ret>0) From 775fc014156bdfa6938ef02dce3d85364b1bd220 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 13 Feb 2026 12:17:08 +0900 Subject: [PATCH 099/147] Improve error message for checksum failures in pgstat_database.c This log message was referring to conflicts, but it is about checksum failures. The log message improved in this commit should never show up, due to the fact that pgstat_prepare_report_checksum_failure() should always be called before pgstat_report_checksum_failures_in_db(), with a stats entry already created in the pgstats shared hash table. The three code paths able to report database-level checksum failures follow already this requirement. Oversight in b96d3c389755. Author: Wang Peng <215722532@qq.com> Discussion: https://postgr.es/m/tencent_9B6CD6D9D34AE28CDEADEC6188DB3BA1FE07@qq.com Backpatch-through: 18 --- src/backend/utils/activity/pgstat_database.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index 6309909bcd..933dcb5cae 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -200,7 +200,7 @@ pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) Assert(entry_ref); if (!entry_ref) { - elog(WARNING, "could not report %d conflicts for DB %u", + elog(WARNING, "could not report %d checksum failures for database %u", failurecount, dboid); return; } From 53c6bd0aa3de58baf828e60c6c8934d0a10a8501 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Fri, 13 Feb 2026 11:36:31 +0100 Subject: [PATCH 100/147] Restart BackgroundPsql's timer more nicely. Use BackgroundPsql's published API for automatically restarting its timer for each query, rather than manually reaching into it to achieve the same thing. 010_tab_completion.pl's logic for this predates the invention of BackgroundPsql (and 664d75753 missed the opportunity to make it cleaner). 030_pager.pl copied-and-pasted the code. Author: Daniel Gustafsson Reviewed-by: Heikki Linnakangas Reviewed-by: Andrew Dunstan Reviewed-by: Tom Lane Discussion: https://postgr.es/m/1100715.1712265845@sss.pgh.pa.us --- src/bin/psql/t/010_tab_completion.pl | 7 +++---- src/bin/psql/t/030_pager.pl | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/bin/psql/t/010_tab_completion.pl b/src/bin/psql/t/010_tab_completion.pl index 7104aba239..1d2e5f5b92 100644 --- a/src/bin/psql/t/010_tab_completion.pl +++ b/src/bin/psql/t/010_tab_completion.pl @@ -77,8 +77,10 @@ # for possible debugging purposes. my $historyfile = "${PostgreSQL::Test::Utils::log_path}/010_psql_history.txt"; -# fire up an interactive psql session +# fire up an interactive psql session and configure it such that each query +# restarts the timer my $h = $node->interactive_psql('postgres', history_file => $historyfile); +$h->set_query_timer_restart(); # Simple test case: type something and see if psql responds as expected sub check_completion @@ -88,9 +90,6 @@ sub check_completion # report test failures from caller location local $Test::Builder::Level = $Test::Builder::Level + 1; - # restart per-command timer - $h->{timeout}->start($PostgreSQL::Test::Utils::timeout_default); - # send the data to be sent and wait for its result my $out = $h->query_until($pattern, $send); my $okay = ($out =~ $pattern && !$h->{timeout}->is_expired); diff --git a/src/bin/psql/t/030_pager.pl b/src/bin/psql/t/030_pager.pl index a35f2b2629..d3f964639d 100644 --- a/src/bin/psql/t/030_pager.pl +++ b/src/bin/psql/t/030_pager.pl @@ -70,8 +70,10 @@ 25 as y, 26 as z'); -# fire up an interactive psql session +# fire up an interactive psql session and configure it such that each query +# restarts the timer my $h = $node->interactive_psql('postgres'); +$h->set_query_timer_restart(); # set the pty's window size to known values # (requires undesirable chumminess with the innards of IPC::Run) @@ -88,9 +90,6 @@ sub do_command # report test failures from caller location local $Test::Builder::Level = $Test::Builder::Level + 1; - # restart per-command timer - $h->{timeout}->start($PostgreSQL::Test::Utils::timeout_default); - # send the data to be sent and wait for its result my $out = $h->query_until($pattern, $send); my $okay = ($out =~ $pattern && !$h->{timeout}->is_expired); From 6736dea14afbe239588dad1c947ceb6e50adbf72 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 13 Feb 2026 19:48:35 +0900 Subject: [PATCH 101/147] pg_dump: Use pg_malloc_object() and pg_malloc_array() The idea is to encourage more the use of these allocation routines across the tree, as these offer stronger type safety guarantees than pg_malloc() & co (type cast in the result, sizeof() embedded). This set of changes is dedicated to the pg_dump code. Similar work has been done as of 31d3847a37be, as one example. Author: Peter Smith Reviewed-by: Aleksander Alekseev Discussion: https://postgr.es/m/CAHut+PvpGPDLhkHAoxw_g3jdrYxA1m16a8uagbgH3TGWSKtXNQ@mail.gmail.com --- src/bin/pg_dump/compress_gzip.c | 6 +- src/bin/pg_dump/compress_io.c | 4 +- src/bin/pg_dump/compress_lz4.c | 4 +- src/bin/pg_dump/compress_none.c | 2 +- src/bin/pg_dump/compress_zstd.c | 2 +- src/bin/pg_dump/connectdb.c | 8 +- src/bin/pg_dump/dumputils.c | 8 +- src/bin/pg_dump/parallel.c | 14 +-- src/bin/pg_dump/pg_backup_archiver.c | 40 +++--- src/bin/pg_dump/pg_backup_custom.c | 8 +- src/bin/pg_dump/pg_backup_directory.c | 8 +- src/bin/pg_dump/pg_dump.c | 174 +++++++++++++------------- src/bin/pg_dump/pg_dump_sort.c | 12 +- src/bin/pg_dump/pg_dumpall.c | 2 +- 14 files changed, 145 insertions(+), 147 deletions(-) diff --git a/src/bin/pg_dump/compress_gzip.c b/src/bin/pg_dump/compress_gzip.c index 41a3d059f9..c9ce8a53aa 100644 --- a/src/bin/pg_dump/compress_gzip.c +++ b/src/bin/pg_dump/compress_gzip.c @@ -57,8 +57,8 @@ DeflateCompressorInit(CompressorState *cs) GzipCompressorState *gzipcs; z_streamp zp; - gzipcs = (GzipCompressorState *) pg_malloc0(sizeof(GzipCompressorState)); - zp = gzipcs->zp = (z_streamp) pg_malloc(sizeof(z_stream)); + gzipcs = pg_malloc0_object(GzipCompressorState); + zp = gzipcs->zp = pg_malloc_object(z_stream); zp->zalloc = Z_NULL; zp->zfree = Z_NULL; zp->opaque = Z_NULL; @@ -178,7 +178,7 @@ ReadDataFromArchiveGzip(ArchiveHandle *AH, CompressorState *cs) char *buf; size_t buflen; - zp = (z_streamp) pg_malloc(sizeof(z_stream)); + zp = pg_malloc_object(z_stream); zp->zalloc = Z_NULL; zp->zfree = Z_NULL; zp->opaque = Z_NULL; diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c index af47ef8883..52652b0d97 100644 --- a/src/bin/pg_dump/compress_io.c +++ b/src/bin/pg_dump/compress_io.c @@ -125,7 +125,7 @@ AllocateCompressor(const pg_compress_specification compression_spec, { CompressorState *cs; - cs = (CompressorState *) pg_malloc0(sizeof(CompressorState)); + cs = pg_malloc0_object(CompressorState); cs->readF = readF; cs->writeF = writeF; @@ -195,7 +195,7 @@ InitCompressFileHandle(const pg_compress_specification compression_spec) { CompressFileHandle *CFH; - CFH = pg_malloc0(sizeof(CompressFileHandle)); + CFH = pg_malloc0_object(CompressFileHandle); if (compression_spec.algorithm == PG_COMPRESSION_NONE) InitCompressFileHandleNone(CFH, compression_spec); diff --git a/src/bin/pg_dump/compress_lz4.c b/src/bin/pg_dump/compress_lz4.c index 20a8741d3c..b72bad130a 100644 --- a/src/bin/pg_dump/compress_lz4.c +++ b/src/bin/pg_dump/compress_lz4.c @@ -305,7 +305,7 @@ InitCompressorLZ4(CompressorState *cs, const pg_compress_specification compressi if (cs->readF) return; - state = pg_malloc0(sizeof(*state)); + state = pg_malloc0_object(LZ4State); if (cs->compression_spec.level >= 0) state->prefs.compressionLevel = cs->compression_spec.level; @@ -754,7 +754,7 @@ InitCompressFileHandleLZ4(CompressFileHandle *CFH, CFH->get_error_func = LZ4Stream_get_error; CFH->compression_spec = compression_spec; - state = pg_malloc0(sizeof(*state)); + state = pg_malloc0_object(LZ4State); if (CFH->compression_spec.level >= 0) state->prefs.compressionLevel = CFH->compression_spec.level; diff --git a/src/bin/pg_dump/compress_none.c b/src/bin/pg_dump/compress_none.c index 9997519e35..d862d8ca6e 100644 --- a/src/bin/pg_dump/compress_none.c +++ b/src/bin/pg_dump/compress_none.c @@ -124,7 +124,7 @@ InitCompressorNone(CompressorState *cs, { NoneCompressorState *nonecs; - nonecs = (NoneCompressorState *) pg_malloc(sizeof(NoneCompressorState)); + nonecs = pg_malloc_object(NoneCompressorState); nonecs->buflen = DEFAULT_IO_BUFFER_SIZE; nonecs->buffer = pg_malloc(nonecs->buflen); nonecs->bufdata = 0; diff --git a/src/bin/pg_dump/compress_zstd.c b/src/bin/pg_dump/compress_zstd.c index 889691aa0c..cf2db2649a 100644 --- a/src/bin/pg_dump/compress_zstd.c +++ b/src/bin/pg_dump/compress_zstd.c @@ -219,7 +219,7 @@ InitCompressorZstd(CompressorState *cs, cs->compression_spec = compression_spec; - zstdcs = (ZstdCompressorState *) pg_malloc0(sizeof(*zstdcs)); + zstdcs = pg_malloc0_object(ZstdCompressorState); cs->private_data = zstdcs; /* We expect that exactly one of readF/writeF is specified */ diff --git a/src/bin/pg_dump/connectdb.c b/src/bin/pg_dump/connectdb.c index 388d29d0ae..f3ce8b1cfb 100644 --- a/src/bin/pg_dump/connectdb.c +++ b/src/bin/pg_dump/connectdb.c @@ -89,8 +89,8 @@ ConnectDatabase(const char *dbname, const char *connection_string, argcount++; } - keywords = pg_malloc0((argcount + 1) * sizeof(*keywords)); - values = pg_malloc0((argcount + 1) * sizeof(*values)); + keywords = pg_malloc0_array(const char *, (argcount + 1)); + values = pg_malloc0_array(const char *, (argcount + 1)); for (conn_opt = conn_opts; conn_opt->keyword != NULL; conn_opt++) { @@ -105,8 +105,8 @@ ConnectDatabase(const char *dbname, const char *connection_string, } else { - keywords = pg_malloc0((argcount + 1) * sizeof(*keywords)); - values = pg_malloc0((argcount + 1) * sizeof(*values)); + keywords = pg_malloc0_array(const char *, (argcount + 1)); + values = pg_malloc0_array(const char *, (argcount + 1)); } if (pghost) diff --git a/src/bin/pg_dump/dumputils.c b/src/bin/pg_dump/dumputils.c index acfa3f22cc..5bc77fed97 100644 --- a/src/bin/pg_dump/dumputils.c +++ b/src/bin/pg_dump/dumputils.c @@ -160,7 +160,7 @@ buildACLCommands(const char *name, const char *subname, const char *nspname, * Besides, a false mismatch will just cause the output to be a little * more verbose than it really needed to be. */ - grantitems = (char **) pg_malloc(naclitems * sizeof(char *)); + grantitems = pg_malloc_array(char *, naclitems); for (i = 0; i < naclitems; i++) { bool found = false; @@ -176,7 +176,7 @@ buildACLCommands(const char *name, const char *subname, const char *nspname, if (!found) grantitems[ngrantitems++] = aclitems[i]; } - revokeitems = (char **) pg_malloc(nbaseitems * sizeof(char *)); + revokeitems = pg_malloc_array(char *, nbaseitems); for (i = 0; i < nbaseitems; i++) { bool found = false; @@ -774,8 +774,8 @@ SplitGUCList(char *rawstring, char separator, * overestimate of the number of pointers we could need. Allow one for * list terminator. */ - *namelist = nextptr = (char **) - pg_malloc((strlen(rawstring) / 2 + 2) * sizeof(char *)); + *namelist = nextptr = + pg_malloc_array(char *, (strlen(rawstring) / 2 + 2)); *nextptr = NULL; while (isspace((unsigned char) *nextp)) diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c index ddaf08faa3..56cb2c1f32 100644 --- a/src/bin/pg_dump/parallel.c +++ b/src/bin/pg_dump/parallel.c @@ -469,7 +469,7 @@ WaitForTerminatingWorkers(ParallelState *pstate) } #else /* WIN32 */ /* On Windows, we must use WaitForMultipleObjects() */ - HANDLE *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers); + HANDLE *lpHandles = pg_malloc_array(HANDLE, pstate->numWorkers); int nrun = 0; DWORD ret; uintptr_t hThread; @@ -903,7 +903,7 @@ ParallelBackupStart(ArchiveHandle *AH) Assert(AH->public.numWorkers > 0); - pstate = (ParallelState *) pg_malloc(sizeof(ParallelState)); + pstate = pg_malloc_object(ParallelState); pstate->numWorkers = AH->public.numWorkers; pstate->te = NULL; @@ -913,10 +913,10 @@ ParallelBackupStart(ArchiveHandle *AH) return pstate; /* Create status arrays, being sure to initialize all fields to 0 */ - pstate->te = (TocEntry **) - pg_malloc0(pstate->numWorkers * sizeof(TocEntry *)); - pstate->parallelSlot = (ParallelSlot *) - pg_malloc0(pstate->numWorkers * sizeof(ParallelSlot)); + pstate->te = + pg_malloc0_array(TocEntry *, pstate->numWorkers); + pstate->parallelSlot = + pg_malloc0_array(ParallelSlot, pstate->numWorkers); #ifdef WIN32 /* Make fmtId() and fmtQualifiedId() use thread-local storage */ @@ -969,7 +969,7 @@ ParallelBackupStart(ArchiveHandle *AH) #ifdef WIN32 /* Create transient structure to pass args to worker function */ - wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo)); + wi = pg_malloc_object(WorkerInfo); wi->AH = AH; wi->slot = slot; diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 35d3a07915..9007f7a0c4 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -134,7 +134,7 @@ static void StrictNamesCheck(RestoreOptions *ropt); DumpOptions * NewDumpOptions(void) { - DumpOptions *opts = (DumpOptions *) pg_malloc(sizeof(DumpOptions)); + DumpOptions *opts = pg_malloc_object(DumpOptions); InitDumpOptions(opts); return opts; @@ -1107,7 +1107,7 @@ NewRestoreOptions(void) { RestoreOptions *opts; - opts = (RestoreOptions *) pg_malloc0(sizeof(RestoreOptions)); + opts = pg_malloc0_object(RestoreOptions); /* set any fields that shouldn't default to zeroes */ opts->format = archUnknown; @@ -1244,7 +1244,7 @@ ArchiveEntry(Archive *AHX, CatalogId catalogId, DumpId dumpId, ArchiveHandle *AH = (ArchiveHandle *) AHX; TocEntry *newToc; - newToc = (TocEntry *) pg_malloc0(sizeof(TocEntry)); + newToc = pg_malloc0_object(TocEntry); AH->tocCount++; if (dumpId > AH->maxDumpId) @@ -1272,7 +1272,7 @@ ArchiveEntry(Archive *AHX, CatalogId catalogId, DumpId dumpId, if (opts->nDeps > 0) { - newToc->dependencies = (DumpId *) pg_malloc(opts->nDeps * sizeof(DumpId)); + newToc->dependencies = pg_malloc_array(DumpId, opts->nDeps); memcpy(newToc->dependencies, opts->deps, opts->nDeps * sizeof(DumpId)); newToc->nDeps = opts->nDeps; } @@ -1575,7 +1575,7 @@ SortTocFromFile(Archive *AHX) StringInfoData linebuf; /* Allocate space for the 'wanted' array, and init it */ - ropt->idWanted = (bool *) pg_malloc0(sizeof(bool) * AH->maxDumpId); + ropt->idWanted = pg_malloc0_array(bool, AH->maxDumpId); /* Setup the file */ fh = fopen(ropt->tocFile, PG_BINARY_R); @@ -1990,8 +1990,8 @@ buildTocEntryArrays(ArchiveHandle *AH) DumpId maxDumpId = AH->maxDumpId; TocEntry *te; - AH->tocsByDumpId = (TocEntry **) pg_malloc0((maxDumpId + 1) * sizeof(TocEntry *)); - AH->tableDataId = (DumpId *) pg_malloc0((maxDumpId + 1) * sizeof(DumpId)); + AH->tocsByDumpId = pg_malloc0_array(TocEntry *, (maxDumpId + 1)); + AH->tableDataId = pg_malloc0_array(DumpId, (maxDumpId + 1)); for (te = AH->toc->next; te != AH->toc; te = te->next) { @@ -2385,7 +2385,7 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt, pg_log_debug("allocating AH for %s, format %d", FileSpec ? FileSpec : "(stdio)", fmt); - AH = (ArchiveHandle *) pg_malloc0(sizeof(ArchiveHandle)); + AH = pg_malloc0_object(ArchiveHandle); AH->version = K_VERS_SELF; @@ -2422,7 +2422,7 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt, AH->currTablespace = NULL; /* ditto */ AH->currTableAm = NULL; /* ditto */ - AH->toc = (TocEntry *) pg_malloc0(sizeof(TocEntry)); + AH->toc = pg_malloc0_object(TocEntry); AH->toc->next = AH->toc; AH->toc->prev = AH->toc; @@ -2509,7 +2509,7 @@ WriteDataChunks(ArchiveHandle *AH, ParallelState *pstate) TocEntry **tes; int ntes; - tes = (TocEntry **) pg_malloc(AH->tocCount * sizeof(TocEntry *)); + tes = pg_malloc_array(TocEntry *, AH->tocCount); ntes = 0; for (te = AH->toc->next; te != AH->toc; te = te->next) { @@ -2720,7 +2720,7 @@ ReadToc(ArchiveHandle *AH) for (i = 0; i < AH->tocCount; i++) { - te = (TocEntry *) pg_malloc0(sizeof(TocEntry)); + te = pg_malloc0_object(TocEntry); te->dumpId = ReadInt(AH); if (te->dumpId > AH->maxDumpId) @@ -2817,7 +2817,7 @@ ReadToc(ArchiveHandle *AH) if (AH->version >= K_VERS_1_5) { depSize = 100; - deps = (DumpId *) pg_malloc(sizeof(DumpId) * depSize); + deps = pg_malloc_array(DumpId, depSize); depIdx = 0; for (;;) { @@ -2827,7 +2827,7 @@ ReadToc(ArchiveHandle *AH) if (depIdx >= depSize) { depSize *= 2; - deps = (DumpId *) pg_realloc(deps, sizeof(DumpId) * depSize); + deps = pg_realloc_array(deps, DumpId, depSize); } sscanf(tmp, "%d", &deps[depIdx]); free(tmp); @@ -2836,7 +2836,7 @@ ReadToc(ArchiveHandle *AH) if (depIdx > 0) /* We have a non-null entry */ { - deps = (DumpId *) pg_realloc(deps, sizeof(DumpId) * depIdx); + deps = pg_realloc_array(deps, DumpId, depIdx); te->dependencies = deps; te->nDeps = depIdx; } @@ -4882,7 +4882,7 @@ fix_dependencies(ArchiveHandle *AH) { if (strcmp(te2->desc, "BLOBS") == 0) { - te->dependencies = (DumpId *) pg_malloc(sizeof(DumpId)); + te->dependencies = pg_malloc_object(DumpId); te->dependencies[0] = te2->dumpId; te->nDeps++; te->depCount++; @@ -4925,7 +4925,7 @@ fix_dependencies(ArchiveHandle *AH) for (te = AH->toc->next; te != AH->toc; te = te->next) { if (te->nRevDeps > 0) - te->revDeps = (DumpId *) pg_malloc(te->nRevDeps * sizeof(DumpId)); + te->revDeps = pg_malloc_array(DumpId, te->nRevDeps); te->nRevDeps = 0; } @@ -5040,7 +5040,7 @@ identify_locking_dependencies(ArchiveHandle *AH, TocEntry *te) * difference between a dependency on a table and a dependency on its * data, so that closer analysis would be needed here. */ - lockids = (DumpId *) pg_malloc(te->nDeps * sizeof(DumpId)); + lockids = pg_malloc_array(DumpId, te->nDeps); nlockids = 0; for (i = 0; i < te->nDeps; i++) { @@ -5058,7 +5058,7 @@ identify_locking_dependencies(ArchiveHandle *AH, TocEntry *te) return; } - te->lockDeps = pg_realloc(lockids, nlockids * sizeof(DumpId)); + te->lockDeps = pg_realloc_array(lockids, DumpId, nlockids); te->nLockDeps = nlockids; } @@ -5148,11 +5148,11 @@ CloneArchive(ArchiveHandle *AH) ArchiveHandle *clone; /* Make a "flat" copy */ - clone = (ArchiveHandle *) pg_malloc(sizeof(ArchiveHandle)); + clone = pg_malloc_object(ArchiveHandle); memcpy(clone, AH, sizeof(ArchiveHandle)); /* Likewise flat-copy the RestoreOptions, so we can alter them locally */ - clone->public.ropt = (RestoreOptions *) pg_malloc(sizeof(RestoreOptions)); + clone->public.ropt = pg_malloc_object(RestoreOptions); memcpy(clone->public.ropt, AH->public.ropt, sizeof(RestoreOptions)); /* Handle format-independent fields */ diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index 2226520dff..5299062094 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -136,7 +136,7 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) AH->WorkerJobRestorePtr = _WorkerJobRestoreCustom; /* Set up a private area. */ - ctx = (lclContext *) pg_malloc0(sizeof(lclContext)); + ctx = pg_malloc0_object(lclContext); AH->formatData = ctx; /* @@ -199,7 +199,7 @@ _ArchiveEntry(ArchiveHandle *AH, TocEntry *te) { lclTocEntry *ctx; - ctx = (lclTocEntry *) pg_malloc0(sizeof(lclTocEntry)); + ctx = pg_malloc0_object(lclTocEntry); if (te->dataDumper) ctx->dataState = K_OFFSET_POS_NOT_SET; else @@ -240,7 +240,7 @@ _ReadExtraToc(ArchiveHandle *AH, TocEntry *te) if (ctx == NULL) { - ctx = (lclTocEntry *) pg_malloc0(sizeof(lclTocEntry)); + ctx = pg_malloc0_object(lclTocEntry); te->formatData = ctx; } @@ -893,7 +893,7 @@ _Clone(ArchiveHandle *AH) /* * Each thread must have private lclContext working state. */ - AH->formatData = (lclContext *) pg_malloc(sizeof(lclContext)); + AH->formatData = pg_malloc_object(lclContext); memcpy(AH->formatData, ctx, sizeof(lclContext)); ctx = (lclContext *) AH->formatData; diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c index cd4036ead8..d6a1428c67 100644 --- a/src/bin/pg_dump/pg_backup_directory.c +++ b/src/bin/pg_dump/pg_backup_directory.c @@ -140,7 +140,7 @@ InitArchiveFmt_Directory(ArchiveHandle *AH) AH->WorkerJobDumpPtr = _WorkerJobDumpDirectory; /* Set up our private context */ - ctx = (lclContext *) pg_malloc0(sizeof(lclContext)); + ctx = pg_malloc0_object(lclContext); AH->formatData = ctx; ctx->dataFH = NULL; @@ -200,7 +200,7 @@ _ArchiveEntry(ArchiveHandle *AH, TocEntry *te) lclTocEntry *tctx; char fn[MAXPGPATH]; - tctx = (lclTocEntry *) pg_malloc0(sizeof(lclTocEntry)); + tctx = pg_malloc0_object(lclTocEntry); if (strcmp(te->desc, "BLOBS") == 0) { snprintf(fn, MAXPGPATH, "blobs_%d.toc", te->dumpId); @@ -252,7 +252,7 @@ _ReadExtraToc(ArchiveHandle *AH, TocEntry *te) if (tctx == NULL) { - tctx = (lclTocEntry *) pg_malloc0(sizeof(lclTocEntry)); + tctx = pg_malloc0_object(lclTocEntry); te->formatData = tctx; } @@ -769,7 +769,7 @@ _Clone(ArchiveHandle *AH) { lclContext *ctx = (lclContext *) AH->formatData; - AH->formatData = (lclContext *) pg_malloc(sizeof(lclContext)); + AH->formatData = pg_malloc_object(lclContext); memcpy(AH->formatData, ctx, sizeof(lclContext)); ctx = (lclContext *) AH->formatData; diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 2c3754d020..b4b7c234e2 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -1541,7 +1541,7 @@ setup_connection(Archive *AH, const char *dumpencoding, * Initialize prepared-query state to "nothing prepared". We do this here * so that a parallel dump worker will have its own state. */ - AH->is_prepared = (bool *) pg_malloc0(NUM_PREP_QUERIES * sizeof(bool)); + AH->is_prepared = pg_malloc0_array(bool, NUM_PREP_QUERIES); /* * Start transaction-snapshot mode transaction to dump consistent data. @@ -2569,7 +2569,7 @@ dumpTableData_insert(Archive *fout, const void *dcontext) * actual column value --- but we can save a few cycles by fetching nulls * rather than the uninteresting-to-us value. */ - attgenerated = (char *) pg_malloc(tbinfo->numatts * sizeof(char)); + attgenerated = pg_malloc_array(char, tbinfo->numatts); appendPQExpBufferStr(q, "DECLARE _pg_dump_cursor CURSOR FOR SELECT "); nfields = 0; for (i = 0; i < tbinfo->numatts; i++) @@ -3070,7 +3070,7 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo) return; /* OK, let's dump it */ - tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo)); + tdinfo = pg_malloc_object(TableDataInfo); if (tbinfo->relkind == RELKIND_MATVIEW) tdinfo->dobj.objType = DO_REFRESH_MATVIEW; @@ -4087,14 +4087,14 @@ getLOs(Archive *fout) * Create a "BLOBS" data item for the group, too. This is just a * placeholder for sorting; it carries no data now. */ - lodata = (DumpableObject *) pg_malloc(sizeof(DumpableObject)); + lodata = pg_malloc_object(DumpableObject); lodata->objType = DO_LARGE_OBJECT_DATA; lodata->catId = nilCatalogId; AssignDumpId(lodata); lodata->name = pg_strdup(namebuf); lodata->components |= DUMP_COMPONENT_DATA; /* Set up explicit dependency from data to metadata */ - lodata->dependencies = (DumpId *) pg_malloc(sizeof(DumpId)); + lodata->dependencies = pg_malloc_object(DumpId); lodata->dependencies[0] = loinfo->dobj.dumpId; lodata->nDeps = lodata->allocDeps = 1; } @@ -4310,7 +4310,7 @@ getPolicies(Archive *fout, TableInfo tblinfo[], int numTables) * Note: use tableoid 0 so that this object won't be mistaken for * something that pg_depend entries apply to. */ - polinfo = pg_malloc(sizeof(PolicyInfo)); + polinfo = pg_malloc_object(PolicyInfo); polinfo->dobj.objType = DO_POLICY; polinfo->dobj.catId.tableoid = 0; polinfo->dobj.catId.oid = tbinfo->dobj.catId.oid; @@ -4366,7 +4366,7 @@ getPolicies(Archive *fout, TableInfo tblinfo[], int numTables) i_polqual = PQfnumber(res, "polqual"); i_polwithcheck = PQfnumber(res, "polwithcheck"); - polinfo = pg_malloc(ntups * sizeof(PolicyInfo)); + polinfo = pg_malloc_array(PolicyInfo, ntups); for (j = 0; j < ntups; j++) { @@ -4608,7 +4608,7 @@ getPublications(Archive *fout) i_pubviaroot = PQfnumber(res, "pubviaroot"); i_pubgencols = PQfnumber(res, "pubgencols"); - pubinfo = pg_malloc(ntups * sizeof(PublicationInfo)); + pubinfo = pg_malloc_array(PublicationInfo, ntups); for (i = 0; i < ntups; i++) { @@ -4787,7 +4787,7 @@ getPublicationNamespaces(Archive *fout) i_pnnspid = PQfnumber(res, "pnnspid"); /* this allocation may be more than we need */ - pubsinfo = pg_malloc(ntups * sizeof(PublicationSchemaInfo)); + pubsinfo = pg_malloc_array(PublicationSchemaInfo, ntups); j = 0; for (i = 0; i < ntups; i++) @@ -4886,7 +4886,7 @@ getPublicationTables(Archive *fout, TableInfo tblinfo[], int numTables) i_prattrs = PQfnumber(res, "prattrs"); /* this allocation may be more than we need */ - pubrinfo = pg_malloc(ntups * sizeof(PublicationRelInfo)); + pubrinfo = pg_malloc_array(PublicationRelInfo, ntups); j = 0; for (i = 0; i < ntups; i++) @@ -5263,7 +5263,7 @@ getSubscriptions(Archive *fout) i_suborigin = PQfnumber(res, "suborigin"); i_suboriginremotelsn = PQfnumber(res, "suboriginremotelsn"); - subinfo = pg_malloc(ntups * sizeof(SubscriptionInfo)); + subinfo = pg_malloc_array(SubscriptionInfo, ntups); for (i = 0; i < ntups; i++) { @@ -5357,7 +5357,7 @@ getSubscriptionRelations(Archive *fout) i_srsubstate = PQfnumber(res, "srsubstate"); i_srsublsn = PQfnumber(res, "srsublsn"); - subrinfo = pg_malloc(ntups * sizeof(SubRelInfo)); + subrinfo = pg_malloc_array(SubRelInfo, ntups); for (int i = 0; i < ntups; i++) { Oid cur_srsubid = atooid(PQgetvalue(res, i, i_srsubid)); @@ -5837,8 +5837,8 @@ collectBinaryUpgradeClassOids(Archive *fout) res = ExecuteSqlQuery(fout, query, PGRES_TUPLES_OK); nbinaryUpgradeClassOids = PQntuples(res); - binaryUpgradeClassOids = (BinaryUpgradeClassOidItem *) - pg_malloc(nbinaryUpgradeClassOids * sizeof(BinaryUpgradeClassOidItem)); + binaryUpgradeClassOids = + pg_malloc_array(BinaryUpgradeClassOidItem, nbinaryUpgradeClassOids); for (int i = 0; i < nbinaryUpgradeClassOids; i++) { @@ -6019,7 +6019,7 @@ getNamespaces(Archive *fout) ntups = PQntuples(res); - nsinfo = (NamespaceInfo *) pg_malloc(ntups * sizeof(NamespaceInfo)); + nsinfo = pg_malloc_array(NamespaceInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6151,7 +6151,7 @@ getExtensions(Archive *fout, int *numExtensions) if (ntups == 0) goto cleanup; - extinfo = (ExtensionInfo *) pg_malloc(ntups * sizeof(ExtensionInfo)); + extinfo = pg_malloc_array(ExtensionInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6250,7 +6250,7 @@ getTypes(Archive *fout) ntups = PQntuples(res); - tyinfo = (TypeInfo *) pg_malloc(ntups * sizeof(TypeInfo)); + tyinfo = pg_malloc_array(TypeInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6336,7 +6336,7 @@ getTypes(Archive *fout) (tyinfo[i].typtype == TYPTYPE_BASE || tyinfo[i].typtype == TYPTYPE_RANGE)) { - stinfo = (ShellTypeInfo *) pg_malloc(sizeof(ShellTypeInfo)); + stinfo = pg_malloc_object(ShellTypeInfo); stinfo->dobj.objType = DO_SHELL_TYPE; stinfo->dobj.catId = nilCatalogId; AssignDumpId(&stinfo->dobj); @@ -6399,7 +6399,7 @@ getOperators(Archive *fout) ntups = PQntuples(res); - oprinfo = (OprInfo *) pg_malloc(ntups * sizeof(OprInfo)); + oprinfo = pg_malloc_array(OprInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6471,7 +6471,7 @@ getCollations(Archive *fout) ntups = PQntuples(res); - collinfo = (CollInfo *) pg_malloc(ntups * sizeof(CollInfo)); + collinfo = pg_malloc_array(CollInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6535,7 +6535,7 @@ getConversions(Archive *fout) ntups = PQntuples(res); - convinfo = (ConvInfo *) pg_malloc(ntups * sizeof(ConvInfo)); + convinfo = pg_malloc_array(ConvInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6608,7 +6608,7 @@ getAccessMethods(Archive *fout) ntups = PQntuples(res); - aminfo = (AccessMethodInfo *) pg_malloc(ntups * sizeof(AccessMethodInfo)); + aminfo = pg_malloc_array(AccessMethodInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6670,7 +6670,7 @@ getOpclasses(Archive *fout) ntups = PQntuples(res); - opcinfo = (OpclassInfo *) pg_malloc(ntups * sizeof(OpclassInfo)); + opcinfo = pg_malloc_array(OpclassInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6735,7 +6735,7 @@ getOpfamilies(Archive *fout) ntups = PQntuples(res); - opfinfo = (OpfamilyInfo *) pg_malloc(ntups * sizeof(OpfamilyInfo)); + opfinfo = pg_malloc_array(OpfamilyInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6853,7 +6853,7 @@ getAggregates(Archive *fout) ntups = PQntuples(res); - agginfo = (AggInfo *) pg_malloc(ntups * sizeof(AggInfo)); + agginfo = pg_malloc_array(AggInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -6886,7 +6886,7 @@ getAggregates(Archive *fout) agginfo[i].aggfn.argtypes = NULL; else { - agginfo[i].aggfn.argtypes = (Oid *) pg_malloc(agginfo[i].aggfn.nargs * sizeof(Oid)); + agginfo[i].aggfn.argtypes = pg_malloc_array(Oid, agginfo[i].aggfn.nargs); parseOidArray(PQgetvalue(res, i, i_proargtypes), agginfo[i].aggfn.argtypes, agginfo[i].aggfn.nargs); @@ -7044,7 +7044,7 @@ getFuncs(Archive *fout) ntups = PQntuples(res); - finfo = (FuncInfo *) pg_malloc0(ntups * sizeof(FuncInfo)); + finfo = pg_malloc0_array(FuncInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -7079,7 +7079,7 @@ getFuncs(Archive *fout) finfo[i].argtypes = NULL; else { - finfo[i].argtypes = (Oid *) pg_malloc(finfo[i].nargs * sizeof(Oid)); + finfo[i].argtypes = pg_malloc_array(Oid, finfo[i].nargs); parseOidArray(PQgetvalue(res, i, i_proargtypes), finfo[i].argtypes, finfo[i].nargs); } @@ -7121,14 +7121,14 @@ getRelationStatistics(Archive *fout, DumpableObject *rel, int32 relpages, (relkind == RELKIND_MATVIEW || relkind == RELKIND_FOREIGN_TABLE)) { - RelStatsInfo *info = pg_malloc0(sizeof(RelStatsInfo)); + RelStatsInfo *info = pg_malloc0_object(RelStatsInfo); DumpableObject *dobj = &info->dobj; dobj->objType = DO_REL_STATS; dobj->catId.tableoid = 0; dobj->catId.oid = 0; AssignDumpId(dobj); - dobj->dependencies = (DumpId *) pg_malloc(sizeof(DumpId)); + dobj->dependencies = pg_malloc_object(DumpId); dobj->dependencies[0] = rel->dumpId; dobj->nDeps = 1; dobj->allocDeps = 1; @@ -7413,7 +7413,7 @@ getTables(Archive *fout, int *numTables) * only one, because we don't yet know which tables might be inheritance * ancestors of the target table. */ - tblinfo = (TableInfo *) pg_malloc0(ntups * sizeof(TableInfo)); + tblinfo = pg_malloc0_array(TableInfo, ntups); i_reltableoid = PQfnumber(res, "tableoid"); i_reloid = PQfnumber(res, "oid"); @@ -7745,7 +7745,7 @@ getInherits(Archive *fout, int *numInherits) *numInherits = ntups; - inhinfo = (InhInfo *) pg_malloc(ntups * sizeof(InhInfo)); + inhinfo = pg_malloc_array(InhInfo, ntups); i_inhrelid = PQfnumber(res, "inhrelid"); i_inhparent = PQfnumber(res, "inhparent"); @@ -8057,7 +8057,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) i_indstatcols = PQfnumber(res, "indstatcols"); i_indstatvals = PQfnumber(res, "indstatvals"); - indxinfo = (IndxInfo *) pg_malloc(ntups * sizeof(IndxInfo)); + indxinfo = pg_malloc_array(IndxInfo, ntups); /* * Outer loop iterates once per table, not once per row. Incrementing of @@ -8123,7 +8123,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) indxinfo[j].indreloptions = pg_strdup(PQgetvalue(res, j, i_indreloptions)); indxinfo[j].indstatcols = pg_strdup(PQgetvalue(res, j, i_indstatcols)); indxinfo[j].indstatvals = pg_strdup(PQgetvalue(res, j, i_indstatvals)); - indxinfo[j].indkeys = (Oid *) pg_malloc(indxinfo[j].indnattrs * sizeof(Oid)); + indxinfo[j].indkeys = pg_malloc_array(Oid, indxinfo[j].indnattrs); parseOidArray(PQgetvalue(res, j, i_indkey), indxinfo[j].indkeys, indxinfo[j].indnattrs); indxinfo[j].indisclustered = (PQgetvalue(res, j, i_indisclustered)[0] == 't'); @@ -8161,7 +8161,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) */ ConstraintInfo *constrinfo; - constrinfo = (ConstraintInfo *) pg_malloc(sizeof(ConstraintInfo)); + constrinfo = pg_malloc_object(ConstraintInfo); constrinfo->dobj.objType = DO_CONSTRAINT; constrinfo->dobj.catId.tableoid = atooid(PQgetvalue(res, j, i_contableoid)); constrinfo->dobj.catId.oid = atooid(PQgetvalue(res, j, i_conoid)); @@ -8252,7 +8252,7 @@ getExtendedStatistics(Archive *fout) i_stxrelid = PQfnumber(res, "stxrelid"); i_stattarget = PQfnumber(res, "stxstattarget"); - statsextinfo = (StatsExtInfo *) pg_malloc(ntups * sizeof(StatsExtInfo)); + statsextinfo = pg_malloc_array(StatsExtInfo, ntups); for (i = 0; i < ntups; i++) { @@ -8363,7 +8363,7 @@ getConstraints(Archive *fout, TableInfo tblinfo[], int numTables) i_conindid = PQfnumber(res, "conindid"); i_condef = PQfnumber(res, "condef"); - constrinfo = (ConstraintInfo *) pg_malloc(ntups * sizeof(ConstraintInfo)); + constrinfo = pg_malloc_array(ConstraintInfo, ntups); curtblindx = -1; for (int j = 0; j < ntups; j++) @@ -8524,7 +8524,7 @@ getDomainConstraints(Archive *fout, TypeInfo *tyinfo) i_convalidated = PQfnumber(res, "convalidated"); i_contype = PQfnumber(res, "contype"); - constrinfo = (ConstraintInfo *) pg_malloc(ntups * sizeof(ConstraintInfo)); + constrinfo = pg_malloc_array(ConstraintInfo, ntups); tyinfo->domChecks = constrinfo; /* 'i' tracks result rows; 'j' counts CHECK constraints */ @@ -8612,7 +8612,7 @@ getRules(Archive *fout) ntups = PQntuples(res); - ruleinfo = (RuleInfo *) pg_malloc(ntups * sizeof(RuleInfo)); + ruleinfo = pg_malloc_array(RuleInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -8818,7 +8818,7 @@ getTriggers(Archive *fout, TableInfo tblinfo[], int numTables) i_tgispartition = PQfnumber(res, "tgispartition"); i_tgdef = PQfnumber(res, "tgdef"); - tginfo = (TriggerInfo *) pg_malloc(ntups * sizeof(TriggerInfo)); + tginfo = pg_malloc_array(TriggerInfo, ntups); /* * Outer loop iterates once per table, not once per row. Incrementing of @@ -8915,7 +8915,7 @@ getEventTriggers(Archive *fout) ntups = PQntuples(res); - evtinfo = (EventTriggerInfo *) pg_malloc(ntups * sizeof(EventTriggerInfo)); + evtinfo = pg_malloc_array(EventTriggerInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -8989,7 +8989,7 @@ getProcLangs(Archive *fout) ntups = PQntuples(res); - planginfo = (ProcLangInfo *) pg_malloc(ntups * sizeof(ProcLangInfo)); + planginfo = pg_malloc_array(ProcLangInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -9081,7 +9081,7 @@ getCasts(Archive *fout) ntups = PQntuples(res); - castinfo = (CastInfo *) pg_malloc(ntups * sizeof(CastInfo)); + castinfo = pg_malloc_array(CastInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -9180,7 +9180,7 @@ getTransforms(Archive *fout) ntups = PQntuples(res); - transforminfo = (TransformInfo *) pg_malloc(ntups * sizeof(TransformInfo)); + transforminfo = pg_malloc_array(TransformInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -9518,28 +9518,28 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) /* Save data for this table */ tbinfo->numatts = numatts; - tbinfo->attnames = (char **) pg_malloc(numatts * sizeof(char *)); - tbinfo->atttypnames = (char **) pg_malloc(numatts * sizeof(char *)); - tbinfo->attstattarget = (int *) pg_malloc(numatts * sizeof(int)); - tbinfo->attstorage = (char *) pg_malloc(numatts * sizeof(char)); - tbinfo->typstorage = (char *) pg_malloc(numatts * sizeof(char)); - tbinfo->attidentity = (char *) pg_malloc(numatts * sizeof(char)); - tbinfo->attgenerated = (char *) pg_malloc(numatts * sizeof(char)); - tbinfo->attisdropped = (bool *) pg_malloc(numatts * sizeof(bool)); - tbinfo->attlen = (int *) pg_malloc(numatts * sizeof(int)); - tbinfo->attalign = (char *) pg_malloc(numatts * sizeof(char)); - tbinfo->attislocal = (bool *) pg_malloc(numatts * sizeof(bool)); - tbinfo->attoptions = (char **) pg_malloc(numatts * sizeof(char *)); - tbinfo->attcollation = (Oid *) pg_malloc(numatts * sizeof(Oid)); - tbinfo->attcompression = (char *) pg_malloc(numatts * sizeof(char)); - tbinfo->attfdwoptions = (char **) pg_malloc(numatts * sizeof(char *)); - tbinfo->attmissingval = (char **) pg_malloc(numatts * sizeof(char *)); - tbinfo->notnull_constrs = (char **) pg_malloc(numatts * sizeof(char *)); - tbinfo->notnull_comment = (char **) pg_malloc(numatts * sizeof(char *)); - tbinfo->notnull_invalid = (bool *) pg_malloc(numatts * sizeof(bool)); - tbinfo->notnull_noinh = (bool *) pg_malloc(numatts * sizeof(bool)); - tbinfo->notnull_islocal = (bool *) pg_malloc(numatts * sizeof(bool)); - tbinfo->attrdefs = (AttrDefInfo **) pg_malloc(numatts * sizeof(AttrDefInfo *)); + tbinfo->attnames = pg_malloc_array(char *, numatts); + tbinfo->atttypnames = pg_malloc_array(char *, numatts); + tbinfo->attstattarget = pg_malloc_array(int, numatts); + tbinfo->attstorage = pg_malloc_array(char, numatts); + tbinfo->typstorage = pg_malloc_array(char, numatts); + tbinfo->attidentity = pg_malloc_array(char, numatts); + tbinfo->attgenerated = pg_malloc_array(char, numatts); + tbinfo->attisdropped = pg_malloc_array(bool, numatts); + tbinfo->attlen = pg_malloc_array(int, numatts); + tbinfo->attalign = pg_malloc_array(char, numatts); + tbinfo->attislocal = pg_malloc_array(bool, numatts); + tbinfo->attoptions = pg_malloc_array(char *, numatts); + tbinfo->attcollation = pg_malloc_array(Oid, numatts); + tbinfo->attcompression = pg_malloc_array(char, numatts); + tbinfo->attfdwoptions = pg_malloc_array(char *, numatts); + tbinfo->attmissingval = pg_malloc_array(char *, numatts); + tbinfo->notnull_constrs = pg_malloc_array(char *, numatts); + tbinfo->notnull_comment = pg_malloc_array(char *, numatts); + tbinfo->notnull_invalid = pg_malloc_array(bool, numatts); + tbinfo->notnull_noinh = pg_malloc_array(bool, numatts); + tbinfo->notnull_islocal = pg_malloc_array(bool, numatts); + tbinfo->attrdefs = pg_malloc_array(AttrDefInfo *, numatts); hasdefaults = false; for (int j = 0; j < numatts; j++, r++) @@ -9624,7 +9624,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) res = ExecuteSqlQuery(fout, q->data, PGRES_TUPLES_OK); numDefaults = PQntuples(res); - attrdefs = (AttrDefInfo *) pg_malloc(numDefaults * sizeof(AttrDefInfo)); + attrdefs = pg_malloc_array(AttrDefInfo, numDefaults); curtblindx = -1; for (int j = 0; j < numDefaults; j++) @@ -9760,7 +9760,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) res = ExecuteSqlQuery(fout, q->data, PGRES_TUPLES_OK); numConstrs = PQntuples(res); - constrs = (ConstraintInfo *) pg_malloc(numConstrs * sizeof(ConstraintInfo)); + constrs = pg_malloc_array(ConstraintInfo, numConstrs); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -9859,7 +9859,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) res = ExecuteSqlQuery(fout, q->data, PGRES_TUPLES_OK); numConstrs = PQntuples(res); - constrs = (ConstraintInfo *) pg_malloc(numConstrs * sizeof(ConstraintInfo)); + constrs = pg_malloc_array(ConstraintInfo, numConstrs); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -10187,7 +10187,7 @@ getTSParsers(Archive *fout) ntups = PQntuples(res); - prsinfo = (TSParserInfo *) pg_malloc(ntups * sizeof(TSParserInfo)); + prsinfo = pg_malloc_array(TSParserInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -10254,7 +10254,7 @@ getTSDictionaries(Archive *fout) ntups = PQntuples(res); - dictinfo = (TSDictInfo *) pg_malloc(ntups * sizeof(TSDictInfo)); + dictinfo = pg_malloc_array(TSDictInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -10318,7 +10318,7 @@ getTSTemplates(Archive *fout) ntups = PQntuples(res); - tmplinfo = (TSTemplateInfo *) pg_malloc(ntups * sizeof(TSTemplateInfo)); + tmplinfo = pg_malloc_array(TSTemplateInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -10377,7 +10377,7 @@ getTSConfigurations(Archive *fout) ntups = PQntuples(res); - cfginfo = (TSConfigInfo *) pg_malloc(ntups * sizeof(TSConfigInfo)); + cfginfo = pg_malloc_array(TSConfigInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -10449,7 +10449,7 @@ getForeignDataWrappers(Archive *fout) ntups = PQntuples(res); - fdwinfo = (FdwInfo *) pg_malloc(ntups * sizeof(FdwInfo)); + fdwinfo = pg_malloc_array(FdwInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -10532,7 +10532,7 @@ getForeignServers(Archive *fout) ntups = PQntuples(res); - srvinfo = (ForeignServerInfo *) pg_malloc(ntups * sizeof(ForeignServerInfo)); + srvinfo = pg_malloc_array(ForeignServerInfo, ntups); i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); @@ -10630,7 +10630,7 @@ getDefaultACLs(Archive *fout) ntups = PQntuples(res); - daclinfo = (DefaultACLInfo *) pg_malloc(ntups * sizeof(DefaultACLInfo)); + daclinfo = pg_malloc_array(DefaultACLInfo, ntups); i_oid = PQfnumber(res, "oid"); i_tableoid = PQfnumber(res, "tableoid"); @@ -10729,7 +10729,7 @@ collectRoleNames(Archive *fout) nrolenames = PQntuples(res); - rolenames = (RoleNameItem *) pg_malloc(nrolenames * sizeof(RoleNameItem)); + rolenames = pg_malloc_array(RoleNameItem, nrolenames); for (i = 0; i < nrolenames; i++) { @@ -11606,7 +11606,7 @@ collectComments(Archive *fout) ntups = PQntuples(res); - comments = (CommentItem *) pg_malloc(ntups * sizeof(CommentItem)); + comments = pg_malloc_array(CommentItem, ntups); ncomments = 0; dobj = NULL; @@ -13683,7 +13683,7 @@ dumpFunc(Archive *fout, const FuncInfo *finfo) if (*protrftypes) { - Oid *typeids = pg_malloc(FUNC_MAX_ARGS * sizeof(Oid)); + Oid *typeids = pg_malloc_array(Oid, FUNC_MAX_ARGS); int i; appendPQExpBufferStr(q, " TRANSFORM "); @@ -16810,7 +16810,7 @@ collectSecLabels(Archive *fout) ntups = PQntuples(res); - seclabels = (SecLabelItem *) pg_malloc(ntups * sizeof(SecLabelItem)); + seclabels = pg_malloc_array(SecLabelItem, ntups); nseclabels = 0; dobj = NULL; @@ -19178,7 +19178,7 @@ collectSequences(Archive *fout) res = ExecuteSqlQuery(fout, query, PGRES_TUPLES_OK); nsequences = PQntuples(res); - sequences = (SequenceItem *) pg_malloc(nsequences * sizeof(SequenceItem)); + sequences = pg_malloc_array(SequenceItem, nsequences); for (int i = 0; i < nsequences; i++) { @@ -19256,7 +19256,7 @@ dumpSequence(Archive *fout, const TableInfo *tbinfo) PQntuples(res)), tbinfo->dobj.name, PQntuples(res)); - seq = pg_malloc0(sizeof(SequenceItem)); + seq = pg_malloc0_object(SequenceItem); seq->seqtype = parse_sequence_type(PQgetvalue(res, 0, 0)); seq->startv = strtoi64(PQgetvalue(res, 0, 1), NULL, 10); seq->incby = strtoi64(PQgetvalue(res, 0, 2), NULL, 10); @@ -20360,7 +20360,7 @@ createBoundaryObjects(void) { DumpableObject *dobjs; - dobjs = (DumpableObject *) pg_malloc(2 * sizeof(DumpableObject)); + dobjs = pg_malloc_array(DumpableObject, 2); dobjs[0].objType = DO_PRE_DATA_BOUNDARY; dobjs[0].catId = nilCatalogId; @@ -20534,7 +20534,7 @@ BuildArchiveDependencies(Archive *fout) continue; /* Set up work array */ allocDeps = 64; - dependencies = (DumpId *) pg_malloc(allocDeps * sizeof(DumpId)); + dependencies = pg_malloc_array(DumpId, allocDeps); nDeps = 0; /* Recursively find all dumpable dependencies */ findDumpableDependencies(AH, dobj, @@ -20542,8 +20542,7 @@ BuildArchiveDependencies(Archive *fout) /* And save 'em ... */ if (nDeps > 0) { - dependencies = (DumpId *) pg_realloc(dependencies, - nDeps * sizeof(DumpId)); + dependencies = pg_realloc_array(dependencies, DumpId, nDeps); te->dependencies = dependencies; te->nDeps = nDeps; } @@ -20577,8 +20576,7 @@ findDumpableDependencies(ArchiveHandle *AH, const DumpableObject *dobj, if (*nDeps >= *allocDeps) { *allocDeps *= 2; - *dependencies = (DumpId *) pg_realloc(*dependencies, - *allocDeps * sizeof(DumpId)); + *dependencies = pg_realloc_array(*dependencies, DumpId, *allocDeps); } (*dependencies)[*nDeps] = depid; (*nDeps)++; diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c index 24bed6681d..03e5c1c111 100644 --- a/src/bin/pg_dump/pg_dump_sort.c +++ b/src/bin/pg_dump/pg_dump_sort.c @@ -572,7 +572,7 @@ sortDumpableObjects(DumpableObject **objs, int numObjs, preDataBoundId = preBoundaryId; postDataBoundId = postBoundaryId; - ordering = (DumpableObject **) pg_malloc(numObjs * sizeof(DumpableObject *)); + ordering = pg_malloc_array(DumpableObject *, numObjs); while (!TopoSort(objs, numObjs, ordering, &nOrdering)) findDependencyLoops(ordering, nOrdering, numObjs); @@ -651,8 +651,8 @@ TopoSort(DumpableObject **objs, * We also make a map showing the input-order index of the item with * dumpId j. */ - beforeConstraints = (int *) pg_malloc0((maxDumpId + 1) * sizeof(int)); - idMap = (int *) pg_malloc((maxDumpId + 1) * sizeof(int)); + beforeConstraints = pg_malloc0_array(int, (maxDumpId + 1)); + idMap = pg_malloc_array(int, (maxDumpId + 1)); for (i = 0; i < numObjs; i++) { obj = objs[i]; @@ -787,9 +787,9 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs) bool fixedloop; int i; - processed = (bool *) pg_malloc0((getMaxDumpId() + 1) * sizeof(bool)); - searchFailed = (DumpId *) pg_malloc0((getMaxDumpId() + 1) * sizeof(DumpId)); - workspace = (DumpableObject **) pg_malloc(totObjs * sizeof(DumpableObject *)); + processed = pg_malloc0_array(bool, (getMaxDumpId() + 1)); + searchFailed = pg_malloc0_array(DumpId, (getMaxDumpId() + 1)); + workspace = pg_malloc_array(DumpableObject *, totObjs); fixedloop = false; for (i = 0; i < nObjs; i++) diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 30fecd0c25..98389d2034 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -1140,7 +1140,7 @@ dumpRoleMembership(PGconn *conn) } remaining = end - start; - done = pg_malloc0(remaining * sizeof(bool)); + done = pg_malloc0_array(bool, remaining); ht = rolename_create(remaining, NULL); /* From 4ec0e75afd488d41482ae10f6b5df9fa9a586dee Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Fri, 13 Feb 2026 11:50:14 +0100 Subject: [PATCH 102/147] meson: Add target for generating docs images This adds an 'images' target to the meson build system in order to be able to regenerate the images used in the docs. Author: Nazir Bilal Yavuz Reviewed-by: Daniel Gustafsson Reported-by: Daniel Gustafsson Discussion: https://postgr.es/m/CAN55FZ0c0Tcjx9=e-YibWGHa1-xmdV63p=THH4YYznz+pYcfig@mail.gmail.com --- doc/src/sgml/images/meson.build | 61 +++++++++++++++++++++++++++++++++ doc/src/sgml/meson.build | 2 ++ meson.build | 2 ++ 3 files changed, 65 insertions(+) create mode 100644 doc/src/sgml/images/meson.build diff --git a/doc/src/sgml/images/meson.build b/doc/src/sgml/images/meson.build new file mode 100644 index 0000000000..8e601e877a --- /dev/null +++ b/doc/src/sgml/images/meson.build @@ -0,0 +1,61 @@ +# doc/src/sgml/images/meson.build +# +# see README in this directory about image handling + +if not xsltproc_bin.found() or not dot.found() or not ditaa.found() + subdir_done() +endif + +image_targets = [] + +fixup_svg_xsl = files('fixup-svg.xsl') + +all_files = [ + 'genetic-algorithm.gv', + 'gin.gv', + 'pagelayout.txt', + 'temporal-entities.txt', + 'temporal-references.txt', +] + +foreach file : all_files + + str_split = file.split('.') + actual_file_name = str_split[0] + extension = str_split[1] + cur_file = files(file) + tmp_name = '@0@.svg.tmp'.format(file) + output_name = '@0@.svg'.format(actual_file_name) + + command = [] + if extension == 'gv' + command = [dot, '-T', 'svg', '-o', '@OUTPUT@', '@INPUT@'] + elif extension == 'txt' + command = [ditaa, '-E', '-S', '--svg', '@INPUT@', '@OUTPUT@'] + else + error('Unknown extension: ".@0@" while generating images'.format(extension)) + endif + + svg_tmp = custom_target(tmp_name, + input: cur_file, + output: tmp_name, + command: command, + ) + + current_svg = custom_target(output_name, + input: svg_tmp, + output: output_name, + command: [xsltproc_bin, + '--nonet', + # Use --novalid to avoid loading SVG DTD if a file specifies it, since + # it might not be available locally, and we don't need it. + '--novalid', + '-o', '@OUTPUT@', + fixup_svg_xsl, + '@INPUT@'] + ) + + image_targets += current_svg +endforeach + +alias_target('images', image_targets) diff --git a/doc/src/sgml/meson.build b/doc/src/sgml/meson.build index d8f40a0b16..a1ae5c54ed 100644 --- a/doc/src/sgml/meson.build +++ b/doc/src/sgml/meson.build @@ -1,5 +1,7 @@ # Copyright (c) 2022-2026, PostgreSQL Global Development Group +subdir('images') + docs = [] installdocs = [] alldocs = [] diff --git a/meson.build b/meson.build index 96b3869df8..f6d5842d85 100644 --- a/meson.build +++ b/meson.build @@ -355,6 +355,8 @@ cp = find_program('cp', required: false, native: true) xmllint_bin = find_program(get_option('XMLLINT'), native: true, required: false) xsltproc_bin = find_program(get_option('XSLTPROC'), native: true, required: false) nm = find_program('nm', required: false, native: true) +ditaa = find_program('ditaa', native: true, required: false) +dot = find_program('dot', native: true, required: false) bison_flags = [] if bison.found() From 4469fe176118607f1a1dbcbbe5de57f9156e293f Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Fri, 13 Feb 2026 11:50:17 +0100 Subject: [PATCH 103/147] doc: Update docs images README with required ditaa version The URL for Ditaa linked to the old Sourceforge version which is too old for what we need, the fork over on Github is the correct version to use for re-generating the SVG files for the docs. The required Ditaa version is 0.11.0 as it when SVG support as added. Running the version found on Sourceforge produce the error below: $ ditaa -E -S --svg in.txt out.txt Unrecognized option: --svg usage: ditaa [OUTFILE] [-A] [-b ] [-d] [-E] [-e ] [-h] [--help] [-o] [-r] [-S] [-s ] [-T] [-t ] [-v] [-W] While there, also mention that meson rules exists for building images. Author: Nazir Bilal Yavuz Reviewed-by: Daniel Gustafsson Reviewed-by: Paul A Jungwirth Discussion: https://postgr.es/m/CAN55FZ2O-23xERF2NYcvv9DM_1c9T16y6mi3vyP=O1iuXS0ASA@mail.gmail.com --- doc/src/sgml/images/README | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/images/README b/doc/src/sgml/images/README index 07c4580255..93b75485c4 100644 --- a/doc/src/sgml/images/README +++ b/doc/src/sgml/images/README @@ -13,14 +13,14 @@ involve diffable source files. These tools are acceptable: - Graphviz (https://graphviz.org/) -- Ditaa (http://ditaa.sourceforge.net/) +- Ditaa v0.11.0 or later (https://github.com/stathissideris/ditaa) We use SVG as the format for integrating the image into the ultimate output formats of the documentation, that is, HTML, PDF, and others. Therefore, any tool used needs to be able to produce SVG. -This directory contains makefile rules to build SVG from common input -formats, using some common styling. +This directory contains makefile and meson rules to build SVG from common +input formats, using some common styling. fixup-svg.xsl applies some postprocessing to the SVG files produced by those external tools to address assorted issues. See comments in From aa082bed0b6433b58815683dde425bce57ed931c Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Fri, 13 Feb 2026 12:12:11 +0100 Subject: [PATCH 104/147] doc: Mention PASSING support for jsonpath variables Commit dfd79e2d added a TODO comment to update this paragraph when support for PASSING was added. Commit 6185c9737cf added PASSING but missed resolving this TODO. Fix by expanding the paragraph with a reference to PASSING. Author: Aditya Gollamudi Reviewed-by: Daniel Gustafsson Discussion: https://postgr.es/m/20260117051406.sx6pss4ryirn2x4v@pgs --- doc/src/sgml/json.sgml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/json.sgml b/doc/src/sgml/json.sgml index 206eadb8f7..8a2aad5935 100644 --- a/doc/src/sgml/json.sgml +++ b/doc/src/sgml/json.sgml @@ -882,9 +882,10 @@ UPDATE table_name SET jsonb_field[1]['a'] = '1'; $varname A named variable. Its value can be set by the parameter - vars of several JSON processing functions; - see for details. - + vars of several JSON processing functions + (see ), or by + using the SQL/JSON PASSING clause as described + in . From ef3c3cf6d021ff9884c513afd850a9fe85cad736 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sat, 14 Feb 2026 13:50:06 +0700 Subject: [PATCH 105/147] Perform radix sort on SortTuples with pass-by-value Datums MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Radix sort can be much faster than quicksort, but for our purposes it is limited to sequences of unsigned bytes. To make tuples with other types amenable to this technique, several features of tuple comparison must be accounted for, i.e. the sort key must be "normalized": 1. Signedness -- It's possible to modify a signed integer such that it can be compared as unsigned. For example, a signed char has range -128 to 127. If we cast that to unsigned char and add 128, the range of values becomes 0 to 255 while preserving order. 2. Direction -- SQL allows specification of ASC or DESC. The descending case is easily handled by taking the complement of the unsigned representation. 3. NULL values -- NULLS FIRST and NULLS LAST must work correctly. This commmit only handles the case where datum1 is pass-by-value Datum (possibly abbreviated) that compares like an ordinary integer. (Abbreviations of values of type "numeric" are a convenient counterexample.) First, tuples are partitioned by nullness in the correct NULL ordering. Then the NOT NULL tuples are sorted with radix sort on datum1. For tiebreaks on subsequent sortkeys (including the first sort key if abbreviated), we divert to the usual qsort. ORDER BY queries on pre-warmed buffers are up to 2x faster on high cardinality inputs with radix sort than the sort specializations added by commit 697492434, so get rid of them. It's sufficient to fall back to qsort_tuple() for small arrays. Moderately low cardinality inputs show more modest improvents. Our qsort is strongly optimized for very low cardinality inputs, but radix sort is usually equal or very close in those cases. The changes to the regression tests are caused by under-specified sort orders, e.g. "SELECT a, b from mytable order by a;". For unstable sorts, such as our qsort and this in-place radix sort, there is no guarantee of the order of "b" within each group of "a". The implementation is taken from ska_byte_sort() (Boost licensed), which is similar to American flag sort (an in-place radix sort) with modifications to make it better suited for modern pipelined CPUs. The technique of normalization described above can also be extended to the case of multiple keys. That is left for future work (Thanks to Peter Geoghegan for the suggestion to look into this area). Reviewed-by: Chengpeng Yan Reviewed-by: zengman Reviewed-by: ChangAo Chen Reviewed-by: Álvaro Herrera Reviewed-by: Chao Li (earlier version) Discussion: https://postgr.es/m/CANWCAZYzx7a7E9AY16Jt_U3+GVKDADfgApZ-42SYNiig8dTnFA@mail.gmail.com --- src/backend/utils/sort/tuplesort.c | 561 ++++++++++++++++++------ src/include/utils/sortsupport.h | 101 ----- src/include/utils/tuplesort.h | 1 + src/test/regress/expected/tuplesort.out | 6 +- src/tools/pgindent/typedefs.list | 1 + 5 files changed, 430 insertions(+), 240 deletions(-) diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 1edcad89c8..1fc440ea6c 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -7,8 +7,8 @@ * applied to different kinds of sortable objects. Implementation of * the particular sorting variants is given in tuplesortvariants.c. * This module works efficiently for both small and large amounts - * of data. Small amounts are sorted in-memory using qsort(). Large - * amounts are sorted using temporary files and a standard external sort + * of data. Small amounts are sorted in-memory. Large amounts are + * sorted using temporary files and a standard external sort * algorithm. * * See Knuth, volume 3, for more than you want to know about external @@ -26,16 +26,16 @@ * Historically, we divided the input into sorted runs using replacement * selection, in the form of a priority tree implemented as a heap * (essentially Knuth's Algorithm 5.2.3H), but now we always use quicksort - * for run generation. + * or radix sort for run generation. * * The approximate amount of memory allowed for any one sort operation * is specified in kilobytes by the caller (most pass work_mem). Initially, * we absorb tuples and simply store them in an unsorted array as long as * we haven't exceeded workMem. If we reach the end of the input without - * exceeding workMem, we sort the array using qsort() and subsequently return + * exceeding workMem, we sort the array in memory and subsequently return * tuples just by scanning the tuple array sequentially. If we do exceed * workMem, we begin to emit tuples into sorted runs in temporary tapes. - * When tuples are dumped in batch after quicksorting, we begin a new run + * When tuples are dumped in batch after in-memory sorting, we begin a new run * with a new output tape. If we reach the max number of tapes, we write * subsequent runs on the existing tapes in a round-robin fashion. We will * need multiple merge passes to finish the merge in that case. After the @@ -476,121 +476,15 @@ static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); static void tuplesort_free(Tuplesortstate *state); static void tuplesort_updatemax(Tuplesortstate *state); -/* - * Specialized comparators that we can inline into specialized sorts. The goal - * is to try to sort two tuples without having to follow the pointers to the - * comparator or the tuple. - * - * XXX: For now, there is no specialization for cases where datum1 is - * authoritative and we don't even need to fall back to a callback at all (that - * would be true for types like int4/int8/timestamp/date, but not true for - * abbreviations of text or multi-key sorts. There could be! Is it worth it? - */ - -/* Used if first key's comparator is ssup_datum_unsigned_cmp */ -static pg_attribute_always_inline int -qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) -{ - int compare; - - compare = ApplyUnsignedSortComparator(a->datum1, a->isnull1, - b->datum1, b->isnull1, - &state->base.sortKeys[0]); - if (compare != 0) - return compare; - - /* - * No need to waste effort calling the tiebreak function when there are no - * other keys to sort on. - */ - if (state->base.onlyKey != NULL) - return 0; - - return state->base.comparetup_tiebreak(a, b, state); -} - -/* Used if first key's comparator is ssup_datum_signed_cmp */ -static pg_attribute_always_inline int -qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) -{ - int compare; - - compare = ApplySignedSortComparator(a->datum1, a->isnull1, - b->datum1, b->isnull1, - &state->base.sortKeys[0]); - - if (compare != 0) - return compare; - - /* - * No need to waste effort calling the tiebreak function when there are no - * other keys to sort on. - */ - if (state->base.onlyKey != NULL) - return 0; - - return state->base.comparetup_tiebreak(a, b, state); -} - -/* Used if first key's comparator is ssup_datum_int32_cmp */ -static pg_attribute_always_inline int -qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) -{ - int compare; - - compare = ApplyInt32SortComparator(a->datum1, a->isnull1, - b->datum1, b->isnull1, - &state->base.sortKeys[0]); - - if (compare != 0) - return compare; - - /* - * No need to waste effort calling the tiebreak function when there are no - * other keys to sort on. - */ - if (state->base.onlyKey != NULL) - return 0; - - return state->base.comparetup_tiebreak(a, b, state); -} /* * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts * any variant of SortTuples, using the appropriate comparetup function. * qsort_ssup() is specialized for the case where the comparetup function * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts - * and Datum sorts. qsort_tuple_{unsigned,signed,int32} are specialized for - * common comparison functions on pass-by-value leading datums. + * and Datum sorts. */ -#define ST_SORT qsort_tuple_unsigned -#define ST_ELEMENT_TYPE SortTuple -#define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare(a, b, state) -#define ST_COMPARE_ARG_TYPE Tuplesortstate -#define ST_CHECK_FOR_INTERRUPTS -#define ST_SCOPE static -#define ST_DEFINE -#include "lib/sort_template.h" - -#define ST_SORT qsort_tuple_signed -#define ST_ELEMENT_TYPE SortTuple -#define ST_COMPARE(a, b, state) qsort_tuple_signed_compare(a, b, state) -#define ST_COMPARE_ARG_TYPE Tuplesortstate -#define ST_CHECK_FOR_INTERRUPTS -#define ST_SCOPE static -#define ST_DEFINE -#include "lib/sort_template.h" - -#define ST_SORT qsort_tuple_int32 -#define ST_ELEMENT_TYPE SortTuple -#define ST_COMPARE(a, b, state) qsort_tuple_int32_compare(a, b, state) -#define ST_COMPARE_ARG_TYPE Tuplesortstate -#define ST_CHECK_FOR_INTERRUPTS -#define ST_SCOPE static -#define ST_DEFINE -#include "lib/sort_template.h" - #define ST_SORT qsort_tuple #define ST_ELEMENT_TYPE SortTuple #define ST_COMPARE_RUNTIME_POINTER @@ -612,6 +506,23 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) #define ST_DEFINE #include "lib/sort_template.h" +/* state for radix sort */ +typedef struct RadixSortInfo +{ + union + { + size_t count; + size_t offset; + }; + size_t next_offset; +} RadixSortInfo; + +/* + * Threshold below which qsort_tuple() is generally faster than a radix sort. + */ +#define QSORT_THRESHOLD 40 + + /* * tuplesort_begin_xxx * @@ -1363,7 +1274,7 @@ tuplesort_performsort(Tuplesortstate *state) */ if (SERIAL(state)) { - /* Just qsort 'em and we're done */ + /* Sort in memory and we're done */ tuplesort_sort_memtuples(state); state->status = TSS_SORTEDINMEM; } @@ -2337,7 +2248,7 @@ dumptuples(Tuplesortstate *state, bool alltuples) /* * Sort all tuples accumulated within the allowed amount of memory for - * this run using quicksort + * this run. */ tuplesort_sort_memtuples(state); @@ -2652,10 +2563,396 @@ sort_bounded_heap(Tuplesortstate *state) state->boundUsed = true; } + +/* radix sort routines */ + +/* + * Retrieve byte from datum, indexed by 'level': 0 for MSB, 7 for LSB + */ +static inline uint8 +current_byte(Datum key, int level) +{ + int shift = (sizeof(Datum) - 1 - level) * BITS_PER_BYTE; + + return (key >> shift) & 0xFF; +} + /* - * Sort all memtuples using specialized qsort() routines. + * Normalize datum such that unsigned comparison is order-preserving, + * taking ASC/DESC into account as well. + */ +static inline Datum +normalize_datum(Datum orig, SortSupport ssup) +{ + Datum norm_datum1; + + if (ssup->comparator == ssup_datum_signed_cmp) + { + norm_datum1 = orig + ((uint64) PG_INT64_MAX) + 1; + } + else if (ssup->comparator == ssup_datum_int32_cmp) + { + /* + * First truncate to uint32. Technically, we don't need to do this, + * but it forces the upper half of the datum to be zero regardless of + * sign. + */ + uint32 u32 = DatumGetUInt32(orig) + ((uint32) PG_INT32_MAX) + 1; + + norm_datum1 = UInt32GetDatum(u32); + } + else + { + Assert(ssup->comparator == ssup_datum_unsigned_cmp); + norm_datum1 = orig; + } + + if (ssup->ssup_reverse) + norm_datum1 = ~norm_datum1; + + return norm_datum1; +} + +/* + * radix_sort_recursive + * + * Radix sort by (pass-by-value) datum1, diverting to qsort_tuple() + * for tiebreaks. + * + * This is a modification of + * ska_byte_sort() from https://github.com/skarupke/ska_sort + * The original copyright notice follows: + * + * Copyright Malte Skarupke 2016. + * Distributed under the Boost Software License, Version 1.0. + * + * Boost Software License - Version 1.0 - August 17th, 2003 + * + * Permission is hereby granted, free of charge, to any person or organization + * obtaining a copy of the software and accompanying documentation covered by + * this license (the "Software") to use, reproduce, display, distribute, + * execute, and transmit the Software, and to prepare derivative works of the + * Software, and to permit third-parties to whom the Software is furnished to + * do so, all subject to the following: + * + * The copyright notices in the Software and this entire statement, including + * the above license grant, this restriction and the following disclaimer, + * must be included in all copies of the Software, in whole or in part, and + * all derivative works of the Software, unless such copies or derivative + * works are solely in the form of machine-executable object code generated by + * a source language processor. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +static void +radix_sort_recursive(SortTuple *begin, size_t n_elems, int level, Tuplesortstate *state) +{ + RadixSortInfo partitions[256] = {0}; + uint8 remaining_partitions[256]; + size_t total = 0; + int num_partitions = 0; + int num_remaining; + SortSupport ssup = &state->base.sortKeys[0]; + size_t start_offset = 0; + SortTuple *partition_begin = begin; + + /* count number of occurrences of each byte */ + for (SortTuple *st = begin; st < begin + n_elems; st++) + { + uint8 this_partition; + + /* extract the byte for this level from the normalized datum */ + this_partition = current_byte(normalize_datum(st->datum1, ssup), + level); + + /* save it for the permutation step */ + st->curbyte = this_partition; + + partitions[this_partition].count++; + + CHECK_FOR_INTERRUPTS(); + } + + /* compute partition offsets */ + for (int i = 0; i < 256; i++) + { + size_t count = partitions[i].count; + + if (count != 0) + { + partitions[i].offset = total; + total += count; + remaining_partitions[num_partitions] = i; + num_partitions++; + } + partitions[i].next_offset = total; + } + + /* + * Swap tuples to correct partition. + * + * In traditional American flag sort, a swap sends the current element to + * the correct partition, but the array pointer only advances if the + * partner of the swap happens to be an element that belongs in the + * current partition. That only requires one pass through the array, but + * the disadvantage is we don't know if the pointer can advance until the + * swap completes. Here lies the most interesting innovation from the + * upstream ska_byte_sort: After initiating the swap, we immediately + * proceed to the next element. This makes better use of CPU pipelining, + * but also means that we will often need multiple iterations of this + * loop. ska_byte_sort() maintains a separate list of which partitions + * haven't finished, which is updated every loop iteration. Here we simply + * check each partition during every iteration. + * + * If we started with a single partition, there is nothing to do. If a + * previous loop iteration results in only one partition that hasn't been + * counted as sorted, we know it's actually sorted and can exit the loop. + */ + num_remaining = num_partitions; + while (num_remaining > 1) + { + /* start the count over */ + num_remaining = num_partitions; + + for (int i = 0; i < num_partitions; i++) + { + uint8 idx = remaining_partitions[i]; + + for (SortTuple *st = begin + partitions[idx].offset; + st < begin + partitions[idx].next_offset; + st++) + { + size_t offset = partitions[st->curbyte].offset++; + SortTuple tmp; + + /* swap current tuple with destination position */ + Assert(offset < n_elems); + tmp = *st; + *st = begin[offset]; + begin[offset] = tmp; + + CHECK_FOR_INTERRUPTS(); + }; + + /* Is this partition sorted? */ + if (partitions[idx].offset == partitions[idx].next_offset) + num_remaining--; + } + } + + /* recurse */ + for (uint8 *rp = remaining_partitions; + rp < remaining_partitions + num_partitions; + rp++) + { + size_t end_offset = partitions[*rp].next_offset; + SortTuple *partition_end = begin + end_offset; + size_t num_elements = end_offset - start_offset; + + if (num_elements > 1) + { + if (level < sizeof(Datum) - 1) + { + if (num_elements < QSORT_THRESHOLD) + { + qsort_tuple(partition_begin, + num_elements, + state->base.comparetup, + state); + } + else + { + radix_sort_recursive(partition_begin, + num_elements, + level + 1, + state); + } + } + else if (state->base.onlyKey == NULL) + { + /* + * We've finished radix sort on all bytes of the pass-by-value + * datum (possibly abbreviated), now sort using the tiebreak + * comparator. + */ + qsort_tuple(partition_begin, + num_elements, + state->base.comparetup_tiebreak, + state); + } + } + + start_offset = end_offset; + partition_begin = partition_end; + } +} + +/* + * Entry point for radix_sort_recursive * - * Quicksort is used for small in-memory sorts, and external sort runs. + * Partition tuples by isnull1, then sort both partitions, using + * radix sort on the NOT NULL partition if it's large enough. + */ +static void +radix_sort_tuple(SortTuple *data, size_t n, Tuplesortstate *state) +{ + bool nulls_first = state->base.sortKeys[0].ssup_nulls_first; + SortTuple *null_start; + SortTuple *not_null_start; + size_t d1 = 0, + d2, + null_count, + not_null_count; + + /* + * Find the first NOT NULL if NULLS FIRST, or first NULL if NULLS LAST. + * This also serves as a quick check for the common case where all tuples + * are NOT NULL in the first sort key. + */ + while (d1 < n && data[d1].isnull1 == nulls_first) + { + d1++; + CHECK_FOR_INTERRUPTS(); + } + + /* + * If we have more than one tuple left after the quick check, partition + * the remainder using branchless cyclic permutation, based on + * https://orlp.net/blog/branchless-lomuto-partitioning/ + */ + Assert(n > 0); + if (d1 < n - 1) + { + size_t i = d1, + j = d1; + SortTuple tmp = data[d1]; /* create gap at front */ + + while (j < n - 1) + { + /* gap is at j, move i's element to gap */ + data[j] = data[i]; + /* advance j to the first unknown element */ + j += 1; + /* move the first unknown element back to i */ + data[i] = data[j]; + /* advance i if this element belongs in the left partition */ + i += (data[i].isnull1 == nulls_first); + + CHECK_FOR_INTERRUPTS(); + } + + /* place gap between left and right partitions */ + data[j] = data[i]; + /* restore the saved element */ + data[i] = tmp; + /* assign it to the correct partition */ + i += (data[i].isnull1 == nulls_first); + + /* d1 is now the number of elements in the left partition */ + d1 = i; + } + + d2 = n - d1; + + /* set pointers and counts for each partition */ + if (nulls_first) + { + null_start = data; + null_count = d1; + not_null_start = data + d1; + not_null_count = d2; + } + else + { + not_null_start = data; + not_null_count = d1; + null_start = data + d1; + null_count = d2; + } + + for (SortTuple *st = null_start; + st < null_start + null_count; + st++) + Assert(st->isnull1 == true); + for (SortTuple *st = not_null_start; + st < not_null_start + not_null_count; + st++) + Assert(st->isnull1 == false); + + /* + * Sort the NULL partition using tiebreak comparator, if necessary. + */ + if (state->base.onlyKey == NULL && null_count > 1) + { + qsort_tuple(null_start, + null_count, + state->base.comparetup_tiebreak, + state); + } + + /* + * Sort the NOT NULL partition, using radix sort if large enough, + * otherwise fall back to quicksort. + */ + if (not_null_count < QSORT_THRESHOLD) + { + qsort_tuple(not_null_start, + not_null_count, + state->base.comparetup, + state); + } + else + { + bool presorted = true; + + for (SortTuple *st = not_null_start + 1; + st < not_null_start + not_null_count; + st++) + { + if (COMPARETUP(state, st - 1, st) > 0) + { + presorted = false; + break; + } + + CHECK_FOR_INTERRUPTS(); + } + + if (presorted) + return; + else + { + radix_sort_recursive(not_null_start, + not_null_count, + 0, + state); + } + } +} + +/* Verify in-memory sort using standard comparator. */ +static void +verify_memtuples_sorted(Tuplesortstate *state) +{ +#ifdef USE_ASSERT_CHECKING + for (SortTuple *st = state->memtuples + 1; + st < state->memtuples + state->memtupcount; + st++) + Assert(COMPARETUP(state, st - 1, st) <= 0); +#endif +} + +/* + * Sort all memtuples using specialized routines. + * + * Quicksort or radix sort is used for small in-memory sorts, + * and external sort runs. */ static void tuplesort_sort_memtuples(Tuplesortstate *state) @@ -2665,30 +2962,22 @@ tuplesort_sort_memtuples(Tuplesortstate *state) if (state->memtupcount > 1) { /* - * Do we have the leading column's value or abbreviation in datum1, - * and is there a specialization for its comparator? + * Do we have the leading column's value or abbreviation in datum1? */ if (state->base.haveDatum1 && state->base.sortKeys) { - if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp) - { - qsort_tuple_unsigned(state->memtuples, - state->memtupcount, - state); - return; - } - else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp) - { - qsort_tuple_signed(state->memtuples, - state->memtupcount, - state); - return; - } - else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp) + SortSupport ssup = &state->base.sortKeys[0]; + + /* Does it compare as an integer? */ + if (state->memtupcount >= QSORT_THRESHOLD && + (ssup->comparator == ssup_datum_unsigned_cmp || + ssup->comparator == ssup_datum_signed_cmp || + ssup->comparator == ssup_datum_int32_cmp)) { - qsort_tuple_int32(state->memtuples, - state->memtupcount, - state); + radix_sort_tuple(state->memtuples, + state->memtupcount, + state); + verify_memtuples_sorted(state); return; } } diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h index 0083756bbd..a8f8f9f026 100644 --- a/src/include/utils/sortsupport.h +++ b/src/include/utils/sortsupport.h @@ -229,107 +229,6 @@ ApplySortComparator(Datum datum1, bool isNull1, return compare; } -static inline int -ApplyUnsignedSortComparator(Datum datum1, bool isNull1, - Datum datum2, bool isNull2, - SortSupport ssup) -{ - int compare; - - if (isNull1) - { - if (isNull2) - compare = 0; /* NULL "=" NULL */ - else if (ssup->ssup_nulls_first) - compare = -1; /* NULL "<" NOT_NULL */ - else - compare = 1; /* NULL ">" NOT_NULL */ - } - else if (isNull2) - { - if (ssup->ssup_nulls_first) - compare = 1; /* NOT_NULL ">" NULL */ - else - compare = -1; /* NOT_NULL "<" NULL */ - } - else - { - compare = datum1 < datum2 ? -1 : datum1 > datum2 ? 1 : 0; - if (ssup->ssup_reverse) - INVERT_COMPARE_RESULT(compare); - } - - return compare; -} - -static inline int -ApplySignedSortComparator(Datum datum1, bool isNull1, - Datum datum2, bool isNull2, - SortSupport ssup) -{ - int compare; - - if (isNull1) - { - if (isNull2) - compare = 0; /* NULL "=" NULL */ - else if (ssup->ssup_nulls_first) - compare = -1; /* NULL "<" NOT_NULL */ - else - compare = 1; /* NULL ">" NOT_NULL */ - } - else if (isNull2) - { - if (ssup->ssup_nulls_first) - compare = 1; /* NOT_NULL ">" NULL */ - else - compare = -1; /* NOT_NULL "<" NULL */ - } - else - { - compare = DatumGetInt64(datum1) < DatumGetInt64(datum2) ? -1 : - DatumGetInt64(datum1) > DatumGetInt64(datum2) ? 1 : 0; - if (ssup->ssup_reverse) - INVERT_COMPARE_RESULT(compare); - } - - return compare; -} - -static inline int -ApplyInt32SortComparator(Datum datum1, bool isNull1, - Datum datum2, bool isNull2, - SortSupport ssup) -{ - int compare; - - if (isNull1) - { - if (isNull2) - compare = 0; /* NULL "=" NULL */ - else if (ssup->ssup_nulls_first) - compare = -1; /* NULL "<" NOT_NULL */ - else - compare = 1; /* NULL ">" NOT_NULL */ - } - else if (isNull2) - { - if (ssup->ssup_nulls_first) - compare = 1; /* NOT_NULL ">" NULL */ - else - compare = -1; /* NOT_NULL "<" NULL */ - } - else - { - compare = DatumGetInt32(datum1) < DatumGetInt32(datum2) ? -1 : - DatumGetInt32(datum1) > DatumGetInt32(datum2) ? 1 : 0; - if (ssup->ssup_reverse) - INVERT_COMPARE_RESULT(compare); - } - - return compare; -} - /* * Apply a sort comparator function and return a 3-way comparison using full, * authoritative comparator. This takes care of handling reverse-sort and diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 5fe229e211..da68f45acf 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -116,6 +116,7 @@ typedef struct void *tuple; /* the tuple itself */ Datum datum1; /* value of first key column */ bool isnull1; /* is first key column NULL? */ + uint8 curbyte; /* chunk of datum1 for current radix sort pass */ int srctape; /* source tape number */ } SortTuple; diff --git a/src/test/regress/expected/tuplesort.out b/src/test/regress/expected/tuplesort.out index 6dd97e7427..fc1321bf44 100644 --- a/src/test/regress/expected/tuplesort.out +++ b/src/test/regress/expected/tuplesort.out @@ -304,9 +304,9 @@ FROM abbrev_abort_uuids ORDER BY ctid DESC LIMIT 5; id | abort_increasing | abort_decreasing | noabort_increasing | noabort_decreasing -------+--------------------------------------+--------------------------------------+--------------------------------------+-------------------------------------- - 0 | | | | 20002 | | | | 20003 | | | | + 0 | | | | 10009 | 00000000-0000-0000-0000-000000010008 | 00000000-0000-0000-0000-000000009992 | 00010008-0000-0000-0000-000000010008 | 00009992-0000-0000-0000-000000009992 10008 | 00000000-0000-0000-0000-000000010007 | 00000000-0000-0000-0000-000000009993 | 00010007-0000-0000-0000-000000010007 | 00009993-0000-0000-0000-000000009993 (5 rows) @@ -335,9 +335,9 @@ FROM abbrev_abort_uuids ORDER BY ctid DESC LIMIT 5; id | abort_increasing | abort_decreasing | noabort_increasing | noabort_decreasing -------+--------------------------------------+--------------------------------------+--------------------------------------+-------------------------------------- - 0 | | | | - 20003 | | | | 20002 | | | | + 20003 | | | | + 0 | | | | 9993 | 00000000-0000-0000-0000-000000009992 | 00000000-0000-0000-0000-000000010008 | 00009992-0000-0000-0000-000000009992 | 00010008-0000-0000-0000-000000010008 9994 | 00000000-0000-0000-0000-000000009993 | 00000000-0000-0000-0000-000000010007 | 00009993-0000-0000-0000-000000009993 | 00010007-0000-0000-0000-000000010007 (5 rows) diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 6e2d876a40..241945734e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4064,6 +4064,7 @@ qsort_comparator query_pathkeys_callback radius_attribute radius_packet +RadixSortInfo rangeTableEntry_used_context rank_context rbt_allocfunc From 4644f8b23bb8cd5cf3454bcd69bc28a5fd5623a9 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Sat, 14 Feb 2026 12:16:16 -0800 Subject: [PATCH 106/147] pg_mblen_range, pg_mblen_with_len: Valgrind after encoding ereport. The prior order caused spurious Valgrind errors. They're spurious because the ereport(ERROR) non-local exit discards the pointer in question. pg_mblen_cstr() ordered the checks correctly, but these other two did not. Back-patch to v14, like commit 1e7fe06c10c0a8da9dd6261a6be8d405dc17c728. Reviewed-by: Thomas Munro Discussion: https://postgr.es/m/20260214053821.fa.noahmisch@microsoft.com Backpatch-through: 14 --- src/backend/utils/mb/mbutils.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index a5a734839a..f3f94d4654 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -1086,15 +1086,16 @@ pg_mblen_range(const char *mbstr, const char *end) int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); Assert(end > mbstr); + + if (unlikely(mbstr + length > end)) + report_invalid_encoding_db(mbstr, length, end - mbstr); + #ifdef VALGRIND_EXPENSIVE VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr); #else VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); #endif - if (unlikely(mbstr + length > end)) - report_invalid_encoding_db(mbstr, length, end - mbstr); - return length; } @@ -1109,15 +1110,16 @@ pg_mblen_with_len(const char *mbstr, int limit) int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); Assert(limit >= 1); + + if (unlikely(length > limit)) + report_invalid_encoding_db(mbstr, length, limit); + #ifdef VALGRIND_EXPENSIVE VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit); #else VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); #endif - if (unlikely(length > limit)) - report_invalid_encoding_db(mbstr, length, limit); - return length; } From 9f4fd119b2cbb9a41ec0c19a8d6ec9b59b92c125 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Sat, 14 Feb 2026 12:16:16 -0800 Subject: [PATCH 107/147] Fix SUBSTRING() for toasted multibyte characters. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1e7fe06c10c0a8da9dd6261a6be8d405dc17c728 changed pg_mbstrlen_with_len() to ereport(ERROR) if the input ends in an incomplete character. Most callers want that. text_substring() does not. It detoasts the most bytes it could possibly need to get the requested number of characters. For example, to extract up to 2 chars from UTF8, it needs to detoast 8 bytes. In a string of 3-byte UTF8 chars, 8 bytes spans 2 complete chars and 1 partial char. Fix this by replacing this pg_mbstrlen_with_len() call with a string traversal that differs by stopping upon finding as many chars as the substring could need. This also makes SUBSTRING() stop raising an encoding error if the incomplete char is past the end of the substring. This is consistent with the general philosophy of the above commit, which was to raise errors on a just-in-time basis. Before the above commit, SUBSTRING() never raised an encoding error. SUBSTRING() has long been detoasting enough for one more char than needed, because it did not distinguish exclusive and inclusive end position. For avoidance of doubt, stop detoasting extra. Back-patch to v14, like the above commit. For applications using SUBSTRING() on non-ASCII column values, consider applying this to your copy of any of the February 12, 2026 releases. Reported-by: SATŌ Kentarō Reviewed-by: Thomas Munro Bug: #19406 Discussion: https://postgr.es/m/19406-9867fddddd724fca@postgresql.org Backpatch-through: 14 --- src/backend/utils/adt/varlena.c | 62 +++++++++++++++++++++----- src/test/regress/expected/encoding.out | 44 +++++++++++++++++- src/test/regress/sql/encoding.sql | 19 +++++++- 3 files changed, 111 insertions(+), 14 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index dbecd7160d..d8ea7f45bc 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -133,6 +133,7 @@ static text *text_substring(Datum str, int32 start, int32 length, bool length_not_specified); +static int pg_mbcharcliplen_chars(const char *mbstr, int len, int limit); static text *text_overlay(text *t1, text *t2, int sp, int sl); static int text_position(text *t1, text *t2, Oid collid); static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state); @@ -586,7 +587,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) int32 S = start; /* start position */ int32 S1; /* adjusted start position */ int32 L1; /* adjusted substring length */ - int32 E; /* end position */ + int32 E; /* end position, exclusive */ /* * SQL99 says S can be zero or negative (which we don't document), but we @@ -684,11 +685,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) else { /* - * A zero or negative value for the end position can happen if the - * start was negative or one. SQL99 says to return a zero-length - * string. + * Ending at position 1, exclusive, obviously yields an empty + * string. A zero or negative value can happen if the start was + * negative or one. SQL99 says to return a zero-length string. */ - if (E < 1) + if (E <= 1) return cstring_to_text(""); /* @@ -698,11 +699,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) L1 = E - S1; /* - * Total slice size in bytes can't be any longer than the start - * position plus substring length times the encoding max length. - * If that overflows, we can just use -1. + * Total slice size in bytes can't be any longer than the + * inclusive end position times the encoding max length. If that + * overflows, we can just use -1. */ - if (pg_mul_s32_overflow(E, eml, &slice_size)) + if (pg_mul_s32_overflow(E - 1, eml, &slice_size)) slice_size = -1; } @@ -725,9 +726,17 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) return cstring_to_text(""); } - /* Now we can get the actual length of the slice in MB characters */ - slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), - slice_len); + /* + * Now we can get the actual length of the slice in MB characters, + * stopping at the end of the substring. Continuing beyond the + * substring end could find an incomplete character attributable + * solely to DatumGetTextPSlice() chopping in the middle of a + * character, and it would be superfluous work at best. + */ + slice_strlen = + (slice_size == -1 ? + pg_mbstrlen_with_len(VARDATA_ANY(slice), slice_len) : + pg_mbcharcliplen_chars(VARDATA_ANY(slice), slice_len, E - 1)); /* * Check that the start position wasn't > slice_strlen. If so, SQL99 @@ -782,6 +791,35 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) return NULL; } +/* + * pg_mbcharcliplen_chars - + * Mirror pg_mbcharcliplen(), except return value unit is chars, not bytes. + * + * This mirrors all the dubious historical behavior, so it's static to + * discourage proliferation. The assertions are specific to the one caller. + */ +static int +pg_mbcharcliplen_chars(const char *mbstr, int len, int limit) +{ + int nch = 0; + int l; + + Assert(len > 0); + Assert(limit > 0); + Assert(pg_database_encoding_max_length() > 1); + + while (len > 0 && *mbstr) + { + l = pg_mblen_with_len(mbstr, len); + nch++; + if (nch == limit) + break; + len -= l; + mbstr += l; + } + return nch; +} + /* * textoverlay * Replace specified substring of first string with second diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out index ea1f38cff4..cac1cb7478 100644 --- a/src/test/regress/expected/encoding.out +++ b/src/test/regress/expected/encoding.out @@ -63,7 +63,13 @@ SELECT reverse(good) FROM regress_encoding; -- invalid short mb character = error SELECT length(truncated) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 -SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT substring(truncated, 1, 3) FROM regress_encoding; + substring +----------- + caf +(1 row) + +SELECT substring(truncated, 1, 4) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 SELECT reverse(truncated) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 @@ -375,7 +381,43 @@ NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK t (1 row) +-- substring fetches a slice of a toasted value; unused tail of that slice is +-- an incomplete char (bug #19406) +CREATE TABLE toast_3b_utf8 (c text); +INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); +SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; + substring +----------- + … +(1 row) + +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; + substring +----------- + +(1 row) + +-- diagnose incomplete char iff within the substring +UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); +SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; + substring +----------- + … +(1 row) + +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +ERROR: invalid byte sequence for encoding "UTF8": 0xe2 0x80 +-- substring needing last byte of its slice_size +ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; +UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); +SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; + substring +----------- + 🚀 +(1 row) + DROP TABLE encoding_tests; +DROP TABLE toast_4b_utf8; DROP FUNCTION test_encoding; DROP FUNCTION test_text_to_wchars; DROP FUNCTION test_mblen_func; diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql index b9543c0cb3..782f90f0d6 100644 --- a/src/test/regress/sql/encoding.sql +++ b/src/test/regress/sql/encoding.sql @@ -40,7 +40,8 @@ SELECT reverse(good) FROM regress_encoding; -- invalid short mb character = error SELECT length(truncated) FROM regress_encoding; -SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT substring(truncated, 1, 3) FROM regress_encoding; +SELECT substring(truncated, 1, 4) FROM regress_encoding; SELECT reverse(truncated) FROM regress_encoding; -- invalid short mb character = silently dropped SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; @@ -212,7 +213,23 @@ INSERT INTO encoding_tests VALUES SELECT COUNT(test_encoding(encoding, description, input)) > 0 FROM encoding_tests; +-- substring fetches a slice of a toasted value; unused tail of that slice is +-- an incomplete char (bug #19406) +CREATE TABLE toast_3b_utf8 (c text); +INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); +SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +-- diagnose incomplete char iff within the substring +UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); +SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +-- substring needing last byte of its slice_size +ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; +UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); +SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; + DROP TABLE encoding_tests; +DROP TABLE toast_4b_utf8; DROP FUNCTION test_encoding; DROP FUNCTION test_text_to_wchars; DROP FUNCTION test_mblen_func; From 459576303dfb47d5b9626331df1f8e8767840ede Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 16 Feb 2026 12:18:18 +0900 Subject: [PATCH 108/147] pgcrypto: Tweak error message for incorrect session key length The error message added in 379695d3cc70 referred to the public key being too long. This is confusing as it is in fact the session key included in a PGP message which is too long. This is harmless, but let's be precise about what is wrong. Per offline report. Reported-by: Zsolt Parragi Backpatch-through: 14 --- contrib/pgcrypto/expected/pgp-pubkey-session.out | 2 +- contrib/pgcrypto/px.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/pgcrypto/expected/pgp-pubkey-session.out b/contrib/pgcrypto/expected/pgp-pubkey-session.out index f724d98eb2..e57cb8fab9 100644 --- a/contrib/pgcrypto/expected/pgp-pubkey-session.out +++ b/contrib/pgcrypto/expected/pgp-pubkey-session.out @@ -44,4 +44,4 @@ ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); -ERROR: Public key too big +ERROR: Session key too big diff --git a/contrib/pgcrypto/px.c b/contrib/pgcrypto/px.c index d9bf1aae81..f08bc498ac 100644 --- a/contrib/pgcrypto/px.c +++ b/contrib/pgcrypto/px.c @@ -65,7 +65,7 @@ static const struct error_desc px_err_list[] = { {PXE_PGP_UNEXPECTED_PKT, "Unexpected packet in key data"}, {PXE_PGP_MATH_FAILED, "Math operation failed"}, {PXE_PGP_SHORT_ELGAMAL_KEY, "Elgamal keys must be at least 1024 bits long"}, - {PXE_PGP_KEY_TOO_BIG, "Public key too big"}, + {PXE_PGP_KEY_TOO_BIG, "Session key too big"}, {PXE_PGP_UNKNOWN_PUBALGO, "Unknown public-key encryption algorithm"}, {PXE_PGP_WRONG_KEY, "Wrong key"}, {PXE_PGP_MULTIPLE_KEYS, From 351265a6c7fd15737e4b68cada778728fc325a8b Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Mon, 16 Feb 2026 13:57:38 +0900 Subject: [PATCH 109/147] Remove recovery.signal at recovery end when both signal files are present. When both standby.signal and recovery.signal are present, standby.signal takes precedence and the server runs in standby mode. Previously, in this case, recovery.signal was not removed at the end of standby mode (i.e., on promotion) or at the end of archive recovery, while standby.signal was removed. As a result, a leftover recovery.signal could cause a subsequent restart to enter archive recovery unexpectedly, potentially preventing the server from starting. This behavior was surprising and confusing to users. This commit fixes the issue by updating the recovery code to remove recovery.signal alongside standby.signal when both files are present and recovery completes. Because this code path is particularly sensitive and changes in recovery behavior can be risky for stable branches, this change is applied only to the master branch. Reported-by: Nikolay Samokhvalov Author: Fujii Masao Reviewed-by: Michael Paquier Reviewed-by: David Steele Discussion: https://postgr.es/m/CAM527d8PVAQFLt_ndTXE19F-XpDZui861882L0rLY3YihQB8qA@mail.gmail.com --- src/backend/access/transam/xlogrecovery.c | 10 ++++++---- src/test/recovery/t/002_archiving.pl | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 4fc37a031d..c0c2744d45 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1068,9 +1068,6 @@ readRecoverySignalFile(void) * Check for recovery signal files and if found, fsync them since they * represent server state information. We don't sweat too much about the * possibility of fsync failure, however. - * - * If present, standby signal file takes precedence. If neither is present - * then we won't enter archive recovery. */ if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) { @@ -1085,7 +1082,8 @@ readRecoverySignalFile(void) } standby_signal_file_found = true; } - else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0) + + if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0) { int fd; @@ -1099,6 +1097,10 @@ readRecoverySignalFile(void) recovery_signal_file_found = true; } + /* + * If both signal files are present, standby signal file takes precedence. + * If neither is present then we won't enter archive recovery. + */ StandbyModeRequested = false; ArchiveRecoveryRequested = false; if (standby_signal_file_found) diff --git a/src/test/recovery/t/002_archiving.pl b/src/test/recovery/t/002_archiving.pl index 883ba75b31..aa40f58e6d 100644 --- a/src/test/recovery/t/002_archiving.pl +++ b/src/test/recovery/t/002_archiving.pl @@ -115,6 +115,17 @@ recovery_end_command = 'echo recovery_end_failed > missing_dir/xyz.file' )); +# Create recovery.signal and confirm that both signal files exist. +# This is necessary to test how recovery behaves when both files are present, +# i.e., standby.signal should take precedence and both files should be +# removed at the end of recovery. +$node_standby2->set_recovery_mode(); +my $node_standby2_data = $node_standby2->data_dir; +ok(-f "$node_standby2_data/recovery.signal", + "recovery.signal is present at the beginning of recovery"); +ok(-f "$node_standby2_data/standby.signal", + "standby.signal is present at the beginning of recovery"); + $node_standby2->start; # Save the log location, to see the failure of recovery_end_command. @@ -126,7 +137,6 @@ # Check the logs of the standby to see that the commands have failed. my $log_contents = slurp_file($node_standby2->logfile, $log_location); -my $node_standby2_data = $node_standby2->data_dir; like( $log_contents, @@ -141,4 +151,10 @@ qr/WARNING:.*recovery_end_command/s, "recovery_end_command failure detected in logs after promotion"); +# Check that no signal files are present after promotion. +ok( !-f "$node_standby2_data/recovery.signal", + "recovery.signal was left behind after promotion"); +ok( !-f "$node_standby2_data/standby.signal", + "standby.signal was left behind after promotion"); + done_testing(); From d50c86e743755e7ea91e5980f09f8575e0cb338b Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Mon, 16 Feb 2026 09:13:10 +0100 Subject: [PATCH 110/147] Change remaining StaticAssertStmt() to StaticAssertDecl() This completes the work started by commit 75f49221c22. In basebackup.c, changing the StaticAssertStmt to StaticAssertDecl results in having the same StaticAssertDecl() in 2 functions. So, it makes more sense to move it to file scope instead. Also, as it depends on some computations based on 2 tar blocks, define TAR_NUM_TERMINATION_BLOCKS. In deadlock.c, change the StaticAssertStmt to StaticAssertDecl and keep it in the function scope. Add new braces to avoid warning from -Wdeclaration-after-statement. In aset.c, change the StaticAssertStmt to StaticAssertDecl and move it to file scope. Finally, update the comments in c.h a bit. Author: Bertrand Drouvot Co-authored-by: Peter Eisentraut Discussion: https://www.postgresql.org/message-id/aYH6ii46AvGVCB84%40ip-10-97-1-34.eu-west-3.compute.internal --- src/backend/backup/basebackup.c | 17 +++++++++-------- src/backend/storage/lmgr/deadlock.c | 12 +++++++----- src/backend/utils/mmgr/aset.c | 10 ++++------ src/include/c.h | 24 +++++++++++++++++------- 4 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 463c0756b5..2d74c64833 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -78,6 +78,11 @@ typedef struct pg_checksum_type manifest_checksum_type; } basebackup_options; +#define TAR_NUM_TERMINATION_BLOCKS 2 + +StaticAssertDecl(TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE <= BLCKSZ, + "BLCKSZ too small for " CppAsString2(TAR_NUM_TERMINATION_BLOCKS) " tar termination blocks"); + static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, struct backup_manifest_info *manifest, IncrementalBackupInfo *ib); @@ -382,10 +387,8 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, else { /* Properly terminate the tarfile. */ - StaticAssertDecl(2 * TAR_BLOCK_SIZE <= BLCKSZ, - "BLCKSZ too small for 2 tar blocks"); - memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE); - bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE); + memset(sink->bbs_buffer, 0, TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE); + bbsink_archive_contents(sink, TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE); /* OK, that's the end of the archive. */ bbsink_end_archive(sink); @@ -635,10 +638,8 @@ perform_base_backup(basebackup_options *opt, bbsink *sink, } /* Properly terminate the tar file. */ - StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ, - "BLCKSZ too small for 2 tar blocks"); - memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE); - bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE); + memset(sink->bbs_buffer, 0, TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE); + bbsink_archive_contents(sink, TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE); /* OK, that's the end of the archive. */ bbsink_end_archive(sink); diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c index 0a8dd5eb7c..c0c4ed57d9 100644 --- a/src/backend/storage/lmgr/deadlock.c +++ b/src/backend/storage/lmgr/deadlock.c @@ -191,11 +191,13 @@ InitDeadLockChecking(void) * last MaxBackends entries in possibleConstraints[] are reserved as * output workspace for FindLockCycle. */ - StaticAssertStmt(MAX_BACKENDS_BITS <= (32 - 3), - "MAX_BACKENDS_BITS too big for * 4"); - maxPossibleConstraints = MaxBackends * 4; - possibleConstraints = - (EDGE *) palloc(maxPossibleConstraints * sizeof(EDGE)); + { + StaticAssertDecl(MAX_BACKENDS_BITS <= (32 - 3), + "MAX_BACKENDS_BITS too big for * 4"); + maxPossibleConstraints = MaxBackends * 4; + possibleConstraints = + (EDGE *) palloc(maxPossibleConstraints * sizeof(EDGE)); + } MemoryContextSwitchTo(oldcxt); } diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index ae7d1647ae..161c2e2d3d 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -87,6 +87,10 @@ #define ALLOC_CHUNK_FRACTION 4 /* We allow chunks to be at most 1/4 of maxBlockSize (less overhead) */ +/* ALLOC_CHUNK_LIMIT must be equal to ALLOCSET_SEPARATE_THRESHOLD */ +StaticAssertDecl(ALLOC_CHUNK_LIMIT == ALLOCSET_SEPARATE_THRESHOLD, + "ALLOC_CHUNK_LIMIT != ALLOCSET_SEPARATE_THRESHOLD"); + /*-------------------- * The first block allocated for an allocset has size initBlockSize. * Each time we have to allocate another block, we double the block size @@ -501,12 +505,6 @@ AllocSetContextCreateInternal(MemoryContext parent, * requests that are all the maximum chunk size we will waste at most * 1/8th of the allocated space. * - * Also, allocChunkLimit must not exceed ALLOCSET_SEPARATE_THRESHOLD. - */ - StaticAssertStmt(ALLOC_CHUNK_LIMIT == ALLOCSET_SEPARATE_THRESHOLD, - "ALLOC_CHUNK_LIMIT != ALLOCSET_SEPARATE_THRESHOLD"); - - /* * Determine the maximum size that a chunk can be before we allocate an * entire AllocBlock dedicated for that chunk. We set the absolute limit * of that size as ALLOC_CHUNK_LIMIT but we reduce it further so that we diff --git a/src/include/c.h b/src/include/c.h index 3fc09ec1e4..a249674f02 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -924,25 +924,35 @@ pg_noreturn extern void ExceptionalCondition(const char *conditionName, * * If the "condition" (a compile-time-constant expression) evaluates to false, * throw a compile error using the "errmessage" (a string literal). - * + */ + +/* * We require C11 and C++11, so static_assert() is expected to be there. * StaticAssertDecl() was previously used for portability, but it's now just a * plain wrapper and doesn't need to be used in new code. static_assert() is * a "declaration", and so it must be placed where for example a variable * declaration would be valid. As long as we compile with * -Wno-declaration-after-statement, that also means it cannot be placed after - * statements in a function. Macros StaticAssertStmt() and StaticAssertExpr() - * make it safe to use as a statement or in an expression, respectively. + * statements in a function. + */ +#define StaticAssertDecl(condition, errmessage) \ + static_assert(condition, errmessage) + +/* + * StaticAssertStmt() was previously used to make static assertions work as a + * statement, but its use is now deprecated. + */ +#define StaticAssertStmt(condition, errmessage) \ + do { static_assert(condition, errmessage); } while(0) + +/* + * StaticAssertExpr() is for use in an expression. * * For compilers without GCC statement expressions, we fall back on a kluge * that assumes the compiler will complain about a negative width for a struct * bit-field. This will not include a helpful error message, but it beats not * getting an error at all. */ -#define StaticAssertDecl(condition, errmessage) \ - static_assert(condition, errmessage) -#define StaticAssertStmt(condition, errmessage) \ - do { static_assert(condition, errmessage); } while(0) #ifdef HAVE_STATEMENT_EXPRESSIONS #define StaticAssertExpr(condition, errmessage) \ ((void) ({ static_assert(condition, errmessage); true; })) From 07e90c6913586a5c46e55e162771aaa5a33811ba Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Mon, 16 Feb 2026 15:10:16 +0100 Subject: [PATCH 111/147] Avoid using the X25519 curve in ssl tests The X25519 curve is disallowed when OpenSSL is configured for FIPS mode which makes the testsuite fail. Since X25519 isn't required for the tests we can remove it to allow FIPS enabled configurations to run the tests. Author: Daniel Gustafsson Reported-by: Tom Lane Discussion: https://postgr.es/m/3521653.1770666093@sss.pgh.pa.us --- src/test/ssl/t/SSL/Server.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/ssl/t/SSL/Server.pm b/src/test/ssl/t/SSL/Server.pm index a023fa6bde..4c101a2650 100644 --- a/src/test/ssl/t/SSL/Server.pm +++ b/src/test/ssl/t/SSL/Server.pm @@ -324,7 +324,7 @@ sub switch_server_cert $node->append_conf('sslconfig.conf', $backend->set_server_cert(\%params)); # use lists of ECDH curves and cipher suites for syntax testing $node->append_conf('sslconfig.conf', - 'ssl_groups=X25519:prime256v1:secp521r1'); + 'ssl_groups=prime256v1:secp521r1'); $node->append_conf('sslconfig.conf', 'ssl_tls13_ciphers=TLS_AES_256_GCM_SHA384:TLS_AES_128_GCM_SHA256'); From db93988ab0e78396f2ed9e96c826ff988d12b9f2 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Mon, 16 Feb 2026 15:11:29 +0100 Subject: [PATCH 112/147] doc: Add note to ssl_group config on X25519 and FIPS The X25519 curve is not allowed when OpenSSL is configured for FIPS mode, so add a note to the documentation that the default setting must be altered for such setups. Author: Daniel Gustafsson Reported-by: Tom Lane Discussion: https://postgr.es/m/3521653.1770666093@sss.pgh.pa.us --- doc/src/sgml/config.sgml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 6bc2690ce0..faf0bdb62a 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1563,6 +1563,15 @@ include_dir 'conf.d' The default is X25519:prime256v1. + + + X25519 is not allowed when + OpenSSL is configured for FIPS mode and + must be removed from the server configuration when FIPS mode is + enabled. + + + OpenSSL names for the most common curves are: From 6be5b76d66cadbea715defa109963939ea866922 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 16 Feb 2026 15:20:15 -0500 Subject: [PATCH 113/147] Ensure that all three build methods install the same set of files. syscache_info.h was installed into $installdir/include/server/catalog if you use a non-VPATH autoconf build, but not if you use a VPATH build or meson. That happened because the makefiles blindly install src/include/catalog/*.h, and in a non-VPATH build the generated header files would be swept up in that. While it's hard to conjure a reason to need syscache_info.h outside of backend build, it's also hard to get the makefiles to skip syscache_info.h, so let's go the other way and install it in the other two cases too. Another problem, new in v19, was that meson builds install a copy of src/include/catalog/README, while autoconf builds do not. The issue here is that that file is new and wasn't added to meson.build's exclusion list. While it's clearly a bug if different build methods don't install the same set of files, I doubt anyone would thank us for changing the behavior in released branches. Hence, fix in master only. Author: Tom Lane Reviewed-by: Andres Freund Discussion: https://postgr.es/m/946828.1771185367@sss.pgh.pa.us --- src/include/catalog/Makefile | 3 ++- src/include/catalog/meson.build | 2 +- src/include/meson.build | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/include/catalog/Makefile b/src/include/catalog/Makefile index c90022f7c5..24b527230d 100644 --- a/src/include/catalog/Makefile +++ b/src/include/catalog/Makefile @@ -149,6 +149,7 @@ install: all installdirs ifeq ($(vpath_build),yes) $(INSTALL_DATA) schemapg.h '$(DESTDIR)$(includedir_server)'/catalog/schemapg.h $(INSTALL_DATA) syscache_ids.h '$(DESTDIR)$(includedir_server)'/catalog/syscache_ids.h + $(INSTALL_DATA) syscache_info.h '$(DESTDIR)$(includedir_server)'/catalog/syscache_info.h $(INSTALL_DATA) system_fk_info.h '$(DESTDIR)$(includedir_server)'/catalog/system_fk_info.h for file in $(GENERATED_HEADERS); do \ $(INSTALL_DATA) $$file '$(DESTDIR)$(includedir_server)'/catalog/$$file || exit; \ @@ -160,7 +161,7 @@ installdirs: uninstall: rm -f $(addprefix '$(DESTDIR)$(datadir)'/, postgres.bki system_constraints.sql) - rm -f $(addprefix '$(DESTDIR)$(includedir_server)'/catalog/, schemapg.h syscache_ids.h system_fk_info.h $(GENERATED_HEADERS)) + rm -f $(addprefix '$(DESTDIR)$(includedir_server)'/catalog/, schemapg.h syscache_ids.h syscache_info.h system_fk_info.h $(GENERATED_HEADERS)) clean: rm -f bki-stamp $(GENBKI_OUTPUT_FILES) diff --git a/src/include/catalog/meson.build b/src/include/catalog/meson.build index b63cd58406..433bcc908a 100644 --- a/src/include/catalog/meson.build +++ b/src/include/catalog/meson.build @@ -115,7 +115,7 @@ output_install = [ dir_data, dir_include_server / 'catalog', dir_include_server / 'catalog', - false, + dir_include_server / 'catalog', dir_include_server / 'catalog', ] diff --git a/src/include/meson.build b/src/include/meson.build index b940c5cd3d..7d734d92da 100644 --- a/src/include/meson.build +++ b/src/include/meson.build @@ -173,6 +173,7 @@ install_subdir('catalog', exclude_files: [ '.gitignore', 'Makefile', + 'README', 'duplicate_oids', 'meson.build', 'reformat_dat_file.pl', From b33f753612846b20ed54e296a52a6a52a6c793bc Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 16 Feb 2026 15:13:06 -0600 Subject: [PATCH 114/147] pg_upgrade: Use COPY for LO metadata for upgrades from < v12. Before v12, pg_largeobject_metadata was defined WITH OIDS, so unlike newer versions, the "oid" column was a hidden system column that pg_dump's getTableAttrs() will not pick up. Thus, for commit 161a3e8b68, we did not bother trying to use COPY for pg_largeobject_metadata for upgrades from older versions. This commit removes that restriction by adjusting the query in getTableAttrs() to pick up the "oid" system column and by teaching dumpTableData_copy() to use COPY (SELECT ...) for this catalog, since system columns cannot be used in COPY's column list. Reviewed-by: Andres Freund Discussion: https://postgr.es/m/aYzuAz_ITUpd9ZvH%40nathan --- src/bin/pg_dump/pg_backup_archiver.c | 7 +-- src/bin/pg_dump/pg_dump.c | 77 ++++++++++++++-------------- 2 files changed, 40 insertions(+), 44 deletions(-) diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 9007f7a0c4..7afcc0859c 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -2991,12 +2991,9 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) /* * For binary upgrade mode, dump pg_largeobject_metadata and the * associated pg_shdepend rows. This is faster to restore than the - * equivalent set of large object commands. We can only do this for - * upgrades from v12 and newer; in older versions, pg_largeobject_metadata - * was created WITH OIDS, so the OID column is hidden and won't be dumped. + * equivalent set of large object commands. */ - if (ropt->binary_upgrade && AH->public.remoteVersion >= 120000 && - strcmp(te->desc, "TABLE DATA") == 0 && + if (ropt->binary_upgrade && strcmp(te->desc, "TABLE DATA") == 0 && (te->catalogId.oid == LargeObjectMetadataRelationId || te->catalogId.oid == SharedDependRelationId)) return REQ_DATA; diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index b4b7c234e2..4959830433 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -1118,11 +1118,9 @@ main(int argc, char **argv) * For binary upgrade mode, dump the pg_shdepend rows for large objects * and maybe even pg_largeobject_metadata (see comment below for details). * This is faster to restore than the equivalent set of large object - * commands. We can only do this for upgrades from v12 and newer; in - * older versions, pg_largeobject_metadata was created WITH OIDS, so the - * OID column is hidden and won't be dumped. + * commands. */ - if (dopt.binary_upgrade && fout->remoteVersion >= 120000) + if (dopt.binary_upgrade) { TableInfo *shdepend; @@ -2406,11 +2404,14 @@ dumpTableData_copy(Archive *fout, const void *dcontext) column_list = fmtCopyColumnList(tbinfo, clistBuf); /* - * Use COPY (SELECT ...) TO when dumping a foreign table's data, and when - * a filter condition was specified. For other cases a simple COPY - * suffices. + * Use COPY (SELECT ...) TO when dumping a foreign table's data, when a + * filter condition was specified, and when in binary upgrade mode and + * dumping an old pg_largeobject_metadata defined WITH OIDS. For other + * cases a simple COPY suffices. */ - if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE) + if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE || + (fout->dopt->binary_upgrade && fout->remoteVersion < 120000 && + tbinfo->dobj.catId.oid == LargeObjectMetadataRelationId)) { /* Temporary allows to access to foreign tables to dump data */ if (tbinfo->relkind == RELKIND_FOREIGN_TABLE) @@ -3967,14 +3968,13 @@ getLOs(Archive *fout) "FROM pg_largeobject_metadata "); /* - * For binary upgrades from v12 or newer, we transfer - * pg_largeobject_metadata via COPY or by copying/linking its files from - * the old cluster. On such upgrades, we only need to consider large - * objects that have comments or security labels, since we still restore - * those objects via COMMENT/SECURITY LABEL commands. + * For binary upgrades, we transfer pg_largeobject_metadata via COPY or by + * copying/linking its files from the old cluster. On such upgrades, we + * only need to consider large objects that have comments or security + * labels, since we still restore those objects via COMMENT/SECURITY LABEL + * commands. */ - if (dopt->binary_upgrade && - fout->remoteVersion >= 120000) + if (dopt->binary_upgrade) appendPQExpBufferStr(loQry, "WHERE oid IN " "(SELECT objoid FROM pg_description " @@ -4063,25 +4063,13 @@ getLOs(Archive *fout) loinfo->dobj.components |= DUMP_COMPONENT_ACL; /* - * In binary-upgrade mode for LOs, we do *not* dump out the LO data, - * as it will be copied by pg_upgrade, which simply copies the - * pg_largeobject table. - * - * The story for LO metadata is more complicated. For upgrades from - * versions older than v12, we use ordinary SQL commands to restore - * both the content of pg_largeobject_metadata and any associated - * pg_shdepend rows. For upgrades from newer versions, we transfer - * this information via COPY or by copying/linking the files from the - * old cluster. For such upgrades, we do not need to dump the data, - * ACLs, or definitions of large objects. + * In binary upgrade mode, pg_largeobject and pg_largeobject_metadata + * are transferred via COPY or by copying/linking the files from the + * old cluster. Thus, we do not need to dump LO data, definitions, or + * ACLs. */ if (dopt->binary_upgrade) - { - if (fout->remoteVersion >= 120000) - loinfo->dobj.dump &= ~(DUMP_COMPONENT_DATA | DUMP_COMPONENT_ACL | DUMP_COMPONENT_DEFINITION); - else - loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA; - } + loinfo->dobj.dump &= ~(DUMP_COMPONENT_DATA | DUMP_COMPONENT_ACL | DUMP_COMPONENT_DEFINITION); /* * Create a "BLOBS" data item for the group, too. This is just a @@ -9296,12 +9284,10 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) * pg_shdepend so that the columns names are collected for the * corresponding COPY commands. Restoring the data for those catalogs * is faster than restoring the equivalent set of large object - * commands. We can only do this for upgrades from v12 and newer; in - * older versions, pg_largeobject_metadata was created WITH OIDS, so - * the OID column is hidden and won't be dumped. + * commands. */ if (!tbinfo->interesting && - !(fout->dopt->binary_upgrade && fout->remoteVersion >= 120000 && + !(fout->dopt->binary_upgrade && (tbinfo->dobj.catId.oid == LargeObjectMetadataRelationId || tbinfo->dobj.catId.oid == SharedDependRelationId))) continue; @@ -9442,7 +9428,18 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) "(pt.classoid = co.tableoid AND pt.objoid = co.oid)\n"); appendPQExpBufferStr(q, - "WHERE a.attnum > 0::pg_catalog.int2\n" + "WHERE a.attnum > 0::pg_catalog.int2\n"); + + /* + * For binary upgrades from dopt->binary_upgrade && fout->remoteVersion < 120000) + appendPQExpBufferStr(q, + "OR (a.attnum = -2::pg_catalog.int2 AND src.tbloid = " + CppAsString2(LargeObjectMetadataRelationId) ")\n"); + + appendPQExpBufferStr(q, "ORDER BY a.attrelid, a.attnum"); res = ExecuteSqlQuery(fout, q->data, PGRES_TUPLES_OK); @@ -9510,7 +9507,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) /* cross-check that we only got requested tables */ if (tbinfo->relkind == RELKIND_SEQUENCE || (!tbinfo->interesting && - !(fout->dopt->binary_upgrade && fout->remoteVersion >= 120000 && + !(fout->dopt->binary_upgrade && (tbinfo->dobj.catId.oid == LargeObjectMetadataRelationId || tbinfo->dobj.catId.oid == SharedDependRelationId)))) pg_fatal("unexpected column data for table \"%s\"", @@ -9544,7 +9541,9 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) for (int j = 0; j < numatts; j++, r++) { - if (j + 1 != atoi(PQgetvalue(res, r, i_attnum))) + if (j + 1 != atoi(PQgetvalue(res, r, i_attnum)) && + !(fout->dopt->binary_upgrade && fout->remoteVersion < 120000 && + tbinfo->dobj.catId.oid == LargeObjectMetadataRelationId)) pg_fatal("invalid column numbering in table \"%s\"", tbinfo->dobj.name); tbinfo->attnames[j] = pg_strdup(PQgetvalue(res, r, i_attname)); From a6f823e77835a075265bc6accce0a17370ed6db5 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 17 Feb 2026 08:41:26 +0900 Subject: [PATCH 115/147] hstore: Fix NULL pointer dereference with receive function The receive function of hstore was not able to handle correctly duplicate key values when a new duplicate links to a NULL value, where a pfree() could be attempted on a NULL pointer, crashing due to a pointer dereference. This problem would happen for a COPY BINARY, when stacking values like that: aa => 5 aa => null The second key/value pair is discarded and pfree() calls are attempted on its key and its value, leading to a pointer dereference for the value part as the value is NULL. The first key/value pair takes priority when a duplicate is found. Per offline report. Reported-by: "Anemone" Reported-by: "A1ex" Backpatch-through: 14 --- contrib/hstore/hstore_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c index 9cdfcb5daa..9b72efb867 100644 --- a/contrib/hstore/hstore_io.c +++ b/contrib/hstore/hstore_io.c @@ -385,7 +385,8 @@ hstoreUniquePairs(Pairs *a, int32 l, int32 *buflen) if (ptr->needfree) { pfree(ptr->key); - pfree(ptr->val); + if (ptr->val != NULL) + pfree(ptr->val); } } else From 8cef93d8a5886b57099fcd32e92f12bc5df7bbd1 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Mon, 16 Feb 2026 18:04:58 -0800 Subject: [PATCH 116/147] Suppress new "may be used uninitialized" warning. Various buildfarm members, having compilers like gcc 8.5 and 6.3, fail to deduce that text_substring() variable "E" is initialized if slice_size!=-1. This suppression approach quiets gcc 8.5; I did not reproduce the warning elsewhere. Back-patch to v14, like commit 9f4fd119b2cbb9a41ec0c19a8d6ec9b59b92c125. Reported-by: Tom Lane Discussion: https://postgr.es/m/1157953.1771266105@sss.pgh.pa.us Backpatch-through: 14 --- src/backend/utils/adt/varlena.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index d8ea7f45bc..7caf700fd6 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -665,14 +665,14 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) if (length_not_specified) /* special case - get length to end of * string */ - slice_size = L1 = -1; + E = slice_size = L1 = -1; else if (length < 0) { /* SQL99 says to throw an error for E < S, i.e., negative length */ ereport(ERROR, (errcode(ERRCODE_SUBSTRING_ERROR), errmsg("negative substring length not allowed"))); - slice_size = L1 = -1; /* silence stupider compilers */ + E = slice_size = L1 = -1; /* silence stupider compilers */ } else if (pg_add_s32_overflow(S, length, &E)) { From bd626ef093b2e5f279285a344c8ac53d67cdfd76 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Tue, 17 Feb 2026 13:53:32 +1300 Subject: [PATCH 117/147] Fix test_valid_server_encoding helper function. Commit c67bef3f325 introduced this test helper function for use by src/test/regress/sql/encoding.sql, but its logic was incorrect. It confused an encoding ID for a boolean so it gave the wrong results for some inputs, and also forgot the usual return macro. The mistake didn't affect values actually used in the test, so there is no change in behavior. Also drop it and another missed function at the end of the test, for consistency. Backpatch-through: 14 Author: Zsolt Parragi --- src/test/regress/expected/encoding.out | 2 ++ src/test/regress/regress.c | 2 +- src/test/regress/sql/encoding.sql | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out index cac1cb7478..b3655527b0 100644 --- a/src/test/regress/expected/encoding.out +++ b/src/test/regress/expected/encoding.out @@ -419,7 +419,9 @@ SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; DROP TABLE encoding_tests; DROP TABLE toast_4b_utf8; DROP FUNCTION test_encoding; +DROP FUNCTION test_wchars_to_text; DROP FUNCTION test_text_to_wchars; +DROP FUNCTION test_valid_server_encoding; DROP FUNCTION test_mblen_func; DROP FUNCTION test_bytea_to_text; DROP FUNCTION test_text_to_bytea; diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 96cf30ac92..a02f41c972 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -1251,7 +1251,7 @@ PG_FUNCTION_INFO_V1(test_valid_server_encoding); Datum test_valid_server_encoding(PG_FUNCTION_ARGS) { - return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0))); + PG_RETURN_BOOL(pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0))) >= 0); } /* Provide SQL access to IsBinaryCoercible() */ diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql index 782f90f0d6..d591818c3e 100644 --- a/src/test/regress/sql/encoding.sql +++ b/src/test/regress/sql/encoding.sql @@ -231,7 +231,9 @@ SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; DROP TABLE encoding_tests; DROP TABLE toast_4b_utf8; DROP FUNCTION test_encoding; +DROP FUNCTION test_wchars_to_text; DROP FUNCTION test_text_to_wchars; +DROP FUNCTION test_valid_server_encoding; DROP FUNCTION test_mblen_func; DROP FUNCTION test_bytea_to_text; DROP FUNCTION test_text_to_bytea; From 3d28ecb5ac76ae29ca0837dc81e7dacc2ab215eb Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 17 Feb 2026 10:06:32 +0100 Subject: [PATCH 118/147] Test List macros in C++ extensions All of these macros already work in C++ with Clang and GCC (the only compilers we're currently testing C++ extension support for). This adds a regression test for them in our test C++ extension, so we can safely change their implementation without accidentally breaking C++. Some of the List macros didn't work in C++ in the past (see commit d5ca15ee5), and this would have caught that. Author: Jelte Fennema-Nio Discussion: https://www.postgresql.org/message-id/flat/CAGECzQR21OnnKiZO_1rLWO0-16kg1JBxnVq-wymYW0-_1cUNtg@mail.gmail.com --- .../test_cplusplusext/test_cplusplusext.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/test/modules/test_cplusplusext/test_cplusplusext.cpp b/src/test/modules/test_cplusplusext/test_cplusplusext.cpp index 435937c00d..f1a2ab7f2b 100644 --- a/src/test/modules/test_cplusplusext/test_cplusplusext.cpp +++ b/src/test/modules/test_cplusplusext/test_cplusplusext.cpp @@ -17,6 +17,8 @@ extern "C" { #include "postgres.h" #include "fmgr.h" +#include "nodes/pg_list.h" +#include "nodes/primnodes.h" PG_MODULE_MAGIC; @@ -32,6 +34,21 @@ test_cplusplus_add(PG_FUNCTION_ARGS) { int32 a = PG_GETARG_INT32(0); int32 b = PG_GETARG_INT32(1); + RangeTblRef *node = makeNode(RangeTblRef); + List *list = list_make1(node); + + foreach_ptr(RangeTblRef, rtr, list) + { + (void) rtr; + } + + foreach_node(RangeTblRef, rtr, list) + { + (void) rtr; + } + + list_free(list); + pfree(node); PG_RETURN_INT32(a + b); } From 451650eaacd5e482380f2b0e506fc81f5aa92fc7 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 17 Feb 2026 10:06:39 +0100 Subject: [PATCH 119/147] Test most StaticAssert macros in C++ extensions Most of the StaticAssert macros already worked in C++ with Clang and GCC:(the only compilers we're currently testing C++ extension support for). This adds a regression test for them in our test C++ extension, so we can safely change their implementation without accidentally breaking C++. The only macros that StaticAssert macros that don't work yet are the StaticAssertVariableIsOfType and StaticAssertVariableIsOfTypeMacro. These will be added in a follow-on commit. Author: Jelte Fennema-Nio Discussion: https://www.postgresql.org/message-id/flat/CAGECzQR21OnnKiZO_1rLWO0-16kg1JBxnVq-wymYW0-_1cUNtg@mail.gmail.com --- src/test/modules/test_cplusplusext/test_cplusplusext.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/test/modules/test_cplusplusext/test_cplusplusext.cpp b/src/test/modules/test_cplusplusext/test_cplusplusext.cpp index f1a2ab7f2b..8c2eabcca4 100644 --- a/src/test/modules/test_cplusplusext/test_cplusplusext.cpp +++ b/src/test/modules/test_cplusplusext/test_cplusplusext.cpp @@ -25,6 +25,8 @@ PG_MODULE_MAGIC; PG_FUNCTION_INFO_V1(test_cplusplus_add); } +StaticAssertDecl(sizeof(int32) == 4, "int32 should be 4 bytes"); + /* * Simple function that returns the sum of two integers. This verifies that * C++ extension modules can be loaded and called correctly at runtime. @@ -47,6 +49,9 @@ test_cplusplus_add(PG_FUNCTION_ARGS) (void) rtr; } + StaticAssertStmt(sizeof(int32) == 4, "int32 should be 4 bytes"); + (void) StaticAssertExpr(sizeof(int64) == 8, "int64 should be 8 bytes"); + list_free(list); pfree(node); From a92b809f9da100ed5cef9d6b6ce2edd576449521 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 Feb 2026 17:18:31 +0200 Subject: [PATCH 120/147] Ignore SIGINT in walwriter and walsummarizer Previously, SIGINT was treated the same as SIGTERM in walwriter and walsummarizer. That decision goes back to when the walwriter process was introduced (commit ad4295728e04), and was later copied to walsummarizer. It was a pretty arbitrary decision back then, and we haven't adopted that convention in all the other processes that have been introduced later. Summary of how other processes respond to SIGINT: - Autovacuum launcher: Cancel the current iteration of launching - bgworker: Ignore (unless connected to a database) - checkpointer: Request shutdown checkpoint - bgwriter: Ignore - pgarch: Ignore - startup process: Ignore - walreceiver: Ignore - IO worker: die() IO workers are a notable exception in that they exit on SIGINT, and there's a documented reason for that: IO workers ignore SIGTERM, so SIGINT provides a way to manually kill them. (They do respond to SIGUSR2, though, like all the other processes that we don't want to exit immediately on SIGTERM on operating system shutdown.) To make this a little more consistent, ignore SIGINT in walwriter and walsummarizer. They have no "query" to cancel, and they react to SIGTERM just fine. Reviewed-by: Andres Freund Discussion: https://www.postgresql.org/message-id/818bafaf-1e77-4c78-8037-d7120878d87c@iki.fi --- src/backend/postmaster/walsummarizer.c | 5 +---- src/backend/postmaster/walwriter.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 2d8f57099f..742137edad 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -241,12 +241,9 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) /* * Properly accept or ignore signals the postmaster might send us - * - * We have no particular use for SIGINT at the moment, but seems - * reasonable to treat like SIGTERM. */ pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGINT, SignalHandlerForShutdownRequest); + pqsignal(SIGINT, SIG_IGN); /* no query to cancel */ pqsignal(SIGTERM, SignalHandlerForShutdownRequest); /* SIGQUIT handler was already set up by InitPostmasterChild */ pqsignal(SIGALRM, SIG_IGN); diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 23e79a3234..7c0e2809c1 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -98,12 +98,9 @@ WalWriterMain(const void *startup_data, size_t startup_data_len) /* * Properly accept or ignore signals the postmaster might send us - * - * We have no particular use for SIGINT at the moment, but seems - * reasonable to treat like SIGTERM. */ pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGINT, SignalHandlerForShutdownRequest); + pqsignal(SIGINT, SIG_IGN); /* no query to cancel */ pqsignal(SIGTERM, SignalHandlerForShutdownRequest); /* SIGQUIT handler was already set up by InitPostmasterChild */ pqsignal(SIGALRM, SIG_IGN); From 661237056b3ad92af40bc674459152d5ea0a58bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Tue, 17 Feb 2026 16:38:24 +0100 Subject: [PATCH 121/147] Fix memory leak in new GUC check_hook Commit 38e0190ced71 forgot to pfree() an allocation (freed in other places of the same function) in only one of several spots in check_log_min_messages(). Per Coverity. Add that. While at it, avoid open-coding guc_strdup(). The new coding does a strlen() that wasn't there before, but I doubt it's measurable. --- src/backend/utils/error/elog.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 59315e94e3..cb1c9d85ff 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -2363,11 +2363,12 @@ check_log_min_messages(char **newval, void **extra, GucSource source) appendStringInfo(&buf, ", %s", elem); } - result = (char *) guc_malloc(LOG, buf.len + 1); + result = guc_strdup(LOG, buf.data); if (!result) + { + pfree(buf.data); return false; - memcpy(result, buf.data, buf.len); - result[buf.len] = '\0'; + } guc_free(*newval); *newval = result; From b7271aa1d71acda712a372213633fdb55c1465c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Tue, 17 Feb 2026 17:59:45 +0100 Subject: [PATCH 122/147] Use a bitmask for ExecInsertIndexTuples options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... instead of passing a bunch of separate booleans. Also, rearrange the argument list in a hopefully more sensible order. Discussion: https://postgr.es/m/202602111846.xpvuccb3inbx@alvherre.pgsql Reviewed-by: Andres Freund Reviewed-by: Fabrízio de Royes Mello (older version) --- src/backend/commands/copyfrom.c | 14 ++++------- src/backend/executor/execIndexing.c | 35 +++++++++++++------------- src/backend/executor/execReplication.c | 30 ++++++++++++++++------ src/backend/executor/nodeModifyTable.c | 28 +++++++++++---------- src/include/executor/executor.h | 15 ++++++----- 5 files changed, 68 insertions(+), 54 deletions(-) diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index 25ee20b23d..2b7556b287 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -572,8 +572,8 @@ CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo, cstate->cur_lineno = buffer->linenos[i]; recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - buffer->slots[i], estate, false, - false, NULL, NIL, false); + estate, 0, buffer->slots[i], + NIL, NULL); ExecARInsertTriggers(estate, resultRelInfo, slots[i], recheckIndexes, cstate->transition_capture); @@ -1429,13 +1429,9 @@ CopyFrom(CopyFromState cstate) if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - myslot, - estate, - false, - false, - NULL, - NIL, - false); + estate, 0, + myslot, NIL, + NULL); } /* AFTER ROW INSERT Triggers */ diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index f0ba7eac87..9d071e495c 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -276,18 +276,18 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * into all the relations indexing the result relation * when a heap tuple is inserted into the result relation. * - * When 'update' is true and 'onlySummarizing' is false, + * When EIIT_IS_UPDATE is set and EIIT_ONLY_SUMMARIZING isn't, * executor is performing an UPDATE that could not use an * optimization like heapam's HOT (in more general terms a * call to table_tuple_update() took place and set * 'update_indexes' to TU_All). Receiving this hint makes * us consider if we should pass down the 'indexUnchanged' * hint in turn. That's something that we figure out for - * each index_insert() call iff 'update' is true. - * (When 'update' is false we already know not to pass the + * each index_insert() call iff EIIT_IS_UPDATE is set. + * (When that flag is not set we already know not to pass the * hint to any index.) * - * If onlySummarizing is set, an equivalent optimization to + * If EIIT_ONLY_SUMMARIZING is set, an equivalent optimization to * HOT has been applied and any updated columns are indexed * only by summarizing indexes (or in more general terms a * call to table_tuple_update() took place and set @@ -298,23 +298,21 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * Unique and exclusion constraints are enforced at the same * time. This returns a list of index OIDs for any unique or * exclusion constraints that are deferred and that had - * potential (unconfirmed) conflicts. (if noDupErr == true, + * potential (unconfirmed) conflicts. (if EIIT_NO_DUPE_ERROR, * the same is done for non-deferred constraints, but report * if conflict was speculative or deferred conflict to caller) * - * If 'arbiterIndexes' is nonempty, noDupErr applies only to - * those indexes. NIL means noDupErr applies to all indexes. + * If 'arbiterIndexes' is nonempty, EIIT_NO_DUPE_ERROR applies only to + * those indexes. NIL means EIIT_NO_DUPE_ERROR applies to all indexes. * ---------------------------------------------------------------- */ List * ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate, - bool update, - bool noDupErr, - bool *specConflict, + bits32 flags, + TupleTableSlot *slot, List *arbiterIndexes, - bool onlySummarizing) + bool *specConflict) { ItemPointer tupleid = &slot->tts_tid; List *result = NIL; @@ -374,7 +372,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, * Skip processing of non-summarizing indexes if we only update * summarizing indexes */ - if (onlySummarizing && !indexInfo->ii_Summarizing) + if ((flags & EIIT_ONLY_SUMMARIZING) && !indexInfo->ii_Summarizing) continue; /* Check for partial index */ @@ -409,7 +407,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, isnull); /* Check whether to apply noDupErr to this index */ - applyNoDupErr = noDupErr && + applyNoDupErr = (flags & EIIT_NO_DUPE_ERROR) && (arbiterIndexes == NIL || list_member_oid(arbiterIndexes, indexRelation->rd_index->indexrelid)); @@ -441,10 +439,11 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, * index. If we're being called as part of an UPDATE statement, * consider if the 'indexUnchanged' = true hint should be passed. */ - indexUnchanged = update && index_unchanged_by_update(resultRelInfo, - estate, - indexInfo, - indexRelation); + indexUnchanged = ((flags & EIIT_IS_UPDATE) && + index_unchanged_by_update(resultRelInfo, + estate, + indexInfo, + indexRelation)); satisfiesConstraint = index_insert(indexRelation, /* index relation */ diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 743b1ee2b2..2497ee7edc 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -846,11 +846,18 @@ ExecSimpleRelationInsert(ResultRelInfo *resultRelInfo, conflictindexes = resultRelInfo->ri_onConflictArbiterIndexes; if (resultRelInfo->ri_NumIndices > 0) + { + bits32 flags; + + if (conflictindexes != NIL) + flags = EIIT_NO_DUPE_ERROR; + else + flags = 0; recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, false, - conflictindexes ? true : false, - &conflict, - conflictindexes, false); + estate, flags, + slot, conflictindexes, + &conflict); + } /* * Checks the conflict indexes to fetch the conflicting local row and @@ -943,11 +950,18 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, conflictindexes = resultRelInfo->ri_onConflictArbiterIndexes; if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None)) + { + bits32 flags = EIIT_IS_UPDATE; + + if (conflictindexes != NIL) + flags |= EIIT_NO_DUPE_ERROR; + if (update_indexes == TU_Summarizing) + flags |= EIIT_ONLY_SUMMARIZING; recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, true, - conflictindexes ? true : false, - &conflict, conflictindexes, - (update_indexes == TU_Summarizing)); + estate, flags, + slot, conflictindexes, + &conflict); + } /* * Refer to the comments above the call to CheckAndReportConflict() in diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 6802fc13e9..793c76d4f8 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1226,10 +1226,9 @@ ExecInsert(ModifyTableContext *context, /* insert index entries for tuple */ recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, false, true, - &specConflict, - arbiterIndexes, - false); + estate, EIIT_NO_DUPE_ERROR, + slot, arbiterIndexes, + &specConflict); /* adjust the tuple's state accordingly */ table_tuple_complete_speculative(resultRelationDesc, slot, @@ -1266,10 +1265,9 @@ ExecInsert(ModifyTableContext *context, /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, false, - false, NULL, NIL, - false); + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, estate, + 0, slot, NIL, + NULL); } } @@ -2356,11 +2354,15 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, /* insert index entries for tuple if necessary */ if (resultRelInfo->ri_NumIndices > 0 && (updateCxt->updateIndexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, context->estate, - true, false, - NULL, NIL, - (updateCxt->updateIndexes == TU_Summarizing)); + { + bits32 flags = EIIT_IS_UPDATE; + + if (updateCxt->updateIndexes == TU_Summarizing) + flags |= EIIT_ONLY_SUMMARIZING; + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, context->estate, + flags, slot, NIL, + NULL); + } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 55a7d930d2..d46ba59895 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -739,12 +739,15 @@ extern Bitmapset *ExecGetAllUpdatedCols(ResultRelInfo *relinfo, EState *estate); */ extern void ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative); extern void ExecCloseIndices(ResultRelInfo *resultRelInfo); -extern List *ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, EState *estate, - bool update, - bool noDupErr, - bool *specConflict, List *arbiterIndexes, - bool onlySummarizing); + +/* flags for ExecInsertIndexTuples */ +#define EIIT_IS_UPDATE (1<<0) +#define EIIT_NO_DUPE_ERROR (1<<1) +#define EIIT_ONLY_SUMMARIZING (1<<2) +extern List *ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, EState *estate, + bits32 options, TupleTableSlot *slot, + List *arbiterIndexes, + bool *specConflict); extern bool ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, ItemPointer conflictTid, From f7df12a66cc90ed12d24edba31eeedfb546ef14c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 18 Feb 2026 08:47:58 +0900 Subject: [PATCH 123/147] Fix one-off issue with cache ID in objectaddress.c get_catalog_object_by_oid_extended() has been doing a syscache lookup when given a cache ID strictly higher than 0, which is wrong because the first valid value of SysCacheIdentifier is 0. This issue had no consequences, as the first value assigned in the enum SysCacheIdentifier is AGGFNOID, which is not used in the object type properties listed in objectaddress.c. Even if an ID of 0 was hypotherically given, the code would still work with a less efficient heap-or-index scan. Discussion: https://postgr.es/m/aZTr_R6JGmqokUBb@paquier.xyz --- src/backend/catalog/objectaddress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 02af64b82c..198caf641a 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -2808,7 +2808,7 @@ get_catalog_object_by_oid_extended(Relation catalog, Oid classId = RelationGetRelid(catalog); int oidCacheId = get_object_catcache_oid(classId); - if (oidCacheId > 0) + if (oidCacheId >= 0) { if (locktup) tuple = SearchSysCacheLockedCopy1(oidCacheId, From c06b5b99bbb0d0e5ddeea9661ec7678e3cf53b4c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 18 Feb 2026 09:25:52 +0900 Subject: [PATCH 124/147] Add concept of invalid value to SysCacheIdentifier This commit tweaks the generation of the syscache IDs for the enum SysCacheIdentifier to now include an invalid value, with -1 assigned as value. The concept of an invalid syscache ID exists when handling lookups of a ObjectAddress, based on their set of properties in ObjectPropertyType. -1 is used for the case where an object type has no option for a syscache lookup. This has been found as independently useful while discussing a switch of SysCacheIdentifier to a typedef, as we already have places that want to know about the concept of an invalid value when dealing with ObjectAddresses. Reviewed-by: Andreas Karlsson Discussion: https://postgr.es/m/aZQRnmp9nVjtxAHS@paquier.xyz --- src/backend/catalog/aclchk.c | 2 +- src/backend/catalog/genbki.pl | 11 ++++- src/backend/catalog/objectaddress.c | 62 ++++++++++++++--------------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index a431fc0926..56060ca544 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -4101,7 +4101,7 @@ object_ownercheck(Oid classid, Oid objectid, Oid roleid) classid = LargeObjectMetadataRelationId; cacheid = get_object_catcache_oid(classid); - if (cacheid != -1) + if (cacheid != SYSCACHEID_INVALID) { /* we can get the object's tuple from the syscache */ HeapTuple tuple; diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index b2c1b1c573..975cc53435 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -797,7 +797,7 @@ print_boilerplate($syscache_ids_fh, "syscache_ids.h", "SysCache identifiers"); print $syscache_ids_fh "enum SysCacheIdentifier { -"; +\tSYSCACHEID_INVALID = -1,\n"; print_boilerplate($syscache_info_fh, "syscache_info.h", "SysCache definitions"); @@ -812,7 +812,14 @@ my $last_syscache; foreach my $syscache (sort keys %syscaches) { - print $syscache_ids_fh "\t$syscache,\n"; + if (not defined $last_syscache) + { + print $syscache_ids_fh "\t$syscache = 0,\n"; + } + else + { + print $syscache_ids_fh "\t$syscache,\n"; + } $last_syscache = $syscache; print $syscache_info_fh "\t[$syscache] = {\n"; diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 198caf641a..33be619090 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -135,8 +135,8 @@ static const ObjectPropertyType ObjectProperty[] = "access method operator", AccessMethodOperatorRelationId, AccessMethodOperatorOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_amop_oid, InvalidAttrNumber, InvalidAttrNumber, @@ -149,8 +149,8 @@ static const ObjectPropertyType ObjectProperty[] = "access method procedure", AccessMethodProcedureRelationId, AccessMethodProcedureOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_amproc_oid, InvalidAttrNumber, InvalidAttrNumber, @@ -163,8 +163,8 @@ static const ObjectPropertyType ObjectProperty[] = "cast", CastRelationId, CastOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_cast_oid, InvalidAttrNumber, InvalidAttrNumber, @@ -178,7 +178,7 @@ static const ObjectPropertyType ObjectProperty[] = CollationRelationId, CollationOidIndexId, COLLOID, - -1, /* COLLNAMEENCNSP also takes encoding */ + SYSCACHEID_INVALID, /* COLLNAMEENCNSP also takes encoding */ Anum_pg_collation_oid, Anum_pg_collation_collname, Anum_pg_collation_collnamespace, @@ -192,7 +192,7 @@ static const ObjectPropertyType ObjectProperty[] = ConstraintRelationId, ConstraintOidIndexId, CONSTROID, - -1, + SYSCACHEID_INVALID, Anum_pg_constraint_oid, Anum_pg_constraint_conname, Anum_pg_constraint_connamespace, @@ -220,7 +220,7 @@ static const ObjectPropertyType ObjectProperty[] = DatabaseRelationId, DatabaseOidIndexId, DATABASEOID, - -1, + SYSCACHEID_INVALID, Anum_pg_database_oid, Anum_pg_database_datname, InvalidAttrNumber, @@ -233,8 +233,8 @@ static const ObjectPropertyType ObjectProperty[] = "default ACL", DefaultAclRelationId, DefaultAclOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_default_acl_oid, InvalidAttrNumber, InvalidAttrNumber, @@ -247,8 +247,8 @@ static const ObjectPropertyType ObjectProperty[] = "extension", ExtensionRelationId, ExtensionOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_extension_oid, Anum_pg_extension_extname, InvalidAttrNumber, /* extension doesn't belong to extnamespace */ @@ -290,7 +290,7 @@ static const ObjectPropertyType ObjectProperty[] = ProcedureRelationId, ProcedureOidIndexId, PROCOID, - -1, /* PROCNAMEARGSNSP also takes argument types */ + SYSCACHEID_INVALID, /* PROCNAMEARGSNSP also takes argument types */ Anum_pg_proc_oid, Anum_pg_proc_proname, Anum_pg_proc_pronamespace, @@ -317,8 +317,8 @@ static const ObjectPropertyType ObjectProperty[] = "large object metadata", LargeObjectMetadataRelationId, LargeObjectMetadataOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_largeobject_metadata_oid, InvalidAttrNumber, InvalidAttrNumber, @@ -332,7 +332,7 @@ static const ObjectPropertyType ObjectProperty[] = OperatorClassRelationId, OpclassOidIndexId, CLAOID, - -1, /* CLAAMNAMENSP also takes opcmethod */ + SYSCACHEID_INVALID, /* CLAAMNAMENSP also takes opcmethod */ Anum_pg_opclass_oid, Anum_pg_opclass_opcname, Anum_pg_opclass_opcnamespace, @@ -346,7 +346,7 @@ static const ObjectPropertyType ObjectProperty[] = OperatorRelationId, OperatorOidIndexId, OPEROID, - -1, /* OPERNAMENSP also takes left and right type */ + SYSCACHEID_INVALID, /* OPERNAMENSP also takes left and right type */ Anum_pg_operator_oid, Anum_pg_operator_oprname, Anum_pg_operator_oprnamespace, @@ -360,7 +360,7 @@ static const ObjectPropertyType ObjectProperty[] = OperatorFamilyRelationId, OpfamilyOidIndexId, OPFAMILYOID, - -1, /* OPFAMILYAMNAMENSP also takes opfmethod */ + SYSCACHEID_INVALID, /* OPFAMILYAMNAMENSP also takes opfmethod */ Anum_pg_opfamily_oid, Anum_pg_opfamily_opfname, Anum_pg_opfamily_opfnamespace, @@ -387,8 +387,8 @@ static const ObjectPropertyType ObjectProperty[] = "role membership", AuthMemRelationId, AuthMemOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_auth_members_oid, InvalidAttrNumber, InvalidAttrNumber, @@ -401,8 +401,8 @@ static const ObjectPropertyType ObjectProperty[] = "rule", RewriteRelationId, RewriteOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_rewrite_oid, Anum_pg_rewrite_rulename, InvalidAttrNumber, @@ -444,7 +444,7 @@ static const ObjectPropertyType ObjectProperty[] = TableSpaceRelationId, TablespaceOidIndexId, TABLESPACEOID, - -1, + SYSCACHEID_INVALID, Anum_pg_tablespace_oid, Anum_pg_tablespace_spcname, InvalidAttrNumber, @@ -458,7 +458,7 @@ static const ObjectPropertyType ObjectProperty[] = TransformRelationId, TransformOidIndexId, TRFOID, - -1, + SYSCACHEID_INVALID, Anum_pg_transform_oid, InvalidAttrNumber, InvalidAttrNumber, @@ -471,8 +471,8 @@ static const ObjectPropertyType ObjectProperty[] = "trigger", TriggerRelationId, TriggerOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_trigger_oid, Anum_pg_trigger_tgname, InvalidAttrNumber, @@ -485,8 +485,8 @@ static const ObjectPropertyType ObjectProperty[] = "policy", PolicyRelationId, PolicyOidIndexId, - -1, - -1, + SYSCACHEID_INVALID, + SYSCACHEID_INVALID, Anum_pg_policy_oid, Anum_pg_policy_polname, InvalidAttrNumber, @@ -626,7 +626,7 @@ static const ObjectPropertyType ObjectProperty[] = UserMappingRelationId, UserMappingOidIndexId, USERMAPPINGOID, - -1, + SYSCACHEID_INVALID, Anum_pg_user_mapping_oid, InvalidAttrNumber, InvalidAttrNumber, @@ -2583,7 +2583,7 @@ get_object_namespace(const ObjectAddress *address) /* Currently, we can only handle object types with system caches. */ cache = property->oid_catcache_id; - Assert(cache != -1); + Assert(cache != SYSCACHEID_INVALID); /* Fetch tuple from syscache and extract namespace attribute. */ tuple = SearchSysCache1(cache, ObjectIdGetDatum(address->objectId)); From ee642cccc43ca1a0ff4a4af2a457208b919af017 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 18 Feb 2026 09:58:38 +0900 Subject: [PATCH 125/147] Switch SysCacheIdentifier to a typedef enum The main purpose of this change is to allow an ABI checker to understand when the list of SysCacheIdentifier changes, by switching all the routine declarations that relied on a signed integer for a syscache ID to this new type. This is going to be useful in the long-term for versions newer than v19 so as we will be able to check when the list of values in SysCacheIdentifier is updated in a non-ABI compliant fashion. Most of the changes of this commit are due to the new definition of SyscacheCallbackFunction, where a SysCacheIdentifier is now required for the syscache ID. It is a mechanical change, still slightly invasive. There are more areas in the tree that could be improved with an ABI checker in mind; this takes care of only one area. Reported-by: Tom Lane Author: Andreas Karlsson Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/289125.1770913057@sss.pgh.pa.us --- contrib/postgres_fdw/connection.c | 5 +-- contrib/postgres_fdw/shippable.c | 3 +- src/backend/catalog/aclchk.c | 10 +++--- src/backend/catalog/dependency.c | 2 +- src/backend/catalog/genbki.pl | 4 +-- src/backend/catalog/namespace.c | 5 +-- src/backend/catalog/objectaddress.c | 17 ++++++----- src/backend/commands/alter.c | 8 ++--- src/backend/commands/extension.c | 5 +-- src/backend/optimizer/util/predtest.c | 6 ++-- src/backend/parser/parse_oper.c | 6 ++-- src/backend/replication/logical/syncutils.c | 3 +- src/backend/replication/logical/worker.c | 2 +- src/backend/replication/pgoutput/pgoutput.c | 10 +++--- src/backend/utils/adt/acl.c | 6 ++-- src/backend/utils/adt/ri_triggers.c | 6 ++-- src/backend/utils/cache/attoptcache.c | 3 +- src/backend/utils/cache/evtcache.c | 6 ++-- src/backend/utils/cache/inval.c | 4 +-- src/backend/utils/cache/plancache.c | 10 +++--- src/backend/utils/cache/spccache.c | 3 +- src/backend/utils/cache/syscache.c | 34 ++++++++++----------- src/backend/utils/cache/ts_cache.c | 2 +- src/backend/utils/cache/typcache.c | 15 +++++---- src/backend/utils/misc/superuser.c | 5 +-- src/include/catalog/objectaddress.h | 5 +-- src/include/replication/worker_internal.h | 3 +- src/include/utils/inval.h | 8 +++-- src/include/utils/syscache.h | 30 +++++++++--------- 29 files changed, 128 insertions(+), 98 deletions(-) diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c index 487a1a2317..add673a477 100644 --- a/contrib/postgres_fdw/connection.c +++ b/contrib/postgres_fdw/connection.c @@ -150,7 +150,8 @@ static void pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid, SubTransactionId parentSubid, void *arg); -static void pgfdw_inval_callback(Datum arg, int cacheid, uint32 hashvalue); +static void pgfdw_inval_callback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); static void pgfdw_reject_incomplete_xact_state_change(ConnCacheEntry *entry); static void pgfdw_reset_xact_state(ConnCacheEntry *entry, bool toplevel); static bool pgfdw_cancel_query(PGconn *conn); @@ -1309,7 +1310,7 @@ pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid, * individual option values, but it seems too much effort for the gain. */ static void -pgfdw_inval_callback(Datum arg, int cacheid, uint32 hashvalue) +pgfdw_inval_callback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { HASH_SEQ_STATUS scan; ConnCacheEntry *entry; diff --git a/contrib/postgres_fdw/shippable.c b/contrib/postgres_fdw/shippable.c index d32d3d0e46..250f54fea3 100644 --- a/contrib/postgres_fdw/shippable.c +++ b/contrib/postgres_fdw/shippable.c @@ -62,7 +62,8 @@ typedef struct * made for them, however. */ static void -InvalidateShippableCacheCallback(Datum arg, int cacheid, uint32 hashvalue) +InvalidateShippableCacheCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { HASH_SEQ_STATUS status; ShippableCacheEntry *entry; diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index 56060ca544..aef855abcc 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -2115,7 +2115,7 @@ static void ExecGrant_common(InternalGrant *istmt, Oid classid, AclMode default_privs, void (*object_check) (InternalGrant *istmt, HeapTuple tuple)) { - int cacheid; + SysCacheIdentifier cacheid; Relation relation; ListCell *cell; @@ -3058,7 +3058,7 @@ object_aclmask_ext(Oid classid, Oid objectid, Oid roleid, AclMode mask, AclMaskHow how, bool *is_missing) { - int cacheid; + SysCacheIdentifier cacheid; AclMode result; HeapTuple tuple; Datum aclDatum; @@ -4089,7 +4089,7 @@ pg_largeobject_aclcheck_snapshot(Oid lobj_oid, Oid roleid, AclMode mode, bool object_ownercheck(Oid classid, Oid objectid, Oid roleid) { - int cacheid; + SysCacheIdentifier cacheid; Oid ownerId; /* Superusers bypass all permission checking. */ @@ -4486,7 +4486,7 @@ recordExtObjInitPriv(Oid objoid, Oid classoid) /* This will error on unsupported classoid. */ else if (get_object_attnum_acl(classoid) != InvalidAttrNumber) { - int cacheid; + SysCacheIdentifier cacheid; Datum aclDatum; bool isNull; HeapTuple tuple; @@ -4870,7 +4870,7 @@ RemoveRoleFromInitPriv(Oid roleid, Oid classid, Oid objid, int32 objsubid) ScanKeyData key[3]; SysScanDesc scan; HeapTuple oldtuple; - int cacheid; + SysCacheIdentifier cacheid; HeapTuple objtuple; Oid ownerId; Datum oldAclDatum; diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index f89267f034..7564965fa1 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -1238,7 +1238,7 @@ reportDependentObjects(const ObjectAddresses *targetObjects, static void DropObjectById(const ObjectAddress *object) { - int cacheId; + SysCacheIdentifier cacheId; Relation rel; HeapTuple tup; diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index 975cc53435..48c6805f75 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -795,7 +795,7 @@ # Now generate syscache info print_boilerplate($syscache_ids_fh, "syscache_ids.h", "SysCache identifiers"); -print $syscache_ids_fh "enum SysCacheIdentifier +print $syscache_ids_fh "typedef enum SysCacheIdentifier { \tSYSCACHEID_INVALID = -1,\n"; @@ -832,7 +832,7 @@ print $syscache_info_fh "\t},\n"; } -print $syscache_ids_fh "};\n"; +print $syscache_ids_fh "} SysCacheIdentifier;\n"; print $syscache_ids_fh "#define SysCacheSize ($last_syscache + 1)\n"; print $syscache_info_fh "};\n"; diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index c3b79a2ba4..4b0f4ba115 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -229,7 +229,8 @@ static void AccessTempTableNamespace(bool force); static void InitTempTableNamespace(void); static void RemoveTempRelations(Oid tempNamespaceId); static void RemoveTempRelationsCallback(int code, Datum arg); -static void InvalidationCallback(Datum arg, int cacheid, uint32 hashvalue); +static void InvalidationCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, bool include_out_arguments, int pronargs, int **argnumbers, int *fgc_flags); @@ -4863,7 +4864,7 @@ InitializeSearchPath(void) * Syscache inval callback function */ static void -InvalidationCallback(Datum arg, int cacheid, uint32 hashvalue) +InvalidationCallback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { /* * Force search path to be recomputed on next use, also invalidating the diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 33be619090..13d73f8909 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -99,10 +99,11 @@ typedef struct * error messages */ Oid class_oid; /* oid of catalog */ Oid oid_index_oid; /* oid of index on system oid column */ - int oid_catcache_id; /* id of catcache on system oid column */ - int name_catcache_id; /* id of catcache on (name,namespace), or - * (name) if the object does not live in a - * namespace */ + SysCacheIdentifier oid_catcache_id; /* id of catcache on system oid column */ + SysCacheIdentifier name_catcache_id; /* id of catcache on + * (name,namespace), or (name) if + * the object does not live in a + * namespace */ AttrNumber attnum_oid; /* attribute number of oid column */ AttrNumber attnum_name; /* attnum of name field */ AttrNumber attnum_namespace; /* attnum of namespace field */ @@ -2571,7 +2572,7 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, Oid get_object_namespace(const ObjectAddress *address) { - int cache; + SysCacheIdentifier cache; HeapTuple tuple; Oid oid; const ObjectPropertyType *property; @@ -2640,7 +2641,7 @@ get_object_oid_index(Oid class_id) return prop->oid_index_oid; } -int +SysCacheIdentifier get_object_catcache_oid(Oid class_id) { const ObjectPropertyType *prop = get_object_property_data(class_id); @@ -2648,7 +2649,7 @@ get_object_catcache_oid(Oid class_id) return prop->oid_catcache_id; } -int +SysCacheIdentifier get_object_catcache_name(Oid class_id) { const ObjectPropertyType *prop = get_object_property_data(class_id); @@ -2806,7 +2807,7 @@ get_catalog_object_by_oid_extended(Relation catalog, { HeapTuple tuple; Oid classId = RelationGetRelid(catalog); - int oidCacheId = get_object_catcache_oid(classId); + SysCacheIdentifier oidCacheId = get_object_catcache_oid(classId); if (oidCacheId >= 0) { diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c index 08957104c7..c6f58d47be 100644 --- a/src/backend/commands/alter.c +++ b/src/backend/commands/alter.c @@ -159,8 +159,8 @@ static void AlterObjectRename_internal(Relation rel, Oid objectId, const char *new_name) { Oid classId = RelationGetRelid(rel); - int oidCacheId = get_object_catcache_oid(classId); - int nameCacheId = get_object_catcache_name(classId); + SysCacheIdentifier oidCacheId = get_object_catcache_oid(classId); + SysCacheIdentifier nameCacheId = get_object_catcache_name(classId); AttrNumber Anum_name = get_object_attnum_name(classId); AttrNumber Anum_namespace = get_object_attnum_namespace(classId); AttrNumber Anum_owner = get_object_attnum_owner(classId); @@ -686,8 +686,8 @@ static Oid AlterObjectNamespace_internal(Relation rel, Oid objid, Oid nspOid) { Oid classId = RelationGetRelid(rel); - int oidCacheId = get_object_catcache_oid(classId); - int nameCacheId = get_object_catcache_name(classId); + SysCacheIdentifier oidCacheId = get_object_catcache_oid(classId); + SysCacheIdentifier nameCacheId = get_object_catcache_name(classId); AttrNumber Anum_name = get_object_attnum_name(classId); AttrNumber Anum_namespace = get_object_attnum_namespace(classId); AttrNumber Anum_owner = get_object_attnum_owner(classId); diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 81f24615d5..574858bfec 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -162,7 +162,8 @@ typedef struct ExtensionSiblingCache static ExtensionSiblingCache *ext_sibling_list = NULL; /* Local functions */ -static void ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue); +static void ext_sibling_callback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); static List *find_update_path(List *evi_list, ExtensionVersionInfo *evi_start, ExtensionVersionInfo *evi_target, @@ -379,7 +380,7 @@ get_function_sibling_type(Oid funcoid, const char *typname) * looking for, could change without an extension update or drop/recreate. */ static void -ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue) +ext_sibling_callback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { ExtensionSiblingCache *cache_entry; diff --git a/src/backend/optimizer/util/predtest.c b/src/backend/optimizer/util/predtest.c index 26858d1d2b..fe15881af4 100644 --- a/src/backend/optimizer/util/predtest.c +++ b/src/backend/optimizer/util/predtest.c @@ -109,7 +109,8 @@ static bool operator_same_subexprs_proof(Oid pred_op, Oid clause_op, static bool operator_same_subexprs_lookup(Oid pred_op, Oid clause_op, bool refute_it); static Oid get_btree_test_op(Oid pred_op, Oid clause_op, bool refute_it); -static void InvalidateOprProofCacheCallBack(Datum arg, int cacheid, uint32 hashvalue); +static void InvalidateOprProofCacheCallBack(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); /* @@ -2343,7 +2344,8 @@ get_btree_test_op(Oid pred_op, Oid clause_op, bool refute_it) * Callback for pg_amop inval events */ static void -InvalidateOprProofCacheCallBack(Datum arg, int cacheid, uint32 hashvalue) +InvalidateOprProofCacheCallBack(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { HASH_SEQ_STATUS status; OprProofCacheEntry *hentry; diff --git a/src/backend/parser/parse_oper.c b/src/backend/parser/parse_oper.c index 768e4cff9c..a6b402f2d7 100644 --- a/src/backend/parser/parse_oper.c +++ b/src/backend/parser/parse_oper.c @@ -79,7 +79,8 @@ static bool make_oper_cache_key(ParseState *pstate, OprCacheKey *key, int location); static Oid find_oper_cache_entry(OprCacheKey *key); static void make_oper_cache_entry(OprCacheKey *key, Oid opr_oid); -static void InvalidateOprCacheCallBack(Datum arg, int cacheid, uint32 hashvalue); +static void InvalidateOprCacheCallBack(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); /* @@ -1076,7 +1077,8 @@ make_oper_cache_entry(OprCacheKey *key, Oid opr_oid) * Callback for pg_operator and pg_cast inval events */ static void -InvalidateOprCacheCallBack(Datum arg, int cacheid, uint32 hashvalue) +InvalidateOprCacheCallBack(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { HASH_SEQ_STATUS status; OprCacheEntry *hentry; diff --git a/src/backend/replication/logical/syncutils.c b/src/backend/replication/logical/syncutils.c index 535ffb6f09..ef61ca0437 100644 --- a/src/backend/replication/logical/syncutils.c +++ b/src/backend/replication/logical/syncutils.c @@ -98,7 +98,8 @@ FinishSyncWorker(void) * Callback from syscache invalidation. */ void -InvalidateSyncingRelStates(Datum arg, int cacheid, uint32 hashvalue) +InvalidateSyncingRelStates(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { relation_states_validity = SYNC_RELATIONS_STATE_NEEDS_REBUILD; } diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 32725c4862..8b93f48470 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -5164,7 +5164,7 @@ maybe_reread_subscription(void) * Callback from subscription syscache invalidation. */ static void -subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue) +subscription_change_cb(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { MySubscriptionValid = false; } diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index e016f64e0b..7a49185d29 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -86,7 +86,7 @@ static void pgoutput_stream_prepare_txn(LogicalDecodingContext *ctx, static bool publications_valid; static List *LoadPublications(List *pubnames); -static void publication_invalidation_cb(Datum arg, int cacheid, +static void publication_invalidation_cb(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue); static void send_repl_origin(LogicalDecodingContext *ctx, ReplOriginId origin_id, XLogRecPtr origin_lsn, @@ -227,7 +227,7 @@ static void send_relation_and_attrs(Relation relation, TransactionId xid, LogicalDecodingContext *ctx, RelationSyncEntry *relentry); static void rel_sync_cache_relation_cb(Datum arg, Oid relid); -static void rel_sync_cache_publication_cb(Datum arg, int cacheid, +static void rel_sync_cache_publication_cb(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue); static void set_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid); @@ -1828,7 +1828,8 @@ LoadPublications(List *pubnames) * Called for invalidations on pg_publication. */ static void -publication_invalidation_cb(Datum arg, int cacheid, uint32 hashvalue) +publication_invalidation_cb(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { publications_valid = false; } @@ -2431,7 +2432,8 @@ rel_sync_cache_relation_cb(Datum arg, Oid relid) * Called for invalidations on pg_namespace. */ static void -rel_sync_cache_publication_cb(Datum arg, int cacheid, uint32 hashvalue) +rel_sync_cache_publication_cb(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { HASH_SEQ_STATUS status; RelationSyncEntry *entry; diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c index 3a6905f954..641673f0b0 100644 --- a/src/backend/utils/adt/acl.c +++ b/src/backend/utils/adt/acl.c @@ -130,7 +130,8 @@ static AclMode convert_largeobject_priv_string(text *priv_type_text); static AclMode convert_role_priv_string(text *priv_type_text); static AclResult pg_role_aclcheck(Oid role_oid, Oid roleid, AclMode mode); -static void RoleMembershipCacheCallback(Datum arg, int cacheid, uint32 hashvalue); +static void RoleMembershipCacheCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); /* @@ -5067,7 +5068,8 @@ initialize_acl(void) * Syscache inval callback function */ static void -RoleMembershipCacheCallback(Datum arg, int cacheid, uint32 hashvalue) +RoleMembershipCacheCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { if (cacheid == DATABASEOID && hashvalue != cached_db_hash && diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index bbadecef5f..d22b8ef7f3 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -213,7 +213,8 @@ static bool ri_CompareWithCast(Oid eq_opr, Oid typeid, Oid collid, Datum lhs, Datum rhs); static void ri_InitHashTables(void); -static void InvalidateConstraintCacheCallBack(Datum arg, int cacheid, uint32 hashvalue); +static void InvalidateConstraintCacheCallBack(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); static SPIPlanPtr ri_FetchPreparedPlan(RI_QueryKey *key); static void ri_HashPreparedPlan(RI_QueryKey *key, SPIPlanPtr plan); static RI_CompareHashEntry *ri_HashCompareOp(Oid eq_opr, Oid typeid); @@ -2397,7 +2398,8 @@ get_ri_constraint_root(Oid constrOid) * data from changing under it --- but we may get cache flushes anyway.) */ static void -InvalidateConstraintCacheCallBack(Datum arg, int cacheid, uint32 hashvalue) +InvalidateConstraintCacheCallBack(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { dlist_mutable_iter iter; diff --git a/src/backend/utils/cache/attoptcache.c b/src/backend/utils/cache/attoptcache.c index 72edc8f665..9244a23013 100644 --- a/src/backend/utils/cache/attoptcache.c +++ b/src/backend/utils/cache/attoptcache.c @@ -50,7 +50,8 @@ typedef struct * for that attribute. */ static void -InvalidateAttoptCacheCallback(Datum arg, int cacheid, uint32 hashvalue) +InvalidateAttoptCacheCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { HASH_SEQ_STATUS status; AttoptCacheEntry *attopt; diff --git a/src/backend/utils/cache/evtcache.c b/src/backend/utils/cache/evtcache.c index 2b4453e54a..3fe89c9c98 100644 --- a/src/backend/utils/cache/evtcache.c +++ b/src/backend/utils/cache/evtcache.c @@ -49,7 +49,8 @@ static EventTriggerCacheStateType EventTriggerCacheState = ETCS_NEEDS_REBUILD; static void BuildEventTriggerCache(void); static void InvalidateEventCacheCallback(Datum arg, - int cacheid, uint32 hashvalue); + SysCacheIdentifier cacheid, + uint32 hashvalue); static Bitmapset *DecodeTextArrayToBitmapset(Datum array); /* @@ -254,7 +255,8 @@ DecodeTextArrayToBitmapset(Datum array) * memory leaks. */ static void -InvalidateEventCacheCallback(Datum arg, int cacheid, uint32 hashvalue) +InvalidateEventCacheCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { /* * If the cache isn't valid, then there might be a rebuild in progress, so diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index bf465a295e..d59216b28f 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -1813,7 +1813,7 @@ CacheInvalidateRelmap(Oid databaseId) * flush all cached state anyway. */ void -CacheRegisterSyscacheCallback(int cacheid, +CacheRegisterSyscacheCallback(SysCacheIdentifier cacheid, SyscacheCallbackFunction func, Datum arg) { @@ -1895,7 +1895,7 @@ CacheRegisterRelSyncCallback(RelSyncCallbackFunction func, * this module from knowing which catcache IDs correspond to which catalogs. */ void -CallSyscacheCallbacks(int cacheid, uint32 hashvalue) +CallSyscacheCallbacks(SysCacheIdentifier cacheid, uint32 hashvalue) { int i; diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 37d5d73b7f..812e226573 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -106,8 +106,10 @@ static void ScanQueryForLocks(Query *parsetree, bool acquire); static bool ScanQueryWalker(Node *node, bool *acquire); static TupleDesc PlanCacheComputeResultDesc(List *stmt_list); static void PlanCacheRelCallback(Datum arg, Oid relid); -static void PlanCacheObjectCallback(Datum arg, int cacheid, uint32 hashvalue); -static void PlanCacheSysCallback(Datum arg, int cacheid, uint32 hashvalue); +static void PlanCacheObjectCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); +static void PlanCacheSysCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); /* ResourceOwner callbacks to track plancache references */ static void ResOwnerReleaseCachedPlan(Datum res); @@ -2201,7 +2203,7 @@ PlanCacheRelCallback(Datum arg, Oid relid) * or all plans mentioning any member of this cache if hashvalue == 0. */ static void -PlanCacheObjectCallback(Datum arg, int cacheid, uint32 hashvalue) +PlanCacheObjectCallback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { dlist_iter iter; @@ -2310,7 +2312,7 @@ PlanCacheObjectCallback(Datum arg, int cacheid, uint32 hashvalue) * Just invalidate everything... */ static void -PlanCacheSysCallback(Datum arg, int cacheid, uint32 hashvalue) +PlanCacheSysCallback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { ResetPlanCache(); } diff --git a/src/backend/utils/cache/spccache.c b/src/backend/utils/cache/spccache.c index 8f1a5e6959..362169b7d9 100644 --- a/src/backend/utils/cache/spccache.c +++ b/src/backend/utils/cache/spccache.c @@ -52,7 +52,8 @@ typedef struct * tablespaces, nor do we expect them to be frequently modified. */ static void -InvalidateTableSpaceCacheCallback(Datum arg, int cacheid, uint32 hashvalue) +InvalidateTableSpaceCacheCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue) { HASH_SEQ_STATUS status; TableSpaceCacheEntry *spc; diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index ae3d18e0e7..007a9a15d7 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -109,7 +109,7 @@ static int oid_compare(const void *a, const void *b); void InitCatalogCache(void) { - int cacheId; + SysCacheIdentifier cacheId; Assert(!CacheInitialized); @@ -179,7 +179,7 @@ InitCatalogCache(void) void InitCatalogCachePhase2(void) { - int cacheId; + SysCacheIdentifier cacheId; Assert(CacheInitialized); @@ -205,7 +205,7 @@ InitCatalogCachePhase2(void) * CAUTION: The tuple that is returned must NOT be freed by the caller! */ HeapTuple -SearchSysCache(int cacheId, +SearchSysCache(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, @@ -217,7 +217,7 @@ SearchSysCache(int cacheId, } HeapTuple -SearchSysCache1(int cacheId, +SearchSysCache1(SysCacheIdentifier cacheId, Datum key1) { Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); @@ -227,7 +227,7 @@ SearchSysCache1(int cacheId, } HeapTuple -SearchSysCache2(int cacheId, +SearchSysCache2(SysCacheIdentifier cacheId, Datum key1, Datum key2) { Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); @@ -237,7 +237,7 @@ SearchSysCache2(int cacheId, } HeapTuple -SearchSysCache3(int cacheId, +SearchSysCache3(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3) { Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); @@ -247,7 +247,7 @@ SearchSysCache3(int cacheId, } HeapTuple -SearchSysCache4(int cacheId, +SearchSysCache4(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, Datum key4) { Assert(cacheId >= 0 && cacheId < SysCacheSize && SysCache[cacheId]); @@ -279,7 +279,7 @@ ReleaseSysCache(HeapTuple tuple) * doesn't prevent the "tuple concurrently updated" error. */ HeapTuple -SearchSysCacheLocked1(int cacheId, +SearchSysCacheLocked1(SysCacheIdentifier cacheId, Datum key1) { CatCache *cache = SysCache[cacheId]; @@ -371,7 +371,7 @@ SearchSysCacheLocked1(int cacheId, * heap_freetuple() the result when done with it. */ HeapTuple -SearchSysCacheCopy(int cacheId, +SearchSysCacheCopy(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, @@ -396,7 +396,7 @@ SearchSysCacheCopy(int cacheId, * heap_freetuple(). */ HeapTuple -SearchSysCacheLockedCopy1(int cacheId, +SearchSysCacheLockedCopy1(SysCacheIdentifier cacheId, Datum key1) { HeapTuple tuple, @@ -417,7 +417,7 @@ SearchSysCacheLockedCopy1(int cacheId, * No lock is retained on the syscache entry. */ bool -SearchSysCacheExists(int cacheId, +SearchSysCacheExists(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, @@ -440,7 +440,7 @@ SearchSysCacheExists(int cacheId, * No lock is retained on the syscache entry. */ Oid -GetSysCacheOid(int cacheId, +GetSysCacheOid(SysCacheIdentifier cacheId, AttrNumber oidcol, Datum key1, Datum key2, @@ -592,7 +592,7 @@ SearchSysCacheCopyAttNum(Oid relid, int16 attnum) * a different cache for the same catalog the tuple was fetched from. */ Datum -SysCacheGetAttr(int cacheId, HeapTuple tup, +SysCacheGetAttr(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull) { @@ -622,7 +622,7 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, * be NULL. */ Datum -SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, +SysCacheGetAttrNotNull(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber) { bool isnull; @@ -652,7 +652,7 @@ SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, * catcache code that need to be able to compute the hash values. */ uint32 -GetSysCacheHashValue(int cacheId, +GetSysCacheHashValue(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, @@ -668,7 +668,7 @@ GetSysCacheHashValue(int cacheId, * List-search interface */ struct catclist * -SearchSysCacheList(int cacheId, int nkeys, +SearchSysCacheList(SysCacheIdentifier cacheId, int nkeys, Datum key1, Datum key2, Datum key3) { if (cacheId < 0 || cacheId >= SysCacheSize || !SysCache[cacheId]) @@ -687,7 +687,7 @@ SearchSysCacheList(int cacheId, int nkeys, * This routine is only quasi-public: it should only be used by inval.c. */ void -SysCacheInvalidate(int cacheId, uint32 hashValue) +SysCacheInvalidate(SysCacheIdentifier cacheId, uint32 hashValue) { if (cacheId < 0 || cacheId >= SysCacheSize) elog(ERROR, "invalid cache ID: %d", cacheId); diff --git a/src/backend/utils/cache/ts_cache.c b/src/backend/utils/cache/ts_cache.c index 71e49b2b91..744c8e71d7 100644 --- a/src/backend/utils/cache/ts_cache.c +++ b/src/backend/utils/cache/ts_cache.c @@ -91,7 +91,7 @@ static Oid TSCurrentConfigCache = InvalidOid; * table address as the "arg". */ static void -InvalidateTSCacheCallBack(Datum arg, int cacheid, uint32 hashvalue) +InvalidateTSCacheCallBack(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { HTAB *hash = (HTAB *) DatumGetPointer(arg); HASH_SEQ_STATUS status; diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index dc4b1a5641..627e534609 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -337,9 +337,12 @@ static bool multirange_element_has_hashing(TypeCacheEntry *typentry); static bool multirange_element_has_extended_hashing(TypeCacheEntry *typentry); static void cache_multirange_element_properties(TypeCacheEntry *typentry); static void TypeCacheRelCallback(Datum arg, Oid relid); -static void TypeCacheTypCallback(Datum arg, int cacheid, uint32 hashvalue); -static void TypeCacheOpcCallback(Datum arg, int cacheid, uint32 hashvalue); -static void TypeCacheConstrCallback(Datum arg, int cacheid, uint32 hashvalue); +static void TypeCacheTypCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); +static void TypeCacheOpcCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); +static void TypeCacheConstrCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); static void load_enum_cache_data(TypeCacheEntry *tcache); static EnumItem *find_enumitem(TypeCacheEnumData *enumdata, Oid arg); static int enum_oid_cmp(const void *left, const void *right); @@ -2512,7 +2515,7 @@ TypeCacheRelCallback(Datum arg, Oid relid) * it as needing to be reloaded. */ static void -TypeCacheTypCallback(Datum arg, int cacheid, uint32 hashvalue) +TypeCacheTypCallback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { HASH_SEQ_STATUS status; TypeCacheEntry *typentry; @@ -2569,7 +2572,7 @@ TypeCacheTypCallback(Datum arg, int cacheid, uint32 hashvalue) * of members are not going to get cached here. */ static void -TypeCacheOpcCallback(Datum arg, int cacheid, uint32 hashvalue) +TypeCacheOpcCallback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { HASH_SEQ_STATUS status; TypeCacheEntry *typentry; @@ -2607,7 +2610,7 @@ TypeCacheOpcCallback(Datum arg, int cacheid, uint32 hashvalue) * approach to domain constraints. */ static void -TypeCacheConstrCallback(Datum arg, int cacheid, uint32 hashvalue) +TypeCacheConstrCallback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { TypeCacheEntry *typentry; diff --git a/src/backend/utils/misc/superuser.c b/src/backend/utils/misc/superuser.c index 7821624687..b9c3a0ceaa 100644 --- a/src/backend/utils/misc/superuser.c +++ b/src/backend/utils/misc/superuser.c @@ -36,7 +36,8 @@ static Oid last_roleid = InvalidOid; /* InvalidOid == cache not valid */ static bool last_roleid_is_super = false; static bool roleid_callback_registered = false; -static void RoleidCallback(Datum arg, int cacheid, uint32 hashvalue); +static void RoleidCallback(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); /* @@ -100,7 +101,7 @@ superuser_arg(Oid roleid) * Syscache inval callback function */ static void -RoleidCallback(Datum arg, int cacheid, uint32 hashvalue) +RoleidCallback(Datum arg, SysCacheIdentifier cacheid, uint32 hashvalue) { /* Invalidate our local cache in case role's superuserness changed */ last_roleid = InvalidOid; diff --git a/src/include/catalog/objectaddress.h b/src/include/catalog/objectaddress.h index e2fe9db116..b549be2d52 100644 --- a/src/include/catalog/objectaddress.h +++ b/src/include/catalog/objectaddress.h @@ -17,6 +17,7 @@ #include "nodes/parsenodes.h" #include "storage/lockdefs.h" #include "utils/relcache.h" +#include "utils/syscache.h" /* * An ObjectAddress represents a database object of any type. @@ -57,8 +58,8 @@ extern Oid get_object_namespace(const ObjectAddress *address); extern bool is_objectclass_supported(Oid class_id); extern const char *get_object_class_descr(Oid class_id); extern Oid get_object_oid_index(Oid class_id); -extern int get_object_catcache_oid(Oid class_id); -extern int get_object_catcache_name(Oid class_id); +extern SysCacheIdentifier get_object_catcache_oid(Oid class_id); +extern SysCacheIdentifier get_object_catcache_name(Oid class_id); extern AttrNumber get_object_attnum_oid(Oid class_id); extern AttrNumber get_object_attnum_name(Oid class_id); extern AttrNumber get_object_attnum_namespace(Oid class_id); diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index c1285fdd1b..33fb7f552b 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -289,7 +289,8 @@ extern void ProcessSyncingTablesForApply(XLogRecPtr current_lsn); extern void ProcessSequencesForSync(void); pg_noreturn extern void FinishSyncWorker(void); -extern void InvalidateSyncingRelStates(Datum arg, int cacheid, uint32 hashvalue); +extern void InvalidateSyncingRelStates(Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); extern void launch_sync_worker(LogicalRepWorkerType wtype, int nsyncworkers, Oid relid, TimestampTz *last_start_time); extern void ProcessSyncingRelations(XLogRecPtr current_lsn); diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 0e937fec9e..5f64fb2047 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -17,6 +17,7 @@ #include "access/htup.h" #include "storage/relfilelocator.h" #include "utils/relcache.h" +#include "utils/syscache.h" extern PGDLLIMPORT int debug_discard_caches; @@ -38,7 +39,8 @@ extern PGDLLIMPORT int debug_discard_caches; #endif /* not DISCARD_CACHES_ENABLED */ -typedef void (*SyscacheCallbackFunction) (Datum arg, int cacheid, uint32 hashvalue); +typedef void (*SyscacheCallbackFunction) (Datum arg, SysCacheIdentifier cacheid, + uint32 hashvalue); typedef void (*RelcacheCallbackFunction) (Datum arg, Oid relid); typedef void (*RelSyncCallbackFunction) (Datum arg, Oid relid); @@ -81,7 +83,7 @@ extern void CacheInvalidateSmgr(RelFileLocatorBackend rlocator); extern void CacheInvalidateRelmap(Oid databaseId); -extern void CacheRegisterSyscacheCallback(int cacheid, +extern void CacheRegisterSyscacheCallback(SysCacheIdentifier cacheid, SyscacheCallbackFunction func, Datum arg); @@ -91,7 +93,7 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, extern void CacheRegisterRelSyncCallback(RelSyncCallbackFunction func, Datum arg); -extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue); +extern void CallSyscacheCallbacks(SysCacheIdentifier cacheid, uint32 hashvalue); extern void CallRelSyncCallbacks(Oid relid); diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index 13f49af9ed..81e5933708 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -25,35 +25,35 @@ extern void InitCatalogCache(void); extern void InitCatalogCachePhase2(void); -extern HeapTuple SearchSysCache(int cacheId, +extern HeapTuple SearchSysCache(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, Datum key4); /* * The use of argument specific numbers is encouraged. They're faster, and * insulates the caller from changes in the maximum number of keys. */ -extern HeapTuple SearchSysCache1(int cacheId, +extern HeapTuple SearchSysCache1(SysCacheIdentifier cacheId, Datum key1); -extern HeapTuple SearchSysCache2(int cacheId, +extern HeapTuple SearchSysCache2(SysCacheIdentifier cacheId, Datum key1, Datum key2); -extern HeapTuple SearchSysCache3(int cacheId, +extern HeapTuple SearchSysCache3(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3); -extern HeapTuple SearchSysCache4(int cacheId, +extern HeapTuple SearchSysCache4(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, Datum key4); extern void ReleaseSysCache(HeapTuple tuple); -extern HeapTuple SearchSysCacheLocked1(int cacheId, +extern HeapTuple SearchSysCacheLocked1(SysCacheIdentifier cacheId, Datum key1); /* convenience routines */ -extern HeapTuple SearchSysCacheCopy(int cacheId, +extern HeapTuple SearchSysCacheCopy(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, Datum key4); -extern HeapTuple SearchSysCacheLockedCopy1(int cacheId, +extern HeapTuple SearchSysCacheLockedCopy1(SysCacheIdentifier cacheId, Datum key1); -extern bool SearchSysCacheExists(int cacheId, +extern bool SearchSysCacheExists(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, Datum key4); -extern Oid GetSysCacheOid(int cacheId, AttrNumber oidcol, +extern Oid GetSysCacheOid(SysCacheIdentifier cacheId, AttrNumber oidcol, Datum key1, Datum key2, Datum key3, Datum key4); extern HeapTuple SearchSysCacheAttName(Oid relid, const char *attname); @@ -63,21 +63,21 @@ extern bool SearchSysCacheExistsAttName(Oid relid, const char *attname); extern HeapTuple SearchSysCacheAttNum(Oid relid, int16 attnum); extern HeapTuple SearchSysCacheCopyAttNum(Oid relid, int16 attnum); -extern Datum SysCacheGetAttr(int cacheId, HeapTuple tup, +extern Datum SysCacheGetAttr(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull); -extern Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, +extern Datum SysCacheGetAttrNotNull(SysCacheIdentifier cacheId, HeapTuple tup, AttrNumber attributeNumber); -extern uint32 GetSysCacheHashValue(int cacheId, +extern uint32 GetSysCacheHashValue(SysCacheIdentifier cacheId, Datum key1, Datum key2, Datum key3, Datum key4); /* list-search interface. Users of this must import catcache.h too */ struct catclist; -extern struct catclist *SearchSysCacheList(int cacheId, int nkeys, +extern struct catclist *SearchSysCacheList(SysCacheIdentifier cacheId, int nkeys, Datum key1, Datum key2, Datum key3); -extern void SysCacheInvalidate(int cacheId, uint32 hashValue); +extern void SysCacheInvalidate(SysCacheIdentifier cacheId, uint32 hashValue); extern bool RelationInvalidatesSnapshotsOnly(Oid relid); extern bool RelationHasSysCache(Oid relid); From 623a90c2ade60ae52bf19495d8670c9b72de4299 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 18 Feb 2026 16:07:13 +0900 Subject: [PATCH 126/147] Force creation of stamp file after libpq library check in meson builds Previously, if --stamp_file was specified, libpq_check.pl would create a new stamp file only if none could be found. If there was already a stamp file, the script would do nothing, leaving the previous stamp file in place. This logic could cause unnecessary rebuilds because meson relies on the timestamp of the output files to determine if a rebuild should happen. In this case, a stamp file generated during an older check would be kept, but we need a stamp file from the latest moment where the libpq check has been run, so as correct rebuild decisions can be taken. This commit changes libpq_check.pl so as a fresh stamp file is created each time libpq_check.pl is run, when --stamp_file is specified. Oversight in commit 4a8e6f43a6b5. Reported-by: Andres Freund Author: Nazir Bilal Yavuz Reviewed-by: VASUKI M Discussion: https://postgr.es/m/CAN55FZ22rrN6gCn7urtmTR=_5z7ArZLUJu-TsMChdXwmRTaquA@mail.gmail.com --- src/interfaces/libpq/libpq_check.pl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/interfaces/libpq/libpq_check.pl b/src/interfaces/libpq/libpq_check.pl index 8a2e29b2d9..833f5315c3 100755 --- a/src/interfaces/libpq/libpq_check.pl +++ b/src/interfaces/libpq/libpq_check.pl @@ -31,12 +31,9 @@ sub create_stamp_file { - if (!(-f $stamp_file)) - { - open my $fh, '>', $stamp_file - or die "can't open $stamp_file: $!"; - close $fh; - } + open my $fh, '>', $stamp_file + or die "can't open $stamp_file: $!"; + close $fh; } # Skip on Windows and Solaris From 3894f08abe490d31518d038b0a8eaa46ce971b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Wed, 18 Feb 2026 18:09:54 +0100 Subject: [PATCH 127/147] Update obsolete comment table_tuple_update's update_indexes argument hasn't been a boolean since commit 19d8e2308bc5. Backpatch-through: 16 --- src/include/access/tableam.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 251379016b..119593b7b4 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1503,8 +1503,8 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * slot - newly constructed tuple data to store * tmfd - filled in failure cases (see below) * lockmode - filled with lock mode acquired on tuple - * update_indexes - in success cases this is set to true if new index entries - * are required for this tuple + * update_indexes - in success cases this is set if new index entries + * are required for this tuple; see TU_UpdateIndexes * * Normal, successful return value is TM_Ok, which means we did actually * update it. Failure return codes are TM_SelfModified, TM_Updated, and From d62dca3b297413a11a594c9675f2fb22e931d01b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 18 Feb 2026 19:59:34 +0200 Subject: [PATCH 128/147] Use standard die() handler for SIGTERM in bgworkers The previous default bgworker_die() signal would exit with elog(FATAL) directly from the signal handler. That could cause deadlocks or crashes if the signal handler runs while we're e.g holding a spinlock or in the middle of a memory allocation. All the built-in background workers overrode that to use the normal die() handler and CHECK_FOR_INTERRUPTS(). Let's make that the default for all background workers. Some extensions relying on the old behavior might need to adapt, but the new default is much safer and is the right thing to do for most background workers. Reviewed-by: Nathan Bossart Reviewed-by: Kirill Reshke Discussion: https://www.postgresql.org/message-id/5238fe45-e486-4c62-a7f3-c7d8d416e812@iki.fi --- doc/src/sgml/bgworker.sgml | 10 ++++++++++ src/backend/access/transam/parallel.c | 1 - src/backend/postmaster/bgworker.c | 16 +--------------- .../replication/logical/applyparallelworker.c | 1 - src/backend/replication/logical/launcher.c | 1 - src/backend/replication/logical/worker.c | 1 - src/test/modules/test_shm_mq/worker.c | 8 +------- 7 files changed, 12 insertions(+), 26 deletions(-) diff --git a/doc/src/sgml/bgworker.sgml b/doc/src/sgml/bgworker.sgml index 4699ef6345..2affba7438 100644 --- a/doc/src/sgml/bgworker.sgml +++ b/doc/src/sgml/bgworker.sgml @@ -232,6 +232,8 @@ typedef struct BackgroundWorker + A well-behaved background worker must react promptly to standard signals + that the postmaster uses to control its child processes. Signals are initially blocked when control reaches the background worker's main function, and must be unblocked by it; this is to allow the process to customize its signal handlers, if necessary. @@ -240,6 +242,14 @@ typedef struct BackgroundWorker BackgroundWorkerBlockSignals. + + The default signal handlers merely set interrupt flags + that are processed later by CHECK_FOR_INTERRUPTS(). + CHECK_FOR_INTERRUPTS() should be called in any + long-running loop to ensure that the background worker doesn't prevent the + system from shutting down in a timely fashion. + + If bgw_restart_time for a background worker is configured as BGW_NEVER_RESTART, or if it exits with an exit diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index fe00488487..44786dc131 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -1327,7 +1327,6 @@ ParallelWorkerMain(Datum main_arg) InitializingParallelWorker = true; /* Establish signal handlers. */ - pqsignal(SIGTERM, die); BackgroundWorkerUnblockSignals(); /* Determine and set our parallel worker number. */ diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 261ccd3f59..8678ea4e13 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -718,20 +718,6 @@ SanityCheckBackgroundWorker(BackgroundWorker *worker, int elevel) return true; } -/* - * Standard SIGTERM handler for background workers - */ -static void -bgworker_die(SIGNAL_ARGS) -{ - sigprocmask(SIG_SETMASK, &BlockSig, NULL); - - ereport(FATAL, - (errcode(ERRCODE_ADMIN_SHUTDOWN), - errmsg("terminating background worker \"%s\" due to administrator command", - MyBgworkerEntry->bgw_type))); -} - /* * Main entry point for background worker processes. */ @@ -787,7 +773,7 @@ BackgroundWorkerMain(const void *startup_data, size_t startup_data_len) pqsignal(SIGUSR1, SIG_IGN); pqsignal(SIGFPE, SIG_IGN); } - pqsignal(SIGTERM, bgworker_die); + pqsignal(SIGTERM, die); /* SIGQUIT handler was already set up by InitPostmasterChild */ pqsignal(SIGHUP, SIG_IGN); diff --git a/src/backend/replication/logical/applyparallelworker.c b/src/backend/replication/logical/applyparallelworker.c index 8a01f16a2c..1730ace549 100644 --- a/src/backend/replication/logical/applyparallelworker.c +++ b/src/backend/replication/logical/applyparallelworker.c @@ -879,7 +879,6 @@ ParallelApplyWorkerMain(Datum main_arg) * receiving SIGTERM. */ pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGTERM, die); pqsignal(SIGUSR2, SignalHandlerForShutdownRequest); BackgroundWorkerUnblockSignals(); diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 3ed86480be..e6112e11ec 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -1213,7 +1213,6 @@ ApplyLauncherMain(Datum main_arg) /* Establish signal handlers. */ pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGTERM, die); BackgroundWorkerUnblockSignals(); /* diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 8b93f48470..f179d08184 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -5890,7 +5890,6 @@ SetupApplyOrSyncWorker(int worker_slot) /* Setup signal handling */ pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGTERM, die); BackgroundWorkerUnblockSignals(); /* diff --git a/src/test/modules/test_shm_mq/worker.c b/src/test/modules/test_shm_mq/worker.c index 368e4f3f23..6a4147554b 100644 --- a/src/test/modules/test_shm_mq/worker.c +++ b/src/test/modules/test_shm_mq/worker.c @@ -54,13 +54,7 @@ test_shm_mq_main(Datum main_arg) int myworkernumber; PGPROC *registrant; - /* - * Establish signal handlers. - * - * We want CHECK_FOR_INTERRUPTS() to kill off this worker process just as - * it would a normal user backend. To make that happen, we use die(). - */ - pqsignal(SIGTERM, die); + /* Unblock signals. The standard signal handlers are OK for us. */ BackgroundWorkerUnblockSignals(); /* From 759b03b24ce96f0ba6d734b570d1a6f4a0fb1177 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 18 Feb 2026 14:14:44 -0500 Subject: [PATCH 129/147] Simplify creation of built-in functions with default arguments. Up to now, to create such a function, one had to make a pg_proc.dat entry and then overwrite it with a CREATE OR REPLACE command in system_functions.sql. That's error-prone (cf. bug #19409) and results in leaving dead rows in the initial contents of pg_proc. Manual maintenance of pg_node_tree strings seems entirely impractical, and parsing expressions during bootstrap would be extremely difficult as well. But Andres Freund observed that all the current use-cases are simple constants, and building a Const node is well within the capabilities of bootstrap mode. So this patch invents a special case: if bootstrap mode is asked to ingest a non-null value for pg_proc.proargdefaults (which would otherwise fail in pg_node_tree_in), it parses the value as an array literal and then feeds the element strings to the input functions for the corresponding parameter types. Then we can build a suitable pg_node_tree string with just a few more lines of code. This allows removing all the system_functions.sql entries that are just there to set up default arguments, replacing them with proargdefaults fields in pg_proc.dat entries. The old technique remains available in case someone needs a non-constant default. The initial contents of pg_proc are demonstrably the same after this patch, except that (1) json_strip_nulls and jsonb_strip_nulls now have the correct provolatile setting, as per bug #19409; (2) pg_terminate_backend, make_interval, and drandom_normal now have defaults that don't include a type coercion, which is how they should have been all along. In passing, remove some unused entries from bootstrap.c's TypInfo[] array. I had to add some new ones because we'll now need an entry for each default-possessing system function parameter, but we shouldn't carry more than we need there; it's just a maintenance gotcha. Bug: #19409 Reported-by: Lucio Chiessi Author: Tom Lane Author: Andrew Dunstan Reviewed-by: Andres Freund Discussion: https://postgr.es/m/183292bb-4891-4c96-a3ca-e78b5e0e1358@dunslane.net Discussion: https://postgr.es/m/19409-e16cd2605e59a4af@postgresql.org --- doc/src/sgml/bki.sgml | 34 ++- src/backend/bootstrap/bootstrap.c | 177 ++++++++++++-- src/backend/catalog/system_functions.sql | 283 +---------------------- src/backend/utils/cache/lsyscache.c | 4 +- src/include/bootstrap/bootstrap.h | 3 +- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 86 ++++++- 7 files changed, 263 insertions(+), 326 deletions(-) diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml index 53a982bf60..087a6827b0 100644 --- a/doc/src/sgml/bki.sgml +++ b/doc/src/sgml/bki.sgml @@ -271,6 +271,21 @@ + + + There is a special case for values of the + pg_proc.proargdefaults + field, which is of type pg_node_tree. The real + contents of that type are too complex for hand-written entries, + but what we need for proargdefaults is + typically just a list of Const nodes. Therefore, the bootstrap + backend will interpret a value given for that field according to + text array syntax, and then feed the array element values to the + datatype input routines for the corresponding input parameters' data + types, and finally build Const nodes from the datums. + + + Since hashes are unordered data structures, field order and line @@ -817,11 +832,11 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat The following column types are supported directly by bootstrap.c: bool, bytea, char (1 byte), - name, int2, - int4, regproc, regclass, - regtype, text, - oid, tid, xid, - cid, int2vector, oidvector, + int2, int4, int8, + float4, float8, + name, regproc, text, + jsonb, oid, pg_node_tree, + int2vector, oidvector, _int4 (array), _text (array), _oid (array), _char (array), _aclitem (array). Although it is possible to create @@ -884,7 +899,7 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat - insert ( oid_value value1 value2 ... ) + insert ( value1 value2 ... ) @@ -902,6 +917,13 @@ $ perl rewrite_dat_with_prokind.pl pg_proc.dat (To include a single quote in a value, write it twice. Escape-string-style backslash escapes are allowed in the string, too.) + + + In most cases a value + string is simply fed to the datatype input routine for the column's + data type, after de-quoting if needed. However there are exceptions + for certain fields, as detailed previously. + diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 7d32cd0e15..8d601c363b 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -26,6 +26,7 @@ #include "bootstrap/bootstrap.h" #include "catalog/index.h" #include "catalog/pg_collation.h" +#include "catalog/pg_proc.h" #include "catalog/pg_type.h" #include "common/link-canary.h" #include "miscadmin.h" @@ -46,6 +47,7 @@ static void CheckerModeMain(void); static void bootstrap_signals(void); static Form_pg_attribute AllocateAttribute(void); +static void InsertOneProargdefaultsValue(char *value); static void populate_typ_list(void); static Oid gettype(char *type); static void cleanup(void); @@ -91,38 +93,28 @@ static const struct typinfo TypInfo[] = { F_BYTEAIN, F_BYTEAOUT}, {"char", CHAROID, 0, 1, true, TYPALIGN_CHAR, TYPSTORAGE_PLAIN, InvalidOid, F_CHARIN, F_CHAROUT}, + {"cstring", CSTRINGOID, 0, -2, false, TYPALIGN_CHAR, TYPSTORAGE_PLAIN, InvalidOid, + F_CSTRING_IN, F_CSTRING_OUT}, {"int2", INT2OID, 0, 2, true, TYPALIGN_SHORT, TYPSTORAGE_PLAIN, InvalidOid, F_INT2IN, F_INT2OUT}, {"int4", INT4OID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, F_INT4IN, F_INT4OUT}, + {"int8", INT8OID, 0, 8, true, TYPALIGN_DOUBLE, TYPSTORAGE_PLAIN, InvalidOid, + F_INT8IN, F_INT8OUT}, {"float4", FLOAT4OID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, F_FLOAT4IN, F_FLOAT4OUT}, + {"float8", FLOAT8OID, 0, 8, true, TYPALIGN_DOUBLE, TYPSTORAGE_PLAIN, InvalidOid, + F_FLOAT8IN, F_FLOAT8OUT}, {"name", NAMEOID, CHAROID, NAMEDATALEN, false, TYPALIGN_CHAR, TYPSTORAGE_PLAIN, C_COLLATION_OID, F_NAMEIN, F_NAMEOUT}, - {"regclass", REGCLASSOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, - F_REGCLASSIN, F_REGCLASSOUT}, {"regproc", REGPROCOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, F_REGPROCIN, F_REGPROCOUT}, - {"regtype", REGTYPEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, - F_REGTYPEIN, F_REGTYPEOUT}, - {"regrole", REGROLEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, - F_REGROLEIN, F_REGROLEOUT}, - {"regnamespace", REGNAMESPACEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, - F_REGNAMESPACEIN, F_REGNAMESPACEOUT}, - {"regdatabase", REGDATABASEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, - F_REGDATABASEIN, F_REGDATABASEOUT}, {"text", TEXTOID, 0, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, F_TEXTIN, F_TEXTOUT}, + {"jsonb", JSONBOID, 0, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, + F_JSONB_IN, F_JSONB_OUT}, {"oid", OIDOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, F_OIDIN, F_OIDOUT}, - {"oid8", OID8OID, 0, 8, true, TYPALIGN_DOUBLE, TYPSTORAGE_PLAIN, InvalidOid, - F_OID8IN, F_OID8OUT}, - {"tid", TIDOID, 0, 6, false, TYPALIGN_SHORT, TYPSTORAGE_PLAIN, InvalidOid, - F_TIDIN, F_TIDOUT}, - {"xid", XIDOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, - F_XIDIN, F_XIDOUT}, - {"cid", CIDOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, - F_CIDIN, F_CIDOUT}, {"pg_node_tree", PG_NODE_TREEOID, 0, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, F_PG_NODE_TREE_IN, F_PG_NODE_TREE_OUT}, {"int2vector", INT2VECTOROID, INT2OID, -1, false, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, @@ -660,6 +652,7 @@ InsertOneTuple(void) void InsertOneValue(char *value, int i) { + Form_pg_attribute attr; Oid typoid; int16 typlen; bool typbyval; @@ -668,19 +661,42 @@ InsertOneValue(char *value, int i) Oid typioparam; Oid typinput; Oid typoutput; + Oid typcollation; Assert(i >= 0 && i < MAXATTR); elog(DEBUG4, "inserting column %d value \"%s\"", i, value); - typoid = TupleDescAttr(boot_reldesc->rd_att, i)->atttypid; + attr = TupleDescAttr(RelationGetDescr(boot_reldesc), i); + typoid = attr->atttypid; boot_get_type_io_data(typoid, &typlen, &typbyval, &typalign, &typdelim, &typioparam, - &typinput, &typoutput); + &typinput, &typoutput, + &typcollation); - values[i] = OidInputFunctionCall(typinput, value, typioparam, -1); + /* + * pg_node_tree values can't be inserted normally (pg_node_tree_in would + * just error out), so provide special cases for such columns that we + * would like to fill during bootstrap. + */ + if (typoid == PG_NODE_TREEOID) + { + /* pg_proc.proargdefaults */ + if (RelationGetRelid(boot_reldesc) == ProcedureRelationId && + i == Anum_pg_proc_proargdefaults - 1) + InsertOneProargdefaultsValue(value); + else /* maybe other cases later */ + elog(ERROR, "can't handle pg_node_tree input for %s.%s", + RelationGetRelationName(boot_reldesc), + NameStr(attr->attname)); + } + else + { + /* Normal case */ + values[i] = OidInputFunctionCall(typinput, value, typioparam, -1); + } /* * We use ereport not elog here so that parameters aren't evaluated unless @@ -691,6 +707,111 @@ InsertOneValue(char *value, int i) OidOutputFunctionCall(typoutput, values[i])))); } +/* ---------------- + * InsertOneProargdefaultsValue + * + * In general, proargdefaults can be a list of any expressions, but + * for bootstrap we only support a list of Const nodes. The input + * has the form of a text array, and we feed non-null elements to the + * typinput functions for the appropriate parameters. + * ---------------- + */ +static void +InsertOneProargdefaultsValue(char *value) +{ + int pronargs; + oidvector *proargtypes; + Datum arrayval; + Datum *array_datums; + bool *array_nulls; + int array_count; + List *proargdefaults; + char *nodestring; + + /* The pg_proc columns we need to use must have been filled already */ + StaticAssertDecl(Anum_pg_proc_pronargs < Anum_pg_proc_proargdefaults, + "pronargs must come before proargdefaults"); + StaticAssertDecl(Anum_pg_proc_pronargdefaults < Anum_pg_proc_proargdefaults, + "pronargdefaults must come before proargdefaults"); + StaticAssertDecl(Anum_pg_proc_proargtypes < Anum_pg_proc_proargdefaults, + "proargtypes must come before proargdefaults"); + if (Nulls[Anum_pg_proc_pronargs - 1]) + elog(ERROR, "pronargs must not be null"); + if (Nulls[Anum_pg_proc_proargtypes - 1]) + elog(ERROR, "proargtypes must not be null"); + pronargs = DatumGetInt16(values[Anum_pg_proc_pronargs - 1]); + proargtypes = DatumGetPointer(values[Anum_pg_proc_proargtypes - 1]); + Assert(pronargs == proargtypes->dim1); + + /* Parse the input string as an array value, then deconstruct to Datums */ + arrayval = OidFunctionCall3(F_ARRAY_IN, + CStringGetDatum(value), + ObjectIdGetDatum(CSTRINGOID), + Int32GetDatum(-1)); + deconstruct_array_builtin(DatumGetArrayTypeP(arrayval), CSTRINGOID, + &array_datums, &array_nulls, &array_count); + + /* The values should correspond to the last N argtypes */ + if (array_count > pronargs) + elog(ERROR, "too many proargdefaults entries"); + + /* Build the List of Const nodes */ + proargdefaults = NIL; + for (int i = 0; i < array_count; i++) + { + Oid argtype = proargtypes->values[pronargs - array_count + i]; + int16 typlen; + bool typbyval; + char typalign; + char typdelim; + Oid typioparam; + Oid typinput; + Oid typoutput; + Oid typcollation; + Datum defval; + bool defnull; + Const *defConst; + + boot_get_type_io_data(argtype, + &typlen, &typbyval, &typalign, + &typdelim, &typioparam, + &typinput, &typoutput, + &typcollation); + + defnull = array_nulls[i]; + if (defnull) + defval = (Datum) 0; + else + defval = OidInputFunctionCall(typinput, + DatumGetCString(array_datums[i]), + typioparam, -1); + + defConst = makeConst(argtype, + -1, /* never any typmod */ + typcollation, + typlen, + defval, + defnull, + typbyval); + proargdefaults = lappend(proargdefaults, defConst); + } + + /* + * Flatten the List to a node-tree string, then convert to a text datum, + * which is the storage representation of pg_node_tree. + */ + nodestring = nodeToString(proargdefaults); + values[Anum_pg_proc_proargdefaults - 1] = CStringGetTextDatum(nodestring); + Nulls[Anum_pg_proc_proargdefaults - 1] = false; + + /* + * Hack: fill in pronargdefaults with the right value. This is surely + * ugly, but it beats making the programmer do it. + */ + values[Anum_pg_proc_pronargdefaults - 1] = Int16GetDatum(array_count); + Nulls[Anum_pg_proc_pronargdefaults - 1] = false; +} + /* ---------------- * InsertOneNull * ---------------- @@ -831,10 +952,11 @@ gettype(char *type) * boot_get_type_io_data * * Obtain type I/O information at bootstrap time. This intentionally has - * almost the same API as lsyscache.c's get_type_io_data, except that + * an API very close to that of lsyscache.c's get_type_io_data, except that * we only support obtaining the typinput and typoutput routines, not - * the binary I/O routines. It is exported so that array_in and array_out - * can be made to work during early bootstrap. + * the binary I/O routines, and we also return the type's collation. + * This is exported so that array_in and array_out can be made to work + * during early bootstrap. * ---------------- */ void @@ -845,7 +967,8 @@ boot_get_type_io_data(Oid typid, char *typdelim, Oid *typioparam, Oid *typinput, - Oid *typoutput) + Oid *typoutput, + Oid *typcollation) { if (Typ != NIL) { @@ -876,6 +999,8 @@ boot_get_type_io_data(Oid typid, *typinput = ap->am_typ.typinput; *typoutput = ap->am_typ.typoutput; + + *typcollation = ap->am_typ.typcollation; } else { @@ -904,6 +1029,8 @@ boot_get_type_io_data(Oid typid, *typinput = TypInfo[typeindex].inproc; *typoutput = TypInfo[typeindex].outproc; + + *typcollation = TypInfo[typeindex].collation; } } diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index eb9e31ae1b..69699f8830 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -7,7 +7,8 @@ * * This file redefines certain built-in functions that are impractical * to fully define in pg_proc.dat. In most cases that's because they use - * SQL-standard function bodies and/or default expressions. The node + * SQL-standard function bodies and/or default expressions. (But defaults + * that are just constants can be entered in pg_proc.dat.) The node * tree representations of those are too unreadable, platform-dependent, * and changeable to want to deal with them manually. Hence, we put stub * definitions of such functions into pg_proc.dat and then replace them @@ -66,13 +67,6 @@ CREATE OR REPLACE FUNCTION bit_length(text) IMMUTABLE PARALLEL SAFE STRICT COST 1 RETURN octet_length($1) * 8; -CREATE OR REPLACE FUNCTION - random_normal(mean float8 DEFAULT 0, stddev float8 DEFAULT 1) - RETURNS float8 - LANGUAGE internal - VOLATILE PARALLEL RESTRICTED STRICT COST 1 -AS 'drandom_normal'; - CREATE OR REPLACE FUNCTION log(numeric) RETURNS numeric LANGUAGE sql @@ -383,279 +377,6 @@ BEGIN ATOMIC SELECT * FROM ts_debug(get_current_ts_config(), $1); END; -CREATE OR REPLACE FUNCTION - pg_backup_start(label text, fast boolean DEFAULT false) - RETURNS pg_lsn STRICT VOLATILE LANGUAGE internal AS 'pg_backup_start' - PARALLEL RESTRICTED; - -CREATE OR REPLACE FUNCTION pg_backup_stop ( - wait_for_archive boolean DEFAULT true, OUT lsn pg_lsn, - OUT labelfile text, OUT spcmapfile text) - RETURNS record STRICT VOLATILE LANGUAGE internal as 'pg_backup_stop' - PARALLEL RESTRICTED; - -CREATE OR REPLACE FUNCTION - pg_promote(wait boolean DEFAULT true, wait_seconds integer DEFAULT 60) - RETURNS boolean STRICT VOLATILE LANGUAGE INTERNAL AS 'pg_promote' - PARALLEL SAFE; - -CREATE OR REPLACE FUNCTION - pg_terminate_backend(pid integer, timeout int8 DEFAULT 0) - RETURNS boolean STRICT VOLATILE LANGUAGE INTERNAL AS 'pg_terminate_backend' - PARALLEL SAFE; - --- legacy definition for compatibility with 9.3 -CREATE OR REPLACE FUNCTION - json_populate_record(base anyelement, from_json json, use_json_as_text boolean DEFAULT false) - RETURNS anyelement LANGUAGE internal STABLE AS 'json_populate_record' PARALLEL SAFE; - --- legacy definition for compatibility with 9.3 -CREATE OR REPLACE FUNCTION - json_populate_recordset(base anyelement, from_json json, use_json_as_text boolean DEFAULT false) - RETURNS SETOF anyelement LANGUAGE internal STABLE ROWS 100 AS 'json_populate_recordset' PARALLEL SAFE; - -CREATE OR REPLACE FUNCTION pg_logical_slot_get_changes( - IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', - OUT lsn pg_lsn, OUT xid xid, OUT data text) -RETURNS SETOF RECORD -LANGUAGE INTERNAL -VOLATILE ROWS 1000 COST 1000 -AS 'pg_logical_slot_get_changes'; - -CREATE OR REPLACE FUNCTION pg_logical_slot_peek_changes( - IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', - OUT lsn pg_lsn, OUT xid xid, OUT data text) -RETURNS SETOF RECORD -LANGUAGE INTERNAL -VOLATILE ROWS 1000 COST 1000 -AS 'pg_logical_slot_peek_changes'; - -CREATE OR REPLACE FUNCTION pg_logical_slot_get_binary_changes( - IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', - OUT lsn pg_lsn, OUT xid xid, OUT data bytea) -RETURNS SETOF RECORD -LANGUAGE INTERNAL -VOLATILE ROWS 1000 COST 1000 -AS 'pg_logical_slot_get_binary_changes'; - -CREATE OR REPLACE FUNCTION pg_logical_slot_peek_binary_changes( - IN slot_name name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}', - OUT lsn pg_lsn, OUT xid xid, OUT data bytea) -RETURNS SETOF RECORD -LANGUAGE INTERNAL -VOLATILE ROWS 1000 COST 1000 -AS 'pg_logical_slot_peek_binary_changes'; - -CREATE OR REPLACE FUNCTION pg_logical_emit_message( - transactional boolean, - prefix text, - message text, - flush boolean DEFAULT false) -RETURNS pg_lsn -LANGUAGE INTERNAL -STRICT VOLATILE -AS 'pg_logical_emit_message_text'; - -CREATE OR REPLACE FUNCTION pg_logical_emit_message( - transactional boolean, - prefix text, - message bytea, - flush boolean DEFAULT false) -RETURNS pg_lsn -LANGUAGE INTERNAL -STRICT VOLATILE -AS 'pg_logical_emit_message_bytea'; - -CREATE OR REPLACE FUNCTION pg_create_physical_replication_slot( - IN slot_name name, IN immediately_reserve boolean DEFAULT false, - IN temporary boolean DEFAULT false, - OUT slot_name name, OUT lsn pg_lsn) -RETURNS RECORD -LANGUAGE INTERNAL -STRICT VOLATILE -AS 'pg_create_physical_replication_slot'; - -CREATE OR REPLACE FUNCTION pg_create_logical_replication_slot( - IN slot_name name, IN plugin name, - IN temporary boolean DEFAULT false, - IN twophase boolean DEFAULT false, - IN failover boolean DEFAULT false, - OUT slot_name name, OUT lsn pg_lsn) -RETURNS RECORD -LANGUAGE INTERNAL -STRICT VOLATILE -AS 'pg_create_logical_replication_slot'; - -CREATE OR REPLACE FUNCTION - make_interval(years int4 DEFAULT 0, months int4 DEFAULT 0, weeks int4 DEFAULT 0, - days int4 DEFAULT 0, hours int4 DEFAULT 0, mins int4 DEFAULT 0, - secs double precision DEFAULT 0.0) -RETURNS interval -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'make_interval'; - -CREATE OR REPLACE FUNCTION - jsonb_set(jsonb_in jsonb, path text[] , replacement jsonb, - create_if_missing boolean DEFAULT true) -RETURNS jsonb -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'jsonb_set'; - -CREATE OR REPLACE FUNCTION - jsonb_set_lax(jsonb_in jsonb, path text[] , replacement jsonb, - create_if_missing boolean DEFAULT true, - null_value_treatment text DEFAULT 'use_json_null') -RETURNS jsonb -LANGUAGE INTERNAL -CALLED ON NULL INPUT IMMUTABLE PARALLEL SAFE -AS 'jsonb_set_lax'; - -CREATE OR REPLACE FUNCTION - parse_ident(str text, strict boolean DEFAULT true) -RETURNS text[] -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'parse_ident'; - -CREATE OR REPLACE FUNCTION - jsonb_insert(jsonb_in jsonb, path text[] , replacement jsonb, - insert_after boolean DEFAULT false) -RETURNS jsonb -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'jsonb_insert'; - -CREATE OR REPLACE FUNCTION - jsonb_path_exists(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS boolean -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'jsonb_path_exists'; - -CREATE OR REPLACE FUNCTION - jsonb_path_match(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS boolean -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'jsonb_path_match'; - -CREATE OR REPLACE FUNCTION - jsonb_path_query(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS SETOF jsonb -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'jsonb_path_query'; - -CREATE OR REPLACE FUNCTION - jsonb_path_query_array(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS jsonb -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'jsonb_path_query_array'; - -CREATE OR REPLACE FUNCTION - jsonb_path_query_first(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS jsonb -LANGUAGE INTERNAL -STRICT IMMUTABLE PARALLEL SAFE -AS 'jsonb_path_query_first'; - -CREATE OR REPLACE FUNCTION - jsonb_path_exists_tz(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS boolean -LANGUAGE INTERNAL -STRICT STABLE PARALLEL SAFE -AS 'jsonb_path_exists_tz'; - -CREATE OR REPLACE FUNCTION - jsonb_path_match_tz(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS boolean -LANGUAGE INTERNAL -STRICT STABLE PARALLEL SAFE -AS 'jsonb_path_match_tz'; - -CREATE OR REPLACE FUNCTION - jsonb_path_query_tz(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS SETOF jsonb -LANGUAGE INTERNAL -STRICT STABLE PARALLEL SAFE -AS 'jsonb_path_query_tz'; - -CREATE OR REPLACE FUNCTION - jsonb_path_query_array_tz(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS jsonb -LANGUAGE INTERNAL -STRICT STABLE PARALLEL SAFE -AS 'jsonb_path_query_array_tz'; - -CREATE OR REPLACE FUNCTION - jsonb_path_query_first_tz(target jsonb, path jsonpath, vars jsonb DEFAULT '{}', - silent boolean DEFAULT false) -RETURNS jsonb -LANGUAGE INTERNAL -STRICT STABLE PARALLEL SAFE -AS 'jsonb_path_query_first_tz'; - -CREATE OR REPLACE FUNCTION - jsonb_strip_nulls(target jsonb, strip_in_arrays boolean DEFAULT false) -RETURNS jsonb -LANGUAGE INTERNAL -STRICT STABLE PARALLEL SAFE -AS 'jsonb_strip_nulls'; - -CREATE OR REPLACE FUNCTION - json_strip_nulls(target json, strip_in_arrays boolean DEFAULT false) -RETURNS json -LANGUAGE INTERNAL -STRICT STABLE PARALLEL SAFE -AS 'json_strip_nulls'; - --- default normalization form is NFC, per SQL standard -CREATE OR REPLACE FUNCTION - "normalize"(text, text DEFAULT 'NFC') -RETURNS text -LANGUAGE internal -STRICT IMMUTABLE PARALLEL SAFE -AS 'unicode_normalize_func'; - -CREATE OR REPLACE FUNCTION - is_normalized(text, text DEFAULT 'NFC') -RETURNS boolean -LANGUAGE internal -STRICT IMMUTABLE PARALLEL SAFE -AS 'unicode_is_normalized'; - -CREATE OR REPLACE FUNCTION - pg_stat_reset_shared(target text DEFAULT NULL) -RETURNS void -LANGUAGE INTERNAL -CALLED ON NULL INPUT VOLATILE PARALLEL SAFE -AS 'pg_stat_reset_shared'; - -CREATE OR REPLACE FUNCTION - pg_stat_reset_slru(target text DEFAULT NULL) -RETURNS void -LANGUAGE INTERNAL -CALLED ON NULL INPUT VOLATILE PARALLEL SAFE -AS 'pg_stat_reset_slru'; - -CREATE OR REPLACE FUNCTION - pg_replication_origin_session_setup(node_name text, pid integer DEFAULT 0) -RETURNS void -LANGUAGE INTERNAL -STRICT VOLATILE PARALLEL UNSAFE -AS 'pg_replication_origin_session_setup'; -- -- The default permissions for functions mean that anyone can execute them. diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index b924a2d900..1913b009d4 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -2492,6 +2492,7 @@ get_type_io_data(Oid typid, { Oid typinput; Oid typoutput; + Oid typcollation; boot_get_type_io_data(typid, typlen, @@ -2500,7 +2501,8 @@ get_type_io_data(Oid typid, typdelim, typioparam, &typinput, - &typoutput); + &typoutput, + &typcollation); switch (which_func) { case IOFunc_input: diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h index 51680522af..21447a3d66 100644 --- a/src/include/bootstrap/bootstrap.h +++ b/src/include/bootstrap/bootstrap.h @@ -53,7 +53,8 @@ extern void boot_get_type_io_data(Oid typid, char *typdelim, Oid *typioparam, Oid *typinput, - Oid *typoutput); + Oid *typoutput, + Oid *typcollation); union YYSTYPE; typedef void *yyscan_t; diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 0bdd42a2b8..7be8afc10e 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202602121 +#define CATALOG_VERSION_NO 202602181 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 83f6501df3..dac40992cb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3499,6 +3499,7 @@ { oid => '6212', descr => 'random value from normal distribution', proname => 'random_normal', provolatile => 'v', proparallel => 'r', prorettype => 'float8', proargtypes => 'float8 float8', + proargnames => '{mean,stddev}', proargdefaults => '{0,1}', prosrc => 'drandom_normal' }, { oid => '6339', descr => 'random integer in range', proname => 'random', provolatile => 'v', proparallel => 'r', @@ -6174,6 +6175,7 @@ descr => 'statistics: reset collected statistics shared across the cluster', proname => 'pg_stat_reset_shared', proisstrict => 'f', provolatile => 'v', prorettype => 'void', proargtypes => 'text', + proargnames => '{target}', proargdefaults => '{NULL}', prosrc => 'pg_stat_reset_shared' }, { oid => '3776', descr => 'statistics: reset collected statistics for a single table or index in the current database or shared across all databases in the cluster', @@ -6193,6 +6195,7 @@ descr => 'statistics: reset collected statistics for a single SLRU', proname => 'pg_stat_reset_slru', proisstrict => 'f', provolatile => 'v', prorettype => 'void', proargtypes => 'text', proargnames => '{target}', + proargdefaults => '{NULL}', prosrc => 'pg_stat_reset_slru' }, { oid => '6170', descr => 'statistics: reset collected statistics for a single replication slot', @@ -6728,20 +6731,24 @@ { oid => '2096', descr => 'terminate a server process', proname => 'pg_terminate_backend', provolatile => 'v', prorettype => 'bool', proargtypes => 'int4 int8', proargnames => '{pid,timeout}', + proargdefaults => '{0}', prosrc => 'pg_terminate_backend' }, { oid => '2172', descr => 'prepare for taking an online backup', proname => 'pg_backup_start', provolatile => 'v', proparallel => 'r', prorettype => 'pg_lsn', proargtypes => 'text bool', + proargnames => '{label,fast}', proargdefaults => '{false}', prosrc => 'pg_backup_start' }, { oid => '2739', descr => 'finish taking an online backup', proname => 'pg_backup_stop', provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => 'bool', proallargtypes => '{bool,pg_lsn,text,text}', proargmodes => '{i,o,o,o}', proargnames => '{wait_for_archive,lsn,labelfile,spcmapfile}', + proargdefaults => '{true}', prosrc => 'pg_backup_stop' }, { oid => '3436', descr => 'promote standby server', proname => 'pg_promote', provolatile => 'v', prorettype => 'bool', proargtypes => 'bool int4', proargnames => '{wait,wait_seconds}', + proargdefaults => '{true,60}', prosrc => 'pg_promote' }, { oid => '2848', descr => 'switch to new wal file', proname => 'pg_switch_wal', provolatile => 'v', prorettype => 'pg_lsn', @@ -7517,7 +7524,8 @@ { oid => '1268', descr => 'parse qualified identifier to array of identifiers', proname => 'parse_ident', prorettype => '_text', proargtypes => 'text bool', - proargnames => '{str,strict}', prosrc => 'parse_ident' }, + proargnames => '{str,strict}', proargdefaults => '{true}', + prosrc => 'parse_ident' }, { oid => '2246', descr => '(internal)', proname => 'fmgr_internal_validator', provolatile => 's', @@ -9423,7 +9431,9 @@ proargtypes => 'anyelement', prosrc => 'to_json' }, { oid => '3261', descr => 'remove object fields with null values from json', proname => 'json_strip_nulls', prorettype => 'json', - proargtypes => 'json bool', prosrc => 'json_strip_nulls' }, + proargtypes => 'json bool', + proargnames => '{target,strip_in_arrays}', proargdefaults => '{false}', + prosrc => 'json_strip_nulls' }, { oid => '3947', proname => 'json_object_field', prorettype => 'json', @@ -9480,12 +9490,17 @@ { oid => '3960', descr => 'get record fields from a json object', proname => 'json_populate_record', proisstrict => 'f', provolatile => 's', prorettype => 'anyelement', proargtypes => 'anyelement json bool', + proargnames => '{base,from_json,use_json_as_text}', + proargdefaults => '{false}', prosrc => 'json_populate_record' }, { oid => '3961', descr => 'get set of records with fields from a json array of objects', proname => 'json_populate_recordset', prorows => '100', proisstrict => 'f', proretset => 't', provolatile => 's', prorettype => 'anyelement', - proargtypes => 'anyelement json bool', prosrc => 'json_populate_recordset' }, + proargtypes => 'anyelement json bool', + proargnames => '{base,from_json,use_json_as_text}', + proargdefaults => '{false}', + prosrc => 'json_populate_recordset' }, { oid => '3204', descr => 'get record fields from a json object', proname => 'json_to_record', provolatile => 's', prorettype => 'record', proargtypes => 'json', prosrc => 'json_to_record' }, @@ -10364,7 +10379,9 @@ prosrc => 'jsonb_build_object_noargs' }, { oid => '3262', descr => 'remove object fields with null values from jsonb', proname => 'jsonb_strip_nulls', prorettype => 'jsonb', - proargtypes => 'jsonb bool', prosrc => 'jsonb_strip_nulls' }, + proargtypes => 'jsonb bool', + proargnames => '{target,strip_in_arrays}', proargdefaults => '{false}', + prosrc => 'jsonb_strip_nulls' }, { oid => '3478', proname => 'jsonb_object_field', prorettype => 'jsonb', @@ -10538,16 +10555,25 @@ proargtypes => 'jsonb _text', prosrc => 'jsonb_delete_path' }, { oid => '5054', descr => 'Set part of a jsonb, handle NULL value', proname => 'jsonb_set_lax', proisstrict => 'f', prorettype => 'jsonb', - proargtypes => 'jsonb _text jsonb bool text', prosrc => 'jsonb_set_lax' }, + proargtypes => 'jsonb _text jsonb bool text', + proargnames => '{jsonb_in,path,replacement,create_if_missing,null_value_treatment}', + proargdefaults => '{true,use_json_null}', + prosrc => 'jsonb_set_lax' }, { oid => '3305', descr => 'Set part of a jsonb', proname => 'jsonb_set', prorettype => 'jsonb', - proargtypes => 'jsonb _text jsonb bool', prosrc => 'jsonb_set' }, + proargtypes => 'jsonb _text jsonb bool', + proargnames => '{jsonb_in,path,replacement,create_if_missing}', + proargdefaults => '{true}', + prosrc => 'jsonb_set' }, { oid => '3306', descr => 'Indented text from jsonb', proname => 'jsonb_pretty', prorettype => 'text', proargtypes => 'jsonb', prosrc => 'jsonb_pretty' }, { oid => '3579', descr => 'Insert value into a jsonb', proname => 'jsonb_insert', prorettype => 'jsonb', - proargtypes => 'jsonb _text jsonb bool', prosrc => 'jsonb_insert' }, + proargtypes => 'jsonb _text jsonb bool', + proargnames => '{jsonb_in,path,replacement,insert_after}', + proargdefaults => '{false}', + prosrc => 'jsonb_insert' }, # jsonpath { oid => '4001', descr => 'I/O', @@ -10565,42 +10591,66 @@ { oid => '4005', descr => 'jsonpath exists test', proname => 'jsonb_path_exists', prorettype => 'bool', - proargtypes => 'jsonb jsonpath jsonb bool', prosrc => 'jsonb_path_exists' }, + proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', + prosrc => 'jsonb_path_exists' }, { oid => '4006', descr => 'jsonpath query', proname => 'jsonb_path_query', prorows => '1000', proretset => 't', prorettype => 'jsonb', proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', prosrc => 'jsonb_path_query' }, { oid => '4007', descr => 'jsonpath query wrapped into array', proname => 'jsonb_path_query_array', prorettype => 'jsonb', proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', prosrc => 'jsonb_path_query_array' }, { oid => '4008', descr => 'jsonpath query first item', proname => 'jsonb_path_query_first', prorettype => 'jsonb', proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', prosrc => 'jsonb_path_query_first' }, { oid => '4009', descr => 'jsonpath match', proname => 'jsonb_path_match', prorettype => 'bool', - proargtypes => 'jsonb jsonpath jsonb bool', prosrc => 'jsonb_path_match' }, + proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', + prosrc => 'jsonb_path_match' }, { oid => '1177', descr => 'jsonpath exists test with timezone', proname => 'jsonb_path_exists_tz', provolatile => 's', prorettype => 'bool', proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', prosrc => 'jsonb_path_exists_tz' }, { oid => '1179', descr => 'jsonpath query with timezone', proname => 'jsonb_path_query_tz', prorows => '1000', proretset => 't', provolatile => 's', prorettype => 'jsonb', - proargtypes => 'jsonb jsonpath jsonb bool', prosrc => 'jsonb_path_query_tz' }, + proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', + prosrc => 'jsonb_path_query_tz' }, { oid => '1180', descr => 'jsonpath query wrapped into array with timezone', proname => 'jsonb_path_query_array_tz', provolatile => 's', prorettype => 'jsonb', proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', prosrc => 'jsonb_path_query_array_tz' }, { oid => '2023', descr => 'jsonpath query first item with timezone', proname => 'jsonb_path_query_first_tz', provolatile => 's', prorettype => 'jsonb', proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', prosrc => 'jsonb_path_query_first_tz' }, { oid => '2030', descr => 'jsonpath match with timezone', proname => 'jsonb_path_match_tz', provolatile => 's', prorettype => 'bool', - proargtypes => 'jsonb jsonpath jsonb bool', prosrc => 'jsonb_path_match_tz' }, + proargtypes => 'jsonb jsonpath jsonb bool', + proargnames => '{target,path,vars,silent}', + proargdefaults => '{"{}",false}', + prosrc => 'jsonb_path_match_tz' }, { oid => '4010', descr => 'implementation of @? operator', proname => 'jsonb_path_exists_opr', prorettype => 'bool', @@ -11411,6 +11461,7 @@ proname => 'make_interval', prorettype => 'interval', proargtypes => 'int4 int4 int4 int4 int4 int4 float8', proargnames => '{years,months,weeks,days,hours,mins,secs}', + proargdefaults => '{0,0,0,0,0,0,0.0}', prosrc => 'make_interval' }, # spgist opclasses @@ -11511,6 +11562,7 @@ proallargtypes => '{name,bool,bool,name,pg_lsn}', proargmodes => '{i,i,i,o,o}', proargnames => '{slot_name,immediately_reserve,temporary,slot_name,lsn}', + proargdefaults => '{false,false}', prosrc => 'pg_create_physical_replication_slot' }, { oid => '4220', descr => 'copy a physical replication slot, changing temporality', @@ -11546,6 +11598,7 @@ proallargtypes => '{name,name,bool,bool,bool,name,pg_lsn}', proargmodes => '{i,i,i,i,i,o,o}', proargnames => '{slot_name,plugin,temporary,twophase,failover,slot_name,lsn}', + proargdefaults => '{false,false,false}', prosrc => 'pg_create_logical_replication_slot' }, { oid => '4222', descr => 'copy a logical replication slot, changing temporality and plugin', @@ -11578,6 +11631,7 @@ proallargtypes => '{name,pg_lsn,int4,_text,pg_lsn,xid,text}', proargmodes => '{i,i,i,v,o,o,o}', proargnames => '{slot_name,upto_lsn,upto_nchanges,options,lsn,xid,data}', + proargdefaults => '{"{}"}', prosrc => 'pg_logical_slot_get_changes' }, { oid => '3783', descr => 'get binary changes from replication slot', proname => 'pg_logical_slot_get_binary_changes', procost => '1000', @@ -11587,6 +11641,7 @@ proallargtypes => '{name,pg_lsn,int4,_text,pg_lsn,xid,bytea}', proargmodes => '{i,i,i,v,o,o,o}', proargnames => '{slot_name,upto_lsn,upto_nchanges,options,lsn,xid,data}', + proargdefaults => '{"{}"}', prosrc => 'pg_logical_slot_get_binary_changes' }, { oid => '3784', descr => 'peek at changes from replication slot', proname => 'pg_logical_slot_peek_changes', procost => '1000', @@ -11596,6 +11651,7 @@ proallargtypes => '{name,pg_lsn,int4,_text,pg_lsn,xid,text}', proargmodes => '{i,i,i,v,o,o,o}', proargnames => '{slot_name,upto_lsn,upto_nchanges,options,lsn,xid,data}', + proargdefaults => '{"{}"}', prosrc => 'pg_logical_slot_peek_changes' }, { oid => '3785', descr => 'peek at binary changes from replication slot', proname => 'pg_logical_slot_peek_binary_changes', procost => '1000', @@ -11605,6 +11661,7 @@ proallargtypes => '{name,pg_lsn,int4,_text,pg_lsn,xid,bytea}', proargmodes => '{i,i,i,v,o,o,o}', proargnames => '{slot_name,upto_lsn,upto_nchanges,options,lsn,xid,data}', + proargdefaults => '{"{}"}', prosrc => 'pg_logical_slot_peek_binary_changes' }, { oid => '3878', descr => 'advance logical replication slot', proname => 'pg_replication_slot_advance', provolatile => 'v', @@ -11615,10 +11672,14 @@ { oid => '3577', descr => 'emit a textual logical decoding message', proname => 'pg_logical_emit_message', provolatile => 'v', proparallel => 'u', prorettype => 'pg_lsn', proargtypes => 'bool text text bool', + proargnames => '{transactional,prefix,message,flush}', + proargdefaults => '{false}', prosrc => 'pg_logical_emit_message_text' }, { oid => '3578', descr => 'emit a binary logical decoding message', proname => 'pg_logical_emit_message', provolatile => 'v', proparallel => 'u', prorettype => 'pg_lsn', proargtypes => 'bool text bytea bool', + proargnames => '{transactional,prefix,message,flush}', + proargdefaults => '{false}', prosrc => 'pg_logical_emit_message_bytea' }, { oid => '6344', descr => 'sync replication slots from the primary to the standby', @@ -12268,6 +12329,7 @@ descr => 'configure session to maintain replication progress tracking for the passed in origin', proname => 'pg_replication_origin_session_setup', provolatile => 'v', proparallel => 'u', prorettype => 'void', proargtypes => 'text int4', + proargnames => '{node_name,pid}', proargdefaults => '{0}', prosrc => 'pg_replication_origin_session_setup' }, { oid => '6007', descr => 'teardown configured replication progress tracking', @@ -12518,10 +12580,12 @@ { oid => '4350', descr => 'Unicode normalization', proname => 'normalize', prorettype => 'text', proargtypes => 'text text', + proargdefaults => '{NFC}', prosrc => 'unicode_normalize_func' }, { oid => '4351', descr => 'check Unicode normalization', proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text', + proargdefaults => '{NFC}', prosrc => 'unicode_is_normalized' }, { oid => '6198', descr => 'unescape Unicode characters', From 21e323e941cf6ff02f931cd21bcfe4ab2cb5d622 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 19 Feb 2026 15:59:20 +0900 Subject: [PATCH 130/147] Sanitize some WAL-logging buffer handling in GIN and GiST code As transam's README documents, the general order of actions recommended when WAL-logging a buffer is to unlock and unpin buffers after leaving a critical section. This pattern was not being followed by some code paths of GIN and GiST, adjusted in this commit, where buffers were either unlocked or unpinned inside a critical section. Based on my analysis of each code path updated here, there is no reason to not follow the recommended unlocking/unpin pattern done outside of a critical section. These inconsistencies are rather old, coming mainly from ecaa4708e5dd and ff301d6e690b. The guidelines in the README predate these commits, being introduced in 6d61cdec0761. Author: Kirill Reshke Discussion: https://postgr.es/m/CALdSSPgBPnpNNzxv0Y+_GNFzW6PmzRZYh+_hpf06Y1N2zLhZaQ@mail.gmail.com --- src/backend/access/gin/gindatapage.c | 4 ++-- src/backend/access/gin/ginfast.c | 12 ++++++------ src/backend/access/gin/ginutil.c | 4 ++-- src/backend/access/gin/ginvacuum.c | 6 +++--- src/backend/access/transam/xloginsert.c | 7 ++++--- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 436e54f206..c5d7db2807 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -1854,10 +1854,10 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, PageSetLSN(page, recptr); } - UnlockReleaseBuffer(buffer); - END_CRIT_SECTION(); + UnlockReleaseBuffer(buffer); + /* During index build, count the newly-added data page */ if (buildStats) buildStats->nDataPages++; diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index 7a6b177977..f50848eb65 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -134,10 +134,10 @@ writeListPage(Relation index, Buffer buffer, /* get free space before releasing buffer */ freesize = PageGetExactFreeSpace(page); - UnlockReleaseBuffer(buffer); - END_CRIT_SECTION(); + UnlockReleaseBuffer(buffer); + return freesize; } @@ -459,10 +459,10 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * (Size) 1024) needCleanup = true; - UnlockReleaseBuffer(metabuffer); - END_CRIT_SECTION(); + UnlockReleaseBuffer(metabuffer); + /* * Since it could contend with concurrent cleanup process we cleanup * pending list not forcibly. @@ -659,11 +659,11 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, } } + END_CRIT_SECTION(); + for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); - END_CRIT_SECTION(); - for (i = 0; fill_fsm && i < data.ndeleted; i++) RecordFreeIndexPage(index, freespace[i]); diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index d205093e21..ff927279cc 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -663,9 +663,9 @@ ginUpdateStats(Relation index, const GinStatsData *stats, bool is_build) PageSetLSN(metapage, recptr); } - UnlockReleaseBuffer(metabuffer); - END_CRIT_SECTION(); + + UnlockReleaseBuffer(metabuffer); } /* diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 11a6674a10..c9f143f6c3 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -224,12 +224,12 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn PageSetLSN(BufferGetPage(lBuffer), recptr); } + END_CRIT_SECTION(); + ReleaseBuffer(pBuffer); ReleaseBuffer(lBuffer); ReleaseBuffer(dBuffer); - END_CRIT_SECTION(); - gvs->result->pages_newly_deleted++; gvs->result->pages_deleted++; } @@ -654,8 +654,8 @@ ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, PageRestoreTempPage(resPage, page); MarkBufferDirty(buffer); xlogVacuumPage(gvs.index, buffer); - UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); + UnlockReleaseBuffer(buffer); } else { diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index d3acaa636c..a9a1678acc 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1355,11 +1355,12 @@ log_newpage_range(Relation rel, ForkNumber forknum, recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); for (i = 0; i < nbufs; i++) - { PageSetLSN(BufferGetPage(bufpack[i]), recptr); - UnlockReleaseBuffer(bufpack[i]); - } + END_CRIT_SECTION(); + + for (i = 0; i < nbufs; i++) + UnlockReleaseBuffer(bufpack[i]); } } From 0c3fbb3fef1e805d39b7bae576a3b2da4f9e1858 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 19 Feb 2026 08:41:03 +0100 Subject: [PATCH 131/147] Remove useless fallthrough annotation A fallthrough attribute after the last case is a constraint violation in C23, and clang warns about it (not about this comment, but if we changed it to an attribute). Remove it. (There was apparently never anything after this to fall through to, even in the first commit da07a1e8565.) Reviewed-by: Jelte Fennema-Nio Discussion: https://www.postgresql.org/message-id/flat/76a8efcd-925a-4eaf-bdd1-d972cd1a32ff%40eisentraut.org --- src/backend/postmaster/postmaster.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index d6133bfebc..70c7645582 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -4229,7 +4229,6 @@ bgworker_should_start_now(BgWorkerStartTime start_time) case PM_INIT: if (start_time == BgWorkerStart_PostmasterStart) return true; - /* fall through */ } return false; From 8354b9d6b602ea549bc8d85cb404771505662a7b Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 19 Feb 2026 08:41:03 +0100 Subject: [PATCH 132/147] Use fallthrough attribute instead of comment Instead of using comments to mark fallthrough switch cases, use the fallthrough attribute. This will (in the future, not here) allow supporting other compilers besides gcc. The commenting convention is only supported by gcc, the attribute is supported by clang, and in the fullness of time the C23 standard attribute would allow supporting other compilers as well. Right now, we package the attribute into a macro called pg_fallthrough. This commit defines that macro and replaces the existing comments with that macro invocation. We also raise the level of the gcc -Wimplicit-fallthrough= option from 3 to 5 to enforce the use of the attribute. Reviewed-by: Jelte Fennema-Nio Discussion: https://www.postgresql.org/message-id/flat/76a8efcd-925a-4eaf-bdd1-d972cd1a32ff%40eisentraut.org --- configure | 40 ++--- configure.ac | 4 +- contrib/btree_gin/btree_gin.c | 2 +- contrib/ltree/ltxtquery_io.c | 2 +- contrib/pg_trgm/trgm_gin.c | 12 +- contrib/pg_trgm/trgm_gist.c | 8 +- contrib/pgcrypto/pgp-info.c | 2 +- meson.build | 2 +- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/access/nbtree/nbtpreprocesskeys.c | 4 +- src/backend/bootstrap/bootstrap.c | 2 +- src/backend/catalog/dependency.c | 2 +- src/backend/catalog/objectaddress.c | 4 +- src/backend/catalog/pg_shdepend.c | 2 +- src/backend/commands/explain.c | 2 +- src/backend/commands/tablecmds.c | 2 +- src/backend/commands/trigger.c | 2 +- src/backend/executor/nodeAgg.c | 2 +- src/backend/executor/nodeHash.c | 14 +- src/backend/executor/nodeHashjoin.c | 10 +- src/backend/executor/nodeLimit.c | 4 +- src/backend/executor/nodeTidrangescan.c | 4 +- src/backend/libpq/auth.c | 2 +- src/backend/optimizer/plan/planner.c | 4 +- src/backend/optimizer/util/clauses.c | 4 +- src/backend/parser/parse_jsontable.c | 2 +- src/backend/parser/parse_utilcmd.c | 2 +- src/backend/partitioning/partprune.c | 14 +- src/backend/postmaster/postmaster.c | 6 +- src/backend/regex/regc_lex.c | 2 +- src/backend/regex/regcomp.c | 2 +- .../replication/logical/reorderbuffer.c | 1 + src/backend/replication/logical/worker.c | 12 +- src/backend/replication/walreceiver.c | 2 +- src/backend/replication/walreceiverfuncs.c | 2 +- src/backend/storage/aio/aio.c | 2 +- src/backend/tcop/postgres.c | 4 +- src/backend/tcop/utility.c | 2 +- src/backend/utils/adt/datetime.c | 6 +- src/backend/utils/adt/formatting.c | 8 +- src/backend/utils/adt/jsonb.c | 2 +- src/backend/utils/adt/jsonb_util.c | 2 +- src/backend/utils/adt/jsonpath.c | 12 +- src/backend/utils/adt/numeric.c | 14 +- src/backend/utils/adt/ruleutils.c | 2 +- src/backend/utils/adt/timestamp.c | 54 +++---- src/backend/utils/adt/xml.c | 2 +- src/backend/utils/cache/catcache.c | 12 +- src/backend/utils/mb/mbutils.c | 6 +- src/backend/utils/misc/guc.c | 2 +- src/backend/utils/misc/guc_funcs.c | 2 +- src/backend/utils/sort/tuplestore.c | 2 +- src/bin/pgbench/pgbench.c | 6 +- src/common/hashfn.c | 144 +++++++++--------- src/common/wchar.c | 6 +- src/include/c.h | 12 ++ src/include/common/hashfn_unstable.h | 20 +-- src/interfaces/ecpg/pgtypeslib/interval.c | 6 +- src/interfaces/libpq/fe-secure.c | 2 +- src/pl/plpgsql/src/pl_exec.c | 6 +- src/pl/tcl/pltcl.c | 2 +- src/port/snprintf.c | 2 +- .../test_json_parser_incremental.c | 2 +- src/timezone/zic.c | 8 +- src/tools/pg_bsd_indent/indent.c | 4 +- src/tools/pg_bsd_indent/parse.c | 4 +- 66 files changed, 277 insertions(+), 264 deletions(-) diff --git a/configure b/configure index a10a2c85c6..a6eab39629 100755 --- a/configure +++ b/configure @@ -5616,15 +5616,15 @@ fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -Wimplicit-fallthrough=3, for CFLAGS" >&5 -$as_echo_n "checking whether ${CC} supports -Wimplicit-fallthrough=3, for CFLAGS... " >&6; } -if ${pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_3+:} false; then : +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -Wimplicit-fallthrough=5, for CFLAGS" >&5 +$as_echo_n "checking whether ${CC} supports -Wimplicit-fallthrough=5, for CFLAGS... " >&6; } +if ${pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_5+:} false; then : $as_echo_n "(cached) " >&6 else pgac_save_CFLAGS=$CFLAGS pgac_save_CC=$CC CC=${CC} -CFLAGS="${CFLAGS} -Wimplicit-fallthrough=3" +CFLAGS="${CFLAGS} -Wimplicit-fallthrough=5" ac_save_c_werror_flag=$ac_c_werror_flag ac_c_werror_flag=yes cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -5639,31 +5639,31 @@ main () } _ACEOF if ac_fn_c_try_compile "$LINENO"; then : - pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_3=yes + pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_5=yes else - pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_3=no + pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_5=no fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext ac_c_werror_flag=$ac_save_c_werror_flag CFLAGS="$pgac_save_CFLAGS" CC="$pgac_save_CC" fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_3" >&5 -$as_echo "$pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_3" >&6; } -if test x"$pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_3" = x"yes"; then - CFLAGS="${CFLAGS} -Wimplicit-fallthrough=3" +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_5" >&5 +$as_echo "$pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_5" >&6; } +if test x"$pgac_cv_prog_CC_cflags__Wimplicit_fallthrough_5" = x"yes"; then + CFLAGS="${CFLAGS} -Wimplicit-fallthrough=5" fi - { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CXX} supports -Wimplicit-fallthrough=3, for CXXFLAGS" >&5 -$as_echo_n "checking whether ${CXX} supports -Wimplicit-fallthrough=3, for CXXFLAGS... " >&6; } -if ${pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_3+:} false; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CXX} supports -Wimplicit-fallthrough=5, for CXXFLAGS" >&5 +$as_echo_n "checking whether ${CXX} supports -Wimplicit-fallthrough=5, for CXXFLAGS... " >&6; } +if ${pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_5+:} false; then : $as_echo_n "(cached) " >&6 else pgac_save_CXXFLAGS=$CXXFLAGS pgac_save_CXX=$CXX CXX=${CXX} -CXXFLAGS="${CXXFLAGS} -Wimplicit-fallthrough=3" +CXXFLAGS="${CXXFLAGS} -Wimplicit-fallthrough=5" ac_save_cxx_werror_flag=$ac_cxx_werror_flag ac_cxx_werror_flag=yes ac_ext=cpp @@ -5684,9 +5684,9 @@ main () } _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : - pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_3=yes + pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_5=yes else - pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_3=no + pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_5=no fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext ac_ext=c @@ -5699,10 +5699,10 @@ ac_cxx_werror_flag=$ac_save_cxx_werror_flag CXXFLAGS="$pgac_save_CXXFLAGS" CXX="$pgac_save_CXX" fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_3" >&5 -$as_echo "$pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_3" >&6; } -if test x"$pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_3" = x"yes"; then - CXXFLAGS="${CXXFLAGS} -Wimplicit-fallthrough=3" +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_5" >&5 +$as_echo "$pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_5" >&6; } +if test x"$pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_5" = x"yes"; then + CXXFLAGS="${CXXFLAGS} -Wimplicit-fallthrough=5" fi diff --git a/configure.ac b/configure.ac index 814e64a967..455ba31f1d 100644 --- a/configure.ac +++ b/configure.ac @@ -556,8 +556,8 @@ if test "$GCC" = yes -a "$ICC" = no; then PGAC_PROG_CXX_CFLAGS_OPT([-Wendif-labels]) PGAC_PROG_CC_CFLAGS_OPT([-Wmissing-format-attribute]) PGAC_PROG_CXX_CFLAGS_OPT([-Wmissing-format-attribute]) - PGAC_PROG_CC_CFLAGS_OPT([-Wimplicit-fallthrough=3]) - PGAC_PROG_CXX_CFLAGS_OPT([-Wimplicit-fallthrough=3]) + PGAC_PROG_CC_CFLAGS_OPT([-Wimplicit-fallthrough=5]) + PGAC_PROG_CXX_CFLAGS_OPT([-Wimplicit-fallthrough=5]) PGAC_PROG_CC_CFLAGS_OPT([-Wcast-function-type]) PGAC_PROG_CXX_CFLAGS_OPT([-Wcast-function-type]) PGAC_PROG_CC_CFLAGS_OPT([-Wshadow=compatible-local]) diff --git a/contrib/btree_gin/btree_gin.c b/contrib/btree_gin/btree_gin.c index afb8b3820a..8dfbaa4781 100644 --- a/contrib/btree_gin/btree_gin.c +++ b/contrib/btree_gin/btree_gin.c @@ -120,7 +120,7 @@ gin_btree_extract_query(FunctionCallInfo fcinfo, case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: *ptr_partialmatch = true; - /* FALLTHROUGH */ + pg_fallthrough; case BTEqualStrategyNumber: /* If we have a conversion function, apply it */ if (cvt_fns && cvt_fns[rhs_code]) diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c index d15f323539..f4296880c0 100644 --- a/contrib/ltree/ltxtquery_io.c +++ b/contrib/ltree/ltxtquery_io.c @@ -277,7 +277,7 @@ makepol(QPRS_STATE *state) case ERR: if (SOFT_ERROR_OCCURRED(state->escontext)) return ERR; - /* fall through */ + pg_fallthrough; default: ereturn(state->escontext, ERR, (errcode(ERRCODE_SYNTAX_ERROR), diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c index 014bb3c848..5766b3e995 100644 --- a/contrib/pg_trgm/trgm_gin.c +++ b/contrib/pg_trgm/trgm_gin.c @@ -99,7 +99,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case LikeStrategyNumber: /* @@ -113,7 +113,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case RegExpStrategyNumber: trg = createTrgmNFA(val, PG_GET_COLLATION(), &graph, CurrentMemoryContext); @@ -224,7 +224,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case LikeStrategyNumber: case EqualStrategyNumber: /* Check if all extracted trigrams are presented. */ @@ -242,7 +242,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case RegExpStrategyNumber: if (nkeys < 1) { @@ -310,7 +310,7 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case LikeStrategyNumber: case EqualStrategyNumber: /* Check if all extracted trigrams are presented. */ @@ -328,7 +328,7 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case RegExpStrategyNumber: if (nkeys < 1) { diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index 685275a0f9..11812b2984 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -248,7 +248,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case LikeStrategyNumber: qtrg = generate_wildcard_trgm(VARDATA(query), querysize - VARHDRSZ); @@ -257,7 +257,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case RegExpStrategyNumber: qtrg = createTrgmNFA(query, PG_GET_COLLATION(), &graph, fcinfo->flinfo->fn_mcxt); @@ -345,7 +345,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case LikeStrategyNumber: case EqualStrategyNumber: /* Wildcard and equal search are inexact */ @@ -387,7 +387,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) #ifndef IGNORECASE elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); #endif - /* FALL THRU */ + pg_fallthrough; case RegExpStrategyNumber: /* Regexp search is inexact */ *recheck = true; diff --git a/contrib/pgcrypto/pgp-info.c b/contrib/pgcrypto/pgp-info.c index 83dc60486b..6c2be4713a 100644 --- a/contrib/pgcrypto/pgp-info.c +++ b/contrib/pgcrypto/pgp-info.c @@ -169,7 +169,7 @@ pgp_get_keyid(MBuf *pgp_data, char *dst) break; case PGP_PKT_SYMENCRYPTED_SESSKEY: got_symenc_key++; - /* fall through */ + pg_fallthrough; case PGP_PKT_SIGNATURE: case PGP_PKT_MARKER: case PGP_PKT_TRUST: diff --git a/meson.build b/meson.build index f6d5842d85..055e96315d 100644 --- a/meson.build +++ b/meson.build @@ -2154,7 +2154,7 @@ common_warning_flags = [ '-Werror=unguarded-availability-new', '-Wendif-labels', '-Wmissing-format-attribute', - '-Wimplicit-fallthrough=3', + '-Wimplicit-fallthrough=5', '-Wcast-function-type', '-Wshadow=compatible-local', # This was included in -Wall/-Wformat in older GCC versions diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index cbef73e5d4..b83e2013d5 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -861,7 +861,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, break; case HEAPTUPLE_RECENTLY_DEAD: *tups_recently_dead += 1; - /* fall through */ + pg_fallthrough; case HEAPTUPLE_LIVE: /* Live or recently dead, must copy it */ isdead = false; diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index b028b0c3e8..39c0a5d610 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -1198,7 +1198,7 @@ _bt_saoparray_shrink(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, { case BTLessStrategyNumber: cmpexact = 1; /* exclude exact match, if any */ - /* FALL THRU */ + pg_fallthrough; case BTLessEqualStrategyNumber: if (cmpresult >= cmpexact) matchelem++; @@ -1220,7 +1220,7 @@ _bt_saoparray_shrink(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, break; case BTGreaterEqualStrategyNumber: cmpexact = 1; /* include exact match, if any */ - /* FALL THRU */ + pg_fallthrough; case BTGreaterStrategyNumber: if (cmpresult >= cmpexact) matchelem++; diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 8d601c363b..e7699be55a 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -234,7 +234,7 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) (errcode(ERRCODE_SYNTAX_ERROR), errmsg("--%s must be first argument", optarg))); - /* FALLTHROUGH */ + pg_fallthrough; case 'c': { char *name, diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 7564965fa1..570c434ede 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -641,7 +641,7 @@ findDependentObjects(const ObjectAddress *object, break; /* Otherwise, treat this like an internal dependency */ - /* FALL THRU */ + pg_fallthrough; case DEPENDENCY_INTERNAL: diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 13d73f8909..d32aaff282 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -2232,7 +2232,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("name list length must be exactly %d", 1))); /* fall through to check args length */ - /* FALLTHROUGH */ + pg_fallthrough; case OBJECT_DOMCONSTRAINT: case OBJECT_CAST: case OBJECT_PUBLICATION_REL: @@ -2257,7 +2257,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("name list length must be at least %d", 3))); /* fall through to check args length */ - /* FALLTHROUGH */ + pg_fallthrough; case OBJECT_OPERATOR: if (list_length(args) != 2) ereport(ERROR, diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c index 3db41ecd74..c9998531b2 100644 --- a/src/backend/catalog/pg_shdepend.c +++ b/src/backend/catalog/pg_shdepend.c @@ -1458,7 +1458,7 @@ shdepDropOwned(List *roleids, DropBehavior behavior) sdepForm->objid); break; } - /* FALLTHROUGH */ + pg_fallthrough; case SHARED_DEPENDENCY_OWNER: diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index b9587983f8..93918a223b 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -2012,7 +2012,7 @@ ExplainNode(PlanState *planstate, List *ancestors, show_tablesample(((SampleScan *) plan)->tablesample, planstate, ancestors, es); /* fall through to print additional fields the same as SeqScan */ - /* FALLTHROUGH */ + pg_fallthrough; case T_SeqScan: case T_ValuesScan: case T_CteScan: diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index f976c0e5c7..2f5b7007ff 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -16192,7 +16192,7 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock case RELKIND_TOASTVALUE: if (recursing) break; - /* FALL THRU */ + pg_fallthrough; default: ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 8df915f63f..98d402c0a3 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -4392,7 +4392,7 @@ AfterTriggerExecute(EState *estate, trig_tuple_slot2)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); } - /* fall through */ + pg_fallthrough; case AFTER_TRIGGER_FDW_REUSE: /* diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index baa76596ac..7d487a165f 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -2257,7 +2257,7 @@ ExecAgg(PlanState *pstate) case AGG_HASHED: if (!node->table_filled) agg_fill_hash_table(node); - /* FALLTHROUGH */ + pg_fallthrough; case AGG_MIXED: result = agg_retrieve_hash_table(node); break; diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index f5d3edb90e..c0eb5a1f0c 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -258,7 +258,7 @@ MultiExecParallelHash(HashState *node) * way, wait for everyone to arrive here so we can proceed. */ BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ALLOCATE); - /* Fall through. */ + pg_fallthrough; case PHJ_BUILD_HASH_INNER: @@ -1330,13 +1330,13 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) /* All other participants just flush their tuples to disk. */ ExecParallelHashCloseBatchAccessors(hashtable); } - /* Fall through. */ + pg_fallthrough; case PHJ_GROW_BATCHES_REALLOCATE: /* Wait for the above to be finished. */ BarrierArriveAndWait(&pstate->grow_batches_barrier, WAIT_EVENT_HASH_GROW_BATCHES_REALLOCATE); - /* Fall through. */ + pg_fallthrough; case PHJ_GROW_BATCHES_REPARTITION: /* Make sure that we have the current dimensions and buckets. */ @@ -1349,7 +1349,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) /* Wait for the above to be finished. */ BarrierArriveAndWait(&pstate->grow_batches_barrier, WAIT_EVENT_HASH_GROW_BATCHES_REPARTITION); - /* Fall through. */ + pg_fallthrough; case PHJ_GROW_BATCHES_DECIDE: @@ -1411,7 +1411,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) dsa_free(hashtable->area, pstate->old_batches); pstate->old_batches = InvalidDsaPointer; } - /* Fall through. */ + pg_fallthrough; case PHJ_GROW_BATCHES_FINISH: /* Wait for the above to complete. */ @@ -1689,13 +1689,13 @@ ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable) /* Clear the flag. */ pstate->growth = PHJ_GROWTH_OK; } - /* Fall through. */ + pg_fallthrough; case PHJ_GROW_BUCKETS_REALLOCATE: /* Wait for the above to complete. */ BarrierArriveAndWait(&pstate->grow_buckets_barrier, WAIT_EVENT_HASH_GROW_BUCKETS_REALLOCATE); - /* Fall through. */ + pg_fallthrough; case PHJ_GROW_BUCKETS_REINSERT: /* Reinsert all tuples into the hash table. */ diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 114620a813..5aa8a09b26 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -416,7 +416,7 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) else node->hj_JoinState = HJ_NEED_NEW_OUTER; - /* FALL THRU */ + pg_fallthrough; case HJ_NEED_NEW_OUTER: @@ -505,7 +505,7 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) /* OK, let's scan the bucket for matches */ node->hj_JoinState = HJ_SCAN_BUCKET; - /* FALL THRU */ + pg_fallthrough; case HJ_SCAN_BUCKET: @@ -1313,13 +1313,13 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) if (BarrierArriveAndWait(batch_barrier, WAIT_EVENT_HASH_BATCH_ELECT)) ExecParallelHashTableAlloc(hashtable, batchno); - /* Fall through. */ + pg_fallthrough; case PHJ_BATCH_ALLOCATE: /* Wait for allocation to complete. */ BarrierArriveAndWait(batch_barrier, WAIT_EVENT_HASH_BATCH_ALLOCATE); - /* Fall through. */ + pg_fallthrough; case PHJ_BATCH_LOAD: /* Start (or join in) loading tuples. */ @@ -1339,7 +1339,7 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) sts_end_parallel_scan(inner_tuples); BarrierArriveAndWait(batch_barrier, WAIT_EVENT_HASH_BATCH_LOAD); - /* Fall through. */ + pg_fallthrough; case PHJ_BATCH_PROBE: diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c index c40a73dcf1..8f75cbbead 100644 --- a/src/backend/executor/nodeLimit.c +++ b/src/backend/executor/nodeLimit.c @@ -68,7 +68,7 @@ ExecLimit(PlanState *pstate) */ recompute_limits(node); - /* FALL THRU */ + pg_fallthrough; case LIMIT_RESCAN: @@ -215,7 +215,7 @@ ExecLimit(PlanState *pstate) } Assert(node->lstate == LIMIT_WINDOWEND_TIES); - /* FALL THRU */ + pg_fallthrough; case LIMIT_WINDOWEND_TIES: if (ScanDirectionIsForward(direction)) diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index 4aa28918e9..503817da65 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -79,13 +79,13 @@ MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate) { case TIDLessEqOperator: tidopexpr->inclusive = true; - /* fall through */ + pg_fallthrough; case TIDLessOperator: tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND; break; case TIDGreaterEqOperator: tidopexpr->inclusive = true; - /* fall through */ + pg_fallthrough; case TIDGreaterOperator: tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND; break; diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 795bfed8d1..e04aa2e68e 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -2002,7 +2002,7 @@ pam_passwd_conv_proc(int num_msg, PG_PAM_CONST struct pam_message **msg, ereport(LOG, (errmsg("error from underlying PAM layer: %s", msg[i]->msg))); - /* FALL THROUGH */ + pg_fallthrough; case PAM_TEXT_INFO: /* we don't bother to log TEXT_INFO messages */ if ((reply[i].resp = strdup("")) == NULL) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 006b328196..42604a0f75 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3464,11 +3464,11 @@ adjust_group_pathkeys_for_groupagg(PlannerInfo *root) case PATHKEYS_BETTER2: /* 'pathkeys' are stronger, use these ones instead */ currpathkeys = pathkeys; - /* FALLTHROUGH */ + pg_fallthrough; case PATHKEYS_BETTER1: /* 'pathkeys' are less strict */ - /* FALLTHROUGH */ + pg_fallthrough; case PATHKEYS_EQUAL: /* mark this aggregate as covered by 'currpathkeys' */ diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 504a30d883..a41d81734c 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -1547,7 +1547,7 @@ find_nonnullable_rels_walker(Node *node, bool top_level) * the intersection of the sets of nonnullable rels, just as * for OR. Fall through to share code. */ - /* FALL THRU */ + pg_fallthrough; case OR_EXPR: /* @@ -1805,7 +1805,7 @@ find_nonnullable_vars_walker(Node *node, bool top_level) * the intersection of the sets of nonnullable vars, just as * for OR. Fall through to share code. */ - /* FALL THRU */ + pg_fallthrough; case OR_EXPR: /* diff --git a/src/backend/parser/parse_jsontable.c b/src/backend/parser/parse_jsontable.c index c28ae99dee..32a1e8629b 100644 --- a/src/backend/parser/parse_jsontable.c +++ b/src/backend/parser/parse_jsontable.c @@ -312,7 +312,7 @@ transformJsonTableColumns(JsonTableParseContext *cxt, List *columns, rawc->wrapper != JSW_UNSPEC) rawc->coltype = JTC_FORMATTED; - /* FALLTHROUGH */ + pg_fallthrough; case JTC_FORMATTED: case JTC_EXISTS: { diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index b5f4c72459..cc244c49e9 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -918,7 +918,7 @@ transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column) errmsg("primary key constraints are not supported on foreign tables"), parser_errposition(cxt->pstate, constraint->location))); - /* FALL THRU */ + pg_fallthrough; case CONSTR_UNIQUE: if (cxt->isforeign) diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index a4bbb10a3b..6d979a08fd 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -2880,7 +2880,7 @@ get_matching_list_bounds(PartitionPruneContext *context, case BTGreaterEqualStrategyNumber: inclusive = true; - /* fall through */ + pg_fallthrough; case BTGreaterStrategyNumber: off = partition_list_bsearch(partsupfunc, partcollation, @@ -2915,7 +2915,7 @@ get_matching_list_bounds(PartitionPruneContext *context, case BTLessEqualStrategyNumber: inclusive = true; - /* fall through */ + pg_fallthrough; case BTLessStrategyNumber: off = partition_list_bsearch(partsupfunc, partcollation, @@ -3162,7 +3162,7 @@ get_matching_range_bounds(PartitionPruneContext *context, case BTGreaterEqualStrategyNumber: inclusive = true; - /* fall through */ + pg_fallthrough; case BTGreaterStrategyNumber: /* @@ -3243,7 +3243,7 @@ get_matching_range_bounds(PartitionPruneContext *context, case BTLessEqualStrategyNumber: inclusive = true; - /* fall through */ + pg_fallthrough; case BTLessStrategyNumber: /* @@ -3726,19 +3726,19 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey, { case IS_NOT_TRUE: *notclause = true; - /* fall through */ + pg_fallthrough; case IS_TRUE: *outconst = (Expr *) makeBoolConst(true, false); return PARTCLAUSE_MATCH_CLAUSE; case IS_NOT_FALSE: *notclause = true; - /* fall through */ + pg_fallthrough; case IS_FALSE: *outconst = (Expr *) makeBoolConst(false, false); return PARTCLAUSE_MATCH_CLAUSE; case IS_NOT_UNKNOWN: *notclause = true; - /* fall through */ + pg_fallthrough; case IS_UNKNOWN: return PARTCLAUSE_MATCH_NULLNESS; default: diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 70c7645582..3fac46c402 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -625,7 +625,7 @@ PostmasterMain(int argc, char *argv[]) (errcode(ERRCODE_SYNTAX_ERROR), errmsg("--%s must be first argument", optarg))); - /* FALLTHROUGH */ + pg_fallthrough; case 'c': { char *name, @@ -4217,12 +4217,12 @@ bgworker_should_start_now(BgWorkerStartTime start_time) case PM_RUN: if (start_time == BgWorkerStart_RecoveryFinished) return true; - /* fall through */ + pg_fallthrough; case PM_HOT_STANDBY: if (start_time == BgWorkerStart_ConsistentState) return true; - /* fall through */ + pg_fallthrough; case PM_RECOVERY: case PM_STARTUP: diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index 9087ef95af..55df64f9ad 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -743,7 +743,7 @@ lexescape(struct vars *v) /* oops, doesn't look like it's a backref after all... */ v->now = save; /* and fall through into octal number */ - /* FALLTHROUGH */ + pg_fallthrough; case CHR('0'): NOTE(REG_UUNPORT); v->now--; /* put first digit back */ diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 3e18e4a78a..820995332b 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -975,7 +975,7 @@ parseqatom(struct vars *v, /* legal in EREs due to specification botch */ NOTE(REG_UPBOTCH); /* fall through into case PLAIN */ - /* FALLTHROUGH */ + pg_fallthrough; case PLAIN: onechr(v, v->nextvalue, lp, rp); okcolors(v->nfa, v->cm); diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 94b2b29945..e832fa0d8e 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -2322,6 +2322,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, change->action = REORDER_BUFFER_CHANGE_INSERT; /* intentionally fall through */ + pg_fallthrough; case REORDER_BUFFER_CHANGE_INSERT: case REORDER_BUFFER_CHANGE_UPDATE: case REORDER_BUFFER_CHANGE_DELETE: diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index f179d08184..29933f5301 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -839,7 +839,7 @@ handle_streamed_transaction(LogicalRepMsgType action, StringInfo s) */ pa_switch_to_partial_serialize(winfo, false); - /* fall through */ + pg_fallthrough; case TRANS_LEADER_PARTIAL_SERIALIZE: stream_write_change(action, &original_msg); @@ -1586,7 +1586,7 @@ apply_handle_stream_prepare(StringInfo s) */ pa_switch_to_partial_serialize(winfo, true); - /* fall through */ + pg_fallthrough; case TRANS_LEADER_PARTIAL_SERIALIZE: Assert(winfo); @@ -1808,7 +1808,7 @@ apply_handle_stream_start(StringInfo s) */ pa_switch_to_partial_serialize(winfo, !first_segment); - /* fall through */ + pg_fallthrough; case TRANS_LEADER_PARTIAL_SERIALIZE: Assert(winfo); @@ -1923,7 +1923,7 @@ apply_handle_stream_stop(StringInfo s) */ pa_switch_to_partial_serialize(winfo, true); - /* fall through */ + pg_fallthrough; case TRANS_LEADER_PARTIAL_SERIALIZE: stream_write_change(LOGICAL_REP_MSG_STREAM_STOP, s); stream_stop_internal(stream_xid); @@ -2169,7 +2169,7 @@ apply_handle_stream_abort(StringInfo s) */ pa_switch_to_partial_serialize(winfo, true); - /* fall through */ + pg_fallthrough; case TRANS_LEADER_PARTIAL_SERIALIZE: Assert(winfo); @@ -2442,7 +2442,7 @@ apply_handle_stream_commit(StringInfo s) */ pa_switch_to_partial_serialize(winfo, true); - /* fall through */ + pg_fallthrough; case TRANS_LEADER_PARTIAL_SERIALIZE: Assert(winfo); diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 10e64a7d1f..7c1b8757d7 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -192,7 +192,7 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) case WALRCV_STOPPING: /* If we've already been requested to stop, don't start up. */ walrcv->walRcvState = WALRCV_STOPPED; - /* fall through */ + pg_fallthrough; case WALRCV_STOPPED: SpinLockRelease(&walrcv->mutex); diff --git a/src/backend/replication/walreceiverfuncs.c b/src/backend/replication/walreceiverfuncs.c index 42e3e170bc..e62e8a2042 100644 --- a/src/backend/replication/walreceiverfuncs.c +++ b/src/backend/replication/walreceiverfuncs.c @@ -216,7 +216,7 @@ ShutdownWalRcv(void) case WALRCV_WAITING: case WALRCV_RESTARTING: walrcv->walRcvState = WALRCV_STOPPING; - /* fall through */ + pg_fallthrough; case WALRCV_STOPPING: walrcvpid = walrcv->pid; break; diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index d2c9cd6f20..e4ae3031fe 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -622,7 +622,7 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) pgaio_method_ops->wait_one(ioh, ref_generation); continue; } - /* fallthrough */ + pg_fallthrough; /* waiting for owner to submit */ case PGAIO_HS_DEFINED: diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 21de158adb..d01a09dd0c 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3900,7 +3900,7 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("--%s must be first argument", optarg))); - /* FALLTHROUGH */ + pg_fallthrough; case 'c': { char *name, @@ -5024,7 +5024,7 @@ PostgresMain(const char *dbname, const char *username) /* for the cumulative statistics system */ pgStatSessionEndCause = DISCONNECT_CLIENT_EOF; - /* FALLTHROUGH */ + pg_fallthrough; case PqMsg_Terminate: diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 34dd6e18df..bf707f2d57 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -2001,7 +2001,7 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel) if (stmt->concurrent) PreventInTransactionBlock(isTopLevel, "DROP INDEX CONCURRENTLY"); - /* fall through */ + pg_fallthrough; case OBJECT_TABLE: case OBJECT_SEQUENCE: diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c index 8dc0ac5062..90946db72f 100644 --- a/src/backend/utils/adt/datetime.c +++ b/src/backend/utils/adt/datetime.c @@ -3594,7 +3594,7 @@ DecodeInterval(char **field, int *ftype, int nf, int range, * handle signed float numbers and signed year-month values. */ - /* FALLTHROUGH */ + pg_fallthrough; case DTK_DATE: case DTK_NUMBER: @@ -4028,7 +4028,7 @@ DecodeISO8601Interval(char *str, continue; } /* Else fall through to extended alternative format */ - /* FALLTHROUGH */ + pg_fallthrough; case '-': /* ISO 8601 4.4.3.3 Alternative Format, * Extended */ if (havefield) @@ -4111,7 +4111,7 @@ DecodeISO8601Interval(char *str, return 0; } /* Else fall through to extended alternative format */ - /* FALLTHROUGH */ + pg_fallthrough; case ':': /* ISO 8601 4.4.3.3 Alternative Format, * Extended */ if (havefield) diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 7720911a6a..0716aff22b 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1236,7 +1236,7 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n) case NUM_D: num->flag |= NUM_F_LDECIMAL; num->need_locale = true; - /* FALLTHROUGH */ + pg_fallthrough; case NUM_DEC: if (IS_DECIMAL(num)) ereport(ERROR, @@ -3022,7 +3022,7 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col s += strlen(s); break; case DCH_RM: - /* FALLTHROUGH */ + pg_fallthrough; case DCH_rm: /* @@ -3300,7 +3300,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, case DCH_FF5: case DCH_FF6: out->ff = n->key->id - DCH_FF1 + 1; - /* FALLTHROUGH */ + pg_fallthrough; case DCH_US: /* microsecond */ len = from_char_parse_int_len(&out->us, &s, n->key->id == DCH_US ? 6 : @@ -3354,7 +3354,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, } /* otherwise parse it like OF */ } - /* FALLTHROUGH */ + pg_fallthrough; case DCH_OF: /* OF is equivalent to TZH or TZH:TZM */ /* see TZH comments below */ diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c index 28e7f80d77..0a3a77ee78 100644 --- a/src/backend/utils/adt/jsonb.c +++ b/src/backend/utils/adt/jsonb.c @@ -772,7 +772,7 @@ datum_to_jsonb_internal(Datum val, bool is_null, JsonbInState *result, case JSONTYPE_CAST: /* cast to JSON, and then process as JSON */ val = OidFunctionCall1(outfuncoid, val); - /* FALL THROUGH */ + pg_fallthrough; case JSONTYPE_JSON: { /* parse the json right into the existing result object */ diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index e085042f91..91fb9ea09b 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -721,7 +721,7 @@ pushJsonbValueScalar(JsonbInState *pstate, JsonbIteratorToken seq, uniqueifyJsonbObject(&ppstate->contVal, ppstate->unique_keys, ppstate->skip_nulls); - /* fall through! */ + pg_fallthrough; case WJB_END_ARRAY: /* Steps here common to WJB_END_OBJECT case */ Assert(!scalarVal); diff --git a/src/backend/utils/adt/jsonpath.c b/src/backend/utils/adt/jsonpath.c index 18a8046d6c..d70ff1eaa5 100644 --- a/src/backend/utils/adt/jsonpath.c +++ b/src/backend/utils/adt/jsonpath.c @@ -351,7 +351,7 @@ flattenJsonPathParseItem(StringInfo buf, int *result, struct Node *escontext, break; case jpiFilter: argNestingLevel++; - /* FALLTHROUGH */ + pg_fallthrough; case jpiIsUnknown: case jpiNot: case jpiPlus: @@ -487,13 +487,13 @@ alignStringInfoInt(StringInfo buf) { case 3: appendStringInfoCharMacro(buf, 0); - /* FALLTHROUGH */ + pg_fallthrough; case 2: appendStringInfoCharMacro(buf, 0); - /* FALLTHROUGH */ + pg_fallthrough; case 1: appendStringInfoCharMacro(buf, 0); - /* FALLTHROUGH */ + pg_fallthrough; default: break; } @@ -1021,7 +1021,7 @@ jspInitByBuffer(JsonPathItem *v, char *base, int32 pos) case jpiKey: case jpiVariable: read_int32(v->content.value.datalen, base, pos); - /* FALLTHROUGH */ + pg_fallthrough; case jpiNumeric: case jpiBool: v->content.value.data = base + pos; @@ -1433,7 +1433,7 @@ jspIsMutableWalker(JsonPathItem *jpi, struct JsonPathMutableContext *cxt) jspIsMutableWalker(&from, cxt); } - /* FALLTHROUGH */ + pg_fallthrough; case jpiAnyArray: if (!cxt->lax) diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 3bd3635d98..d25b8ad505 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -2378,13 +2378,13 @@ numeric_abbrev_convert_var(const NumericVar *var, NumericSortSupport *nss) { default: result |= ((int64) var->digits[3]); - /* FALLTHROUGH */ + pg_fallthrough; case 3: result |= ((int64) var->digits[2]) << 14; - /* FALLTHROUGH */ + pg_fallthrough; case 2: result |= ((int64) var->digits[1]) << 28; - /* FALLTHROUGH */ + pg_fallthrough; case 1: result |= ((int64) var->digits[0]) << 42; break; @@ -8818,22 +8818,22 @@ mul_var_short(const NumericVar *var1, const NumericVar *var2, term = PRODSUM5(var1digits, 0, var2digits, 4) + carry; res_digits[5] = (NumericDigit) (term % NBASE); carry = term / NBASE; - /* FALLTHROUGH */ + pg_fallthrough; case 5: term = PRODSUM4(var1digits, 0, var2digits, 3) + carry; res_digits[4] = (NumericDigit) (term % NBASE); carry = term / NBASE; - /* FALLTHROUGH */ + pg_fallthrough; case 4: term = PRODSUM3(var1digits, 0, var2digits, 2) + carry; res_digits[3] = (NumericDigit) (term % NBASE); carry = term / NBASE; - /* FALLTHROUGH */ + pg_fallthrough; case 3: term = PRODSUM2(var1digits, 0, var2digits, 1) + carry; res_digits[2] = (NumericDigit) (term % NBASE); carry = term / NBASE; - /* FALLTHROUGH */ + pg_fallthrough; case 2: term = PRODSUM1(var1digits, 0, var2digits, 0) + carry; res_digits[1] = (NumericDigit) (term % NBASE); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 89cbdd3b1e..f16f153578 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -9001,7 +9001,7 @@ isSimpleNode(Node *node, Node *parentNode, int prettyFlags) } /* else do the same stuff as for T_SubLink et al. */ } - /* FALLTHROUGH */ + pg_fallthrough; case T_SubLink: case T_NullTest: diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 8deb236947..e2603183f1 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -4744,14 +4744,14 @@ timestamp_trunc(PG_FUNCTION_ARGS) tm->tm_year = ((tm->tm_year + 999) / 1000) * 1000 - 999; else tm->tm_year = -((999 - (tm->tm_year - 1)) / 1000) * 1000 + 1; - /* FALL THRU */ + pg_fallthrough; case DTK_CENTURY: /* see comments in timestamptz_trunc */ if (tm->tm_year > 0) tm->tm_year = ((tm->tm_year + 99) / 100) * 100 - 99; else tm->tm_year = -((99 - (tm->tm_year - 1)) / 100) * 100 + 1; - /* FALL THRU */ + pg_fallthrough; case DTK_DECADE: /* see comments in timestamptz_trunc */ if (val != DTK_MILLENNIUM && val != DTK_CENTURY) @@ -4761,25 +4761,25 @@ timestamp_trunc(PG_FUNCTION_ARGS) else tm->tm_year = -((8 - (tm->tm_year - 1)) / 10) * 10; } - /* FALL THRU */ + pg_fallthrough; case DTK_YEAR: tm->tm_mon = 1; - /* FALL THRU */ + pg_fallthrough; case DTK_QUARTER: tm->tm_mon = (3 * ((tm->tm_mon - 1) / 3)) + 1; - /* FALL THRU */ + pg_fallthrough; case DTK_MONTH: tm->tm_mday = 1; - /* FALL THRU */ + pg_fallthrough; case DTK_DAY: tm->tm_hour = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_HOUR: tm->tm_min = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_MINUTE: tm->tm_sec = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_SECOND: fsec = 0; break; @@ -4990,14 +4990,14 @@ timestamptz_trunc_internal(text *units, TimestampTz timestamp, pg_tz *tzp) tm->tm_year = ((tm->tm_year + 999) / 1000) * 1000 - 999; else tm->tm_year = -((999 - (tm->tm_year - 1)) / 1000) * 1000 + 1; - /* FALL THRU */ + pg_fallthrough; case DTK_CENTURY: /* truncating to the century? as above: -100, 1, 101... */ if (tm->tm_year > 0) tm->tm_year = ((tm->tm_year + 99) / 100) * 100 - 99; else tm->tm_year = -((99 - (tm->tm_year - 1)) / 100) * 100 + 1; - /* FALL THRU */ + pg_fallthrough; case DTK_DECADE: /* @@ -5011,26 +5011,26 @@ timestamptz_trunc_internal(text *units, TimestampTz timestamp, pg_tz *tzp) else tm->tm_year = -((8 - (tm->tm_year - 1)) / 10) * 10; } - /* FALL THRU */ + pg_fallthrough; case DTK_YEAR: tm->tm_mon = 1; - /* FALL THRU */ + pg_fallthrough; case DTK_QUARTER: tm->tm_mon = (3 * ((tm->tm_mon - 1) / 3)) + 1; - /* FALL THRU */ + pg_fallthrough; case DTK_MONTH: tm->tm_mday = 1; - /* FALL THRU */ + pg_fallthrough; case DTK_DAY: tm->tm_hour = 0; redotz = true; /* for all cases >= DAY */ - /* FALL THRU */ + pg_fallthrough; case DTK_HOUR: tm->tm_min = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_MINUTE: tm->tm_sec = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_SECOND: fsec = 0; break; @@ -5171,33 +5171,33 @@ interval_trunc(PG_FUNCTION_ARGS) case DTK_MILLENNIUM: /* caution: C division may have negative remainder */ tm->tm_year = (tm->tm_year / 1000) * 1000; - /* FALL THRU */ + pg_fallthrough; case DTK_CENTURY: /* caution: C division may have negative remainder */ tm->tm_year = (tm->tm_year / 100) * 100; - /* FALL THRU */ + pg_fallthrough; case DTK_DECADE: /* caution: C division may have negative remainder */ tm->tm_year = (tm->tm_year / 10) * 10; - /* FALL THRU */ + pg_fallthrough; case DTK_YEAR: tm->tm_mon = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_QUARTER: tm->tm_mon = 3 * (tm->tm_mon / 3); - /* FALL THRU */ + pg_fallthrough; case DTK_MONTH: tm->tm_mday = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_DAY: tm->tm_hour = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_HOUR: tm->tm_min = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_MINUTE: tm->tm_sec = 0; - /* FALL THRU */ + pg_fallthrough; case DTK_SECOND: tm->tm_usec = 0; break; diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index fcb13e7c0a..ac675d5021 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -2186,7 +2186,7 @@ xml_errorHandler(void *data, PgXmlErrorPtr error) if (error->code == XML_ERR_NOT_WELL_BALANCED && xmlerrcxt->err_occurred) return; - /* fall through */ + pg_fallthrough; case XML_FROM_NONE: case XML_FROM_MEMORY: diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 681aa92340..519089322f 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -357,15 +357,15 @@ CatalogCacheComputeHashValue(CatCache *cache, int nkeys, case 4: oneHash = (cc_hashfunc[3]) (v4); hashValue ^= pg_rotate_left32(oneHash, 24); - /* FALLTHROUGH */ + pg_fallthrough; case 3: oneHash = (cc_hashfunc[2]) (v3); hashValue ^= pg_rotate_left32(oneHash, 16); - /* FALLTHROUGH */ + pg_fallthrough; case 2: oneHash = (cc_hashfunc[1]) (v2); hashValue ^= pg_rotate_left32(oneHash, 8); - /* FALLTHROUGH */ + pg_fallthrough; case 1: oneHash = (cc_hashfunc[0]) (v1); hashValue ^= oneHash; @@ -403,21 +403,21 @@ CatalogCacheComputeTupleHashValue(CatCache *cache, int nkeys, HeapTuple tuple) cc_tupdesc, &isNull); Assert(!isNull); - /* FALLTHROUGH */ + pg_fallthrough; case 3: v3 = fastgetattr(tuple, cc_keyno[2], cc_tupdesc, &isNull); Assert(!isNull); - /* FALLTHROUGH */ + pg_fallthrough; case 2: v2 = fastgetattr(tuple, cc_keyno[1], cc_tupdesc, &isNull); Assert(!isNull); - /* FALLTHROUGH */ + pg_fallthrough; case 1: v1 = fastgetattr(tuple, cc_keyno[0], diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index f3f94d4654..78f4d5e202 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -1501,7 +1501,7 @@ pg_utf8_increment(unsigned char *charptr, int length) charptr[3]++; break; } - /* FALL THRU */ + pg_fallthrough; case 3: a = charptr[2]; if (a < 0xBF) @@ -1509,7 +1509,7 @@ pg_utf8_increment(unsigned char *charptr, int length) charptr[2]++; break; } - /* FALL THRU */ + pg_fallthrough; case 2: a = charptr[1]; switch (*charptr) @@ -1529,7 +1529,7 @@ pg_utf8_increment(unsigned char *charptr, int length) charptr[1]++; break; } - /* FALL THRU */ + pg_fallthrough; case 1: a = *charptr; if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index ae9d5f3fb7..d77502838c 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3415,7 +3415,7 @@ set_config_with_handle(const char *name, config_handle *handle, } } /* fall through to process the same as PGC_BACKEND */ - /* FALLTHROUGH */ + pg_fallthrough; case PGC_BACKEND: if (context == PGC_SIGHUP) { diff --git a/src/backend/utils/misc/guc_funcs.c b/src/backend/utils/misc/guc_funcs.c index 4f3e40bf47..8524dd3a98 100644 --- a/src/backend/utils/misc/guc_funcs.c +++ b/src/backend/utils/misc/guc_funcs.c @@ -139,7 +139,7 @@ ExecSetVariableStmt(VariableSetStmt *stmt, bool isTopLevel) case VAR_SET_DEFAULT: if (stmt->is_local) WarnNoTransactionBlock(isTopLevel, "SET LOCAL"); - /* fall through */ + pg_fallthrough; case VAR_RESET: (void) set_config_option(stmt->name, NULL, diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index e57fa5bca6..afba82f28a 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -1024,7 +1024,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file"))); state->status = TSS_READFILE; - /* FALLTHROUGH */ + pg_fallthrough; case TSS_READFILE: *should_free = true; diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 58735871c1..cb4e986092 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -3394,7 +3394,7 @@ readCommandResponse(CState *st, MetaCommand meta, char *varprefix) commandError(st, PQresultErrorMessage(res)); goto error; } - /* fall through */ + pg_fallthrough; default: /* anything else is unexpected */ @@ -3607,7 +3607,7 @@ getTransactionStatus(PGconn *con) /* PQTRANS_UNKNOWN is expected given a broken connection */ if (PQstatus(con) == CONNECTION_BAD) return TSTATUS_CONN_ERROR; - /* fall through */ + pg_fallthrough; case PQTRANS_ACTIVE: default: @@ -5720,7 +5720,7 @@ postprocess_sql_command(Command *my_command) break; case QUERY_PREPARED: my_command->prepname = psprintf("P_%d", prepnum++); - /* fall through */ + pg_fallthrough; case QUERY_EXTENDED: if (!parseQuery(my_command)) exit(1); diff --git a/src/common/hashfn.c b/src/common/hashfn.c index 0efe95568c..c7a0626f96 100644 --- a/src/common/hashfn.c +++ b/src/common/hashfn.c @@ -178,13 +178,13 @@ hash_bytes(const unsigned char *k, int keylen) { case 11: c += ((uint32) k[10] << 8); - /* fall through */ + pg_fallthrough; case 10: c += ((uint32) k[9] << 16); - /* fall through */ + pg_fallthrough; case 9: c += ((uint32) k[8] << 24); - /* fall through */ + pg_fallthrough; case 8: /* the lowest byte of c is reserved for the length */ b += ka[1]; @@ -192,22 +192,22 @@ hash_bytes(const unsigned char *k, int keylen) break; case 7: b += ((uint32) k[6] << 8); - /* fall through */ + pg_fallthrough; case 6: b += ((uint32) k[5] << 16); - /* fall through */ + pg_fallthrough; case 5: b += ((uint32) k[4] << 24); - /* fall through */ + pg_fallthrough; case 4: a += ka[0]; break; case 3: a += ((uint32) k[2] << 8); - /* fall through */ + pg_fallthrough; case 2: a += ((uint32) k[1] << 16); - /* fall through */ + pg_fallthrough; case 1: a += ((uint32) k[0] << 24); /* case 0: nothing left to add */ @@ -217,13 +217,13 @@ hash_bytes(const unsigned char *k, int keylen) { case 11: c += ((uint32) k[10] << 24); - /* fall through */ + pg_fallthrough; case 10: c += ((uint32) k[9] << 16); - /* fall through */ + pg_fallthrough; case 9: c += ((uint32) k[8] << 8); - /* fall through */ + pg_fallthrough; case 8: /* the lowest byte of c is reserved for the length */ b += ka[1]; @@ -231,22 +231,22 @@ hash_bytes(const unsigned char *k, int keylen) break; case 7: b += ((uint32) k[6] << 16); - /* fall through */ + pg_fallthrough; case 6: b += ((uint32) k[5] << 8); - /* fall through */ + pg_fallthrough; case 5: b += k[4]; - /* fall through */ + pg_fallthrough; case 4: a += ka[0]; break; case 3: a += ((uint32) k[2] << 16); - /* fall through */ + pg_fallthrough; case 2: a += ((uint32) k[1] << 8); - /* fall through */ + pg_fallthrough; case 1: a += k[0]; /* case 0: nothing left to add */ @@ -280,35 +280,35 @@ hash_bytes(const unsigned char *k, int keylen) { case 11: c += ((uint32) k[10] << 8); - /* fall through */ + pg_fallthrough; case 10: c += ((uint32) k[9] << 16); - /* fall through */ + pg_fallthrough; case 9: c += ((uint32) k[8] << 24); - /* fall through */ + pg_fallthrough; case 8: /* the lowest byte of c is reserved for the length */ b += k[7]; - /* fall through */ + pg_fallthrough; case 7: b += ((uint32) k[6] << 8); - /* fall through */ + pg_fallthrough; case 6: b += ((uint32) k[5] << 16); - /* fall through */ + pg_fallthrough; case 5: b += ((uint32) k[4] << 24); - /* fall through */ + pg_fallthrough; case 4: a += k[3]; - /* fall through */ + pg_fallthrough; case 3: a += ((uint32) k[2] << 8); - /* fall through */ + pg_fallthrough; case 2: a += ((uint32) k[1] << 16); - /* fall through */ + pg_fallthrough; case 1: a += ((uint32) k[0] << 24); /* case 0: nothing left to add */ @@ -318,35 +318,35 @@ hash_bytes(const unsigned char *k, int keylen) { case 11: c += ((uint32) k[10] << 24); - /* fall through */ + pg_fallthrough; case 10: c += ((uint32) k[9] << 16); - /* fall through */ + pg_fallthrough; case 9: c += ((uint32) k[8] << 8); - /* fall through */ + pg_fallthrough; case 8: /* the lowest byte of c is reserved for the length */ b += ((uint32) k[7] << 24); - /* fall through */ + pg_fallthrough; case 7: b += ((uint32) k[6] << 16); - /* fall through */ + pg_fallthrough; case 6: b += ((uint32) k[5] << 8); - /* fall through */ + pg_fallthrough; case 5: b += k[4]; - /* fall through */ + pg_fallthrough; case 4: a += ((uint32) k[3] << 24); - /* fall through */ + pg_fallthrough; case 3: a += ((uint32) k[2] << 16); - /* fall through */ + pg_fallthrough; case 2: a += ((uint32) k[1] << 8); - /* fall through */ + pg_fallthrough; case 1: a += k[0]; /* case 0: nothing left to add */ @@ -417,13 +417,13 @@ hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed) { case 11: c += ((uint32) k[10] << 8); - /* fall through */ + pg_fallthrough; case 10: c += ((uint32) k[9] << 16); - /* fall through */ + pg_fallthrough; case 9: c += ((uint32) k[8] << 24); - /* fall through */ + pg_fallthrough; case 8: /* the lowest byte of c is reserved for the length */ b += ka[1]; @@ -431,22 +431,22 @@ hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed) break; case 7: b += ((uint32) k[6] << 8); - /* fall through */ + pg_fallthrough; case 6: b += ((uint32) k[5] << 16); - /* fall through */ + pg_fallthrough; case 5: b += ((uint32) k[4] << 24); - /* fall through */ + pg_fallthrough; case 4: a += ka[0]; break; case 3: a += ((uint32) k[2] << 8); - /* fall through */ + pg_fallthrough; case 2: a += ((uint32) k[1] << 16); - /* fall through */ + pg_fallthrough; case 1: a += ((uint32) k[0] << 24); /* case 0: nothing left to add */ @@ -456,13 +456,13 @@ hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed) { case 11: c += ((uint32) k[10] << 24); - /* fall through */ + pg_fallthrough; case 10: c += ((uint32) k[9] << 16); - /* fall through */ + pg_fallthrough; case 9: c += ((uint32) k[8] << 8); - /* fall through */ + pg_fallthrough; case 8: /* the lowest byte of c is reserved for the length */ b += ka[1]; @@ -470,22 +470,22 @@ hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed) break; case 7: b += ((uint32) k[6] << 16); - /* fall through */ + pg_fallthrough; case 6: b += ((uint32) k[5] << 8); - /* fall through */ + pg_fallthrough; case 5: b += k[4]; - /* fall through */ + pg_fallthrough; case 4: a += ka[0]; break; case 3: a += ((uint32) k[2] << 16); - /* fall through */ + pg_fallthrough; case 2: a += ((uint32) k[1] << 8); - /* fall through */ + pg_fallthrough; case 1: a += k[0]; /* case 0: nothing left to add */ @@ -519,35 +519,35 @@ hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed) { case 11: c += ((uint32) k[10] << 8); - /* fall through */ + pg_fallthrough; case 10: c += ((uint32) k[9] << 16); - /* fall through */ + pg_fallthrough; case 9: c += ((uint32) k[8] << 24); - /* fall through */ + pg_fallthrough; case 8: /* the lowest byte of c is reserved for the length */ b += k[7]; - /* fall through */ + pg_fallthrough; case 7: b += ((uint32) k[6] << 8); - /* fall through */ + pg_fallthrough; case 6: b += ((uint32) k[5] << 16); - /* fall through */ + pg_fallthrough; case 5: b += ((uint32) k[4] << 24); - /* fall through */ + pg_fallthrough; case 4: a += k[3]; - /* fall through */ + pg_fallthrough; case 3: a += ((uint32) k[2] << 8); - /* fall through */ + pg_fallthrough; case 2: a += ((uint32) k[1] << 16); - /* fall through */ + pg_fallthrough; case 1: a += ((uint32) k[0] << 24); /* case 0: nothing left to add */ @@ -557,35 +557,35 @@ hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed) { case 11: c += ((uint32) k[10] << 24); - /* fall through */ + pg_fallthrough; case 10: c += ((uint32) k[9] << 16); - /* fall through */ + pg_fallthrough; case 9: c += ((uint32) k[8] << 8); - /* fall through */ + pg_fallthrough; case 8: /* the lowest byte of c is reserved for the length */ b += ((uint32) k[7] << 24); - /* fall through */ + pg_fallthrough; case 7: b += ((uint32) k[6] << 16); - /* fall through */ + pg_fallthrough; case 6: b += ((uint32) k[5] << 8); - /* fall through */ + pg_fallthrough; case 5: b += k[4]; - /* fall through */ + pg_fallthrough; case 4: a += ((uint32) k[3] << 24); - /* fall through */ + pg_fallthrough; case 3: a += ((uint32) k[2] << 16); - /* fall through */ + pg_fallthrough; case 2: a += ((uint32) k[1] << 8); - /* fall through */ + pg_fallthrough; case 1: a += k[0]; /* case 0: nothing left to add */ diff --git a/src/common/wchar.c b/src/common/wchar.c index eb15ee5949..e7b6595b04 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -2021,12 +2021,12 @@ pg_utf8_islegal(const unsigned char *source, int length) a = source[3]; if (a < 0x80 || a > 0xBF) return false; - /* FALL THRU */ + pg_fallthrough; case 3: a = source[2]; if (a < 0x80 || a > 0xBF) return false; - /* FALL THRU */ + pg_fallthrough; case 2: a = source[1]; switch (*source) @@ -2052,7 +2052,7 @@ pg_utf8_islegal(const unsigned char *source, int length) return false; break; } - /* FALL THRU */ + pg_fallthrough; case 1: a = *source; if (a >= 0x80 && a < 0xC2) diff --git a/src/include/c.h b/src/include/c.h index a249674f02..7ee4751992 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -132,6 +132,18 @@ #define pg_attribute_unused() #endif +/* + * pg_fallthrough indicates that the fall through from the previous case is + * intentional. + */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201703L) +#define pg_fallthrough [[fallthrough]] +#elif __has_attribute(fallthrough) +#define pg_fallthrough __attribute__((fallthrough)) +#else +#define pg_fallthrough +#endif + /* * pg_nodiscard means the compiler should warn if the result of a function * call is ignored. The name "nodiscard" is chosen in alignment with the C23 diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h index 6966daa2b0..06bdf6d586 100644 --- a/src/include/common/hashfn_unstable.h +++ b/src/include/common/hashfn_unstable.h @@ -158,23 +158,23 @@ fasthash_accum(fasthash_state *hs, const char *k, size_t len) break; case 7: hs->accum |= (uint64) k[6] << 8; - /* FALLTHROUGH */ + pg_fallthrough; case 6: hs->accum |= (uint64) k[5] << 16; - /* FALLTHROUGH */ + pg_fallthrough; case 5: hs->accum |= (uint64) k[4] << 24; - /* FALLTHROUGH */ + pg_fallthrough; case 4: memcpy(&lower_four, k, sizeof(lower_four)); hs->accum |= (uint64) lower_four << 32; break; case 3: hs->accum |= (uint64) k[2] << 40; - /* FALLTHROUGH */ + pg_fallthrough; case 2: hs->accum |= (uint64) k[1] << 48; - /* FALLTHROUGH */ + pg_fallthrough; case 1: hs->accum |= (uint64) k[0] << 56; break; @@ -189,23 +189,23 @@ fasthash_accum(fasthash_state *hs, const char *k, size_t len) break; case 7: hs->accum |= (uint64) k[6] << 48; - /* FALLTHROUGH */ + pg_fallthrough; case 6: hs->accum |= (uint64) k[5] << 40; - /* FALLTHROUGH */ + pg_fallthrough; case 5: hs->accum |= (uint64) k[4] << 32; - /* FALLTHROUGH */ + pg_fallthrough; case 4: memcpy(&lower_four, k, sizeof(lower_four)); hs->accum |= lower_four; break; case 3: hs->accum |= (uint64) k[2] << 16; - /* FALLTHROUGH */ + pg_fallthrough; case 2: hs->accum |= (uint64) k[1] << 8; - /* FALLTHROUGH */ + pg_fallthrough; case 1: hs->accum |= (uint64) k[0]; break; diff --git a/src/interfaces/ecpg/pgtypeslib/interval.c b/src/interfaces/ecpg/pgtypeslib/interval.c index 936a688381..e452a088f9 100644 --- a/src/interfaces/ecpg/pgtypeslib/interval.c +++ b/src/interfaces/ecpg/pgtypeslib/interval.c @@ -184,7 +184,7 @@ DecodeISO8601Interval(char *str, continue; } /* Else fall through to extended alternative format */ - /* FALLTHROUGH */ + pg_fallthrough; case '-': /* ISO 8601 4.4.3.3 Alternative Format, * Extended */ if (havefield) @@ -263,7 +263,7 @@ DecodeISO8601Interval(char *str, return 0; } /* Else fall through to extended alternative format */ - /* FALLTHROUGH */ + pg_fallthrough; case ':': /* ISO 8601 4.4.3.3 Alternative Format, * Extended */ if (havefield) @@ -391,7 +391,7 @@ DecodeInterval(char **field, int *ftype, int nf, /* int range, */ tmask = DTK_M(TZ); break; } - /* FALL THROUGH */ + pg_fallthrough; case DTK_DATE: case DTK_NUMBER: diff --git a/src/interfaces/libpq/fe-secure.c b/src/interfaces/libpq/fe-secure.c index 399fe7adf6..31d5b48d3f 100644 --- a/src/interfaces/libpq/fe-secure.c +++ b/src/interfaces/libpq/fe-secure.c @@ -379,7 +379,7 @@ pqsecure_raw_write(PGconn *conn, const void *ptr, size_t len) /* Set flag for EPIPE */ REMEMBER_EPIPE(spinfo, true); - /* FALL THRU */ + pg_fallthrough; case ECONNRESET: conn->write_failed = true; diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index 723048ab83..84552e32c8 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -3230,7 +3230,7 @@ exec_stmt_return(PLpgSQL_execstate *estate, PLpgSQL_stmt_return *stmt) /* fulfill promise if needed, then handle like regular var */ plpgsql_fulfill_promise(estate, (PLpgSQL_var *) retvar); - /* FALL THRU */ + pg_fallthrough; case PLPGSQL_DTYPE_VAR: { @@ -3362,7 +3362,7 @@ exec_stmt_return_next(PLpgSQL_execstate *estate, /* fulfill promise if needed, then handle like regular var */ plpgsql_fulfill_promise(estate, (PLpgSQL_var *) retvar); - /* FALL THRU */ + pg_fallthrough; case PLPGSQL_DTYPE_VAR: { @@ -5299,7 +5299,7 @@ exec_eval_datum(PLpgSQL_execstate *estate, /* fulfill promise if needed, then handle like regular var */ plpgsql_fulfill_promise(estate, (PLpgSQL_var *) datum); - /* FALL THRU */ + pg_fallthrough; case PLPGSQL_DTYPE_VAR: { diff --git a/src/pl/tcl/pltcl.c b/src/pl/tcl/pltcl.c index 187698ccdd..b7318f7261 100644 --- a/src/pl/tcl/pltcl.c +++ b/src/pl/tcl/pltcl.c @@ -2545,7 +2545,7 @@ pltcl_process_SPI_result(Tcl_Interp *interp, break; } /* fall through for utility returning tuples */ - /* FALLTHROUGH */ + pg_fallthrough; case SPI_OK_SELECT: case SPI_OK_INSERT_RETURNING: diff --git a/src/port/snprintf.c b/src/port/snprintf.c index 56c7036753..5deee44d3a 100644 --- a/src/port/snprintf.c +++ b/src/port/snprintf.c @@ -462,7 +462,7 @@ dopr(PrintfTarget *target, const char *format, va_list args) /* set zero padding if no nonzero digits yet */ if (accum == 0 && !pointflag) zpad = '0'; - /* FALL THRU */ + pg_fallthrough; case '1': case '2': case '3': diff --git a/src/test/modules/test_json_parser/test_json_parser_incremental.c b/src/test/modules/test_json_parser/test_json_parser_incremental.c index 6bc559f7bf..8fbd180c86 100644 --- a/src/test/modules/test_json_parser/test_json_parser_incremental.c +++ b/src/test/modules/test_json_parser/test_json_parser_incremental.c @@ -113,7 +113,7 @@ main(int argc, char **argv) { case 'r': /* chunk range */ run_chunk_ranges = true; - /* fall through */ + pg_fallthrough; case 'c': /* chunk size */ chunk_size = strtou64(optarg, NULL, 10); if (chunk_size > BUFSIZE) diff --git a/src/timezone/zic.c b/src/timezone/zic.c index 8dcc7b337a..2f36486a35 100644 --- a/src/timezone/zic.c +++ b/src/timezone/zic.c @@ -1395,19 +1395,19 @@ gethms(char const *string, char const *errstring) break; case 8: ok = '0' <= xr && xr <= '9'; - /* fallthrough */ + pg_fallthrough; case 7: ok &= ssx == '.'; if (ok && noise) warning(_("fractional seconds rejected by" " pre-2018 versions of zic")); - /* fallthrough */ + pg_fallthrough; case 5: ok &= mmx == ':'; - /* fallthrough */ + pg_fallthrough; case 3: ok &= hhx == ':'; - /* fallthrough */ + pg_fallthrough; case 1: break; } diff --git a/src/tools/pg_bsd_indent/indent.c b/src/tools/pg_bsd_indent/indent.c index 2622cc6227..6e550ff310 100644 --- a/src/tools/pg_bsd_indent/indent.c +++ b/src/tools/pg_bsd_indent/indent.c @@ -352,7 +352,7 @@ main(int argc, char **argv) } goto sw_buffer; } - /* FALLTHROUGH */ + pg_fallthrough; default: /* it is the start of a normal statement */ { int remove_newlines; @@ -922,7 +922,7 @@ main(int argc, char **argv) case structure: if (ps.p_l_follow > 0) goto copy_id; - /* FALLTHROUGH */ + pg_fallthrough; case decl: /* we have a declaration type (int, etc.) */ parse(decl); /* let parser worry about indentation */ if (ps.last_token == rparen && ps.tos <= 1) { diff --git a/src/tools/pg_bsd_indent/parse.c b/src/tools/pg_bsd_indent/parse.c index e707da639c..94cea72439 100644 --- a/src/tools/pg_bsd_indent/parse.c +++ b/src/tools/pg_bsd_indent/parse.c @@ -96,7 +96,7 @@ parse(int tk) /* tk: the code for the construct scanned */ */ ps.i_l_follow = ps.il[ps.tos--]; /* the rest is the same as for dolit and forstmt */ - /* FALLTHROUGH */ + pg_fallthrough; case dolit: /* 'do' */ case forstmt: /* for (...) */ ps.p_stack[++ps.tos] = tk; @@ -303,7 +303,7 @@ reduce(void) case swstmt: /* */ case_ind = ps.cstk[ps.tos - 1]; - /* FALLTHROUGH */ + pg_fallthrough; case decl: /* finish of a declaration */ case elsehead: /* < else> */ From 5b93a5987bd704d2363295eee919eee45f84c286 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Thu, 19 Feb 2026 23:55:12 +0900 Subject: [PATCH 133/147] Log checkpoint request flags in checkpoint completion messages. Checkpoint completion log messages include more detail than checkpoint start messages, but previously omitted the checkpoint request flags, which were only logged at checkpoint start. As a result, users had to correlate completion messages with earlier start messages to see the full context. This commit includes the checkpoint request flags in the checkpoint completion log message as well. This duplicates some information, but makes the completion message self-contained and easier to interpret. Author: Soumya S Murali Reviewed-by: Michael Banck Reviewed-by: Yuan Li Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/CAMtXxw9tPwV=NBv5S9GZXMSKPeKv5f9hRhSjZ8__oLsoS5jcuA@mail.gmail.com --- src/backend/access/transam/xlog.c | 60 ++++++++++++++++++------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 13ec6225b8..13cce9b49f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6768,6 +6768,28 @@ ShutdownXLOG(int code, Datum arg) } } +/* + * Format checkpoint request flags as a space-separated string for + * log messages. + */ +static const char * +CheckpointFlagsString(int flags) +{ + static char buf[128]; + + snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s", + (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", + (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", + (flags & CHECKPOINT_FAST) ? " fast" : "", + (flags & CHECKPOINT_FORCE) ? " force" : "", + (flags & CHECKPOINT_WAIT) ? " wait" : "", + (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", + (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", + (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""); + + return buf; +} + /* * Log start of a checkpoint. */ @@ -6776,35 +6798,21 @@ LogCheckpointStart(int flags, bool restartpoint) { if (restartpoint) ereport(LOG, - /* translator: the placeholders show checkpoint options */ - (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s", - (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", - (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", - (flags & CHECKPOINT_FAST) ? " fast" : "", - (flags & CHECKPOINT_FORCE) ? " force" : "", - (flags & CHECKPOINT_WAIT) ? " wait" : "", - (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", - (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", - (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""))); + /* translator: the placeholder shows checkpoint options */ + (errmsg("restartpoint starting:%s", + CheckpointFlagsString(flags)))); else ereport(LOG, - /* translator: the placeholders show checkpoint options */ - (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s", - (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", - (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", - (flags & CHECKPOINT_FAST) ? " fast" : "", - (flags & CHECKPOINT_FORCE) ? " force" : "", - (flags & CHECKPOINT_WAIT) ? " wait" : "", - (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", - (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", - (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""))); + /* translator: the placeholder shows checkpoint options */ + (errmsg("checkpoint starting:%s", + CheckpointFlagsString(flags)))); } /* * Log end of a checkpoint. */ static void -LogCheckpointEnd(bool restartpoint) +LogCheckpointEnd(bool restartpoint, int flags) { long write_msecs, sync_msecs, @@ -6854,12 +6862,13 @@ LogCheckpointEnd(bool restartpoint) */ if (restartpoint) ereport(LOG, - (errmsg("restartpoint complete: wrote %d buffers (%.1f%%), " + (errmsg("restartpoint complete:%s: wrote %d buffers (%.1f%%), " "wrote %d SLRU buffers; %d WAL file(s) added, " "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", + CheckpointFlagsString(flags), CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -6878,12 +6887,13 @@ LogCheckpointEnd(bool restartpoint) LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)))); else ereport(LOG, - (errmsg("checkpoint complete: wrote %d buffers (%.1f%%), " + (errmsg("checkpoint complete:%s: wrote %d buffers (%.1f%%), " "wrote %d SLRU buffers; %d WAL file(s) added, " "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", + CheckpointFlagsString(flags), CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -7480,7 +7490,7 @@ CreateCheckPoint(int flags) TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); /* Real work is done; log and update stats. */ - LogCheckpointEnd(false); + LogCheckpointEnd(false, flags); /* Reset the process title */ update_checkpoint_display(flags, false, true); @@ -7951,7 +7961,7 @@ CreateRestartPoint(int flags) TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); /* Real work is done; log and update stats. */ - LogCheckpointEnd(true); + LogCheckpointEnd(true, flags); /* Reset the process title */ update_checkpoint_display(flags, true, true); From 8a6af3ad08799755e575275d0df5feb7b102ca35 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Fri, 20 Feb 2026 00:52:43 +0900 Subject: [PATCH 134/147] Make GUC wal_receiver_timeout user-settable. When multiple subscribers connect to different publisher servers, it can be useful to set different wal_receiver_timeout values for each connection to better detect failures. However, previously this wasn't possible, which limited flexibility in managing subscriptions. This commit changes wal_receiver_timeout to be user-settable, allowing different values to be assigned using ALTER ROLE SET for each subscription owner. This effectively enables per-subscription configuration. Author: Fujii Masao Reviewed-by: Japin Li Reviewed-by: Chao Li Discussion: https://postgr.es/m/a1414b64-bf58-43a6-8494-9704975a41e9@oss.nttdata.com --- doc/src/sgml/config.sgml | 3 --- src/backend/utils/misc/guc_parameters.dat | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index faf0bdb62a..20dbcaeb3e 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -5249,9 +5249,6 @@ ANY num_sync ( + subwalrcvtimeout text + + + The wal_receiver_timeout + setting for the subscription's workers to use + + + subpublications text[] diff --git a/doc/src/sgml/ref/alter_subscription.sgml b/doc/src/sgml/ref/alter_subscription.sgml index 27c06439f4..5318998e80 100644 --- a/doc/src/sgml/ref/alter_subscription.sgml +++ b/doc/src/sgml/ref/alter_subscription.sgml @@ -280,8 +280,9 @@ ALTER SUBSCRIPTION name RENAME TO < origin, failover, two_phase, - retain_dead_tuples, and - max_retention_duration. + retain_dead_tuples, + max_retention_duration, and + wal_receiver_timeout. Only a superuser can set password_required = false. diff --git a/doc/src/sgml/ref/create_subscription.sgml b/doc/src/sgml/ref/create_subscription.sgml index b7dd361294..eb0cc645d8 100644 --- a/doc/src/sgml/ref/create_subscription.sgml +++ b/doc/src/sgml/ref/create_subscription.sgml @@ -574,8 +574,21 @@ CREATE SUBSCRIPTION subscription_name - + + wal_receiver_timeout (text) + + + The value of this parameter overrides the + setting within this + subscription's apply worker processes. The default value is + -1, which means it does not override the global setting, + i.e., the value from the server configuration, command line, role or + database settings will be used instead. + + + + diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c index 2b10324529..acf42b853e 100644 --- a/src/backend/catalog/pg_subscription.c +++ b/src/backend/catalog/pg_subscription.c @@ -129,6 +129,12 @@ GetSubscription(Oid subid, bool missing_ok) Anum_pg_subscription_subsynccommit); sub->synccommit = TextDatumGetCString(datum); + /* Get walrcvtimeout */ + datum = SysCacheGetAttrNotNull(SUBSCRIPTIONOID, + tup, + Anum_pg_subscription_subwalrcvtimeout); + sub->walrcvtimeout = TextDatumGetCString(datum); + /* Get publications */ datum = SysCacheGetAttrNotNull(SUBSCRIPTIONOID, tup, diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 0b3c8499b4..5e3c0964d3 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -73,8 +73,9 @@ #define SUBOPT_FAILOVER 0x00002000 #define SUBOPT_RETAIN_DEAD_TUPLES 0x00004000 #define SUBOPT_MAX_RETENTION_DURATION 0x00008000 -#define SUBOPT_LSN 0x00010000 -#define SUBOPT_ORIGIN 0x00020000 +#define SUBOPT_WAL_RECEIVER_TIMEOUT 0x00010000 +#define SUBOPT_LSN 0x00020000 +#define SUBOPT_ORIGIN 0x00040000 /* check if the 'val' has 'bits' set */ #define IsSet(val, bits) (((val) & (bits)) == (bits)) @@ -104,6 +105,7 @@ typedef struct SubOpts int32 maxretention; char *origin; XLogRecPtr lsn; + char *wal_receiver_timeout; } SubOpts; /* @@ -402,6 +404,30 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->specified_opts |= SUBOPT_LSN; opts->lsn = lsn; } + else if (IsSet(supported_opts, SUBOPT_WAL_RECEIVER_TIMEOUT) && + strcmp(defel->defname, "wal_receiver_timeout") == 0) + { + bool parsed; + int val; + + if (IsSet(opts->specified_opts, SUBOPT_WAL_RECEIVER_TIMEOUT)) + errorConflictingDefElem(defel, pstate); + + opts->specified_opts |= SUBOPT_WAL_RECEIVER_TIMEOUT; + opts->wal_receiver_timeout = defGetString(defel); + + /* + * Test if the given value is valid for wal_receiver_timeout GUC. + * Skip this test if the value is -1, since -1 is allowed for the + * wal_receiver_timeout subscription option, but not for the GUC + * itself. + */ + parsed = parse_int(opts->wal_receiver_timeout, &val, 0, NULL); + if (!parsed || val != -1) + (void) set_config_option("wal_receiver_timeout", opts->wal_receiver_timeout, + PGC_BACKEND, PGC_S_TEST, GUC_ACTION_SET, + false, 0, false); + } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -612,7 +638,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | SUBOPT_RETAIN_DEAD_TUPLES | - SUBOPT_MAX_RETENTION_DURATION | SUBOPT_ORIGIN); + SUBOPT_MAX_RETENTION_DURATION | + SUBOPT_WAL_RECEIVER_TIMEOUT | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, supported_opts, &opts); /* @@ -695,6 +722,14 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, if (opts.synchronous_commit == NULL) opts.synchronous_commit = "off"; + /* + * The default for wal_receiver_timeout of subscriptions is -1, which + * means the value is inherited from the server configuration, command + * line, or role/database settings. + */ + if (opts.wal_receiver_timeout == NULL) + opts.wal_receiver_timeout = "-1"; + conninfo = stmt->conninfo; publications = stmt->publication; @@ -742,6 +777,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, nulls[Anum_pg_subscription_subslotname - 1] = true; values[Anum_pg_subscription_subsynccommit - 1] = CStringGetTextDatum(opts.synchronous_commit); + values[Anum_pg_subscription_subwalrcvtimeout - 1] = + CStringGetTextDatum(opts.wal_receiver_timeout); values[Anum_pg_subscription_subpublications - 1] = publicationListToArray(publications); values[Anum_pg_subscription_suborigin - 1] = @@ -1410,6 +1447,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_MAX_RETENTION_DURATION | + SUBOPT_WAL_RECEIVER_TIMEOUT | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, @@ -1665,6 +1703,13 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, origin = opts.origin; } + if (IsSet(opts.specified_opts, SUBOPT_WAL_RECEIVER_TIMEOUT)) + { + values[Anum_pg_subscription_subwalrcvtimeout - 1] = + CStringGetTextDatum(opts.wal_receiver_timeout); + replaces[Anum_pg_subscription_subwalrcvtimeout - 1] = true; + } + update_tuple = true; break; } diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 29933f5301..adbdec49a0 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -627,6 +627,8 @@ static inline void reset_apply_error_context_info(void); static TransApplyAction get_transaction_apply_action(TransactionId xid, ParallelApplyWorkerInfo **winfo); +static void set_wal_receiver_timeout(void); + static void on_exit_clear_xact_state(int code, Datum arg); /* @@ -5154,12 +5156,48 @@ maybe_reread_subscription(void) SetConfigOption("synchronous_commit", MySubscription->synccommit, PGC_BACKEND, PGC_S_OVERRIDE); + /* Change wal_receiver_timeout according to the user's wishes */ + set_wal_receiver_timeout(); + if (started_tx) CommitTransactionCommand(); MySubscriptionValid = true; } +/* + * Change wal_receiver_timeout to MySubscription->walrcvtimeout. + */ +static void +set_wal_receiver_timeout(void) +{ + bool parsed; + int val; + int prev_timeout = wal_receiver_timeout; + + /* + * Set the wal_receiver_timeout GUC to MySubscription->walrcvtimeout, + * which comes from the subscription's wal_receiver_timeout option. If the + * value is -1, reset the GUC to its default, meaning it will inherit from + * the server config, command line, or role/database settings. + */ + parsed = parse_int(MySubscription->walrcvtimeout, &val, 0, NULL); + if (parsed && val == -1) + SetConfigOption("wal_receiver_timeout", NULL, + PGC_BACKEND, PGC_S_SESSION); + else + SetConfigOption("wal_receiver_timeout", MySubscription->walrcvtimeout, + PGC_BACKEND, PGC_S_SESSION); + + /* + * Log the wal_receiver_timeout setting (in milliseconds) as a debug + * message when it changes, to verify it was set correctly. + */ + if (prev_timeout != wal_receiver_timeout) + elog(DEBUG1, "logical replication worker for subscription \"%s\" wal_receiver_timeout: %d ms", + MySubscription->name, wal_receiver_timeout); +} + /* * Callback from subscription syscache invalidation. */ @@ -5822,6 +5860,9 @@ InitializeLogRepWorker(void) SetConfigOption("synchronous_commit", MySubscription->synccommit, PGC_BACKEND, PGC_S_OVERRIDE); + /* Change wal_receiver_timeout according to the user's wishes */ + set_wal_receiver_timeout(); + /* * Keep us informed about subscription or role changes. Note that the * role's superuser privilege can be revoked. diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 4959830433..450cec285b 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -5112,6 +5112,7 @@ getSubscriptions(Archive *fout) int i_subconninfo; int i_subslotname; int i_subsynccommit; + int i_subwalrcvtimeout; int i_subpublications; int i_suborigin; int i_suboriginremotelsn; @@ -5205,10 +5206,17 @@ getSubscriptions(Archive *fout) if (fout->remoteVersion >= 190000) appendPQExpBufferStr(query, - " s.submaxretention\n"); + " s.submaxretention,\n"); else appendPQExpBuffer(query, - " 0 AS submaxretention\n"); + " 0 AS submaxretention,\n"); + + if (fout->remoteVersion >= 190000) + appendPQExpBufferStr(query, + " s.subwalrcvtimeout\n"); + else + appendPQExpBufferStr(query, + " '-1' AS subwalrcvtimeout\n"); appendPQExpBufferStr(query, "FROM pg_subscription s\n"); @@ -5247,6 +5255,7 @@ getSubscriptions(Archive *fout) i_subconninfo = PQfnumber(res, "subconninfo"); i_subslotname = PQfnumber(res, "subslotname"); i_subsynccommit = PQfnumber(res, "subsynccommit"); + i_subwalrcvtimeout = PQfnumber(res, "subwalrcvtimeout"); i_subpublications = PQfnumber(res, "subpublications"); i_suborigin = PQfnumber(res, "suborigin"); i_suboriginremotelsn = PQfnumber(res, "suboriginremotelsn"); @@ -5290,6 +5299,8 @@ getSubscriptions(Archive *fout) pg_strdup(PQgetvalue(res, i, i_subslotname)); subinfo[i].subsynccommit = pg_strdup(PQgetvalue(res, i, i_subsynccommit)); + subinfo[i].subwalrcvtimeout = + pg_strdup(PQgetvalue(res, i, i_subwalrcvtimeout)); subinfo[i].subpublications = pg_strdup(PQgetvalue(res, i, i_subpublications)); subinfo[i].suborigin = pg_strdup(PQgetvalue(res, i, i_suborigin)); @@ -5548,6 +5559,9 @@ dumpSubscription(Archive *fout, const SubscriptionInfo *subinfo) if (strcmp(subinfo->subsynccommit, "off") != 0) appendPQExpBuffer(query, ", synchronous_commit = %s", fmtId(subinfo->subsynccommit)); + if (strcmp(subinfo->subwalrcvtimeout, "-1") != 0) + appendPQExpBuffer(query, ", wal_receiver_timeout = %s", fmtId(subinfo->subwalrcvtimeout)); + if (pg_strcasecmp(subinfo->suborigin, LOGICALREP_ORIGIN_ANY) != 0) appendPQExpBuffer(query, ", origin = %s", subinfo->suborigin); diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 4c4b14e5fc..6deceef23f 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -722,6 +722,7 @@ typedef struct _SubscriptionInfo char *subconninfo; char *subslotname; char *subsynccommit; + char *subwalrcvtimeout; char *subpublications; char *suborigin; char *suboriginremotelsn; diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 3584c4e142..571a6a003d 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -6806,7 +6806,7 @@ describeSubscriptions(const char *pattern, bool verbose) printQueryOpt myopt = pset.popt; static const bool translate_columns[] = {false, false, false, false, false, false, false, false, false, false, false, false, false, false, - false, false, false, false}; + false, false, false, false, false}; if (pset.sversion < 100000) { @@ -6895,6 +6895,11 @@ describeSubscriptions(const char *pattern, bool verbose) gettext_noop("Synchronous commit"), gettext_noop("Conninfo")); + if (pset.sversion >= 190000) + appendPQExpBuffer(&buf, + ", subwalrcvtimeout AS \"%s\"\n", + gettext_noop("Receiver timeout")); + /* Skip LSN is only supported in v15 and higher */ if (pset.sversion >= 150000) appendPQExpBuffer(&buf, diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 7be8afc10e..7670eb226f 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202602181 +#define CATALOG_VERSION_NO 202602201 #endif diff --git a/src/include/catalog/pg_subscription.h b/src/include/catalog/pg_subscription.h index f3571d2bfc..805493d85c 100644 --- a/src/include/catalog/pg_subscription.h +++ b/src/include/catalog/pg_subscription.h @@ -100,6 +100,9 @@ CATALOG(pg_subscription,6100,SubscriptionRelationId) BKI_SHARED_RELATION BKI_ROW /* Synchronous commit setting for worker */ text subsynccommit BKI_FORCE_NOT_NULL; + /* wal_receiver_timeout setting for worker */ + text subwalrcvtimeout BKI_FORCE_NOT_NULL; + /* List of publications subscribed to */ text subpublications[1] BKI_FORCE_NOT_NULL; @@ -155,6 +158,7 @@ typedef struct Subscription char *conninfo; /* Connection string to the publisher */ char *slotname; /* Name of the replication slot */ char *synccommit; /* Synchronous commit setting for worker */ + char *walrcvtimeout; /* wal_receiver_timeout setting for worker */ List *publications; /* List of publication names to subscribe to */ char *origin; /* Only publish data originating from the * specified origin */ diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out index b3eccd8afe..3a0637772c 100644 --- a/src/test/regress/expected/subscription.out +++ b/src/test/regress/expected/subscription.out @@ -116,18 +116,18 @@ CREATE SUBSCRIPTION regress_testsub4 CONNECTION 'dbname=regress_doesnotexist' PU WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications. \dRs+ regress_testsub4 - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN -------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | none | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | none | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub4 SET (origin = any); \dRs+ regress_testsub4 - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN -------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub3; @@ -145,10 +145,10 @@ ALTER SUBSCRIPTION regress_testsub CONNECTION 'foobar'; ERROR: invalid connection string syntax: missing "=" after "foobar" in connection info string \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET PUBLICATION testpub2, testpub3 WITH (refresh = false); @@ -157,10 +157,10 @@ ALTER SUBSCRIPTION regress_testsub SET (slot_name = 'newname'); ALTER SUBSCRIPTION regress_testsub SET (password_required = false); ALTER SUBSCRIPTION regress_testsub SET (run_as_owner = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+------------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | f | t | f | f | 0 | f | off | dbname=regress_doesnotexist2 | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+------------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | f | t | f | f | 0 | f | off | dbname=regress_doesnotexist2 | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (password_required = true); @@ -176,10 +176,10 @@ ERROR: unrecognized subscription parameter: "create_slot" -- ok ALTER SUBSCRIPTION regress_testsub SKIP (lsn = '0/12345'); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+------------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist2 | 0/00012345 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+------------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist2 | -1 | 0/00012345 (1 row) -- ok - with lsn = NONE @@ -188,10 +188,10 @@ ALTER SUBSCRIPTION regress_testsub SKIP (lsn = NONE); ALTER SUBSCRIPTION regress_testsub SKIP (lsn = '0/0'); ERROR: invalid WAL location (LSN): 0/0 \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+------------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist2 | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+------------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist2 | -1 | 0/00000000 (1 row) BEGIN; @@ -222,11 +222,15 @@ ALTER SUBSCRIPTION regress_testsub_foo SET (synchronous_commit = local); ALTER SUBSCRIPTION regress_testsub_foo SET (synchronous_commit = foobar); ERROR: invalid value for parameter "synchronous_commit": "foobar" HINT: Available values: local, remote_write, remote_apply, on, off. +ALTER SUBSCRIPTION regress_testsub_foo SET (wal_receiver_timeout = '-1'); +ALTER SUBSCRIPTION regress_testsub_foo SET (wal_receiver_timeout = '80s'); +ALTER SUBSCRIPTION regress_testsub_foo SET (wal_receiver_timeout = 'foobar'); +ERROR: invalid value for parameter "wal_receiver_timeout": "foobar" \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ----------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+------------------------------+------------ - regress_testsub_foo | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | 0 | f | local | dbname=regress_doesnotexist2 | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +---------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+------------------------------+------------------+------------ + regress_testsub_foo | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | 0 | f | local | dbname=regress_doesnotexist2 | 80s | 0/00000000 (1 row) -- rename back to keep the rest simple @@ -255,19 +259,19 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | t | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | t | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (binary = false); ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub; @@ -279,27 +283,27 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | on | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (streaming = parallel); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (streaming = false); ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) -- fail - publication already exists @@ -314,10 +318,10 @@ ALTER SUBSCRIPTION regress_testsub ADD PUBLICATION testpub1, testpub2 WITH (refr ALTER SUBSCRIPTION regress_testsub ADD PUBLICATION testpub1, testpub2 WITH (refresh = false); ERROR: publication "testpub1" is already in subscription "regress_testsub" \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-----------------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub,testpub1,testpub2} | f | off | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-----------------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub,testpub1,testpub2} | f | off | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) -- fail - publication used more than once @@ -332,10 +336,10 @@ ERROR: publication "testpub3" is not in subscription "regress_testsub" -- ok - delete publications ALTER SUBSCRIPTION regress_testsub DROP PUBLICATION testpub1, testpub2 WITH (refresh = false); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub; @@ -371,19 +375,19 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | p | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | p | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) -- we can alter streaming when two_phase enabled ALTER SUBSCRIPTION regress_testsub SET (streaming = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); @@ -393,10 +397,10 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); @@ -409,18 +413,18 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (disable_on_error = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | t | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | t | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); @@ -433,10 +437,10 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); @@ -450,19 +454,19 @@ NOTICE: max_retention_duration is ineffective when retain_dead_tuples is disabl WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and alter the subscription to refresh publications. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 1000 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 1000 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) -- ok ALTER SUBSCRIPTION regress_testsub SET (max_retention_duration = 0); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Max retention duration | Retention active | Synchronous commit | Conninfo | Receiver timeout | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------+------------------+--------------------+-----------------------------+------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | 0 | f | off | dbname=regress_doesnotexist | -1 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql index ef0c298d2d..d93cbc279d 100644 --- a/src/test/regress/sql/subscription.sql +++ b/src/test/regress/sql/subscription.sql @@ -139,6 +139,9 @@ RESET ROLE; ALTER SUBSCRIPTION regress_testsub RENAME TO regress_testsub_foo; ALTER SUBSCRIPTION regress_testsub_foo SET (synchronous_commit = local); ALTER SUBSCRIPTION regress_testsub_foo SET (synchronous_commit = foobar); +ALTER SUBSCRIPTION regress_testsub_foo SET (wal_receiver_timeout = '-1'); +ALTER SUBSCRIPTION regress_testsub_foo SET (wal_receiver_timeout = '80s'); +ALTER SUBSCRIPTION regress_testsub_foo SET (wal_receiver_timeout = 'foobar'); \dRs+ From 2f248ad573ce30e6847318ce4ea5eec3f747cbda Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 19 Feb 2026 11:08:52 -0500 Subject: [PATCH 136/147] Remove no-longer-useful markers in pg_hba.conf.sample. The source version of pg_hba.conf.sample contains @remove-line-for-nolocal@ markers that indicate which lines should be deleted for an installation that doesn't HAVE_UNIX_SOCKETS. We no longer support that case, and since commit f55808828 all that initdb is doing is unconditionally removing the markers. We might as well remove the markers from the source version and drop the removal code, which is unintelligible now anyway. This will not of course save any noticeable number of cycles in initdb, but it might save some confusion for future developers looking at pg_hba.conf.sample. It also reduces the number of distinct cases that replace_token() has to support, possibly allowing some tightening of that function. Discussion: https://postgr.es/m/2287786.1771458157@sss.pgh.pa.us --- src/backend/libpq/pg_hba.conf.sample | 6 +++--- src/bin/initdb/initdb.c | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/backend/libpq/pg_hba.conf.sample b/src/backend/libpq/pg_hba.conf.sample index b64c8dea97..475100f886 100644 --- a/src/backend/libpq/pg_hba.conf.sample +++ b/src/backend/libpq/pg_hba.conf.sample @@ -109,14 +109,14 @@ # TYPE DATABASE USER ADDRESS METHOD -@remove-line-for-nolocal@# "local" is for Unix domain socket connections only -@remove-line-for-nolocal@local all all @authmethodlocal@ +# "local" is for Unix domain socket connections only +local all all @authmethodlocal@ # IPv4 local connections: host all all 127.0.0.1/32 @authmethodhost@ # IPv6 local connections: host all all ::1/128 @authmethodhost@ # Allow replication connections from localhost, by a user with the # replication privilege. -@remove-line-for-nolocal@local replication all @authmethodlocal@ +local replication all @authmethodlocal@ host replication all 127.0.0.1/32 @authmethodhost@ host replication all ::1/128 @authmethodhost@ diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index a3980e5535..7c49dd433a 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -1463,9 +1463,6 @@ setup_config(void) conflines = readfile(hba_file); - conflines = replace_token(conflines, "@remove-line-for-nolocal@", ""); - - /* * Probe to see if there is really any platform support for IPv6, and * comment out the relevant pg_hba line if not. This avoids runtime From fc3896c786a2df86904e138d819c16f263bf7673 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Thu, 19 Feb 2026 17:11:04 +0100 Subject: [PATCH 137/147] Add translator comment Otherwise the message is not very clear. Backpatch-through: 18 --- src/backend/replication/syncrep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c index 7ea6001e9a..d1582a5d71 100644 --- a/src/backend/replication/syncrep.c +++ b/src/backend/replication/syncrep.c @@ -1077,6 +1077,7 @@ check_synchronous_standby_names(char **newval, void **extra, GucSource source) if (syncrep_parse_error_msg) GUC_check_errdetail("%s", syncrep_parse_error_msg); else + /* translator: %s is a GUC name */ GUC_check_errdetail("\"%s\" parser failed.", "synchronous_standby_names"); return false; From 45908934152af1bf26c10312a9ba68d8da53a56e Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Sat, 7 Feb 2026 08:14:43 -0500 Subject: [PATCH 138/147] Allow extensions to mark an individual index as disabled. Up until now, the only way for a loadable module to disable the use of a particular index was to use get_relation_info_hook to remove it from the index list. While that works, it has some disadvantages. First, the index becomes invisible for all purposes, and can no longer be used for optimizations such as self-join elimination or left join removal, which can severely degrade the resulting plan. Second, if the module attempts to compel the use of a certain index by removing all other indexes from the index list and disabling other scan types, but the planner is unable to use the chosen index for some reason, it will fall back to a sequential scan, because that is only disabled, whereas the other indexes are, from the planner's point of view, completely gone. While this situation ideally shouldn't occur, it's hard for a loadable module to be completely sure whether the planner will view a certain index as usable for a certain query. If it isn't, it's more desirable to fall back to the next-cheapest plan than to be forced into a sequential scan. --- src/backend/optimizer/util/pathnode.c | 8 ++++++++ src/include/nodes/pathnodes.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 9678c20ff1..829295b3af 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1077,6 +1077,14 @@ create_index_path(PlannerInfo *root, cost_index(pathnode, root, loop_count, partial_path); + /* + * cost_index will set disabled_nodes to 1 if this rel is not allowed to + * use index scans in general, but it doesn't have the IndexOptInfo to + * know whether this specific index has been disabled. + */ + if (index->disabled) + pathnode->path.disabled_nodes = 1; + return pathnode; } diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index c175ee95b6..27758ec16f 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1412,6 +1412,8 @@ typedef struct IndexOptInfo bool nullsnotdistinct; /* is uniqueness enforced immediately? */ bool immediate; + /* true if paths using this index should be marked disabled */ + bool disabled; /* true if index doesn't really exist */ bool hypothetical; From b97994bf4041173160cb4fb75fbbf0217fcfef71 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 10 Feb 2026 08:13:46 -0500 Subject: [PATCH 139/147] Replace get_relation_info_hook with build_simple_rel_hook. For a long time, PostgreSQL has had a get_relation_info_hook which plugins can use to editorialize on the information that get_relation_info obtains from the catalogs. However, this hook is only called for baserels of type RTE_RELATION, and there is potential utility in a similar call back for other types of RTEs. This might have had utility even before commit 4020b370f214315b8c10430301898ac21658143f added pgs_mask to RelOptInfo, but it certainly has utility now. So, move the callback up one level, deleting get_relation_info_hook and adding build_simple_rel_hook instead. The new callback is called just slightly later than before and with slightly different arguments, but it should be fairly straightforward to adjust existing code that currentyy uses get_relation_info_hook: the values previously available as relationObjectId and inhparent are now available via rte->relid and rte->inh, and calls where rte->rtekind != RTE_RELATION can be ignored if desired. --- src/backend/optimizer/util/plancat.c | 14 -------------- src/backend/optimizer/util/relnode.c | 15 +++++++++++++++ src/include/optimizer/pathnode.h | 6 ++++++ src/include/optimizer/plancat.h | 8 -------- 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index d63e7390be..b2fbd6a082 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -57,9 +57,6 @@ /* GUC parameter */ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; -/* Hook for plugins to get control in get_relation_info() */ -get_relation_info_hook_type get_relation_info_hook = NULL; - typedef struct NotnullHashEntry { Oid relid; /* OID of the relation */ @@ -571,17 +568,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, set_relation_partition_info(root, rel, relation); table_close(relation, NoLock); - - /* - * Allow a plugin to editorialize on the info we obtained from the - * catalogs. Actions might include altering the assumed relation size, - * removing an index, or adding a hypothetical index to the indexlist. - * - * An extension can also modify rel->pgs_mask here to control path - * generation. - */ - if (get_relation_info_hook) - (*get_relation_info_hook) (root, relationObjectId, inhparent, rel); } /* diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index a714c83f1b..4a89eda014 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -47,6 +47,9 @@ typedef struct JoinHashEntry RelOptInfo *join_rel; } JoinHashEntry; +/* Hook for plugins to get control in build_simple_rel() */ +build_simple_rel_hook_type build_simple_rel_hook = NULL; + /* Hook for plugins to get control during joinrel setup */ joinrel_setup_hook_type joinrel_setup_hook = NULL; @@ -394,6 +397,18 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) break; } + /* + * Allow a plugin to editorialize on the new RelOptInfo. This could + * involve editorializing on the information which get_relation_info + * obtained from the catalogs, such as altering the assumed relation size, + * removing an index, or adding a hypothetical index to the indexlist. + * + * An extension can also modify rel->pgs_mask here to control path + * generation. + */ + if (build_simple_rel_hook) + (*build_simple_rel_hook) (root, rel, rte); + /* * We must apply the partially filled in RelOptInfo before calling * apply_child_basequals due to some transformations within that function diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index cf8a654fa5..8297870cf7 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -17,6 +17,12 @@ #include "nodes/bitmapset.h" #include "nodes/pathnodes.h" +/* Hook for plugins to get control in get_relation_info() */ +typedef void (*build_simple_rel_hook_type) (PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); +extern PGDLLIMPORT build_simple_rel_hook_type build_simple_rel_hook; + /* * Everything in subpaths or partial_subpaths will become part of the * Append node's subpaths list. Partial and non-partial subpaths can be diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 8d7cc6d988..09baf1a691 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -17,14 +17,6 @@ #include "nodes/pathnodes.h" #include "utils/relcache.h" -/* Hook for plugins to get control in get_relation_info() */ -typedef void (*get_relation_info_hook_type) (PlannerInfo *root, - Oid relationObjectId, - bool inhparent, - RelOptInfo *rel); -extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; - - extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); From a33d6c5727a3cb28d9226a5af400a8c80eae8b39 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 26 Jan 2026 09:56:36 -0500 Subject: [PATCH 140/147] Add pg_plan_advice contrib module. Provide a facility that (1) can be used to stabilize certain plan choices so that the planner cannot reverse course without authorization and (2) can be used by knowledgeable users to insist on plan choices contrary to what the planner believes best. In both cases, terrible outcomes are possible: users should think twice and perhaps three times before constraining the planner's ability to do as it thinks best; nevertheless, there are problems that are much more easily solved with these facilities than without them. This patch takes the approach of analyzing a finished plan to produce textual output, which we call "plan advice", that describes key decisions made during plan; if that plan advice is provided during future planning cycles, it will force those key decisions to be made in the same way. Not all planner decisions can be controlled using advice; for example, decisions about how to perform aggregation are currently out of scope, as is choice of sort order. Plan advice can also be edited by the user, or even written from scratch in simple cases, making it possible to generate outcomes that the planner would not have produced. Partial advice can be provided to control some planner outcomes but not others. Currently, plan advice is focused only on specific outcomes, such as the choice to use a sequential scan for a particular relation, and not on estimates that might contribute to those outcomes, such as a possibly-incorrect selectivity estimate. While it would be useful to users to be able to provide plan advice that affects selectivity estimates or other aspects of costing, that is out of scope for this commit. Reviewed-by: Lukas Fittl Reviewed-by: Jakub Wartak Reviewed-by: Greg Burd Reviewed-by: Jacob Champion Reviewed-by: Haibo Yan Reviewed-by: Dian Fay Reviewed-by: Ajay Pal Reviewed-by: John Naylor Discussion: http://postgr.es/m/CA+TgmoZ-Jh1T6QyWoCODMVQdhTUPYkaZjWztzP1En4=ZHoKPzw@mail.gmail.com --- contrib/Makefile | 1 + contrib/meson.build | 1 + contrib/pg_plan_advice/.gitignore | 3 + contrib/pg_plan_advice/Makefile | 50 + contrib/pg_plan_advice/README | 260 ++ contrib/pg_plan_advice/expected/gather.out | 371 +++ .../pg_plan_advice/expected/join_order.out | 509 ++++ .../pg_plan_advice/expected/join_strategy.out | 339 +++ .../expected/local_collector.out | 69 + .../pg_plan_advice/expected/partitionwise.out | 426 ++++ contrib/pg_plan_advice/expected/prepared.out | 67 + contrib/pg_plan_advice/expected/scan.out | 757 ++++++ contrib/pg_plan_advice/expected/semijoin.out | 377 +++ contrib/pg_plan_advice/expected/syntax.out | 192 ++ contrib/pg_plan_advice/meson.build | 79 + .../pg_plan_advice/pg_plan_advice--1.0.sql | 43 + contrib/pg_plan_advice/pg_plan_advice.c | 563 +++++ contrib/pg_plan_advice/pg_plan_advice.control | 5 + contrib/pg_plan_advice/pg_plan_advice.h | 61 + contrib/pg_plan_advice/pgpa_ast.c | 351 +++ contrib/pg_plan_advice/pgpa_ast.h | 185 ++ contrib/pg_plan_advice/pgpa_collector.c | 639 +++++ contrib/pg_plan_advice/pgpa_collector.h | 18 + contrib/pg_plan_advice/pgpa_identifier.c | 476 ++++ contrib/pg_plan_advice/pgpa_identifier.h | 52 + contrib/pg_plan_advice/pgpa_join.c | 629 +++++ contrib/pg_plan_advice/pgpa_join.h | 105 + contrib/pg_plan_advice/pgpa_output.c | 571 +++++ contrib/pg_plan_advice/pgpa_output.h | 22 + contrib/pg_plan_advice/pgpa_parser.y | 301 +++ contrib/pg_plan_advice/pgpa_planner.c | 2166 +++++++++++++++++ contrib/pg_plan_advice/pgpa_planner.h | 17 + contrib/pg_plan_advice/pgpa_scan.c | 269 ++ contrib/pg_plan_advice/pgpa_scan.h | 85 + contrib/pg_plan_advice/pgpa_scanner.l | 297 +++ contrib/pg_plan_advice/pgpa_trove.c | 516 ++++ contrib/pg_plan_advice/pgpa_trove.h | 114 + contrib/pg_plan_advice/pgpa_walker.c | 1029 ++++++++ contrib/pg_plan_advice/pgpa_walker.h | 141 ++ contrib/pg_plan_advice/sql/gather.sql | 86 + contrib/pg_plan_advice/sql/join_order.sql | 145 ++ contrib/pg_plan_advice/sql/join_strategy.sql | 84 + .../pg_plan_advice/sql/local_collector.sql | 46 + contrib/pg_plan_advice/sql/partitionwise.sql | 99 + contrib/pg_plan_advice/sql/prepared.sql | 37 + contrib/pg_plan_advice/sql/scan.sql | 195 ++ contrib/pg_plan_advice/sql/semijoin.sql | 118 + contrib/pg_plan_advice/sql/syntax.sql | 68 + contrib/pg_plan_advice/t/001_regress.pl | 148 ++ doc/src/sgml/contrib.sgml | 1 + doc/src/sgml/filelist.sgml | 1 + doc/src/sgml/pgplanadvice.sgml | 1036 ++++++++ src/tools/pgindent/typedefs.list | 39 + 53 files changed, 14259 insertions(+) create mode 100644 contrib/pg_plan_advice/.gitignore create mode 100644 contrib/pg_plan_advice/Makefile create mode 100644 contrib/pg_plan_advice/README create mode 100644 contrib/pg_plan_advice/expected/gather.out create mode 100644 contrib/pg_plan_advice/expected/join_order.out create mode 100644 contrib/pg_plan_advice/expected/join_strategy.out create mode 100644 contrib/pg_plan_advice/expected/local_collector.out create mode 100644 contrib/pg_plan_advice/expected/partitionwise.out create mode 100644 contrib/pg_plan_advice/expected/prepared.out create mode 100644 contrib/pg_plan_advice/expected/scan.out create mode 100644 contrib/pg_plan_advice/expected/semijoin.out create mode 100644 contrib/pg_plan_advice/expected/syntax.out create mode 100644 contrib/pg_plan_advice/meson.build create mode 100644 contrib/pg_plan_advice/pg_plan_advice--1.0.sql create mode 100644 contrib/pg_plan_advice/pg_plan_advice.c create mode 100644 contrib/pg_plan_advice/pg_plan_advice.control create mode 100644 contrib/pg_plan_advice/pg_plan_advice.h create mode 100644 contrib/pg_plan_advice/pgpa_ast.c create mode 100644 contrib/pg_plan_advice/pgpa_ast.h create mode 100644 contrib/pg_plan_advice/pgpa_collector.c create mode 100644 contrib/pg_plan_advice/pgpa_collector.h create mode 100644 contrib/pg_plan_advice/pgpa_identifier.c create mode 100644 contrib/pg_plan_advice/pgpa_identifier.h create mode 100644 contrib/pg_plan_advice/pgpa_join.c create mode 100644 contrib/pg_plan_advice/pgpa_join.h create mode 100644 contrib/pg_plan_advice/pgpa_output.c create mode 100644 contrib/pg_plan_advice/pgpa_output.h create mode 100644 contrib/pg_plan_advice/pgpa_parser.y create mode 100644 contrib/pg_plan_advice/pgpa_planner.c create mode 100644 contrib/pg_plan_advice/pgpa_planner.h create mode 100644 contrib/pg_plan_advice/pgpa_scan.c create mode 100644 contrib/pg_plan_advice/pgpa_scan.h create mode 100644 contrib/pg_plan_advice/pgpa_scanner.l create mode 100644 contrib/pg_plan_advice/pgpa_trove.c create mode 100644 contrib/pg_plan_advice/pgpa_trove.h create mode 100644 contrib/pg_plan_advice/pgpa_walker.c create mode 100644 contrib/pg_plan_advice/pgpa_walker.h create mode 100644 contrib/pg_plan_advice/sql/gather.sql create mode 100644 contrib/pg_plan_advice/sql/join_order.sql create mode 100644 contrib/pg_plan_advice/sql/join_strategy.sql create mode 100644 contrib/pg_plan_advice/sql/local_collector.sql create mode 100644 contrib/pg_plan_advice/sql/partitionwise.sql create mode 100644 contrib/pg_plan_advice/sql/prepared.sql create mode 100644 contrib/pg_plan_advice/sql/scan.sql create mode 100644 contrib/pg_plan_advice/sql/semijoin.sql create mode 100644 contrib/pg_plan_advice/sql/syntax.sql create mode 100644 contrib/pg_plan_advice/t/001_regress.pl create mode 100644 doc/src/sgml/pgplanadvice.sgml diff --git a/contrib/Makefile b/contrib/Makefile index 2f0a88d3f7..dd04c20acd 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -34,6 +34,7 @@ SUBDIRS = \ pg_freespacemap \ pg_logicalinspect \ pg_overexplain \ + pg_plan_advice \ pg_prewarm \ pg_stat_statements \ pg_surgery \ diff --git a/contrib/meson.build b/contrib/meson.build index def13257cb..5a752eac34 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -48,6 +48,7 @@ subdir('pgcrypto') subdir('pg_freespacemap') subdir('pg_logicalinspect') subdir('pg_overexplain') +subdir('pg_plan_advice') subdir('pg_prewarm') subdir('pgrowlocks') subdir('pg_stat_statements') diff --git a/contrib/pg_plan_advice/.gitignore b/contrib/pg_plan_advice/.gitignore new file mode 100644 index 0000000000..19a1425301 --- /dev/null +++ b/contrib/pg_plan_advice/.gitignore @@ -0,0 +1,3 @@ +/pgpa_parser.h +/pgpa_parser.c +/pgpa_scanner.c diff --git a/contrib/pg_plan_advice/Makefile b/contrib/pg_plan_advice/Makefile new file mode 100644 index 0000000000..1d4c559aed --- /dev/null +++ b/contrib/pg_plan_advice/Makefile @@ -0,0 +1,50 @@ +# contrib/pg_plan_advice/Makefile + +MODULE_big = pg_plan_advice +OBJS = \ + $(WIN32RES) \ + pg_plan_advice.o \ + pgpa_ast.o \ + pgpa_collector.o \ + pgpa_identifier.o \ + pgpa_join.o \ + pgpa_output.o \ + pgpa_parser.o \ + pgpa_planner.o \ + pgpa_scan.o \ + pgpa_scanner.o \ + pgpa_trove.o \ + pgpa_walker.o + +EXTENSION = pg_plan_advice +DATA = pg_plan_advice--1.0.sql +PGFILEDESC = "pg_plan_advice - help the planner get the right plan" + +REGRESS = gather join_order join_strategy partitionwise scan +TAP_TESTS = 1 + +EXTRA_CLEAN = pgpa_parser.h pgpa_parser.c pgpa_scanner.c + +# required for 001_regress.pl +REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX) +export REGRESS_SHLIB + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_plan_advice +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +# See notes in src/backend/parser/Makefile about the following two rules +pgpa_parser.h: pgpa_parser.c + touch $@ + +pgpa_parser.c: BISONFLAGS += -d + +# Force these dependencies to be known even without dependency info built: +pgpa_parser.o pgpa_scanner.o: pgpa_parser.h diff --git a/contrib/pg_plan_advice/README b/contrib/pg_plan_advice/README new file mode 100644 index 0000000000..0b888fd82f --- /dev/null +++ b/contrib/pg_plan_advice/README @@ -0,0 +1,260 @@ +contrib/pg_plan_advice/README + +Plan Advice +=========== + +This module implements a mini-language for "plan advice" that allows for +control of certain key planner decisions. Goals include (1) enforcing plan +stability (my previous plan was good and I would like to keep getting a +similar one) and (2) allowing users to experiment with plans other than +the one preferred by the optimizer. Non-goals include (1) controlling +every possible planner decision and (2) forcing consideration of plans +that the optimizer rejects for reasons other than cost. (There is some +room for bikeshedding about what exactly this non-goal means: what if +we skip path generation entirely for a certain case on the theory that +we know it cannot win on cost? Does that count as a cost-based rejection +even though no cost was ever computed?) + +Generally, plan advice is a series of whitespace-separated advice items, +each of which applies an advice tag to a list of advice targets. For +example, "SEQ_SCAN(foo) HASH_JOIN(bar@ss)" contains two items of advice, +the first of which applies the SEQ_SCAN tag to "foo" and the second of +which applies the HASH_JOIN tag to "bar@ss". In this simple example, each +target identifies a single relation; see "Relation Identifiers", below. +Advice tags can also be applied to groups of relations; for example, +"HASH_JOIN(baz (bletch quux))" applies the HASH_JOIN tag to the single +relation identifier "baz" as well as to the 2-item list containing +"bletch" and "quux". + +Critically, this module knows both how to generate plan advice from an +already-existing plan, and also how to enforce it during future planning +cycles. Everything it does is intended to be "round-trip safe": if you +generate advice from a plan and then feed that back into a future planing +cycle, each piece of advice should be guaranteed to apply to the exactly the +same part of the query from which it was generated without ambiguity or +guesswork, and it should succesfully enforce the same planning decision that +led to it being generated in the first place. Note that there is no +intention that these guarantees hold in the presence of intervening DDL; +e.g. if you change the properties of a function so that a subquery is no +longer inlined, or if you drop an index named in the plan advice, the advice +isn't going to work any more. That's expected. + +This module aims to force the planner to follow any provided advice without +regard to whether it is appears to be good advice or bad advice. If the +user provides bad advice, whether derived from a previously-generated plan +or manually written, they may get a bad plan. We regard this as user error, +not a defect in this module. It seems likely that applying advice +judiciously and only when truly required to avoid problems will be a more +successful strategy than applying it with a broad brush, but users are free +to experiment with whatever strategies they think best. + +Relation Identifiers +==================== + +Uniquely identifying the part of a query to which a certain piece of +advice applies is harder than it sounds. Our basic approach is to use +relation aliases as a starting point, and then disambiguate. There are +three ways that same relation alias can occur multiple times: + +1. It can appear in more than one subquery. + +2. It can appear more than once in the same subquery, + e.g. (foo JOIN bar) x JOIN foo. + +3. The table can be partitioned. + +Any combination of these things can occur simultaneously. Therefore, our +general syntax for a relation identifier is: + +alias_name#occurrence_number/partition_schema.partition_name@plan_name + +All components except for the alias_name are optional and included only +when required. When a component is omitted, the associated punctuation +must also be omitted. Occurrence numbers are counted ignoring children of +partitioned tables. When the generated occurrence number is 1, we omit +the occurrence number. The partition schema and partition name are included +only for children of partitioned tables. In generated advice, the +partition_schema is always included whenever there is a partition_name, +but user-written advice may mention the name and omit the schema. The +plan_name is omitted for the top-level PlannerInfo. + +Scan Advice +=========== + +For many types of scan, no advice is generated or possible; for instance, +a subquery is always scanned using a subquery scan. While that scan may be +elided via setrefs processing, this doesn't change the fact that only one +basic approach exists. Hence, scan advice applies mostly to relations, which +can be scanned in multiple ways. + +We tend to think of a scan as targeting a single relation, and that's +normally the case, but it doesn't have to be. For instance, if a join is +proven empty, the whole thing may be replaced with a single Result node +which, in effect, is a degenerate scan of every relation in the collapsed +portion of the join tree. Similarly, it's possible to inject a custom scan +in such a way that it replaces an entire join. If we ever emit advice +for these cases, it would target sets of relation identifiers surrounded +by parentheses, e.g. SOME_SORT_OF_SCAN(foo (bar baz)) would mean that the +the given scan type would be used for foo as a single relation and also the +combination of bar and baz as a join product. We have no such cases at +present. + +For index and index-only scans, both the relation being scanned and the +index or indexes being used must be specified. For example, INDEX_SCAN(foo +foo_a_idx bar bar_b_idx) indicates that an index scan (not an index-only +scan) should be used on foo_a_idx when scanning foo, and that an index scan +should be used on bar_b_idx when scanning bar. + +Bitmap heap scans currently do not allow for an index specification: +BITMAP_HEAP_SCAN(foo bar) simply means that each of foo and bar should use +some sort of bitmap heap scan. + +Join Order Advice +================= + +The JOIN_ORDER tag specifies the order in which several tables that are +part of the same join problem should be joined. Each subquery (except for +those that are inlined) is a separate join problem. Within a subquery, +partitionwise joins can create additional, separate join problems. Hence, +queries involving partitionwise joins may use JOIN_ORDER() many times. + +We take the canonical join structure to be an outer-deep tree, so +JOIN_ORDER(t1 t2 t3) says that t1 is the driving table and should be joined +first to t2 and then to t3. If the join problem involves additional tables, +they can be joined in any order after the join between t1, t2, and t3 has +been constructured. Generated join advice always mentions all tables +in the join problem, but manually written join advice need not do so. + +For trees which are not outer-deep, parentheses can be used. For example, +JOIN_ORDER(t1 (t2 t3)) says that the top-level join should have t1 on the +outer side and a join between t2 and t3 on the inner side. That join should +be constructed so that t2 is on the outer side and t3 is on the inner side. + +In some cases, it's not possible to fully specify the join order in this way. +For example, if t2 and t3 are being scanned by a single custom scan or foreign +scan, or if a partitionwise join is being performed between those tables, then +it's impossible to say that t2 is the outer table and t3 is the inner table, +or the other way around; it's just undefined. In such cases, we generate +join advice that uses curly braces, intending to indicate a lack of ordering: +JOIN_ORDER(t1 {t2 t3}) says that the uppermost join should have t1 on the outer +side and some kind of join between t2 and t3 on the inner side, but without +saying how that join must be performed or anything about which relation should +appear on which side of the join, or even whether this kind of join has sides. + +Join Strategy Advice +==================== + +Tags such as NESTED_LOOP_PLAIN specify the method that should be used to +perform a certain join. More specifically, NESTED_LOOP_PLAIN(x (y z)) says +that the plan should put the relation whose identifier is "x" on the inner +side of a plain nested loop (one without materialization or memoization) +and that it should also put a join between the relation whose identifier is +"y" and the relation whose identifier is "z" on the inner side of a nested +loop. Hence, for an N-table join problem, there will be N-1 pieces of join +strategy advice; no join strategy advice is required for the outermost +table in the join problem. + +Considering that we have both join order advice and join strategy advice, +it might seem natural to say that NESTED_LOOP_PLAIN(x) should be redefined +to mean that x should appear by itself on one side or the other of a nested +loop, rather than specifically on the inner side, but this definition appears +useless in practice. It gives the planner too much freedom to do things that +bear little resemblance to what the user probably had in mind. This makes +only a limited amount of practical difference in the case of a merge join or +unparameterized nested loop, but for a parameterized nested loop or a hash +join, the two sides are treated very differently and saying that a certain +relation should be involved in one of those operations without saying which +role it should take isn't saying much. + +This choice of definition implies that join strategy advice also imposes some +join order constraints. For example, given a join between foo and bar, +HASH_JOIN(bar) implies that foo is the driving table. Otherwise, it would +be impossible to put bar beneath the inner side of a Hash Join. + +Note that, given this definition, it's reasonable to consider deleting the +join order advice but applying the join strategy advice. For example, +consider a star schema with tables fact, dim1, dim2, dim3, dim4, and dim5. +The automatically generated advice might specify JOIN_ORDER(fact dim1 dim3 +dim4 dim2 dim5) HASH_JOIN(dim2 dim4) NESTED_LOOP_PLAIN(dim1 dim3 dim5). +Deleting the JOIN_ORDER advice allows the planner to reorder the joins +however it likes while still forcing the same choice of join method. This +seems potentially useful, and is one reason why a unified syntax that controls +both join order and join method in a single locution was not chosen. + +Advice Completeness +=================== + +An essential guiding principle is that no inference may made on the basis +of the absence of advice. The user is entitled to remove any portion of the +generated advice which they deem unsuitable or counterproductive and the +result should only be to increase the flexibility afforded to the planner. +This means that if advice can say that a certain optimization or technique +should be used, it should also be able to say that the optimization or +technique should not be used. We should never assume that the absence of an +instruction to do a certain thing means that it should not be done; all +instructions must be explicit. + +Semijoin Uniqueness +=================== + +Faced with a semijoin, the planner considers both a direct implementation +and a plan where the one side is made unique and then an inner join is +performed. We emit SEMIJOIN_UNIQUE() advice when this transformation occurs +and SEMIJOIN_NON_UNIQUE() advice when it doesn't. These items work like +join strategy advice: the inner side of the relevant join is named, and the +chosen join order must be compatible with the advice having some effect. + +Partitionwise +============= + +PARTITIONWISE() advise can be used to specify both those partitionwise joins +which should be performed and those which should not be performed; the idea +is that each argument to PARTITIONWISE specifies a set of relations that +should be scanned partitionwise after being joined to each other and nothing +else. Hence, for example, PARTITIONWISE((t1 t2) t3) specifies that the +query should contain a partitionwise join between t1 and t2 and that t3 +should not be part of any partitionwise join. If there are no other rels +in the query, specifying just PARTITIONWISE((t1 t2)) would have the same +effect, since there would be no other rels to which t3 could be joined in +a partitionwise fashion. + +Parallel Query (Gather, etc.) +============================= + +Each argument to GATHER() or GATHER_MERGE() is a single relation or an +exact set of relations on top of which a Gather or Gather Merge node, +respectively, should be placed. Each argument to NO_GATHER() is a single +relation that should not appear beneath any Gather or Gather Merge node; +that is, parallelism should not be used. + +Implicit Join Order Constraints +=============================== + +When JOIN_ORDER() advice is not provided for a particular join problem, +other pieces of advice may still incidentally constraint the join order. +For example, a user who specifies HASH_JOIN((foo bar)) is explicitly saying +that there should be a hash join with exactly foo and bar on the outer +side of it, but that also implies that foo and bar must be joined to +each other before either of them is joined to anything else. Otherwise, +the join the user is attempting to constraint won't actually occur in the +query, which ends up looking like the system has just decided to ignore +the advice altogether. + +Future Work +=========== + +We don't handle choice of aggregation: it would be nice to be able to force +sorted or grouped aggregation. I'm guessing this can be left to future work. + +More seriously, we don't know anything about eager aggregation, which could +have a large impact on the shape of the plan tree. XXX: This needs some study +to determine how large a problem it is, and might need to be fixed sooner +rather than later. + +We don't offer any control over estimates, only outcomes. It seems like a +good idea to incorporate that ability at some future point, as pg_hint_plan +does. However, since primary goal of the initial development work is to be +able to induce the planner to recreate a desired plan that worked well in +the past, this has not been included in the initial development effort. + +XXX Need to investigate whether and how well supplying advice works with GEQO diff --git a/contrib/pg_plan_advice/expected/gather.out b/contrib/pg_plan_advice/expected/gather.out new file mode 100644 index 0000000000..0cc0dedf85 --- /dev/null +++ b/contrib/pg_plan_advice/expected/gather.out @@ -0,0 +1,371 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 1; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET debug_parallel_query = off; +CREATE TABLE gt_dim (id serial primary key, dim text) + WITH (autovacuum_enabled = false); +INSERT INTO gt_dim (dim) SELECT random()::text FROM generate_series(1,100) g; +VACUUM ANALYZE gt_dim; +CREATE TABLE gt_fact ( + id int not null, + dim_id integer not null references gt_dim (id) +) WITH (autovacuum_enabled = false); +INSERT INTO gt_fact + SELECT g, (g%3)+1 FROM generate_series(1,100000) g; +VACUUM ANALYZE gt_fact; +-- By default, we expect Gather Merge with a parallel hash join. +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------------- + Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER_MERGE((f d)) +(14 rows) + +-- Force Gather or Gather Merge of both relations together. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------------- + Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER_MERGE((f d)) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER_MERGE((f d)) +(16 rows) + +SET LOCAL pg_plan_advice.advice = 'gather((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------------- + Sort + Sort Key: f.dim_id + -> Gather + Workers Planned: 1 + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER((f d)) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER((f d)) +(16 rows) + +COMMIT; +-- Force a separate Gather or Gather Merge operation for each relation. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Seq Scan on gt_fact f + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: d.id + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER_MERGE(f) /* matched */ + GATHER_MERGE(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f d) + GATHER_MERGE(f d) +(20 rows) + +SET LOCAL pg_plan_advice.advice = 'gather(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Sort + Sort Key: f.dim_id + -> Gather + Workers Planned: 1 + -> Parallel Seq Scan on gt_fact f + -> Sort + Sort Key: d.id + -> Gather + Workers Planned: 1 + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER(f) /* matched */ + GATHER(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f d) + GATHER(f d) +(20 rows) + +SET LOCAL pg_plan_advice.advice = 'gather((d d/d.d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Seq Scan on gt_fact f + -> Index Scan using gt_dim_pkey on gt_dim d + Supplied Plan Advice: + GATHER((d d/d.d)) /* partially matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.gt_dim_pkey) + GATHER_MERGE(f) + NO_GATHER(d) +(17 rows) + +COMMIT; +-- Force a Gather or Gather Merge on one relation but no parallelism on other. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge(f) no_gather(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Seq Scan on gt_fact f + -> Index Scan using gt_dim_pkey on gt_dim d + Supplied Plan Advice: + GATHER_MERGE(f) /* matched */ + NO_GATHER(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.gt_dim_pkey) + GATHER_MERGE(f) + NO_GATHER(d) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'gather_merge(d) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Sort + Sort Key: f.dim_id + -> Seq Scan on gt_fact f + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: d.id + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER_MERGE(d) /* matched */ + NO_GATHER(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f d) + GATHER_MERGE(d) + NO_GATHER(f) +(19 rows) + +SET LOCAL pg_plan_advice.advice = 'gather(f) no_gather(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using gt_dim_pkey on gt_dim d + -> Sort + Sort Key: f.dim_id + -> Gather + Workers Planned: 1 + -> Parallel Seq Scan on gt_fact f + Supplied Plan Advice: + GATHER(f) /* matched */ + NO_GATHER(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_PLAIN(f) + SEQ_SCAN(f) + INDEX_SCAN(d public.gt_dim_pkey) + GATHER(f) + NO_GATHER(d) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'gather(d) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Sort + Sort Key: f.dim_id + -> Seq Scan on gt_fact f + -> Sort + Sort Key: d.id + -> Gather + Workers Planned: 1 + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER(d) /* matched */ + NO_GATHER(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f d) + GATHER(d) + NO_GATHER(f) +(19 rows) + +COMMIT; +-- Force no Gather or Gather Merge use at all. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'no_gather(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------ + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using gt_dim_pkey on gt_dim d + -> Sort + Sort Key: f.dim_id + -> Seq Scan on gt_fact f + Supplied Plan Advice: + NO_GATHER(f) /* matched */ + NO_GATHER(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_PLAIN(f) + SEQ_SCAN(f) + INDEX_SCAN(d public.gt_dim_pkey) + NO_GATHER(f d) +(15 rows) + +COMMIT; +-- Can't force Gather Merge without the ORDER BY clause, but just Gather is OK. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------------------- + Gather + Disabled: true + Workers Planned: 1 + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER_MERGE((f d)) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER((f d)) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'gather((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------------------- + Gather + Workers Planned: 1 + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER((f d)) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER((f d)) +(14 rows) + +COMMIT; +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather((f d)) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------------- + Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER((f d)) /* matched, conflicting, failed */ + NO_GATHER(f) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER_MERGE((f d)) +(17 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/join_order.out b/contrib/pg_plan_advice/expected/join_order.out new file mode 100644 index 0000000000..db0dcef701 --- /dev/null +++ b/contrib/pg_plan_advice/expected/join_order.out @@ -0,0 +1,509 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +CREATE TABLE jo_dim1 (id integer primary key, dim1 text, val1 int) + WITH (autovacuum_enabled = false); +INSERT INTO jo_dim1 (id, dim1, val1) + SELECT g, 'some filler text ' || g, (g % 3) + 1 + FROM generate_series(1,100) g; +VACUUM ANALYZE jo_dim1; +CREATE TABLE jo_dim2 (id integer primary key, dim2 text, val2 int) + WITH (autovacuum_enabled = false); +INSERT INTO jo_dim2 (id, dim2, val2) + SELECT g, 'some filler text ' || g, (g % 7) + 1 + FROM generate_series(1,1000) g; +VACUUM ANALYZE jo_dim2; +CREATE TABLE jo_fact ( + id int primary key, + dim1_id integer not null references jo_dim1 (id), + dim2_id integer not null references jo_dim2 (id) +) WITH (autovacuum_enabled = false); +INSERT INTO jo_fact + SELECT g, (g%100)+1, (g%100)+1 FROM generate_series(1,100000) g; +VACUUM ANALYZE jo_fact; +-- We expect to join to d2 first and then d1, since the condition on d2 +-- is more selective. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------ + Hash Join + Hash Cond: (f.dim1_id = d1.id) + -> Hash Join + Hash Cond: (f.dim2_id = d2.id) + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + Generated Plan Advice: + JOIN_ORDER(f d2 d1) + HASH_JOIN(d2 d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(f d1 d2) +(16 rows) + +-- Force a few different join orders. Some of these are very inefficient, +-- but the planner considers them all viable. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------ + Hash Join + Hash Cond: (f.dim2_id = d2.id) + -> Hash Join + Hash Cond: (f.dim1_id = d1.id) + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + Supplied Plan Advice: + JOIN_ORDER(f d1 d2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d1 d2) + HASH_JOIN(d1 d2) + SEQ_SCAN(f d1 d2) + NO_GATHER(f d1 d2) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------ + Hash Join + Hash Cond: (f.dim1_id = d1.id) + -> Hash Join + Hash Cond: (f.dim2_id = d2.id) + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(f d2 d1) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d2 d1) + HASH_JOIN(d2 d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(f d1 d2) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(d1 f d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +----------------------------------------- + Hash Join + Hash Cond: (f.dim2_id = d2.id) + -> Hash Join + Hash Cond: (d1.id = f.dim1_id) + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + Supplied Plan Advice: + JOIN_ORDER(d1 f d2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d1 f d2) + HASH_JOIN(f d2) + SEQ_SCAN(d1 f d2) + NO_GATHER(f d1 d2) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f (d1 d2))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Merge Cond: ((f.dim2_id = d2.id) AND (f.dim1_id = d1.id)) + -> Sort + Sort Key: f.dim2_id, f.dim1_id + -> Seq Scan on jo_fact f + -> Sort + Sort Key: d2.id, d1.id + -> Nested Loop + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Materialize + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + Supplied Plan Advice: + JOIN_ORDER(f (d1 d2)) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f (d1 d2)) + MERGE_JOIN_PLAIN((d1 d2)) + NESTED_LOOP_MATERIALIZE(d2) + SEQ_SCAN(f d1 d2) + NO_GATHER(f d1 d2) +(21 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f {d1 d2})'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Merge Cond: ((f.dim2_id = d2.id) AND (f.dim1_id = d1.id)) + -> Sort + Sort Key: f.dim2_id, f.dim1_id + -> Seq Scan on jo_fact f + -> Sort + Sort Key: d2.id, d1.id + -> Nested Loop + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Materialize + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(f {d1 d2}) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(f (d2 d1)) + MERGE_JOIN_PLAIN((d1 d2)) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(f d1 d2) +(21 rows) + +COMMIT; +-- Force a join order by mentioning just a prefix of the join list. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------ + Hash Join + Hash Cond: (d2.id = f.dim2_id) + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Hash + -> Hash Join + Hash Cond: (f.dim1_id = d1.id) + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(d2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d2 (f d1)) + HASH_JOIN(d1 (f d1)) + SEQ_SCAN(d2 f d1) + NO_GATHER(f d1 d2) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Merge Cond: ((d2.id = f.dim2_id) AND (d1.id = f.dim1_id)) + -> Sort + Sort Key: d2.id, d1.id + -> Nested Loop + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Materialize + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Sort + Sort Key: f.dim2_id, f.dim1_id + -> Seq Scan on jo_fact f + Supplied Plan Advice: + JOIN_ORDER(d2 d1) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d2 d1 f) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(d2 d1 f) + NO_GATHER(f d1 d2) +(21 rows) + +COMMIT; +-- jo_fact is not partitioned, but let's try pretending that it is and +-- verifying that the advice does not apply. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Disabled: true + -> Nested Loop + Disabled: true + -> Seq Scan on jo_fact f + -> Index Scan using jo_dim1_pkey on jo_dim1 d1 + Index Cond: (id = f.dim1_id) + Filter: (val1 = 1) + -> Index Scan using jo_dim2_pkey on jo_dim2 d2 + Index Cond: (id = f.dim2_id) + Filter: (val2 = 1) + Supplied Plan Advice: + JOIN_ORDER(f/d1 d1 d2) /* partially matched */ + Generated Plan Advice: + JOIN_ORDER(f d1 d2) + NESTED_LOOP_PLAIN(d1 d2) + SEQ_SCAN(f) + INDEX_SCAN(d1 public.jo_dim1_pkey d2 public.jo_dim2_pkey) + NO_GATHER(f d1 d2) +(19 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 (d1 d2))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +-------------------------------------------------------------- + Nested Loop + Disabled: true + Join Filter: ((d1.id = f.dim1_id) AND (d2.id = f.dim2_id)) + -> Nested Loop + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Materialize + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Seq Scan on jo_fact f + Supplied Plan Advice: + JOIN_ORDER(f/d1 (d1 d2)) /* partially matched */ + Generated Plan Advice: + JOIN_ORDER(d1 d2 f) + NESTED_LOOP_PLAIN(f) + NESTED_LOOP_MATERIALIZE(d2) + SEQ_SCAN(d1 d2 f) + NO_GATHER(f d1 d2) +(18 rows) + +COMMIT; +-- The unusual formulation of this query is intended to prevent the query +-- planner from reducing the FULL JOIN to some other join type, so that we +-- can test what happens with a join type that cannot be reordered. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0))) + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Materialize + -> Seq Scan on jo_dim1 d1 + Generated Plan Advice: + JOIN_ORDER(d2 f d1) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(d2 f d1) + NO_GATHER(d1 f d2) +(18 rows) + +-- We should not be able to force the planner to join f to d1 first, because +-- that is not a valid join order, but we should be able to force the planner +-- to make either d2 or f the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Disabled: true + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Disabled: true + Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0))) + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Seq Scan on jo_dim1 d1 + Supplied Plan Advice: + JOIN_ORDER(f d1 d2) /* partially matched */ + Generated Plan Advice: + JOIN_ORDER(d2 f d1) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_PLAIN(d1) + SEQ_SCAN(d2 f d1) + NO_GATHER(d1 f d2) +(21 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Merge Cond: (((f.dim2_id + 0)) = ((d2.id + 0))) + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Materialize + -> Seq Scan on jo_dim1 d1 + Supplied Plan Advice: + JOIN_ORDER(f d2 d1) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d2 d1) + MERGE_JOIN_PLAIN(d2) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(d1 f d2) +(20 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(d2 f d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0))) + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Materialize + -> Seq Scan on jo_dim1 d1 + Supplied Plan Advice: + JOIN_ORDER(d2 f d1) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d2 f d1) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(d2 f d1) + NO_GATHER(d1 f d2) +(20 rows) + +COMMIT; +-- Two incompatible join orders should conflict. In the second case, +-- the conflict is implicit: if d1 is on the inner side of a join of any +-- type, it cannot also be the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f) join_order(d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Merge Cond: (((f.dim2_id + 0)) = ((d2.id + 0))) + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Materialize + -> Seq Scan on jo_dim1 d1 + Supplied Plan Advice: + JOIN_ORDER(f) /* matched, conflicting */ + JOIN_ORDER(d1) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(f d2 d1) + MERGE_JOIN_PLAIN(d2) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(d1 f d2) +(21 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(d1) hash_join(d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +--------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Seq Scan on jo_dim1 d1 + -> Materialize + -> Merge Full Join + Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0))) + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + Supplied Plan Advice: + JOIN_ORDER(d1) /* matched, conflicting */ + HASH_JOIN(d1) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(d1 (d2 f)) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_MATERIALIZE((f d2)) + SEQ_SCAN(d1 d2 f) + NO_GATHER(d1 f d2) +(21 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/join_strategy.out b/contrib/pg_plan_advice/expected/join_strategy.out new file mode 100644 index 0000000000..0f9db69219 --- /dev/null +++ b/contrib/pg_plan_advice/expected/join_strategy.out @@ -0,0 +1,339 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +CREATE TABLE join_dim (id serial primary key, dim text) + WITH (autovacuum_enabled = false); +INSERT INTO join_dim (dim) SELECT random()::text FROM generate_series(1,100) g; +VACUUM ANALYZE join_dim; +CREATE TABLE join_fact ( + id int primary key, + dim_id integer not null references join_dim (id) +) WITH (autovacuum_enabled = false); +INSERT INTO join_fact + SELECT g, (g%3)+1 FROM generate_series(1,100000) g; +CREATE INDEX join_fact_dim_id ON join_fact (dim_id); +VACUUM ANALYZE join_fact; +-- We expect a hash join by default. +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------ + Hash Join + Hash Cond: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Hash + -> Seq Scan on join_dim d + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + NO_GATHER(f d) +(10 rows) + +-- Try forcing each join method in turn with join_dim as the inner table. +-- All of these should work except for MERGE_JOIN_MATERIALIZE; that will +-- fail, because the planner knows that join_dim (id) is unique, and will +-- refuse to add mark/restore overhead. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------ + Hash Join + Hash Cond: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Hash + -> Seq Scan on join_dim d + Supplied Plan Advice: + HASH_JOIN(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------------- + Merge Join + Disabled: true + Merge Cond: (f.dim_id = d.id) + -> Index Scan using join_fact_dim_id on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Supplied Plan Advice: + MERGE_JOIN_MATERIALIZE(d) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + INDEX_SCAN(f public.join_fact_dim_id d public.join_dim_pkey) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Index Scan using join_fact_dim_id on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Supplied Plan Advice: + MERGE_JOIN_PLAIN(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + INDEX_SCAN(f public.join_fact_dim_id d public.join_dim_pkey) + NO_GATHER(f d) +(11 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +-------------------------------------------- + Nested Loop + Join Filter: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Materialize + -> Seq Scan on join_dim d + Supplied Plan Advice: + NESTED_LOOP_MATERIALIZE(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_MATERIALIZE(d) + SEQ_SCAN(f d) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------- + Nested Loop + -> Seq Scan on join_fact f + -> Memoize + Cache Key: f.dim_id + Cache Mode: logical + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + NESTED_LOOP_MEMOIZE(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_MEMOIZE(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.join_dim_pkey) + NO_GATHER(f d) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------- + Nested Loop + -> Seq Scan on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + NESTED_LOOP_PLAIN(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.join_dim_pkey) + NO_GATHER(f d) +(12 rows) + +COMMIT; +-- Now try forcing each join method in turn with join_fact as the inner +-- table. All of these should work. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------- + Hash Join + Hash Cond: (d.id = f.dim_id) + -> Seq Scan on join_dim d + -> Hash + -> Seq Scan on join_fact f + Supplied Plan Advice: + HASH_JOIN(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + HASH_JOIN(f) + SEQ_SCAN(d f) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------------- + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using join_dim_pkey on join_dim d + -> Materialize + -> Index Scan using join_fact_dim_id on join_fact f + Supplied Plan Advice: + MERGE_JOIN_MATERIALIZE(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_MATERIALIZE(f) + INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------------- + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using join_dim_pkey on join_dim d + -> Index Scan using join_fact_dim_id on join_fact f + Supplied Plan Advice: + MERGE_JOIN_PLAIN(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_PLAIN(f) + INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id) + NO_GATHER(f d) +(11 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +-------------------------------------------- + Nested Loop + Join Filter: (f.dim_id = d.id) + -> Seq Scan on join_dim d + -> Materialize + -> Seq Scan on join_fact f + Supplied Plan Advice: + NESTED_LOOP_MATERIALIZE(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + NESTED_LOOP_MATERIALIZE(f) + SEQ_SCAN(d f) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +-------------------------------------------------------------- + Nested Loop + -> Seq Scan on join_dim d + -> Memoize + Cache Key: d.id + Cache Mode: logical + -> Index Scan using join_fact_dim_id on join_fact f + Index Cond: (dim_id = d.id) + Supplied Plan Advice: + NESTED_LOOP_MEMOIZE(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + NESTED_LOOP_MEMOIZE(f) + SEQ_SCAN(d) + INDEX_SCAN(f public.join_fact_dim_id) + NO_GATHER(f d) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +-------------------------------------------------------- + Nested Loop + -> Seq Scan on join_dim d + -> Index Scan using join_fact_dim_id on join_fact f + Index Cond: (dim_id = d.id) + Supplied Plan Advice: + NESTED_LOOP_PLAIN(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + NESTED_LOOP_PLAIN(f) + SEQ_SCAN(d) + INDEX_SCAN(f public.join_fact_dim_id) + NO_GATHER(f d) +(12 rows) + +COMMIT; +-- Non-working cases. We can't force a foreign join between these tables, +-- because they aren't foreign tables. We also can't use two different +-- strategies on the same table, nor can we put both tables on the inner +-- side of the same join. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'FOREIGN_JOIN((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------- + Nested Loop + Disabled: true + -> Seq Scan on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + FOREIGN_JOIN((f d)) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.join_dim_pkey) + NO_GATHER(f d) +(13 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f) NESTED_LOOP_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +----------------------------------------------------------------- + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using join_dim_pkey on join_dim d + -> Index Scan using join_fact_dim_id on join_fact f + Supplied Plan Advice: + NESTED_LOOP_PLAIN(f) /* matched, conflicting, failed */ + NESTED_LOOP_MATERIALIZE(f) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_PLAIN(f) + INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------- + Nested Loop + Disabled: true + -> Seq Scan on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + NESTED_LOOP_PLAIN(f) /* matched, failed */ + NESTED_LOOP_PLAIN(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.join_dim_pkey) + NO_GATHER(f d) +(14 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/local_collector.out b/contrib/pg_plan_advice/expected/local_collector.out new file mode 100644 index 0000000000..30c07682ce --- /dev/null +++ b/contrib/pg_plan_advice/expected/local_collector.out @@ -0,0 +1,69 @@ +CREATE EXTENSION pg_plan_advice; +SET debug_parallel_query = off; +-- Try clearing advice before we've collected any. +SELECT pg_clear_collected_local_advice(); + pg_clear_collected_local_advice +--------------------------------- + +(1 row) + +-- Set a small advice collection limit so that we'll exceed it. +SET pg_plan_advice.local_collection_limit = 2; +-- Enable the collector. +SET pg_plan_advice.local_collector = on; +-- Set up a dummy table. +CREATE TABLE dummy_table (a int primary key, b text) + WITH (autovacuum_enabled = false, parallel_workers = 0); +-- Test queries. +SELECT * FROM dummy_table a, dummy_table b; + a | b | a | b +---+---+---+--- +(0 rows) + +SELECT * FROM dummy_table; + a | b +---+--- +(0 rows) + +-- Should return the advice from the second test query. +SET pg_plan_advice.local_collector = off; +SELECT advice FROM pg_get_collected_local_advice() ORDER BY id DESC LIMIT 1; + advice +------------------------ + SEQ_SCAN(dummy_table) + + NO_GATHER(dummy_table) +(1 row) + +-- Now try clearing advice again. +SELECT pg_clear_collected_local_advice(); + pg_clear_collected_local_advice +--------------------------------- + +(1 row) + +-- Raise the collection limit so that the collector uses multiple chunks. +SET pg_plan_advice.local_collection_limit = 2000; +SET pg_plan_advice.local_collector = on; +-- Push a bunch of queries through the collector. +DO $$ +BEGIN + FOR x IN 1..2000 LOOP + EXECUTE 'SELECT * FROM dummy_table'; + END LOOP; +END +$$; +-- Check that the collector worked. +SELECT COUNT(*) FROM pg_get_collected_local_advice(); + count +------- + 2000 +(1 row) + +-- And clear one more time, to verify that this doesn't cause a problem +-- even with a larger number of entries. +SELECT pg_clear_collected_local_advice(); + pg_clear_collected_local_advice +--------------------------------- + +(1 row) + diff --git a/contrib/pg_plan_advice/expected/partitionwise.out b/contrib/pg_plan_advice/expected/partitionwise.out new file mode 100644 index 0000000000..2b3d0a8244 --- /dev/null +++ b/contrib/pg_plan_advice/expected/partitionwise.out @@ -0,0 +1,426 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +SET enable_partitionwise_join = true; +CREATE TABLE pt1 (id integer primary key, dim1 text, val1 int) + PARTITION BY RANGE (id); +CREATE TABLE pt1a PARTITION OF pt1 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt1b PARTITION OF pt1 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt1c PARTITION OF pt1 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt1 (id, dim1, val1) + SELECT g, 'some filler text ' || g, (g % 3) + 1 + FROM generate_series(1,3000) g; +VACUUM ANALYZE pt1; +CREATE TABLE pt2 (id integer primary key, dim2 text, val2 int) + PARTITION BY RANGE (id); +CREATE TABLE pt2a PARTITION OF pt2 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt2b PARTITION OF pt2 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt2c PARTITION OF pt2 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt2 (id, dim2, val2) + SELECT g, 'some other text ' || g, (g % 5) + 1 + FROM generate_series(1,3000,2) g; +VACUUM ANALYZE pt2; +CREATE TABLE pt3 (id integer primary key, dim3 text, val3 int) + PARTITION BY RANGE (id); +CREATE TABLE pt3a PARTITION OF pt3 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt3b PARTITION OF pt3 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt3c PARTITION OF pt3 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt3 (id, dim3, val3) + SELECT g, 'a third random text ' || g, (g % 7) + 1 + FROM generate_series(1,3000,3) g; +VACUUM ANALYZE pt3; +CREATE TABLE ptmismatch (id integer primary key, dimm text, valm int) + PARTITION BY RANGE (id); +CREATE TABLE ptmismatcha PARTITION OF ptmismatch + FOR VALUES FROM (1) to (1501) + WITH (autovacuum_enabled = false); +CREATE TABLE ptmismatchb PARTITION OF ptmismatch + FOR VALUES FROM (1501) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO ptmismatch (id, dimm, valm) + SELECT g, 'yet another text ' || g, (g % 2) + 1 + FROM generate_series(1,3000) g; +VACUUM ANALYZE ptmismatch; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Append + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_1.id = pt3_1.id) + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2_1.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_2.id = pt3_2.id) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2_2.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_3.id = pt3_3.id) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2_3.id) + Filter: (val1 = 1) + Generated Plan Advice: + JOIN_ORDER(pt2/public.pt2a pt3/public.pt3a pt1/public.pt1a) + JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b) + JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c) + NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + HASH_JOIN(pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) + SEQ_SCAN(pt2/public.pt2a pt3/public.pt3a pt2/public.pt2b pt3/public.pt3b + pt2/public.pt2c pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE((pt1 pt2 pt3)) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(47 rows) + +-- Suppress partitionwise join, or do it just partially. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE(pt1 pt2 pt3)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Nested Loop + -> Hash Join + Hash Cond: (pt2.id = pt3.id) + -> Append + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Append + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Append + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2.id) + Filter: (val1 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2.id) + Filter: (val1 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2.id) + Filter: (val1 = 1) + Supplied Plan Advice: + PARTITIONWISE(pt1) /* matched */ + PARTITIONWISE(pt2) /* matched */ + PARTITIONWISE(pt3) /* matched */ + Generated Plan Advice: + JOIN_ORDER(pt2 pt3 pt1) + NESTED_LOOP_PLAIN(pt1) + HASH_JOIN(pt3) + SEQ_SCAN(pt2/public.pt2a pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a + pt3/public.pt3b pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE(pt2 pt3 pt1) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(43 rows) + +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) pt3)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Hash Join + Hash Cond: (pt1.id = pt3.id) + -> Append + -> Hash Join + Hash Cond: (pt1_1.id = pt2_1.id) + -> Seq Scan on pt1a pt1_1 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Hash Join + Hash Cond: (pt1_2.id = pt2_2.id) + -> Seq Scan on pt1b pt1_2 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash Join + Hash Cond: (pt1_3.id = pt2_3.id) + -> Seq Scan on pt1c pt1_3 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Append + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + Supplied Plan Advice: + PARTITIONWISE((pt1 pt2)) /* matched */ + PARTITIONWISE(pt3) /* matched */ + Generated Plan Advice: + JOIN_ORDER(pt1/public.pt1a pt2/public.pt2a) + JOIN_ORDER(pt1/public.pt1b pt2/public.pt2b) + JOIN_ORDER(pt1/public.pt1c pt2/public.pt2c) + JOIN_ORDER({pt1 pt2} pt3) + HASH_JOIN(pt2/public.pt2a pt2/public.pt2b pt2/public.pt2c pt3) + SEQ_SCAN(pt1/public.pt1a pt2/public.pt2a pt1/public.pt1b pt2/public.pt2b + pt1/public.pt1c pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b + pt3/public.pt3c) + PARTITIONWISE((pt1 pt2) pt3) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(47 rows) + +COMMIT; +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) (pt1 pt3))'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Append + Disabled: true + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_1.id = pt3_1.id) + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2_1.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_2.id = pt3_2.id) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2_2.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_3.id = pt3_3.id) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2_3.id) + Filter: (val1 = 1) + Supplied Plan Advice: + PARTITIONWISE((pt1 pt2)) /* matched, conflicting, failed */ + PARTITIONWISE((pt1 pt3)) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(pt2/public.pt2a pt3/public.pt3a pt1/public.pt1a) + JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b) + JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c) + NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + HASH_JOIN(pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) + SEQ_SCAN(pt2/public.pt2a pt3/public.pt3a pt2/public.pt2b pt3/public.pt3b + pt2/public.pt2c pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE((pt1 pt2 pt3)) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(51 rows) + +COMMIT; +-- Can't force a partitionwise join with a mismatched table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 ptmismatch))'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, ptmismatch WHERE pt1.id = ptmismatch.id; + QUERY PLAN +--------------------------------------------------------------------------- + Nested Loop + Disabled: true + -> Append + -> Seq Scan on pt1a pt1_1 + -> Seq Scan on pt1b pt1_2 + -> Seq Scan on pt1c pt1_3 + -> Append + -> Index Scan using ptmismatcha_pkey on ptmismatcha ptmismatch_1 + Index Cond: (id = pt1.id) + -> Index Scan using ptmismatchb_pkey on ptmismatchb ptmismatch_2 + Index Cond: (id = pt1.id) + Supplied Plan Advice: + PARTITIONWISE((pt1 ptmismatch)) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(pt1 ptmismatch) + NESTED_LOOP_PLAIN(ptmismatch) + SEQ_SCAN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + INDEX_SCAN(ptmismatch/public.ptmismatcha public.ptmismatcha_pkey + ptmismatch/public.ptmismatchb public.ptmismatchb_pkey) + PARTITIONWISE(pt1 ptmismatch) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c + ptmismatch/public.ptmismatcha ptmismatch/public.ptmismatchb) +(22 rows) + +COMMIT; +-- Force join order for a particular branch of the partitionwise join with +-- and without mentioning the schema name. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Append + -> Nested Loop + -> Hash Join + Hash Cond: (pt3_1.id = pt2_1.id) + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Hash + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2_1.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_2.id = pt3_2.id) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2_2.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_3.id = pt3_3.id) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2_3.id) + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a) /* matched */ + Generated Plan Advice: + JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a) + JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b) + JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c) + NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + HASH_JOIN(pt2/public.pt2a pt3/public.pt3b pt3/public.pt3c) + SEQ_SCAN(pt3/public.pt3a pt2/public.pt2a pt2/public.pt2b pt3/public.pt3b + pt2/public.pt2c pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE((pt1 pt2 pt3)) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(49 rows) + +SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Append + -> Nested Loop + -> Hash Join + Hash Cond: (pt3_1.id = pt2_1.id) + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Hash + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2_1.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_2.id = pt3_2.id) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2_2.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_3.id = pt3_3.id) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2_3.id) + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a) /* matched */ + Generated Plan Advice: + JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a) + JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b) + JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c) + NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + HASH_JOIN(pt2/public.pt2a pt3/public.pt3b pt3/public.pt3c) + SEQ_SCAN(pt3/public.pt3a pt2/public.pt2a pt2/public.pt2b pt3/public.pt3b + pt2/public.pt2c pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE((pt1 pt2 pt3)) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(49 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/prepared.out b/contrib/pg_plan_advice/expected/prepared.out new file mode 100644 index 0000000000..07a7c62365 --- /dev/null +++ b/contrib/pg_plan_advice/expected/prepared.out @@ -0,0 +1,67 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +CREATE TABLE ptab (id integer, val text) WITH (autovacuum_enabled = false); +SET pg_plan_advice.always_store_advice_details = false; +-- Not prepared, so advice should be generated. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM ptab; + QUERY PLAN +------------------------ + Seq Scan on ptab + Generated Plan Advice: + SEQ_SCAN(ptab) + NO_GATHER(ptab) +(4 rows) + +-- Prepared, so advice should not be generated. +PREPARE pt1 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt1; + QUERY PLAN +------------------ + Seq Scan on ptab +(1 row) + +SET pg_plan_advice.always_store_advice_details = true; +-- Prepared, but always_store_advice_details = true, so should show advice. +PREPARE pt2 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2; + QUERY PLAN +------------------------ + Seq Scan on ptab + Generated Plan Advice: + SEQ_SCAN(ptab) + NO_GATHER(ptab) +(4 rows) + +-- Not prepared, so feedback should be generated. +SET pg_plan_advice.always_store_advice_details = false; +SET pg_plan_advice.advice = 'SEQ_SCAN(ptab)'; +EXPLAIN (COSTS OFF) +SELECT * FROM ptab; + QUERY PLAN +-------------------------------- + Seq Scan on ptab + Supplied Plan Advice: + SEQ_SCAN(ptab) /* matched */ +(3 rows) + +-- Prepared, so advice should not be generated. +PREPARE pt3 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF) EXECUTE pt1; + QUERY PLAN +------------------ + Seq Scan on ptab +(1 row) + +SET pg_plan_advice.always_store_advice_details = true; +-- Prepared, but always_store_advice_details = true, so should show feedback. +PREPARE pt4 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2; + QUERY PLAN +------------------------ + Seq Scan on ptab + Generated Plan Advice: + SEQ_SCAN(ptab) + NO_GATHER(ptab) +(4 rows) + diff --git a/contrib/pg_plan_advice/expected/scan.out b/contrib/pg_plan_advice/expected/scan.out new file mode 100644 index 0000000000..3f9e13b6d4 --- /dev/null +++ b/contrib/pg_plan_advice/expected/scan.out @@ -0,0 +1,757 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +SET seq_page_cost = 0.1; +SET random_page_cost = 0.1; +SET cpu_tuple_cost = 0; +SET cpu_index_tuple_cost = 0; +CREATE TABLE scan_table (a int primary key, b text) + WITH (autovacuum_enabled = false); +INSERT INTO scan_table + SELECT g, 'some text ' || g FROM generate_series(1, 100000) g; +CREATE INDEX scan_table_b ON scan_table USING brin (b); +VACUUM ANALYZE scan_table; +-- Sequential scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +------------------------- + Seq Scan on scan_table + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(4 rows) + +-- Index scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(5 rows) + +-- Index-only scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------ + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(5 rows) + +-- Bitmap heap scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; + QUERY PLAN +----------------------------------------------- + Bitmap Heap Scan on scan_table + Recheck Cond: (b > 'some text 8'::text) + -> Bitmap Index Scan on scan_table_b + Index Cond: (b > 'some text 8'::text) + Generated Plan Advice: + BITMAP_HEAP_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +-- TID scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; + QUERY PLAN +----------------------------------- + Tid Scan on scan_table + TID Cond: (ctid = '(0,1)'::tid) + Generated Plan Advice: + TID_SCAN(scan_table) + NO_GATHER(scan_table) +(5 rows) + +-- TID range scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; + QUERY PLAN +--------------------------------------------------------------- + Tid Range Scan on scan_table + TID Cond: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid)) + Generated Plan Advice: + TID_SCAN(scan_table) + NO_GATHER(scan_table) +(5 rows) + +-- Try forcing each of our test queries to use the scan type they +-- wanted to use anyway. This should succeed. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +-------------------------------------- + Seq Scan on scan_table + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(6 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------- + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; + QUERY PLAN +----------------------------------------------- + Bitmap Heap Scan on scan_table + Recheck Cond: (b > 'some text 8'::text) + -> Bitmap Index Scan on scan_table_b + Index Cond: (b > 'some text 8'::text) + Supplied Plan Advice: + BITMAP_HEAP_SCAN(scan_table) /* matched */ + Generated Plan Advice: + BITMAP_HEAP_SCAN(scan_table) + NO_GATHER(scan_table) +(9 rows) + +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; + QUERY PLAN +-------------------------------------- + Tid Scan on scan_table + TID Cond: (ctid = '(0,1)'::tid) + Supplied Plan Advice: + TID_SCAN(scan_table) /* matched */ + Generated Plan Advice: + TID_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; + QUERY PLAN +--------------------------------------------------------------- + Tid Range Scan on scan_table + TID Cond: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid)) + Supplied Plan Advice: + TID_SCAN(scan_table) /* matched */ + Generated Plan Advice: + TID_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +COMMIT; +-- Try to force a full scan of the table to use some other scan type. All +-- of these will fail. An index scan or bitmap heap scan could potentially +-- generate the correct answer, but the planner does not even consider these +-- possibilities due to the lack of a WHERE clause. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +---------------------------------------------------------------- + Seq Scan on scan_table + Disabled: true + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on scan_table + Disabled: true + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +------------------------------------------------------ + Seq Scan on scan_table + Disabled: true + Supplied Plan Advice: + BITMAP_HEAP_SCAN(scan_table) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +---------------------------------------------- + Seq Scan on scan_table + Disabled: true + Supplied Plan Advice: + TID_SCAN(scan_table) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +COMMIT; +-- Try again to force index use. This should now succeed for the INDEX_SCAN +-- and BITMAP_HEAP_SCAN, but the INDEX_ONLY_SCAN can't be forced because the +-- query fetches columns not included in the index. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; + QUERY PLAN +-------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a > 0) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on scan_table + Disabled: true + Filter: (a > 0) + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(8 rows) + +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; + QUERY PLAN +---------------------------------------------- + Bitmap Heap Scan on scan_table + Recheck Cond: (a > 0) + -> Bitmap Index Scan on scan_table_pkey + Index Cond: (a > 0) + Supplied Plan Advice: + BITMAP_HEAP_SCAN(scan_table) /* matched */ + Generated Plan Advice: + BITMAP_HEAP_SCAN(scan_table) + NO_GATHER(scan_table) +(9 rows) + +COMMIT; +-- We can force a primary key lookup to use a sequential scan, but we +-- can't force it to use an index-only scan (due to the column list) +-- or a TID scan (due to the absence of a TID qual). +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------- + Seq Scan on scan_table + Filter: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +--------------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Disabled: true + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Disabled: true + Index Cond: (a = 1) + Supplied Plan Advice: + TID_SCAN(scan_table) /* matched, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +COMMIT; +-- We can forcibly downgrade an index-only scan to an index scan, but we can't +-- force the use of an index that the planner thinks is inapplicable. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +--------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Disabled: true + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_b) /* matched, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +COMMIT; +-- We can force the use of a sequential scan in place of a bitmap heap scan, +-- but a plain index scan on a BRIN index is not possible. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; + QUERY PLAN +-------------------------------------- + Seq Scan on scan_table + Filter: (b > 'some text 8'::text) + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Disabled: true + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_b) /* matched, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +COMMIT; +-- We can force the use of a sequential scan rather than a TID scan or +-- TID range scan. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; + QUERY PLAN +-------------------------------------- + Seq Scan on scan_table + Filter: (ctid = '(0,1)'::tid) + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; + QUERY PLAN +------------------------------------------------------------- + Seq Scan on scan_table + Filter: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid)) + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +COMMIT; +-- Test more complex scenarios with index scans. +BEGIN; +-- Should still work if we mention the schema. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +--------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +-- But not if we mention the wrong schema. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table cilbup.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table cilbup.scan_table_pkey) /* matched, inapplicable, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +-- It's OK to repeat the same advice. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +-- But it doesn't work if the index target is even notionally different. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +---------------------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched, conflicting */ + INDEX_SCAN(scan_table public.scan_table_pkey) /* matched, conflicting */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +COMMIT; +-- Test assorted incorrect advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(nothing)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------ + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(nothing) /* not matched */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(nothing whatsoever)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------ + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(nothing whatsoever) /* not matched */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table bogus)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------------------------------------- + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table bogus) /* matched, inapplicable, failed */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(nothing whatsoever)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +--------------------------------------------------------- + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_ONLY_SCAN(nothing whatsoever) /* not matched */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table bogus)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------------------- + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table bogus) /* matched, inapplicable, failed */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +COMMIT; +-- Test our ability to refer to multiple instances of the same alias. +BEGIN; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; + QUERY PLAN +------------------------------------------------------------------- + Nested Loop Left Join + -> Nested Loop Left Join + -> Function Scan on generate_series g + -> Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = g.g) + -> Index Scan using scan_table_pkey on scan_table s_1 + Index Cond: (a = g.g) + Generated Plan Advice: + JOIN_ORDER(g s s#2) + NESTED_LOOP_PLAIN(s s#2) + INDEX_SCAN(s public.scan_table_pkey s#2 public.scan_table_pkey) + NO_GATHER(g s s#2) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; + QUERY PLAN +---------------------------------------------------------- + Nested Loop Left Join + -> Hash Left Join + Hash Cond: (g.g = s.a) + -> Function Scan on generate_series g + -> Hash + -> Seq Scan on scan_table s + -> Index Scan using scan_table_pkey on scan_table s_1 + Index Cond: (a = g.g) + Supplied Plan Advice: + SEQ_SCAN(s) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g s s#2) + NESTED_LOOP_PLAIN(s#2) + HASH_JOIN(s) + SEQ_SCAN(s) + INDEX_SCAN(s#2 public.scan_table_pkey) + NO_GATHER(g s s#2) +(17 rows) + +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s#2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; + QUERY PLAN +-------------------------------------------------------------- + Hash Left Join + Hash Cond: (g.g = s_1.a) + -> Nested Loop Left Join + -> Function Scan on generate_series g + -> Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = g.g) + -> Hash + -> Seq Scan on scan_table s_1 + Supplied Plan Advice: + SEQ_SCAN(s#2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g s s#2) + NESTED_LOOP_PLAIN(s) + HASH_JOIN(s#2) + SEQ_SCAN(s#2) + INDEX_SCAN(s public.scan_table_pkey) + NO_GATHER(g s s#2) +(17 rows) + +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s) SEQ_SCAN(s#2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; + QUERY PLAN +------------------------------------------------ + Hash Left Join + Hash Cond: (g.g = s_1.a) + -> Hash Left Join + Hash Cond: (g.g = s.a) + -> Function Scan on generate_series g + -> Hash + -> Seq Scan on scan_table s + -> Hash + -> Seq Scan on scan_table s_1 + Supplied Plan Advice: + SEQ_SCAN(s) /* matched */ + SEQ_SCAN(s#2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g s s#2) + HASH_JOIN(s s#2) + SEQ_SCAN(s s#2) + NO_GATHER(g s s#2) +(17 rows) + +COMMIT; +-- Test our ability to refer to scans within a subquery. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; + QUERY PLAN +-------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Generated Plan Advice: + INDEX_SCAN(s@x public.scan_table_pkey) + NO_GATHER(x s@x) +(5 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); + QUERY PLAN +--------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Generated Plan Advice: + INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey) + NO_GATHER(unnamed_subquery s@unnamed_subquery) +(5 rows) + +BEGIN; +-- Should not match. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; + QUERY PLAN +-------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s) /* not matched */ + Generated Plan Advice: + INDEX_SCAN(s@x public.scan_table_pkey) + NO_GATHER(x s@x) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); + QUERY PLAN +--------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s) /* not matched */ + Generated Plan Advice: + INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey) + NO_GATHER(unnamed_subquery s@unnamed_subquery) +(7 rows) + +-- Should match first query only. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@x)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; + QUERY PLAN +------------------------------- + Seq Scan on scan_table s + Filter: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s@x) /* matched */ + Generated Plan Advice: + SEQ_SCAN(s@x) + NO_GATHER(x s@x) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); + QUERY PLAN +--------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s@x) /* not matched */ + Generated Plan Advice: + INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey) + NO_GATHER(unnamed_subquery s@unnamed_subquery) +(7 rows) + +-- Should match second query only. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@unnamed_subquery)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; + QUERY PLAN +-------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s@unnamed_subquery) /* not matched */ + Generated Plan Advice: + INDEX_SCAN(s@x public.scan_table_pkey) + NO_GATHER(x s@x) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); + QUERY PLAN +-------------------------------------------------- + Seq Scan on scan_table s + Filter: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s@unnamed_subquery) /* matched */ + Generated Plan Advice: + SEQ_SCAN(s@unnamed_subquery) + NO_GATHER(unnamed_subquery s@unnamed_subquery) +(7 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/semijoin.out b/contrib/pg_plan_advice/expected/semijoin.out new file mode 100644 index 0000000000..5551c028a1 --- /dev/null +++ b/contrib/pg_plan_advice/expected/semijoin.out @@ -0,0 +1,377 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +CREATE TABLE sj_wide ( + id integer primary key, + val1 integer, + padding text storage plain +) WITH (autovacuum_enabled = false); +INSERT INTO sj_wide + SELECT g, g%10+1, repeat(' ', 300) FROM generate_series(1, 1000) g; +CREATE INDEX ON sj_wide (val1); +VACUUM ANALYZE sj_wide; +CREATE TABLE sj_narrow ( + id integer primary key, + val1 integer +) WITH (autovacuum_enabled = false); +INSERT INTO sj_narrow + SELECT g, g%10+1 FROM generate_series(1, 1000) g; +CREATE INDEX ON sj_narrow (val1); +VACUUM ANALYZE sj_narrow; +-- We expect this to make the VALUES list unique and use index lookups to +-- find the rows in sj_wide, so as to avoid a full scan of sj_wide. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +----------------------------------------------------------- + Nested Loop + -> HashAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" + -> Index Scan using sj_wide_pkey on sj_wide + Index Cond: (id = "*VALUES*".column1) + Filter: (val1 = "*VALUES*".column2) + Generated Plan Advice: + JOIN_ORDER("*VALUES*" sj_wide) + NESTED_LOOP_PLAIN(sj_wide) + INDEX_SCAN(sj_wide public.sj_wide_pkey) + SEMIJOIN_UNIQUE("*VALUES*") + NO_GATHER(sj_wide "*VALUES*") +(13 rows) + +-- If we ask for a unique semijoin, we should get the same plan as with +-- no advice. If we ask for a non-unique semijoin, we should see a Semi +-- Join operation in the plan tree. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +----------------------------------------------------------- + Nested Loop + -> HashAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" + -> Index Scan using sj_wide_pkey on sj_wide + Index Cond: (id = "*VALUES*".column1) + Filter: (val1 = "*VALUES*".column2) + Supplied Plan Advice: + SEMIJOIN_UNIQUE("*VALUES*") /* matched */ + Generated Plan Advice: + JOIN_ORDER("*VALUES*" sj_wide) + NESTED_LOOP_PLAIN(sj_wide) + INDEX_SCAN(sj_wide public.sj_wide_pkey) + SEMIJOIN_UNIQUE("*VALUES*") + NO_GATHER(sj_wide "*VALUES*") +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +------------------------------------------------------------------------------------------ + Hash Semi Join + Hash Cond: ((sj_wide.id = "*VALUES*".column1) AND (sj_wide.val1 = "*VALUES*".column2)) + -> Seq Scan on sj_wide + -> Hash + -> Values Scan on "*VALUES*" + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE("*VALUES*") /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_wide "*VALUES*") + HASH_JOIN("*VALUES*") + SEQ_SCAN(sj_wide) + SEMIJOIN_NON_UNIQUE("*VALUES*") + NO_GATHER(sj_wide "*VALUES*") +(13 rows) + +COMMIT; +-- Because this table is narrower than the previous one, a sequential scan +-- is less expensive, and we choose a straightforward Semi Join plan by +-- default. (Note that this is also very sensitive to the length of the IN +-- list, which affects how many index lookups the alternative plan will need.) +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +---------------------------------------------------------------------------------------------- + Hash Semi Join + Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2)) + -> Seq Scan on sj_narrow + -> Hash + -> Values Scan on "*VALUES*" + Generated Plan Advice: + JOIN_ORDER(sj_narrow "*VALUES*") + HASH_JOIN("*VALUES*") + SEQ_SCAN(sj_narrow) + SEMIJOIN_NON_UNIQUE("*VALUES*") + NO_GATHER(sj_narrow "*VALUES*") +(11 rows) + +-- Here, we expect advising a unique semijoin to swith to the same plan that +-- we got with sj_wide, and advising a non-unique semijoin should not change +-- the plan. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +---------------------------------------------------------------------------------------------- + Hash Join + Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2)) + -> Seq Scan on sj_narrow + -> Hash + -> HashAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" + Supplied Plan Advice: + SEMIJOIN_UNIQUE("*VALUES*") /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow "*VALUES*") + HASH_JOIN("*VALUES*") + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE("*VALUES*") + NO_GATHER(sj_narrow "*VALUES*") +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +---------------------------------------------------------------------------------------------- + Hash Semi Join + Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2)) + -> Seq Scan on sj_narrow + -> Hash + -> Values Scan on "*VALUES*" + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE("*VALUES*") /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow "*VALUES*") + HASH_JOIN("*VALUES*") + SEQ_SCAN(sj_narrow) + SEMIJOIN_NON_UNIQUE("*VALUES*") + NO_GATHER(sj_narrow "*VALUES*") +(13 rows) + +COMMIT; +-- In the above example, we made the outer side of the join unique, but here, +-- we should make the inner side unique. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------ + Hash Join + Hash Cond: (g.g = sj_narrow.val1) + -> Function Scan on generate_series g + -> Hash + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + Generated Plan Advice: + JOIN_ORDER(g sj_narrow) + HASH_JOIN(sj_narrow) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(g sj_narrow) +(13 rows) + +-- We should be able to force a plan with or without the make-unique strategy, +-- with either side as the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +-------------------------------------------- + Hash Join + Hash Cond: (g.g = sj_narrow.val1) + -> Function Scan on generate_series g + -> Hash + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + Supplied Plan Advice: + SEMIJOIN_UNIQUE(sj_narrow) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g sj_narrow) + HASH_JOIN(sj_narrow) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(g sj_narrow) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------------ + Hash Semi Join + Hash Cond: (g.g = sj_narrow.val1) + -> Function Scan on generate_series g + -> Hash + -> Seq Scan on sj_narrow + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g sj_narrow) + HASH_JOIN(sj_narrow) + SEQ_SCAN(sj_narrow) + SEMIJOIN_NON_UNIQUE(sj_narrow) + NO_GATHER(g sj_narrow) +(13 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) join_order(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------------ + Hash Join + Hash Cond: (sj_narrow.val1 = g.g) + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + -> Hash + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_UNIQUE(sj_narrow) /* matched */ + JOIN_ORDER(sj_narrow) /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow g) + HASH_JOIN(g) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(g sj_narrow) +(16 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow) join_order(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------------ + Hash Right Semi Join + Hash Cond: (sj_narrow.val1 = g.g) + -> Seq Scan on sj_narrow + -> Hash + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched */ + JOIN_ORDER(sj_narrow) /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow g) + HASH_JOIN(g) + SEQ_SCAN(sj_narrow) + SEMIJOIN_NON_UNIQUE(sj_narrow) + NO_GATHER(g sj_narrow) +(14 rows) + +COMMIT; +-- However, mentioning the wrong side of the join should result in an advice +-- failure. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +-------------------------------------------- + Nested Loop + Disabled: true + Join Filter: (g.g = sj_narrow.val1) + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_UNIQUE(g) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow g) + NESTED_LOOP_PLAIN(g) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(g sj_narrow) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------------ + Nested Loop + Disabled: true + Join Filter: (g.g = sj_narrow.val1) + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE(g) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow g) + NESTED_LOOP_PLAIN(g) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(g sj_narrow) +(15 rows) + +COMMIT; +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) semijoin_non_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +--------------------------------------------------------------------- + Hash Join + Hash Cond: (g.g = sj_narrow.val1) + -> Function Scan on generate_series g + -> Hash + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + Supplied Plan Advice: + SEMIJOIN_UNIQUE(sj_narrow) /* matched, conflicting */ + SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(g sj_narrow) + HASH_JOIN(sj_narrow) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(g sj_narrow) +(16 rows) + +COMMIT; +-- Try applying SEMIJOIN_UNIQUE() to a non-semijoin. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g, sj_narrow s WHERE g = s.val1; + QUERY PLAN +---------------------------------------------------------- + Merge Join + Merge Cond: (s.val1 = g.g) + -> Index Scan using sj_narrow_val1_idx on sj_narrow s + -> Sort + Sort Key: g.g + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_UNIQUE(g) /* matched, inapplicable, failed */ + Generated Plan Advice: + JOIN_ORDER(s g) + MERGE_JOIN_PLAIN(g) + INDEX_SCAN(s public.sj_narrow_val1_idx) + NO_GATHER(g s) +(13 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/syntax.out b/contrib/pg_plan_advice/expected/syntax.out new file mode 100644 index 0000000000..be61402b56 --- /dev/null +++ b/contrib/pg_plan_advice/expected/syntax.out @@ -0,0 +1,192 @@ +LOAD 'pg_plan_advice'; +-- An empty string is allowed. Empty target lists are allowed for most advice +-- tags, but not for JOIN_ORDER. "Supplied Plan Advice" should be omitted in +-- text format when there is no actual advice, but not in non-text format. +SET pg_plan_advice.advice = ''; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------ + Result +(1 row) + +SET pg_plan_advice.advice = 'SEQ_SCAN()'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------ + Result +(1 row) + +SET pg_plan_advice.advice = 'NESTED_LOOP_PLAIN()'; +EXPLAIN (COSTS OFF, FORMAT JSON) SELECT 1; + QUERY PLAN +-------------------------------- + [ + + { + + "Plan": { + + "Node Type": "Result", + + "Parallel Aware": false,+ + "Async Capable": false, + + "Disabled": false + + }, + + "Supplied Plan Advice": ""+ + } + + ] +(1 row) + +SET pg_plan_advice.advice = 'JOIN_ORDER()'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "JOIN_ORDER()" +DETAIL: Could not parse advice: JOIN_ORDER must have at least one target at or near ")" +-- Test assorted variations in capitalization, whitespace, and which parts of +-- the relation identifier are included. These should all work. +SET pg_plan_advice.advice = 'SEQ_SCAN(x)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +--------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = 'seq_scan(x@y)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +----------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x@y) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = 'SEQ_scan(x#2)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +----------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x#2) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = 'SEQ_SCAN (x/y)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +----------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x/y) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = ' SEQ_SCAN ( x / y . z ) '; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x/y.z) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = 'SEQ_SCAN("x"#2/"y"."z"@"t")'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +----------------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x#2/y.z@t) /* not matched */ +(3 rows) + +-- Syntax errors. +SET pg_plan_advice.advice = 'SEQUENTIAL_SCAN(x)'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQUENTIAL_SCAN(x)" +DETAIL: Could not parse advice: syntax error at or near "SEQUENTIAL_SCAN" +SET pg_plan_advice.advice = 'SEQ_SCAN'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN" +DETAIL: Could not parse advice: syntax error at end of input +SET pg_plan_advice.advice = 'SEQ_SCAN('; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN(" +DETAIL: Could not parse advice: syntax error at end of input +SET pg_plan_advice.advice = 'SEQ_SCAN("'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("" +DETAIL: Could not parse advice: unterminated quoted identifier at end of input +SET pg_plan_advice.advice = 'SEQ_SCAN("")'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("")" +DETAIL: Could not parse advice: zero-length delimited identifier at or near """ +SET pg_plan_advice.advice = 'SEQ_SCAN("a"'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("a"" +DETAIL: Could not parse advice: syntax error at end of input +SET pg_plan_advice.advice = 'SEQ_SCAN(#'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN(#" +DETAIL: Could not parse advice: syntax error at or near "#" +SET pg_plan_advice.advice = '()'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "()" +DETAIL: Could not parse advice: syntax error at or near "(" +SET pg_plan_advice.advice = '123'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "123" +DETAIL: Could not parse advice: syntax error at or near "123" +-- Tags like SEQ_SCAN and NO_GATHER don't allow sublists at all; other tags, +-- except for JOIN_ORDER, allow at most one level of sublist. Hence, these +-- examples should error out. +SET pg_plan_advice.advice = 'SEQ_SCAN((x))'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN((x))" +DETAIL: Could not parse advice: syntax error at or near "(" +SET pg_plan_advice.advice = 'GATHER(((x)))'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "GATHER(((x)))" +DETAIL: Could not parse advice: syntax error at or near "(" +-- Legal comments. +SET pg_plan_advice.advice = '/**/'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------ + Result +(1 row) + +SET pg_plan_advice.advice = 'HASH_JOIN(_)/***/'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +---------------------------------- + Result + Supplied Plan Advice: + HASH_JOIN(_) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(/*x*/y)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +---------------------------------- + Result + Supplied Plan Advice: + HASH_JOIN(y) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(y//*x*/z)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------------------------------ + Result + Supplied Plan Advice: + HASH_JOIN(y/z) /* not matched */ +(3 rows) + +-- Unterminated comments. +SET pg_plan_advice.advice = '/*'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "/*" +DETAIL: Could not parse advice: unterminated comment at end of input +SET pg_plan_advice.advice = 'JOIN_ORDER("fOO") /* oops'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "JOIN_ORDER("fOO") /* oops" +DETAIL: Could not parse advice: unterminated comment at end of input +-- Nested comments are not supported, so the first of these is legal and +-- the second is not. +SET pg_plan_advice.advice = '/*/*/'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------ + Result +(1 row) + +SET pg_plan_advice.advice = '/*/* stuff */*/'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "/*/* stuff */*/" +DETAIL: Could not parse advice: syntax error at or near "*" +-- Foreign join requires multiple relation identifiers. +SET pg_plan_advice.advice = 'FOREIGN_JOIN(a)'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "FOREIGN_JOIN(a)" +DETAIL: Could not parse advice: FOREIGN_JOIN targets must contain more than one relation identifier at or near ")" +SET pg_plan_advice.advice = 'FOREIGN_JOIN((a))'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "FOREIGN_JOIN((a))" +DETAIL: Could not parse advice: FOREIGN_JOIN targets must contain more than one relation identifier at or near ")" diff --git a/contrib/pg_plan_advice/meson.build b/contrib/pg_plan_advice/meson.build new file mode 100644 index 0000000000..f7229dddce --- /dev/null +++ b/contrib/pg_plan_advice/meson.build @@ -0,0 +1,79 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +pg_plan_advice_sources = files( + 'pg_plan_advice.c', + 'pgpa_ast.c', + 'pgpa_collector.c', + 'pgpa_identifier.c', + 'pgpa_join.c', + 'pgpa_output.c', + 'pgpa_planner.c', + 'pgpa_scan.c', + 'pgpa_trove.c', + 'pgpa_walker.c', +) + +pgpa_scanner = custom_target('pgpa_scanner', + input: 'pgpa_scanner.l', + output: 'pgpa_scanner.c', + command: flex_cmd, +) +generated_sources += pgpa_scanner +pg_plan_advice_sources += pgpa_scanner + +pgpa_parser = custom_target('pgpa_parser', + input: 'pgpa_parser.y', + kwargs: bison_kw, +) +generated_sources += pgpa_parser.to_list() +pg_plan_advice_sources += pgpa_parser + +if host_system == 'windows' + pg_plan_advice_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_plan_advice', + '--FILEDESC', 'pg_plan_advice - help the planner get the right plan',]) +endif + +pg_plan_advice_inc = include_directories('.') + +pg_plan_advice = shared_module('pg_plan_advice', + pg_plan_advice_sources, + include_directories: pg_plan_advice_inc, + kwargs: contrib_mod_args, +) +contrib_targets += pg_plan_advice + +install_data( + 'pg_plan_advice--1.0.sql', + 'pg_plan_advice.control', + kwargs: contrib_data_args, +) + +install_headers( + 'pg_plan_advice.h', + install_dir: dir_include_extension / 'pg_plan_advice', +) + +tests += { + 'name': 'pg_plan_advice', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'gather', + 'join_order', + 'join_strategy', + 'local_collector', + 'partitionwise', + 'prepared', + 'scan', + 'semijoin', + 'syntax', + ], + }, + 'tap': { + 'tests': [ + 't/001_regress.pl', + ], + }, +} diff --git a/contrib/pg_plan_advice/pg_plan_advice--1.0.sql b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql new file mode 100644 index 0000000000..450c42040f --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql @@ -0,0 +1,43 @@ +/* contrib/pg_plan_advice/pg_plan_advice--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_plan_advice" to load this file. \quit + +CREATE FUNCTION pg_clear_collected_local_advice() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_clear_collected_local_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_clear_collected_shared_advice() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_clear_collected_shared_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_get_collected_local_advice( + OUT id bigint, + OUT userid oid, + OUT dbid oid, + OUT queryid bigint, + OUT collection_time timestamptz, + OUT query text, + OUT advice text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_get_collected_local_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_get_collected_shared_advice( + OUT id bigint, + OUT userid oid, + OUT dbid oid, + OUT queryid bigint, + OUT collection_time timestamptz, + OUT query text, + OUT advice text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_get_collected_shared_advice' +LANGUAGE C STRICT; + +REVOKE ALL ON FUNCTION pg_clear_collected_shared_advice() FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_get_collected_shared_advice() FROM PUBLIC; diff --git a/contrib/pg_plan_advice/pg_plan_advice.c b/contrib/pg_plan_advice/pg_plan_advice.c new file mode 100644 index 0000000000..99b9784399 --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.c @@ -0,0 +1,563 @@ +/*------------------------------------------------------------------------- + * + * pg_plan_advice.c + * main entrypoints for generating and applying planner advice + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pg_plan_advice.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_ast.h" +#include "pgpa_collector.h" +#include "pgpa_identifier.h" +#include "pgpa_output.h" +#include "pgpa_planner.h" +#include "pgpa_trove.h" +#include "pgpa_walker.h" + +#include "commands/defrem.h" +#include "commands/explain.h" +#include "commands/explain_format.h" +#include "commands/explain_state.h" +#include "funcapi.h" +#include "optimizer/planner.h" +#include "storage/dsm_registry.h" +#include "utils/guc.h" + +PG_MODULE_MAGIC; + +static pgpa_shared_state *pgpa_state = NULL; +static dsa_area *pgpa_dsa_area = NULL; +static List *advisor_hook_list = NIL; + +/* GUC variables */ +char *pg_plan_advice_advice = NULL; +bool pg_plan_advice_always_store_advice_details = false; +static bool pg_plan_advice_always_explain_supplied_advice = true; +bool pg_plan_advice_feedback_warnings = false; +bool pg_plan_advice_local_collector = false; +int pg_plan_advice_local_collection_limit = 0; +bool pg_plan_advice_shared_collector = false; +int pg_plan_advice_shared_collection_limit = 0; +bool pg_plan_advice_trace_mask = false; + +/* Saved hook value */ +static explain_per_plan_hook_type prev_explain_per_plan = NULL; + +/* Other file-level globals */ +static int es_extension_id; +static MemoryContext pgpa_memory_context = NULL; + +static void pgpa_init_shared_state(void *ptr, void *arg); +static void pg_plan_advice_explain_option_handler(ExplainState *es, + DefElem *opt, + ParseState *pstate); +static void pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt, + IntoClause *into, + ExplainState *es, + const char *queryString, + ParamListInfo params, + QueryEnvironment *queryEnv); +static bool pg_plan_advice_advice_check_hook(char **newval, void **extra, + GucSource source); +static DefElem *find_defelem_by_defname(List *deflist, char *defname); + +/* + * Initialize this module. + */ +void +_PG_init(void) +{ + DefineCustomStringVariable("pg_plan_advice.advice", + "advice to apply during query planning", + NULL, + &pg_plan_advice_advice, + NULL, + PGC_USERSET, + 0, + pg_plan_advice_advice_check_hook, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.always_explain_supplied_advice", + "EXPLAIN output includes supplied advice even without EXPLAIN (PLAN_ADVICE)", + NULL, + &pg_plan_advice_always_explain_supplied_advice, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.always_store_advice_details", + "Generate advice strings even when seemingly not required", + "Use this option to see generated advice for prepared queries.", + &pg_plan_advice_always_store_advice_details, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.feedback_warnings", + "Warn when supplied advice does not apply cleanly", + NULL, + &pg_plan_advice_feedback_warnings, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.local_collector", + "Enable the local advice collector.", + NULL, + &pg_plan_advice_local_collector, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_plan_advice.local_collection_limit", + "# of advice entries to retain in per-backend memory", + NULL, + &pg_plan_advice_local_collection_limit, + 0, + 0, INT_MAX, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.shared_collector", + "Enable the shared advice collector.", + NULL, + &pg_plan_advice_shared_collector, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_plan_advice.shared_collection_limit", + "# of advice entries to retain in shared memory", + NULL, + &pg_plan_advice_shared_collection_limit, + 0, + 0, INT_MAX, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.trace_mask", + "Emit debugging messages showing the computed strategy mask for each relation", + NULL, + &pg_plan_advice_trace_mask, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + MarkGUCPrefixReserved("pg_plan_advice"); + + /* Get an ID that we can use to cache data in an ExplainState. */ + es_extension_id = GetExplainExtensionId("pg_plan_advice"); + + /* Register the new EXPLAIN options implemented by this module. */ + RegisterExtensionExplainOption("plan_advice", + pg_plan_advice_explain_option_handler); + + /* Install hooks */ + pgpa_planner_install_hooks(); + prev_explain_per_plan = explain_per_plan_hook; + explain_per_plan_hook = pg_plan_advice_explain_per_plan_hook; +} + +/* + * Initialize shared state when first created. + */ +static void +pgpa_init_shared_state(void *ptr, void *arg) +{ + pgpa_shared_state *state = (pgpa_shared_state *) ptr; + + LWLockInitialize(&state->lock, LWLockNewTrancheId("pg_plan_advice_lock")); + state->dsa_tranche = LWLockNewTrancheId("pg_plan_advice_dsa"); + state->area = DSA_HANDLE_INVALID; + state->shared_collector = InvalidDsaPointer; +} + +/* + * Return a pointer to a memory context where long-lived data managed by this + * module can be stored. + */ +MemoryContext +pg_plan_advice_get_mcxt(void) +{ + if (pgpa_memory_context == NULL) + pgpa_memory_context = AllocSetContextCreate(TopMemoryContext, + "pg_plan_advice", + ALLOCSET_DEFAULT_SIZES); + + return pgpa_memory_context; +} + +/* + * Get a pointer to our shared state. + * + * If no shared state exists, create and initialize it. If it does exist but + * this backend has not yet accessed it, attach to it. Otherwise, just return + * our cached pointer. + * + * Along the way, make sure the relevant LWLock tranches are registered. + */ +pgpa_shared_state * +pg_plan_advice_attach(void) +{ + if (pgpa_state == NULL) + { + bool found; + + pgpa_state = + GetNamedDSMSegment("pg_plan_advice", sizeof(pgpa_shared_state), + pgpa_init_shared_state, &found, NULL); + } + + return pgpa_state; +} + +/* + * Return a pointer to pg_plan_advice's DSA area, creating it if needed. + */ +dsa_area * +pg_plan_advice_dsa_area(void) +{ + if (pgpa_dsa_area == NULL) + { + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_handle area_handle; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + area_handle = state->area; + if (area_handle == DSA_HANDLE_INVALID) + { + pgpa_dsa_area = dsa_create(state->dsa_tranche); + dsa_pin(pgpa_dsa_area); + state->area = dsa_get_handle(pgpa_dsa_area); + LWLockRelease(&state->lock); + } + else + { + LWLockRelease(&state->lock); + pgpa_dsa_area = dsa_attach(area_handle); + } + + dsa_pin_mapping(pgpa_dsa_area); + + MemoryContextSwitchTo(oldcontext); + } + + return pgpa_dsa_area; +} + +/* + * Was the PLAN_ADVICE option specified and not set to false? + */ +bool +pg_plan_advice_should_explain(ExplainState *es) +{ + bool *plan_advice = NULL; + + if (es != NULL) + plan_advice = GetExplainExtensionState(es, es_extension_id); + return plan_advice != NULL && *plan_advice; +} + +/* + * Get the advice that should be used while planning a particular query. + */ +char * +pg_plan_advice_get_supplied_query_advice(PlannerGlobal *glob, + Query *parse, + const char *query_string, + int cursorOptions, + ExplainState *es) +{ + ListCell *lc; + + /* + * If any advisors are loaded, consult them. The first one that produces a + * non-NULL string wins. + */ + foreach(lc, advisor_hook_list) + { + pg_plan_advice_advisor_hook hook = lfirst(lc); + char *advice_string; + + advice_string = (*hook) (glob, parse, query_string, cursorOptions, es); + if (advice_string != NULL) + return advice_string; + } + + /* Otherwise, just use the value of the GUC. */ + return pg_plan_advice_advice; +} + +/* + * Add an advisor, which can supply advice strings to be used during future + * query planning operations. + * + * The advisor should return NULL if it has no advice string to offer for a + * given query. If multiple advisors are added, they will be consulted in the + * order added until one of them returns a non-NULL value. + */ +void +pg_plan_advice_add_advisor(pg_plan_advice_advisor_hook hook) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + advisor_hook_list = lappend(advisor_hook_list, hook); + MemoryContextSwitchTo(oldcontext); +} + +/* + * Remove an advisor. + */ +void +pg_plan_advice_remove_advisor(pg_plan_advice_advisor_hook hook) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + advisor_hook_list = list_delete_ptr(advisor_hook_list, hook); + MemoryContextSwitchTo(oldcontext); +} + +/* + * Handler for EXPLAIN (PLAN_ADVICE). + */ +static void +pg_plan_advice_explain_option_handler(ExplainState *es, DefElem *opt, + ParseState *pstate) +{ + bool *plan_advice; + + plan_advice = GetExplainExtensionState(es, es_extension_id); + + if (plan_advice == NULL) + { + plan_advice = palloc0_object(bool); + SetExplainExtensionState(es, es_extension_id, plan_advice); + } + + *plan_advice = defGetBoolean(opt); +} + +/* + * Display a string that is likely to consist of multiple lines in EXPLAIN + * output. + */ +static void +pg_plan_advice_explain_text_multiline(ExplainState *es, char *qlabel, + char *value) +{ + char *s; + + /* For non-text formats, it's best not to add any special handling. */ + if (es->format != EXPLAIN_FORMAT_TEXT) + { + ExplainPropertyText(qlabel, value, es); + return; + } + + /* In text format, if there is no data, display nothing. */ + if (*value == '\0') + return; + + /* + * It looks nicest to indent each line of the advice separately, beginning + * on the line below the label. + */ + ExplainIndentText(es); + appendStringInfo(es->str, "%s:\n", qlabel); + es->indent++; + while ((s = strchr(value, '\n')) != NULL) + { + ExplainIndentText(es); + appendBinaryStringInfo(es->str, value, (s - value) + 1); + value = s + 1; + } + + /* Don't interpret a terminal newline as a request for an empty line. */ + if (*value != '\0') + { + ExplainIndentText(es); + appendStringInfo(es->str, "%s\n", value); + } + + es->indent--; +} + +/* + * Add advice feedback to the EXPLAIN output. + */ +static void +pg_plan_advice_explain_feedback(ExplainState *es, List *feedback) +{ + StringInfoData buf; + + initStringInfo(&buf); + foreach_node(DefElem, item, feedback) + { + int flags = defGetInt32(item); + + appendStringInfo(&buf, "%s /* ", item->defname); + pgpa_trove_append_flags(&buf, flags); + appendStringInfo(&buf, " */\n"); + } + + pg_plan_advice_explain_text_multiline(es, "Supplied Plan Advice", + buf.data); +} + +/* + * Add relevant details, if any, to the EXPLAIN output for a single plan. + */ +static void +pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt, + IntoClause *into, + ExplainState *es, + const char *queryString, + ParamListInfo params, + QueryEnvironment *queryEnv) +{ + bool should_explain; + DefElem *pgpa_item; + List *pgpa_list; + + if (prev_explain_per_plan) + prev_explain_per_plan(plannedstmt, into, es, queryString, params, + queryEnv); + + /* Should an advice string be part of the EXPLAIN output? */ + should_explain = pg_plan_advice_should_explain(es); + + /* Find any data pgpa_planner_shutdown stashed in the PlannedStmt. */ + pgpa_item = find_defelem_by_defname(plannedstmt->extension_state, + "pg_plan_advice"); + pgpa_list = pgpa_item == NULL ? NULL : (List *) pgpa_item->arg; + + /* + * By default, if there is a record of attempting to apply advice during + * query planning, we always output that information, but the user can set + * pg_plan_advice.always_explain_supplied_advice = false to suppress that + * behavior. If they do, we'll only display it when the PLAN_ADVICE option + * was specified and not set to false. + * + * NB: If we're explaining a query planned beforehand -- i.e. a prepared + * statement -- the application of query advice may not have been + * recorded, and therefore this won't be able to show anything. Use + * pg_plan_advice.always_store_advice_details = true to work around this. + */ + if (pgpa_list != NULL && (pg_plan_advice_always_explain_supplied_advice || + should_explain)) + { + DefElem *feedback; + + feedback = find_defelem_by_defname(pgpa_list, "feedback"); + if (feedback != NULL) + pg_plan_advice_explain_feedback(es, (List *) feedback->arg); + } + + /* + * If the PLAN_ADVICE option was specified -- and not sent to FALSE -- + * show generated advice. + */ + if (should_explain) + { + DefElem *advice_string_item; + char *advice_string = NULL; + + advice_string_item = + find_defelem_by_defname(pgpa_list, "advice_string"); + if (advice_string_item != NULL) + { + advice_string = strVal(advice_string_item->arg); + pg_plan_advice_explain_text_multiline(es, "Generated Plan Advice", + advice_string); + } + } +} + +/* + * Check hook for pg_plan_advice.advice + */ +static bool +pg_plan_advice_advice_check_hook(char **newval, void **extra, GucSource source) +{ + MemoryContext oldcontext; + MemoryContext tmpcontext; + char *error; + + if (*newval == NULL) + return true; + + tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "pg_plan_advice.advice", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(tmpcontext); + + /* + * It would be nice to save the parse tree that we construct here for + * eventual use when planning with this advice, but *extra can only point + * to a single guc_malloc'd chunk, and our parse tree involves an + * arbitrary number of memory allocations. + */ + (void) pgpa_parse(*newval, &error); + + if (error != NULL) + { + GUC_check_errdetail("Could not parse advice: %s", error); + return false; + } + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(tmpcontext); + + return true; +} + +/* + * Search a list of DefElem objects for a given defname. + */ +static DefElem * +find_defelem_by_defname(List *deflist, char *defname) +{ + foreach_node(DefElem, item, deflist) + { + if (strcmp(item->defname, defname) == 0) + return item; + } + + return NULL; +} diff --git a/contrib/pg_plan_advice/pg_plan_advice.control b/contrib/pg_plan_advice/pg_plan_advice.control new file mode 100644 index 0000000000..aa6fdc9e7b --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.control @@ -0,0 +1,5 @@ +# pg_plan_advice extension +comment = 'help the planner get the right plan' +default_version = '1.0' +module_pathname = '$libdir/pg_plan_advice' +relocatable = true diff --git a/contrib/pg_plan_advice/pg_plan_advice.h b/contrib/pg_plan_advice/pg_plan_advice.h new file mode 100644 index 0000000000..21f66092fa --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.h @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * pg_plan_advice.h + * main header file for pg_plan_advice contrib module + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pg_plan_advice.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_PLAN_ADVICE_H +#define PG_PLAN_ADVICE_H + +#include "commands/explain_state.h" +#include "nodes/pathnodes.h" +#include "nodes/plannodes.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" + +typedef struct pgpa_shared_state +{ + LWLock lock; + int dsa_tranche; + dsa_handle area; + dsa_pointer shared_collector; +} pgpa_shared_state; + +/* Hook for other plugins to supply advice strings */ +typedef char *(*pg_plan_advice_advisor_hook) (PlannerGlobal *glob, + Query *parse, + const char *query_string, + int cursorOptions, + ExplainState *es); + +/* GUC variables */ +extern char *pg_plan_advice_advice; +extern bool pg_plan_advice_always_store_advice_details; +extern bool pg_plan_advice_feedback_warnings; +extern bool pg_plan_advice_local_collector; +extern int pg_plan_advice_local_collection_limit; +extern bool pg_plan_advice_shared_collector; +extern int pg_plan_advice_shared_collection_limit; +extern bool pg_plan_advice_trace_mask; + +/* Function prototypes (for use by pg_plan_advice itself) */ +extern MemoryContext pg_plan_advice_get_mcxt(void); +extern pgpa_shared_state *pg_plan_advice_attach(void); +extern dsa_area *pg_plan_advice_dsa_area(void); +extern bool pg_plan_advice_should_explain(ExplainState *es); +extern char *pg_plan_advice_get_supplied_query_advice(PlannerGlobal *glob, + Query *parse, + const char *query_string, + int cursorOptions, + ExplainState *es); + +/* Function prototypes (for use by other plugins) */ +extern PGDLLEXPORT void pg_plan_advice_add_advisor(pg_plan_advice_advisor_hook hook); +extern PGDLLEXPORT void pg_plan_advice_remove_advisor(pg_plan_advice_advisor_hook hook); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_ast.c b/contrib/pg_plan_advice/pgpa_ast.c new file mode 100644 index 0000000000..85bd74859d --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_ast.c @@ -0,0 +1,351 @@ +/*------------------------------------------------------------------------- + * + * pgpa_ast.c + * additional supporting code related to plan advice parsing + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_ast.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_ast.h" + +#include "funcapi.h" +#include "utils/array.h" +#include "utils/builtins.h" + +static bool pgpa_identifiers_cover_target(int nrids, pgpa_identifier *rids, + pgpa_advice_target *target, + bool *rids_used); + +/* + * Get a C string that corresponds to the specified advice tag. + */ +char * +pgpa_cstring_advice_tag(pgpa_advice_tag_type advice_tag) +{ + switch (advice_tag) + { + case PGPA_TAG_BITMAP_HEAP_SCAN: + return "BITMAP_HEAP_SCAN"; + case PGPA_TAG_FOREIGN_JOIN: + return "FOREIGN_JOIN"; + case PGPA_TAG_GATHER: + return "GATHER"; + case PGPA_TAG_GATHER_MERGE: + return "GATHER_MERGE"; + case PGPA_TAG_HASH_JOIN: + return "HASH_JOIN"; + case PGPA_TAG_INDEX_ONLY_SCAN: + return "INDEX_ONLY_SCAN"; + case PGPA_TAG_INDEX_SCAN: + return "INDEX_SCAN"; + case PGPA_TAG_JOIN_ORDER: + return "JOIN_ORDER"; + case PGPA_TAG_MERGE_JOIN_MATERIALIZE: + return "MERGE_JOIN_MATERIALIZE"; + case PGPA_TAG_MERGE_JOIN_PLAIN: + return "MERGE_JOIN_PLAIN"; + case PGPA_TAG_NESTED_LOOP_MATERIALIZE: + return "NESTED_LOOP_MATERIALIZE"; + case PGPA_TAG_NESTED_LOOP_MEMOIZE: + return "NESTED_LOOP_MEMOIZE"; + case PGPA_TAG_NESTED_LOOP_PLAIN: + return "NESTED_LOOP_PLAIN"; + case PGPA_TAG_NO_GATHER: + return "NO_GATHER"; + case PGPA_TAG_PARTITIONWISE: + return "PARTITIONWISE"; + case PGPA_TAG_SEMIJOIN_NON_UNIQUE: + return "SEMIJOIN_NON_UNIQUE"; + case PGPA_TAG_SEMIJOIN_UNIQUE: + return "SEMIJOIN_UNIQUE"; + case PGPA_TAG_SEQ_SCAN: + return "SEQ_SCAN"; + case PGPA_TAG_TID_SCAN: + return "TID_SCAN"; + } + + pg_unreachable(); + return NULL; +} + +/* + * Convert an advice tag, formatted as a string that has already been + * downcased as appropriate, to a pgpa_advice_tag_type. + * + * If we succeed, set *fail = false and return the result; if we fail, + * set *fail = true and reurn an arbitrary value. + */ +pgpa_advice_tag_type +pgpa_parse_advice_tag(const char *tag, bool *fail) +{ + *fail = false; + + switch (tag[0]) + { + case 'b': + if (strcmp(tag, "bitmap_heap_scan") == 0) + return PGPA_TAG_BITMAP_HEAP_SCAN; + break; + case 'f': + if (strcmp(tag, "foreign_join") == 0) + return PGPA_TAG_FOREIGN_JOIN; + break; + case 'g': + if (strcmp(tag, "gather") == 0) + return PGPA_TAG_GATHER; + if (strcmp(tag, "gather_merge") == 0) + return PGPA_TAG_GATHER_MERGE; + break; + case 'h': + if (strcmp(tag, "hash_join") == 0) + return PGPA_TAG_HASH_JOIN; + break; + case 'i': + if (strcmp(tag, "index_scan") == 0) + return PGPA_TAG_INDEX_SCAN; + if (strcmp(tag, "index_only_scan") == 0) + return PGPA_TAG_INDEX_ONLY_SCAN; + break; + case 'j': + if (strcmp(tag, "join_order") == 0) + return PGPA_TAG_JOIN_ORDER; + break; + case 'm': + if (strcmp(tag, "merge_join_materialize") == 0) + return PGPA_TAG_MERGE_JOIN_MATERIALIZE; + if (strcmp(tag, "merge_join_plain") == 0) + return PGPA_TAG_MERGE_JOIN_PLAIN; + break; + case 'n': + if (strcmp(tag, "nested_loop_materialize") == 0) + return PGPA_TAG_NESTED_LOOP_MATERIALIZE; + if (strcmp(tag, "nested_loop_memoize") == 0) + return PGPA_TAG_NESTED_LOOP_MEMOIZE; + if (strcmp(tag, "nested_loop_plain") == 0) + return PGPA_TAG_NESTED_LOOP_PLAIN; + if (strcmp(tag, "no_gather") == 0) + return PGPA_TAG_NO_GATHER; + break; + case 'p': + if (strcmp(tag, "partitionwise") == 0) + return PGPA_TAG_PARTITIONWISE; + break; + case 's': + if (strcmp(tag, "semijoin_non_unique") == 0) + return PGPA_TAG_SEMIJOIN_NON_UNIQUE; + if (strcmp(tag, "semijoin_unique") == 0) + return PGPA_TAG_SEMIJOIN_UNIQUE; + if (strcmp(tag, "seq_scan") == 0) + return PGPA_TAG_SEQ_SCAN; + break; + case 't': + if (strcmp(tag, "tid_scan") == 0) + return PGPA_TAG_TID_SCAN; + break; + } + + /* didn't work out */ + *fail = true; + + /* return an arbitrary value to unwind the call stack */ + return PGPA_TAG_SEQ_SCAN; +} + +/* + * Format a pgpa_advice_target as a string and append result to a StringInfo. + */ +void +pgpa_format_advice_target(StringInfo str, pgpa_advice_target *target) +{ + if (target->ttype != PGPA_TARGET_IDENTIFIER) + { + bool first = true; + char *delims; + + if (target->ttype == PGPA_TARGET_UNORDERED_LIST) + delims = "{}"; + else + delims = "()"; + + appendStringInfoChar(str, delims[0]); + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + if (first) + first = false; + else + appendStringInfoChar(str, ' '); + pgpa_format_advice_target(str, child_target); + } + appendStringInfoChar(str, delims[1]); + } + else + { + const char *rt_identifier; + + rt_identifier = pgpa_identifier_string(&target->rid); + appendStringInfoString(str, rt_identifier); + } +} + +/* + * Format a pgpa_index_target as a string and append result to a StringInfo. + */ +void +pgpa_format_index_target(StringInfo str, pgpa_index_target *itarget) +{ + if (itarget->indnamespace != NULL) + appendStringInfo(str, "%s.", + quote_identifier(itarget->indnamespace)); + appendStringInfoString(str, quote_identifier(itarget->indname)); +} + +/* + * Determine whether two pgpa_index_target objects are exactly identical. + */ +bool +pgpa_index_targets_equal(pgpa_index_target *i1, pgpa_index_target *i2) +{ + /* indnamespace can be NULL, and two NULL values are equal */ + if ((i1->indnamespace != NULL || i2->indnamespace != NULL) && + (i1->indnamespace == NULL || i2->indnamespace == NULL || + strcmp(i1->indnamespace, i2->indnamespace) != 0)) + return false; + if (strcmp(i1->indname, i2->indname) != 0) + return false; + + return true; +} + +/* + * Check whether an identifier matches an any part of an advice target. + */ +bool +pgpa_identifier_matches_target(pgpa_identifier *rid, pgpa_advice_target *target) +{ + /* For non-identifiers, check all descendents. */ + if (target->ttype != PGPA_TARGET_IDENTIFIER) + { + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + if (pgpa_identifier_matches_target(rid, child_target)) + return true; + } + return false; + } + + /* Straightforward comparisons of alias name and occcurrence number. */ + if (strcmp(rid->alias_name, target->rid.alias_name) != 0) + return false; + if (rid->occurrence != target->rid.occurrence) + return false; + + /* + * If a relation identifer mentions a partition name, it should also + * specify a partition schema. But the target may leave the schema NULL to + * match anything. + */ + Assert(rid->partnsp != NULL || rid->partrel == NULL); + if (rid->partnsp != NULL && target->rid.partnsp != NULL && + strcmp(rid->partnsp, target->rid.partnsp) != 0) + return false; + + /* + * These fields can be NULL on either side, but NULL only matches another + * NULL. + */ + if (!strings_equal_or_both_null(rid->partrel, target->rid.partrel)) + return false; + if (!strings_equal_or_both_null(rid->plan_name, target->rid.plan_name)) + return false; + + return true; +} + +/* + * Match identifiers to advice targets and return an enum value indicating + * the relationship between the set of keys and the set of targets. + * + * See the comments for pgpa_itm_type. + */ +pgpa_itm_type +pgpa_identifiers_match_target(int nrids, pgpa_identifier *rids, + pgpa_advice_target *target) +{ + bool all_rids_used = true; + bool any_rids_used = false; + bool all_targets_used; + bool *rids_used = palloc0_array(bool, nrids); + + all_targets_used = + pgpa_identifiers_cover_target(nrids, rids, target, rids_used); + + for (int i = 0; i < nrids; ++i) + { + if (rids_used[i]) + any_rids_used = true; + else + all_rids_used = false; + } + + if (all_rids_used) + { + if (all_targets_used) + return PGPA_ITM_EQUAL; + else + return PGPA_ITM_KEYS_ARE_SUBSET; + } + else + { + if (all_targets_used) + return PGPA_ITM_TARGETS_ARE_SUBSET; + else if (any_rids_used) + return PGPA_ITM_INTERSECTING; + else + return PGPA_ITM_DISJOINT; + } +} + +/* + * Returns true if every target or sub-target is matched by at least one + * identifier, and otherwise false. + * + * Also sets rids_used[i] = true for each idenifier that matches at least one + * target. + */ +static bool +pgpa_identifiers_cover_target(int nrids, pgpa_identifier *rids, + pgpa_advice_target *target, bool *rids_used) +{ + bool result = false; + + if (target->ttype != PGPA_TARGET_IDENTIFIER) + { + result = true; + + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + if (!pgpa_identifiers_cover_target(nrids, rids, child_target, + rids_used)) + result = false; + } + } + else + { + for (int i = 0; i < nrids; ++i) + { + if (pgpa_identifier_matches_target(&rids[i], target)) + { + rids_used[i] = true; + result = true; + } + } + } + + return result; +} diff --git a/contrib/pg_plan_advice/pgpa_ast.h b/contrib/pg_plan_advice/pgpa_ast.h new file mode 100644 index 0000000000..5d3f8d58a7 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_ast.h @@ -0,0 +1,185 @@ +/*------------------------------------------------------------------------- + * + * pgpa_ast.h + * abstract syntax trees for plan advice, plus parser/scanner support + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_ast.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_AST_H +#define PGPA_AST_H + +#include "pgpa_identifier.h" + +#include "nodes/pg_list.h" + +/* + * Advice items generally take the form SOME_TAG(item [...]), where an item + * can take various forms. The simplest case is a relation identifier, but + * some tags allow sublists, and JOIN_ORDER() allows both ordered and unordered + * sublists. + */ +typedef enum +{ + PGPA_TARGET_IDENTIFIER, /* relation identifier */ + PGPA_TARGET_ORDERED_LIST, /* (item ...) */ + PGPA_TARGET_UNORDERED_LIST /* {item ...} */ +} pgpa_target_type; + +/* + * An index specification. + */ +typedef struct pgpa_index_target +{ + /* Index schema and name */ + char *indnamespace; + char *indname; +} pgpa_index_target; + +/* + * A single item about which advice is being given, which could be either + * a relation identifier that we want to break out into its constituent fields, + * or a sublist of some kind. + */ +typedef struct pgpa_advice_target +{ + pgpa_target_type ttype; + + /* + * This field is meaningful when ttype is PGPA_TARGET_IDENTIFIER. + * + * All identifiers must have an alias name and an occurrence number; the + * remaining fields can be NULL. Note that it's possible to specify a + * partition name without a partition schema, but not the reverse. + */ + pgpa_identifier rid; + + /* + * This field is set when ttype is PPGA_TARGET_IDENTIFIER and the advice + * tag is PGPA_TAG_INDEX_SCAN or PGPA_TAG_INDEX_ONLY_SCAN. + */ + pgpa_index_target *itarget; + + /* + * When the ttype is PGPA_TARGET__LIST, this field contains a + * list of additional pgpa_advice_target objects. Otherwise, it is unused. + */ + List *children; +} pgpa_advice_target; + +/* + * These are all the kinds of advice that we know how to parse. If a keyword + * is found at the top level, it must be in this list. + * + * If you change anything here, also update pgpa_parse_advice_tag and + * pgpa_cstring_advice_tag. + */ +typedef enum pgpa_advice_tag_type +{ + PGPA_TAG_BITMAP_HEAP_SCAN, + PGPA_TAG_FOREIGN_JOIN, + PGPA_TAG_GATHER, + PGPA_TAG_GATHER_MERGE, + PGPA_TAG_HASH_JOIN, + PGPA_TAG_INDEX_ONLY_SCAN, + PGPA_TAG_INDEX_SCAN, + PGPA_TAG_JOIN_ORDER, + PGPA_TAG_MERGE_JOIN_MATERIALIZE, + PGPA_TAG_MERGE_JOIN_PLAIN, + PGPA_TAG_NESTED_LOOP_MATERIALIZE, + PGPA_TAG_NESTED_LOOP_MEMOIZE, + PGPA_TAG_NESTED_LOOP_PLAIN, + PGPA_TAG_NO_GATHER, + PGPA_TAG_PARTITIONWISE, + PGPA_TAG_SEMIJOIN_NON_UNIQUE, + PGPA_TAG_SEMIJOIN_UNIQUE, + PGPA_TAG_SEQ_SCAN, + PGPA_TAG_TID_SCAN +} pgpa_advice_tag_type; + +/* + * An item of advice, meaning a tag and the list of all targets to which + * it is being applied. + * + * "targets" is a list of pgpa_advice_target objects. + * + * The List returned from pgpa_yyparse is list of pgpa_advice_item objects. + */ +typedef struct pgpa_advice_item +{ + pgpa_advice_tag_type tag; + List *targets; +} pgpa_advice_item; + +/* + * Result of comparing an array of pgpa_relation_identifier objects to a + * pgpa_advice_target. + * + * PGPA_ITM_EQUAL means all targets are matched by some identifier, and + * all identifiers were matched to a target. + * + * PGPA_ITM_KEYS_ARE_SUBSET means that all identifiers matched to a target, + * but there were leftover targets. Generally, this means that the advice is + * looking to apply to all of the rels we have plus some additional ones that + * we don't have. + * + * PGPA_ITM_TARGETS_ARE_SUBSET means that all targets are matched by an + * identifiers, but there were leftover identifiers. Generally, this means + * that the advice is looking to apply to some but not all of the rels we have. + * + * PGPA_ITM_INTERSECTING means that some identifeirs and targets were matched, + * but neither all identifiers nor all targets could be matched to items in + * the other set. + * + * PGPA_ITM_DISJOINT means that no matches between identifeirs and targets were + * found. + */ +typedef enum +{ + PGPA_ITM_EQUAL, + PGPA_ITM_KEYS_ARE_SUBSET, + PGPA_ITM_TARGETS_ARE_SUBSET, + PGPA_ITM_INTERSECTING, + PGPA_ITM_DISJOINT +} pgpa_itm_type; + +/* for pgpa_scanner.l and pgpa_parser.y */ +union YYSTYPE; +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void *yyscan_t; +#endif + +/* in pgpa_scanner.l */ +extern int pgpa_yylex(union YYSTYPE *yylval_param, List **result, + char **parse_error_msg_p, yyscan_t yyscanner); +extern void pgpa_yyerror(List **result, char **parse_error_msg_p, + yyscan_t yyscanner, + const char *message); +extern void pgpa_scanner_init(const char *str, yyscan_t *yyscannerp); +extern void pgpa_scanner_finish(yyscan_t yyscanner); + +/* in pgpa_parser.y */ +extern int pgpa_yyparse(List **result, char **parse_error_msg_p, + yyscan_t yyscanner); +extern List *pgpa_parse(const char *advice_string, char **error_p); + +/* in pgpa_ast.c */ +extern char *pgpa_cstring_advice_tag(pgpa_advice_tag_type advice_tag); +extern bool pgpa_identifier_matches_target(pgpa_identifier *rid, + pgpa_advice_target *target); +extern pgpa_itm_type pgpa_identifiers_match_target(int nrids, + pgpa_identifier *rids, + pgpa_advice_target *target); +extern bool pgpa_index_targets_equal(pgpa_index_target *i1, + pgpa_index_target *i2); +extern pgpa_advice_tag_type pgpa_parse_advice_tag(const char *tag, bool *fail); +extern void pgpa_format_advice_target(StringInfo str, + pgpa_advice_target *target); +extern void pgpa_format_index_target(StringInfo str, + pgpa_index_target *itarget); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_collector.c b/contrib/pg_plan_advice/pgpa_collector.c new file mode 100644 index 0000000000..a0b0d7e159 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_collector.c @@ -0,0 +1,639 @@ +/*------------------------------------------------------------------------- + * + * pgpa_collector.c + * collect advice into backend-local or shared memory + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_collector.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_collector.h" + +#include "datatype/timestamp.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/timestamp.h" + +PG_FUNCTION_INFO_V1(pg_clear_collected_local_advice); +PG_FUNCTION_INFO_V1(pg_clear_collected_shared_advice); +PG_FUNCTION_INFO_V1(pg_get_collected_local_advice); +PG_FUNCTION_INFO_V1(pg_get_collected_shared_advice); + +#define ADVICE_CHUNK_SIZE 1024 +#define ADVICE_CHUNK_ARRAY_SIZE 64 + +#define PG_GET_ADVICE_COLUMNS 7 + +/* + * Advice extracted from one query plan, together with the query string + * and various other identifying details. + */ +typedef struct pgpa_collected_advice +{ + Oid userid; /* user OID */ + Oid dbid; /* database OID */ + uint64 queryid; /* query identifier */ + TimestampTz timestamp; /* query timestamp */ + int advice_offset; /* start of advice in textual data */ + char textual_data[FLEXIBLE_ARRAY_MEMBER]; +} pgpa_collected_advice; + +/* + * A bunch of pointers to pgpa_collected_advice objects, stored in + * backend-local memory. + */ +typedef struct pgpa_local_advice_chunk +{ + pgpa_collected_advice *entries[ADVICE_CHUNK_SIZE]; +} pgpa_local_advice_chunk; + +/* + * Information about all of the pgpa_collected_advice objects that we're + * storing in local memory. + * + * We assign consecutive IDs, starting from 0, to each pgpa_collected_advice + * object that we store. The actual storage is an array of chunks, which + * helps keep memcpy() overhead low when we start discarding older data. + */ +typedef struct pgpa_local_advice +{ + uint64 next_id; + uint64 oldest_id; + uint64 base_id; + int chunk_array_allocated_size; + pgpa_local_advice_chunk **chunks; +} pgpa_local_advice; + +/* + * Just like pgpa_local_advice_chunk, but stored in a dynamic shared area, + * so we must use dsa_pointer instead of native pointers. + */ +typedef struct pgpa_shared_advice_chunk +{ + dsa_pointer entries[ADVICE_CHUNK_SIZE]; +} pgpa_shared_advice_chunk; + +/* + * Just like pgpa_local_advice, but stored in a dynamic shared area, so + * we must use dsa_pointer instead of native pointers. + */ +typedef struct pgpa_shared_advice +{ + uint64 next_id; + uint64 oldest_id; + uint64 base_id; + int chunk_array_allocated_size; + dsa_pointer chunks; +} pgpa_shared_advice; + +/* Pointers to local and shared collectors */ +static pgpa_local_advice *local_collector = NULL; +static pgpa_shared_advice *shared_collector = NULL; + +/* Static functions */ +static pgpa_collected_advice *pgpa_make_collected_advice(Oid userid, + Oid dbid, + uint64 queryId, + TimestampTz timestamp, + const char *query_string, + const char *advice_string, + dsa_area *area, + dsa_pointer *result); +static void pgpa_store_local_advice(pgpa_collected_advice *ca); +static void pgpa_trim_local_advice(int limit); +static void pgpa_store_shared_advice(dsa_pointer ca_pointer); +static void pgpa_trim_shared_advice(dsa_area *area, int limit); + +/* Helper function to extract the query string from pgpa_collected_advice */ +static inline const char * +query_string(pgpa_collected_advice *ca) +{ + return ca->textual_data; +} + +/* Helper function to extract the advice string from pgpa_collected_advice */ +static inline const char * +advice_string(pgpa_collected_advice *ca) +{ + return ca->textual_data + ca->advice_offset; +} + +/* + * Store collected query advice into the local or shared advice collector, + * as appropriate. + */ +void +pgpa_collect_advice(uint64 queryId, const char *query_string, + const char *advice_string) +{ + Oid userid = GetUserId(); + Oid dbid = MyDatabaseId; + TimestampTz now = GetCurrentTimestamp(); + + if (pg_plan_advice_local_collector && + pg_plan_advice_local_collection_limit > 0) + { + pgpa_collected_advice *ca; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + ca = pgpa_make_collected_advice(userid, dbid, queryId, now, + query_string, advice_string, + NULL, NULL); + pgpa_store_local_advice(ca); + MemoryContextSwitchTo(oldcontext); + } + + if (pg_plan_advice_shared_collector && + pg_plan_advice_shared_collection_limit > 0) + { + dsa_area *area = pg_plan_advice_dsa_area(); + dsa_pointer ca_pointer = InvalidDsaPointer; /* placate compiler */ + + pgpa_make_collected_advice(userid, dbid, queryId, now, + query_string, advice_string, area, + &ca_pointer); + pgpa_store_shared_advice(ca_pointer); + } +} + +/* + * Allocate and fill a new pgpa_collected_advice object. + * + * If area != NULL, it is used to allocate the new object, and the resulting + * dsa_pointer is returned via *result. + * + * If area == NULL, the new object is allocated in the current memory context, + * and result is not examined or modified. + */ +static pgpa_collected_advice * +pgpa_make_collected_advice(Oid userid, Oid dbid, uint64 queryId, + TimestampTz timestamp, + const char *query_string, + const char *advice_string, + dsa_area *area, dsa_pointer *result) +{ + size_t query_string_length = strlen(query_string) + 1; + size_t advice_string_length = strlen(advice_string) + 1; + size_t total_length; + pgpa_collected_advice *ca; + + total_length = offsetof(pgpa_collected_advice, textual_data) + + query_string_length + advice_string_length; + + if (area == NULL) + ca = palloc(total_length); + else + { + *result = dsa_allocate(area, total_length); + ca = dsa_get_address(area, *result); + } + + ca->userid = userid; + ca->dbid = dbid; + ca->queryid = queryId; + ca->timestamp = timestamp; + ca->advice_offset = query_string_length; + + memcpy(ca->textual_data, query_string, query_string_length); + memcpy(&ca->textual_data[ca->advice_offset], + advice_string, advice_string_length); + + return ca; +} + +/* + * Add a pg_collected_advice object to our backend-local advice collection. + * + * Caller is responsible for switching to the appropriate memory context; + * the provided object should have been allocated in that same context. + */ +static void +pgpa_store_local_advice(pgpa_collected_advice *ca) +{ + uint64 chunk_number; + uint64 chunk_offset; + pgpa_local_advice *la = local_collector; + + /* If the local advice collector isn't initialized yet, do that now. */ + if (la == NULL) + { + la = palloc0(sizeof(pgpa_local_advice)); + la->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE; + la->chunks = palloc0_array(pgpa_local_advice_chunk *, + la->chunk_array_allocated_size); + local_collector = la; + } + + /* Compute chunk and offset at which to store this advice. */ + chunk_number = (la->next_id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (la->next_id - la->base_id) % ADVICE_CHUNK_SIZE; + + /* Extend chunk array, if needed. */ + if (chunk_number >= la->chunk_array_allocated_size) + { + int new_size; + + new_size = la->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE; + la->chunks = repalloc0_array(la->chunks, + pgpa_local_advice_chunk *, + la->chunk_array_allocated_size, + new_size); + la->chunk_array_allocated_size = new_size; + } + + /* Allocate new chunk, if needed. */ + if (la->chunks[chunk_number] == NULL) + la->chunks[chunk_number] = palloc0_object(pgpa_local_advice_chunk); + + /* Save pointer and bump next-id counter. */ + Assert(la->chunks[chunk_number]->entries[chunk_offset] == NULL); + la->chunks[chunk_number]->entries[chunk_offset] = ca; + ++la->next_id; + + /* If we've exceeded the storage limit, discard old data. */ + pgpa_trim_local_advice(pg_plan_advice_local_collection_limit); +} + +/* + * Add a pg_collected_advice object to the shared advice collection. + * + * 'ca_pointer' should have been allocated from the pg_plan_advice DSA area + * and should point to an object of type pgpa_collected_advice. + */ +static void +pgpa_store_shared_advice(dsa_pointer ca_pointer) +{ + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + pgpa_shared_advice *sa = shared_collector; + dsa_pointer *chunk_array; + pgpa_shared_advice_chunk *chunk; + + /* Lock the shared state. */ + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If we're not attached to the shared advice collector yet, fix that now. + * If we're the first ones to attach, we may need to create the object. + */ + if (sa == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + state->shared_collector = + dsa_allocate0(area, sizeof(pgpa_shared_advice)); + shared_collector = sa = dsa_get_address(area, state->shared_collector); + } + + /* + * It's possible that some other backend may have succeeded in creating + * the main collector object but failed to allocate an initial chunk + * array, so we must be prepared to allocate the chunk array here whether + * or not we created the collector object. + */ + if (shared_collector->chunk_array_allocated_size == 0) + { + sa->chunks = + dsa_allocate0(area, + sizeof(dsa_pointer) * ADVICE_CHUNK_ARRAY_SIZE); + sa->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE; + } + + /* Compute chunk and offset at which to store this advice. */ + chunk_number = (sa->next_id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (sa->next_id - sa->base_id) % ADVICE_CHUNK_SIZE; + + /* Get the address of the chunk array and, if needed, extend it. */ + if (chunk_number >= sa->chunk_array_allocated_size) + { + int new_size; + dsa_pointer new_chunks; + + /* + * DSA can't enlarge an existing allocation, so we must make a new + * allocation and copy data over. + */ + new_size = sa->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE; + new_chunks = dsa_allocate0(area, sizeof(dsa_pointer) * new_size); + chunk_array = dsa_get_address(area, new_chunks); + memcpy(chunk_array, dsa_get_address(area, sa->chunks), + sizeof(dsa_pointer) * sa->chunk_array_allocated_size); + dsa_free(area, sa->chunks); + sa->chunks = new_chunks; + sa->chunk_array_allocated_size = new_size; + } + else + chunk_array = dsa_get_address(area, sa->chunks); + + /* Get the address of the desired chunk, allocating it if needed. */ + if (chunk_array[chunk_number] == InvalidDsaPointer) + chunk_array[chunk_number] = + dsa_allocate0(area, sizeof(pgpa_shared_advice_chunk)); + chunk = dsa_get_address(area, chunk_array[chunk_number]); + + /* Save pointer and bump next-id counter. */ + Assert(chunk->entries[chunk_offset] == InvalidDsaPointer); + chunk->entries[chunk_offset] = ca_pointer; + ++sa->next_id; + + /* If we've exceeded the storage limit, discard old data. */ + pgpa_trim_shared_advice(area, pg_plan_advice_shared_collection_limit); + + /* Release lock on shared state. */ + LWLockRelease(&state->lock); +} + +/* + * Discard collected advice stored in backend-local memory in excess of the + * specified limit. + */ +static void +pgpa_trim_local_advice(int limit) +{ + pgpa_local_advice *la = local_collector; + uint64 current_count; + uint64 trim_count; + uint64 total_chunk_count; + uint64 trim_chunk_count; + uint64 remaining_chunk_count; + + /* If we haven't yet reached the limit, there's nothing to do. */ + current_count = la->next_id - la->oldest_id; + if (current_count <= limit) + return; + + /* Free enough entries to get us back down to the limit. */ + trim_count = current_count - limit; + while (trim_count > 0) + { + uint64 chunk_number; + uint64 chunk_offset; + + chunk_number = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (la->oldest_id - la->base_id) % ADVICE_CHUNK_SIZE; + + Assert(la->chunks[chunk_number]->entries[chunk_offset] != NULL); + pfree(la->chunks[chunk_number]->entries[chunk_offset]); + la->chunks[chunk_number]->entries[chunk_offset] = NULL; + ++la->oldest_id; + --trim_count; + } + + /* Free any chunks that are now entirely unused. */ + trim_chunk_count = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE; + for (uint64 n = 0; n < trim_chunk_count; ++n) + pfree(la->chunks[n]); + + /* Slide remaining chunk pointers back toward the base of the array. */ + total_chunk_count = (la->next_id - la->base_id + + ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE; + remaining_chunk_count = total_chunk_count - trim_chunk_count; + if (remaining_chunk_count > 0) + memmove(&la->chunks[0], &la->chunks[trim_chunk_count], + sizeof(pgpa_local_advice_chunk *) * remaining_chunk_count); + + /* Don't leave stale pointers around. */ + memset(&la->chunks[remaining_chunk_count], 0, + sizeof(pgpa_local_advice_chunk *) + * (total_chunk_count - remaining_chunk_count)); + + /* Adjust base ID value accordingly. */ + la->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE; +} + +/* + * Discard collected advice stored in shared memory in excess of the + * specified limit. + */ +static void +pgpa_trim_shared_advice(dsa_area *area, int limit) +{ + pgpa_shared_advice *sa = shared_collector; + uint64 current_count; + uint64 trim_count; + uint64 total_chunk_count; + uint64 trim_chunk_count; + uint64 remaining_chunk_count; + dsa_pointer *chunk_array; + + /* If we haven't yet reached the limit, there's nothing to do. */ + current_count = sa->next_id - sa->oldest_id; + if (current_count <= limit) + return; + + /* Get a pointer to the chunk array. */ + chunk_array = dsa_get_address(area, sa->chunks); + + /* Free enough entries to get us back down to the limit. */ + trim_count = current_count - limit; + while (trim_count > 0) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_advice_chunk *chunk; + + chunk_number = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (sa->oldest_id - sa->base_id) % ADVICE_CHUNK_SIZE; + + chunk = dsa_get_address(area, chunk_array[chunk_number]); + Assert(chunk->entries[chunk_offset] != InvalidDsaPointer); + dsa_free(area, chunk->entries[chunk_offset]); + chunk->entries[chunk_offset] = InvalidDsaPointer; + ++sa->oldest_id; + --trim_count; + } + + /* Free any chunks that are now entirely unused. */ + trim_chunk_count = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE; + for (uint64 n = 0; n < trim_chunk_count; ++n) + dsa_free(area, chunk_array[n]); + + /* Slide remaining chunk pointers back toward the base of the array. */ + total_chunk_count = (sa->next_id - sa->base_id + + ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE; + remaining_chunk_count = total_chunk_count - trim_chunk_count; + if (remaining_chunk_count > 0) + memmove(&chunk_array[0], &chunk_array[trim_chunk_count], + sizeof(dsa_pointer) * remaining_chunk_count); + + /* Don't leave stale pointers around. */ + memset(&chunk_array[remaining_chunk_count], 0, + sizeof(pgpa_shared_advice_chunk *) + * (total_chunk_count - remaining_chunk_count)); + + /* Adjust base ID value accordingly. */ + sa->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE; +} + +/* + * SQL-callable function to discard advice collected in backend-local memory + */ +Datum +pg_clear_collected_local_advice(PG_FUNCTION_ARGS) +{ + if (local_collector != NULL) + pgpa_trim_local_advice(0); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable function to discard advice collected in backend-local memory + */ +Datum +pg_clear_collected_shared_advice(PG_FUNCTION_ARGS) +{ + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If we're not attached to the shared advice collector yet, fix that now; + * but if the collector doesn't even exist, we can return without doing + * anything else. + */ + if (shared_collector == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + { + LWLockRelease(&state->lock); + return (Datum) 0; + } + shared_collector = dsa_get_address(area, state->shared_collector); + } + + /* Do the real work */ + pgpa_trim_shared_advice(area, 0); + + LWLockRelease(&state->lock); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable SRF to return advice collected in backend-local memory + */ +Datum +pg_get_collected_local_advice(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + pgpa_local_advice *la = local_collector; + Oid userid = GetUserId(); + + InitMaterializedSRF(fcinfo, 0); + + if (la == NULL) + return (Datum) 0; + + /* Loop over all entries. */ + for (uint64 id = la->oldest_id; id < la->next_id; ++id) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_collected_advice *ca; + Datum values[PG_GET_ADVICE_COLUMNS]; + bool nulls[PG_GET_ADVICE_COLUMNS] = {0}; + + chunk_number = (id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (id - la->base_id) % ADVICE_CHUNK_SIZE; + + ca = la->chunks[chunk_number]->entries[chunk_offset]; + + if (!member_can_set_role(userid, ca->userid)) + continue; + + values[0] = UInt64GetDatum(id); + values[1] = ObjectIdGetDatum(ca->userid); + values[2] = ObjectIdGetDatum(ca->dbid); + values[3] = UInt64GetDatum(ca->queryid); + values[4] = TimestampGetDatum(ca->timestamp); + values[5] = CStringGetTextDatum(query_string(ca)); + values[6] = CStringGetTextDatum(advice_string(ca)); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + return (Datum) 0; +} + +/* + * SQL-callable SRF to return advice collected in shared memory + */ +Datum +pg_get_collected_shared_advice(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + dsa_pointer *chunk_array; + pgpa_shared_advice *sa = shared_collector; + + InitMaterializedSRF(fcinfo, 0); + + /* Lock the shared state. */ + LWLockAcquire(&state->lock, LW_SHARED); + + /* + * If we're not attached to the shared advice collector yet, fix that now; + * but if the collector doesn't even exist, we can return without doing + * anything else. + */ + if (sa == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + { + LWLockRelease(&state->lock); + return (Datum) 0; + } + shared_collector = sa = dsa_get_address(area, state->shared_collector); + } + + /* Get a pointer to the chunk array. */ + chunk_array = dsa_get_address(area, sa->chunks); + + /* Loop over all entries. */ + for (uint64 id = sa->oldest_id; id < sa->next_id; ++id) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_advice_chunk *chunk; + pgpa_collected_advice *ca; + Datum values[PG_GET_ADVICE_COLUMNS]; + bool nulls[PG_GET_ADVICE_COLUMNS] = {0}; + + chunk_number = (id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (id - sa->base_id) % ADVICE_CHUNK_SIZE; + + chunk = dsa_get_address(area, chunk_array[chunk_number]); + ca = dsa_get_address(area, chunk->entries[chunk_offset]); + + values[0] = UInt64GetDatum(id); + values[1] = ObjectIdGetDatum(ca->userid); + values[2] = ObjectIdGetDatum(ca->dbid); + values[3] = UInt64GetDatum(ca->queryid); + values[4] = TimestampGetDatum(ca->timestamp); + values[5] = CStringGetTextDatum(query_string(ca)); + values[6] = CStringGetTextDatum(advice_string(ca)); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + /* Release lock on shared state. */ + LWLockRelease(&state->lock); + + return (Datum) 0; +} diff --git a/contrib/pg_plan_advice/pgpa_collector.h b/contrib/pg_plan_advice/pgpa_collector.h new file mode 100644 index 0000000000..b6e746a06d --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_collector.h @@ -0,0 +1,18 @@ +/*------------------------------------------------------------------------- + * + * pgpa_collector.h + * collect advice into backend-local or shared memory + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_collector.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_COLLECTOR_H +#define PGPA_COLLECTOR_H + +extern void pgpa_collect_advice(uint64 queryId, const char *query_string, + const char *advice_string); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_identifier.c b/contrib/pg_plan_advice/pgpa_identifier.c new file mode 100644 index 0000000000..51b4b0c60a --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_identifier.c @@ -0,0 +1,476 @@ +/*------------------------------------------------------------------------- + * + * pgpa_identifier.c + * create appropriate identifiers for range table entries + * + * The goal of this module is to be able to produce identifiers for range + * table entries that are unique, understandable to human beings, and + * able to be reconstructed during future planning cycles. As an + * exception, we do not care about, or want to produce, identifiers for + * RTE_JOIN entries. This is because (1) we would end up with a ton of + * RTEs with unhelpful names like unnamed_join_17; (2) not all joins have + * RTEs; and (3) we intend to refer to joins by their constituent members + * rather than by reference to the join RTE. + * + * In general, we construct identifiers of the following form: + * + * alias_name#occurrence_number/child_table_name@subquery_name + * + * However, occurrence_number is omitted when it is the first occurrence + * within the same subquery, child_table_name is omitted for relations that + * are not child tables, and subquery_name is omitted for the topmost + * query level. Whenever an item is omitted, the preceding punctuation mark + * is also omitted. Identifier-style escaping is applied to alias_name and + * subquery_name. Whenever we include child_table_name, we always + * schema-qualified name, but writing their own plan advice are not required + * to do so. Identifier-style escaping is applied to the schema and to the + * relation names separately. + * + * The upshot of all of these rules is that in simple cases, the relation + * identifier is textually identical to the alias name, making life easier + * for users. However, even in complex cases, every relation identifier + * for a given query will be unique (or at least we hope so: if not, this + * code is buggy and the identifier format might need to be rethought). + * + * A key goal of this system is that we want to be able to reconstruct the + * same identifiers during a future planning cycle for the same query, so + * that if a certain behavior is specified for a certain identifier, we can + * properly identify the RTI for which that behavior is mandated. In order + * for this to work, subquery names must be unique and known before the + * subquery is planned, and the remainder of the identifier must not depend + * on any part of the query outside of the current subquery level. In + * particular, occurrence_number must be calculated relative to the range + * table for the relevant subquery, not the final flattened range table. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_identifier.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_identifier.h" + +#include "parser/parsetree.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +static Index *pgpa_create_top_rti_map(Index rtable_length, List *rtable, + List *appinfos); +static int pgpa_occurrence_number(List *rtable, Index *top_rti_map, + SubPlanRTInfo *rtinfo, Index rti); + +/* + * Create a range table identifier from scratch. + * + * This function leaves the caller to do all the heavy lifting, so it's + * generally better to use one of the functions below instead. + * + * See the file header comments for more details on the format of an + * identifier. + */ +const char * +pgpa_identifier_string(const pgpa_identifier *rid) +{ + const char *result; + + Assert(rid->alias_name != NULL); + result = quote_identifier(rid->alias_name); + + Assert(rid->occurrence >= 0); + if (rid->occurrence > 1) + result = psprintf("%s#%d", result, rid->occurrence); + + if (rid->partrel != NULL) + { + if (rid->partnsp == NULL) + result = psprintf("%s/%s", result, + quote_identifier(rid->partrel)); + else + result = psprintf("%s/%s.%s", result, + quote_identifier(rid->partnsp), + quote_identifier(rid->partrel)); + } + + if (rid->plan_name != NULL) + result = psprintf("%s@%s", result, quote_identifier(rid->plan_name)); + + return result; +} + +/* + * Compute a relation identifier for a particular RTI. + * + * The caller provides root and rti, and gets the necessary details back via + * the remaining parameters. + */ +void +pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti, + pgpa_identifier *rid) +{ + Index top_rti = rti; + int occurrence = 1; + RangeTblEntry *rte; + RangeTblEntry *top_rte; + char *partnsp = NULL; + char *partrel = NULL; + + /* + * If this is a child RTE, find the topmost parent that is still of type + * RTE_RELATION. We do this because we identify children of partitioned + * tables by the name of the child table, but subqueries can also have + * child rels and we don't care about those here. + */ + for (;;) + { + AppendRelInfo *appinfo; + RangeTblEntry *parent_rte; + + /* append_rel_array can be NULL if there are no children */ + if (root->append_rel_array == NULL || + (appinfo = root->append_rel_array[top_rti]) == NULL) + break; + + parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + if (parent_rte->rtekind != RTE_RELATION) + break; + + top_rti = appinfo->parent_relid; + } + + /* Get the range table entries for the RTI and top RTI. */ + rte = planner_rt_fetch(rti, root); + top_rte = planner_rt_fetch(top_rti, root); + Assert(rte->rtekind != RTE_JOIN); + Assert(top_rte->rtekind != RTE_JOIN); + + /* Work out the correct occurrence number. */ + for (Index prior_rti = 1; prior_rti < top_rti; ++prior_rti) + { + RangeTblEntry *prior_rte; + AppendRelInfo *appinfo; + + /* + * If this is a child rel of a parent that is a relation, skip it. + * + * Such range table entries are disambiguated by mentioning the schema + * and name of the table, not by counting them as separate occurrences + * of the same table. + * + * NB: append_rel_array can be NULL if there are no children + */ + if (root->append_rel_array != NULL && + (appinfo = root->append_rel_array[prior_rti]) != NULL) + { + RangeTblEntry *parent_rte; + + parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + if (parent_rte->rtekind == RTE_RELATION) + continue; + } + + /* Skip NULL entries and joins. */ + prior_rte = planner_rt_fetch(prior_rti, root); + if (prior_rte == NULL || prior_rte->rtekind == RTE_JOIN) + continue; + + /* Skip if the alias name differs. */ + if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0) + continue; + + /* Looks like a true duplicate. */ + ++occurrence; + } + + /* If this is a child table, get the schema and relation names. */ + if (rti != top_rti) + { + partnsp = get_namespace_name_or_temp(get_rel_namespace(rte->relid)); + partrel = get_rel_name(rte->relid); + } + + /* OK, we have all the answers we need. Return them to the caller. */ + rid->alias_name = top_rte->eref->aliasname; + rid->occurrence = occurrence; + rid->partnsp = partnsp; + rid->partrel = partrel; + rid->plan_name = root->plan_name; +} + +/* + * Compute a relation identifier for a set of RTIs, except for any RTE_JOIN + * RTIs that may be present. + * + * RTE_JOIN entries are excluded because they cannot be mentioned by plan + * advice. + * + * The caller is responsible for making sure that the tkeys array is large + * enough to store the results. + * + * The return value is the number of identifiers computed. + */ +int +pgpa_compute_identifiers_by_relids(PlannerInfo *root, Bitmapset *relids, + pgpa_identifier *rids) +{ + int count = 0; + int rti = -1; + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + RangeTblEntry *rte = planner_rt_fetch(rti, root); + + if (rte->rtekind == RTE_JOIN) + continue; + pgpa_compute_identifier_by_rti(root, rti, &rids[count++]); + } + + Assert(count > 0); + return count; +} + +/* + * Create an array of range table identifiers for all the non-NULL, + * non-RTE_JOIN entries in the PlannedStmt's range table. + */ +pgpa_identifier * +pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt) +{ + Index rtable_length = list_length(pstmt->rtable); + pgpa_identifier *result = palloc0_array(pgpa_identifier, rtable_length); + Index *top_rti_map; + int rtinfoindex = 0; + SubPlanRTInfo *rtinfo = NULL; + SubPlanRTInfo *nextrtinfo = NULL; + + /* + * Account for relations addded by inheritance expansion of partitioned + * tables. + */ + top_rti_map = pgpa_create_top_rti_map(rtable_length, pstmt->rtable, + pstmt->appendRelations); + + /* + * When we begin iterating, we're processing the portion of the range + * table that originated from the top-level PlannerInfo, so subrtinfo is + * NULL. Later, subrtinfo will be the SubPlanRTInfo for the subquery whose + * portion of the range table we are processing. nextrtinfo is always the + * SubPlanRTInfo that follows the current one, if any, so when we're + * processing the top-level query's portion of the range table, the next + * SubPlanRTInfo is the very first one. + */ + if (pstmt->subrtinfos != NULL) + nextrtinfo = linitial(pstmt->subrtinfos); + + /* Main loop over the range table. */ + for (Index rti = 1; rti <= rtable_length; rti++) + { + const char *plan_name; + Index top_rti; + RangeTblEntry *rte; + RangeTblEntry *top_rte; + char *partnsp = NULL; + char *partrel = NULL; + int occurrence; + pgpa_identifier *rid; + + /* + * Advance to the next SubPlanRTInfo, if it's time to do that. + * + * This loop probably shouldn't ever iterate more than once, because + * that would imply that a subquery was planned but added nothing to + * the range table; but let's be defensive and assume it can happen. + */ + while (nextrtinfo != NULL && rti > nextrtinfo->rtoffset) + { + rtinfo = nextrtinfo; + if (++rtinfoindex >= list_length(pstmt->subrtinfos)) + nextrtinfo = NULL; + else + nextrtinfo = list_nth(pstmt->subrtinfos, rtinfoindex); + } + + /* Fetch the range table entry, if any. */ + rte = rt_fetch(rti, pstmt->rtable); + + /* + * We can't and don't need to identify null entries, and we don't want + * to identify join entries. + */ + if (rte == NULL || rte->rtekind == RTE_JOIN) + continue; + + /* + * If this is not a relation added by partitioned table expansion, + * then the top RTI/RTE are just the same as this RTI/RTE. Otherwise, + * we need the information for the top RTI/RTE, and must also fetch + * the partition schema and name. + */ + top_rti = top_rti_map[rti - 1]; + if (rti == top_rti) + top_rte = rte; + else + { + top_rte = rt_fetch(top_rti, pstmt->rtable); + partnsp = + get_namespace_name_or_temp(get_rel_namespace(rte->relid)); + partrel = get_rel_name(rte->relid); + } + + /* Compute the correct occurrence number. */ + occurrence = pgpa_occurrence_number(pstmt->rtable, top_rti_map, + rtinfo, top_rti); + + /* Get the name of the current plan (NULL for toplevel query). */ + plan_name = rtinfo == NULL ? NULL : rtinfo->plan_name; + + /* Save all the details we've derived. */ + rid = &result[rti - 1]; + rid->alias_name = top_rte->eref->aliasname; + rid->occurrence = occurrence; + rid->partnsp = partnsp; + rid->partrel = partrel; + rid->plan_name = plan_name; + } + + return result; +} + +/* + * Search for a pgpa_identifier in the array of identifiers computed for the + * range table. If exactly one match is found, return the matching RTI; else + * return 0. + */ +Index +pgpa_compute_rti_from_identifier(int rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_identifier *rid) +{ + Index result = 0; + + for (Index rti = 1; rti <= rtable_length; ++rti) + { + pgpa_identifier *rti_rid = &rt_identifiers[rti - 1]; + + /* If there's no identifier for this RTI, skip it. */ + if (rti_rid->alias_name == NULL) + continue; + + /* + * If it matches, return this RTI. As usual, an omitted partition + * schema matches anything, but partition and plan names must either + * match exactly or be omitted on both sides. + */ + if (strcmp(rid->alias_name, rti_rid->alias_name) == 0 && + rid->occurrence == rti_rid->occurrence && + (rid->partnsp == NULL || rti_rid->partnsp == NULL || + strcmp(rid->partnsp, rti_rid->partnsp) == 0) && + strings_equal_or_both_null(rid->partrel, rti_rid->partrel) && + strings_equal_or_both_null(rid->plan_name, rti_rid->plan_name)) + { + if (result != 0) + { + /* Multiple matches were found. */ + return 0; + } + result = rti; + } + } + + return result; +} + +/* + * Build a mapping from each RTI to the RTI whose alias_name will be used to + * construct the range table identifier. + * + * For child relations, this is the topmost parent that is still of type + * RTE_RELATION. For other relations, it's just the original RTI. + * + * Since we're eventually going to need this information for every RTI in + * the range table, it's best to compute all the answers in a single pass over + * the AppendRelInfo list. Otherwise, we might end up searching through that + * list repeatedly for entries of interest. + * + * Note that the returned array is uses zero-based indexing, while RTIs use + * 1-based indexing, so subtract 1 from the RTI before looking it up in the + * array. + */ +static Index * +pgpa_create_top_rti_map(Index rtable_length, List *rtable, List *appinfos) +{ + Index *top_rti_map = palloc0_array(Index, rtable_length); + + /* Initially, make every RTI point to itself. */ + for (Index rti = 1; rti <= rtable_length; ++rti) + top_rti_map[rti - 1] = rti; + + /* Update the map for each AppendRelInfo object. */ + foreach_node(AppendRelInfo, appinfo, appinfos) + { + Index parent_rti = appinfo->parent_relid; + RangeTblEntry *parent_rte = rt_fetch(parent_rti, rtable); + + /* If the parent is not RTE_RELATION, ignore this entry. */ + if (parent_rte->rtekind != RTE_RELATION) + continue; + + /* + * Map the child to wherever we mapped the parent. Parents always + * precede their children in the AppendRelInfo list, so this should + * work out. + */ + top_rti_map[appinfo->child_relid - 1] = top_rti_map[parent_rti - 1]; + } + + return top_rti_map; +} + +/* + * Find the occurence number of a certain relation within a certain subquery. + * + * The same alias name can occur multiple times within a subquery, but we want + * to disambiguate by giving different occurrences different integer indexes. + * However, child tables are disambiguated by including the table name rather + * than by incrementing the occurrence number; and joins are not named and so + * shouldn't increment the occurence number either. + */ +static int +pgpa_occurrence_number(List *rtable, Index *top_rti_map, + SubPlanRTInfo *rtinfo, Index rti) +{ + Index rtoffset = (rtinfo == NULL) ? 0 : rtinfo->rtoffset; + int occurrence = 1; + RangeTblEntry *rte = rt_fetch(rti, rtable); + + for (Index prior_rti = rtoffset + 1; prior_rti < rti; ++prior_rti) + { + RangeTblEntry *prior_rte; + + /* + * If this is a child rel of a parent that is a relation, skip it. + * + * Such range table entries are disambiguated by mentioning the schema + * and name of the table, not by counting them as separate occurrences + * of the same table. + */ + if (top_rti_map[prior_rti - 1] != prior_rti) + continue; + + /* Skip joins. */ + prior_rte = rt_fetch(prior_rti, rtable); + if (prior_rte->rtekind == RTE_JOIN) + continue; + + /* Skip if the alias name differs. */ + if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0) + continue; + + /* Looks like a true duplicate. */ + ++occurrence; + } + + return occurrence; +} diff --git a/contrib/pg_plan_advice/pgpa_identifier.h b/contrib/pg_plan_advice/pgpa_identifier.h new file mode 100644 index 0000000000..b000d2b708 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_identifier.h @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * pgpa_identifier.h + * create appropriate identifiers for range table entries + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_identifier.h + * + *------------------------------------------------------------------------- + */ + +#ifndef PGPA_IDENTIFIER_H +#define PGPA_IDENTIFIER_H + +#include "nodes/pathnodes.h" +#include "nodes/plannodes.h" + +typedef struct pgpa_identifier +{ + const char *alias_name; + int occurrence; + const char *partnsp; + const char *partrel; + const char *plan_name; +} pgpa_identifier; + +/* Convenience function for comparing possibly-NULL strings. */ +static inline bool +strings_equal_or_both_null(const char *a, const char *b) +{ + if (a == b) + return true; + else if (a == NULL || b == NULL) + return false; + else + return strcmp(a, b) == 0; +} + +extern const char *pgpa_identifier_string(const pgpa_identifier *rid); +extern void pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti, + pgpa_identifier *rid); +extern int pgpa_compute_identifiers_by_relids(PlannerInfo *root, + Bitmapset *relids, + pgpa_identifier *rids); +extern pgpa_identifier *pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt); + +extern Index pgpa_compute_rti_from_identifier(int rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_identifier *rid); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_join.c b/contrib/pg_plan_advice/pgpa_join.c new file mode 100644 index 0000000000..ec8e1a666e --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_join.c @@ -0,0 +1,629 @@ +/*------------------------------------------------------------------------- + * + * pgpa_join.c + * analysis of joins in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_join.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_join.h" +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/pathnodes.h" +#include "nodes/print.h" +#include "parser/parsetree.h" + +/* + * Temporary object used when unrolling a join tree. + */ +struct pgpa_join_unroller +{ + unsigned nallocated; + unsigned nused; + Plan *outer_subplan; + ElidedNode *outer_elided_node; + bool outer_beneath_any_gather; + pgpa_join_strategy *strategy; + Plan **inner_subplans; + ElidedNode **inner_elided_nodes; + pgpa_join_unroller **inner_unrollers; + bool *inner_beneath_any_gather; +}; + +static pgpa_join_strategy pgpa_decompose_join(pgpa_plan_walker_context *walker, + Plan *plan, + Plan **realouter, + Plan **realinner, + ElidedNode **elidedrealouter, + ElidedNode **elidedrealinner, + bool *found_any_outer_gather, + bool *found_any_inner_gather); +static ElidedNode *pgpa_descend_node(PlannedStmt *pstmt, Plan **plan); +static ElidedNode *pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan, + bool *found_any_gather); +static bool pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, + ElidedNode **elided_node); + +static bool is_result_node_with_child(Plan *plan); +static bool is_sorting_plan(Plan *plan); + +/* + * Create an initially-empty object for unrolling joins. + * + * This function creates a helper object that can later be used to create a + * pgpa_unrolled_join, after first calling pgpa_unroll_join one or more times. + */ +pgpa_join_unroller * +pgpa_create_join_unroller(void) +{ + pgpa_join_unroller *join_unroller; + + join_unroller = palloc0_object(pgpa_join_unroller); + join_unroller->nallocated = 4; + join_unroller->strategy = + palloc_array(pgpa_join_strategy, join_unroller->nallocated); + join_unroller->inner_subplans = + palloc_array(Plan *, join_unroller->nallocated); + join_unroller->inner_elided_nodes = + palloc_array(ElidedNode *, join_unroller->nallocated); + join_unroller->inner_unrollers = + palloc_array(pgpa_join_unroller *, join_unroller->nallocated); + join_unroller->inner_beneath_any_gather = + palloc_array(bool, join_unroller->nallocated); + + return join_unroller; +} + +/* + * Unroll one level of an unrollable join tree. + * + * Our basic goal here is to unroll join trees as they occur in the Plan + * tree into a simpler and more regular structure that we can more easily + * use for further processing. Unrolling is outer-deep, so if the plan tree + * has Join1(Join2(A,B),Join3(C,D)), the same join unroller object should be + * used for Join1 and Join2, but a different one will be needed for Join3, + * since that involves a join within the *inner* side of another join. + * + * pgpa_plan_walker creates a "top level" join unroller object when it + * encounters a join in a portion of the plan tree in which no join unroller + * is already active. From there, this function is responsible for determing + * to what portion of the plan tree that join unroller applies, and for + * creating any subordinate join unroller objects that are needed as a result + * of non-outer-deep join trees. We do this by returning the join unroller + * objects that should be used for further traversal of the outer and inner + * subtrees of the current plan node via *outer_join_unroller and + * *inner_join_unroller, respectively. + */ +void +pgpa_unroll_join(pgpa_plan_walker_context *walker, Plan *plan, + bool beneath_any_gather, + pgpa_join_unroller *join_unroller, + pgpa_join_unroller **outer_join_unroller, + pgpa_join_unroller **inner_join_unroller) +{ + pgpa_join_strategy strategy; + Plan *realinner, + *realouter; + ElidedNode *elidedinner, + *elidedouter; + int n; + bool found_any_outer_gather = false; + bool found_any_inner_gather = false; + + Assert(join_unroller != NULL); + + /* + * We need to pass the join_unroller object down through certain types of + * plan nodes -- anything that's considered part of the join strategy, and + * any other nodes that can occur in a join tree despite not being scans + * or joins. + * + * This includes: + * + * (1) Materialize, Memoize, and Hash nodes, which are part of the join + * strategy, + * + * (2) Gather and Gather Merge nodes, which can occur at any point in the + * join tree where the planner decided to initiate parallelism, + * + * (3) Sort and IncrementalSort nodes, which can occur beneath MergeJoin + * or GatherMerge, + * + * (4) Agg and Unique nodes, which can occur when we decide to make the + * nullable side of a semijoin unique and then join the result, and + * + * (5) Result nodes with children, which can be added either to project to + * enforce a one-time filter (but Result nodes without children are + * degenerate scans or joins). + */ + if (IsA(plan, Material) || IsA(plan, Memoize) || IsA(plan, Hash) + || IsA(plan, Gather) || IsA(plan, GatherMerge) + || is_sorting_plan(plan) || IsA(plan, Agg) || IsA(plan, Unique) + || is_result_node_with_child(plan)) + { + *outer_join_unroller = join_unroller; + return; + } + + /* + * Since we've already handled nodes that require pass-through treatment, + * this should be an unrollable join. + */ + strategy = pgpa_decompose_join(walker, plan, + &realouter, &realinner, + &elidedouter, &elidedinner, + &found_any_outer_gather, + &found_any_inner_gather); + + /* If our workspace is full, expand it. */ + if (join_unroller->nused >= join_unroller->nallocated) + { + join_unroller->nallocated *= 2; + join_unroller->strategy = + repalloc_array(join_unroller->strategy, + pgpa_join_strategy, + join_unroller->nallocated); + join_unroller->inner_subplans = + repalloc_array(join_unroller->inner_subplans, + Plan *, + join_unroller->nallocated); + join_unroller->inner_elided_nodes = + repalloc_array(join_unroller->inner_elided_nodes, + ElidedNode *, + join_unroller->nallocated); + join_unroller->inner_beneath_any_gather = + repalloc_array(join_unroller->inner_beneath_any_gather, + bool, + join_unroller->nallocated); + join_unroller->inner_unrollers = + repalloc_array(join_unroller->inner_unrollers, + pgpa_join_unroller *, + join_unroller->nallocated); + } + + /* + * Since we're flattening outer-deep join trees, it follows that if the + * outer side is still an unrollable join, it should be unrolled into this + * same object. Otherwise, we've reached the limit of what we can unroll + * into this object and must remember the outer side as the final outer + * subplan. + */ + if (elidedouter == NULL && pgpa_is_join(realouter)) + *outer_join_unroller = join_unroller; + else + { + join_unroller->outer_subplan = realouter; + join_unroller->outer_elided_node = elidedouter; + join_unroller->outer_beneath_any_gather = + beneath_any_gather || found_any_outer_gather; + } + + /* + * Store the inner subplan. If it's an unrollable join, it needs to be + * flattened in turn, but into a new unroller object, not this one. + */ + n = join_unroller->nused++; + join_unroller->strategy[n] = strategy; + join_unroller->inner_subplans[n] = realinner; + join_unroller->inner_elided_nodes[n] = elidedinner; + join_unroller->inner_beneath_any_gather[n] = + beneath_any_gather || found_any_inner_gather; + if (elidedinner == NULL && pgpa_is_join(realinner)) + *inner_join_unroller = pgpa_create_join_unroller(); + else + *inner_join_unroller = NULL; + join_unroller->inner_unrollers[n] = *inner_join_unroller; +} + +/* + * Use the data we've accumulated in a pgpa_join_unroller object to construct + * a pgpa_unrolled_join. + */ +pgpa_unrolled_join * +pgpa_build_unrolled_join(pgpa_plan_walker_context *walker, + pgpa_join_unroller *join_unroller) +{ + pgpa_unrolled_join *ujoin; + int i; + + /* + * We shouldn't have gone even so far as to create a join unroller unless + * we found at least one unrollable join. + */ + Assert(join_unroller->nused > 0); + + /* Allocate result structures. */ + ujoin = palloc0_object(pgpa_unrolled_join); + ujoin->ninner = join_unroller->nused; + ujoin->strategy = palloc0_array(pgpa_join_strategy, join_unroller->nused); + ujoin->inner = palloc0_array(pgpa_join_member, join_unroller->nused); + + /* Handle the outermost join. */ + ujoin->outer.plan = join_unroller->outer_subplan; + ujoin->outer.elided_node = join_unroller->outer_elided_node; + ujoin->outer.scan = + pgpa_build_scan(walker, ujoin->outer.plan, + ujoin->outer.elided_node, + join_unroller->outer_beneath_any_gather, + true); + + /* + * We want the joins from the deepest part of the plan tree to appear + * first in the result object, but the join unroller adds them in exactly + * the reverse of that order, so we need to flip the order of the arrays + * when constructing the final result. + */ + for (i = 0; i < join_unroller->nused; ++i) + { + int k = join_unroller->nused - i - 1; + + /* Copy strategy, Plan, and ElidedNode. */ + ujoin->strategy[i] = join_unroller->strategy[k]; + ujoin->inner[i].plan = join_unroller->inner_subplans[k]; + ujoin->inner[i].elided_node = join_unroller->inner_elided_nodes[k]; + + /* + * Fill in remaining details, using either the nested join unroller, + * or by deriving them from the plan and elided nodes. + */ + if (join_unroller->inner_unrollers[k] != NULL) + ujoin->inner[i].unrolled_join = + pgpa_build_unrolled_join(walker, + join_unroller->inner_unrollers[k]); + else + ujoin->inner[i].scan = + pgpa_build_scan(walker, ujoin->inner[i].plan, + ujoin->inner[i].elided_node, + join_unroller->inner_beneath_any_gather[k], + true); + } + + return ujoin; +} + +/* + * Free memory allocated for pgpa_join_unroller. + */ +void +pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller) +{ + pfree(join_unroller->strategy); + pfree(join_unroller->inner_subplans); + pfree(join_unroller->inner_elided_nodes); + pfree(join_unroller->inner_unrollers); + pfree(join_unroller); +} + +/* + * Identify the join strategy used by a join and the "real" inner and outer + * plans. + * + * For example, a Hash Join always has a Hash node on the inner side, but + * for all intents and purposes the real inner input is the Hash node's child, + * not the Hash node itself. + * + * Likewise, a Merge Join may have Sort note on the inner or outer side; if + * it does, the real input to the join is the Sort node's child, not the + * Sort node itself. + * + * In addition, with a Merge Join or a Nested Loop, the join planning code + * may add additional nodes such as Materialize or Memoize. We regard these + * as an aspect of the join strategy. As in the previous cases, the true input + * to the join is the underlying node. + * + * However, if any involved child node previously had a now-elided node stacked + * on top, then we can't "look through" that node -- indeed, what's going to be + * relevant for our purposes is the ElidedNode on top of that plan node, rather + * than the plan node itself. + * + * If there are multiple elided nodes, we want that one that would have been + * uppermost in the plan tree prior to setrefs processing; we expect to find + * that one last in the list of elided nodes. + * + * On return *realouter and *realinner will have been set to the real inner + * and real outer plans that we identified, and *elidedrealouter and + * *elidedrealinner to the last of any correspoding elided nodes. + * Additionally, *found_any_outer_gather and *found_any_inner_gather will + * be set to true if we looked through a Gather or Gather Merge node on + * that side of the join, and false otherwise. + */ +static pgpa_join_strategy +pgpa_decompose_join(pgpa_plan_walker_context *walker, Plan *plan, + Plan **realouter, Plan **realinner, + ElidedNode **elidedrealouter, ElidedNode **elidedrealinner, + bool *found_any_outer_gather, bool *found_any_inner_gather) +{ + PlannedStmt *pstmt = walker->pstmt; + JoinType jointype = ((Join *) plan)->jointype; + Plan *outerplan = plan->lefttree; + Plan *innerplan = plan->righttree; + ElidedNode *elidedouter; + ElidedNode *elidedinner; + pgpa_join_strategy strategy; + bool uniqueouter; + bool uniqueinner; + + elidedouter = pgpa_last_elided_node(pstmt, outerplan); + elidedinner = pgpa_last_elided_node(pstmt, innerplan); + *found_any_outer_gather = false; + *found_any_inner_gather = false; + + switch (nodeTag(plan)) + { + case T_MergeJoin: + + /* + * The planner may have chosen to place a Material node on the + * inner side of the MergeJoin; if this is present, we record it + * as part of the join strategy. + */ + if (elidedinner == NULL && IsA(innerplan, Material)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_MERGE_JOIN_MATERIALIZE; + } + else + strategy = JSTRAT_MERGE_JOIN_PLAIN; + + /* + * For a MergeJoin, either the outer or the inner subplan, or + * both, may have needed to be sorted; we must disregard any Sort + * or IncrementalSort node to find the real inner or outer + * subplan. + */ + if (elidedouter == NULL && is_sorting_plan(outerplan)) + elidedouter = pgpa_descend_node(pstmt, &outerplan); + if (elidedinner == NULL && is_sorting_plan(innerplan)) + elidedinner = pgpa_descend_node(pstmt, &innerplan); + break; + + case T_NestLoop: + + /* + * The planner may have chosen to place a Material or Memoize node + * on the inner side of the NestLoop; if this is present, we + * record it as part of the join strategy. + */ + if (elidedinner == NULL && IsA(innerplan, Material)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_NESTED_LOOP_MATERIALIZE; + } + else if (elidedinner == NULL && IsA(innerplan, Memoize)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_NESTED_LOOP_MEMOIZE; + } + else + strategy = JSTRAT_NESTED_LOOP_PLAIN; + break; + + case T_HashJoin: + + /* + * The inner subplan of a HashJoin is always a Hash node; the real + * inner subplan is the Hash node's child. + */ + Assert(IsA(innerplan, Hash)); + Assert(elidedinner == NULL); + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_HASH_JOIN; + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(plan)); + } + + /* + * The planner may have decided to implement a semijoin by first making + * the nullable side of the plan unique, and then performing a normal join + * against the result. Therefore, we might need to descend through a + * unique node on either side of the plan. + */ + uniqueouter = pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter); + uniqueinner = pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner); + + /* + * The planner may have decided to parallelize part of the join tree, so + * we could find a Gather or Gather Merge node here. Note that, if + * present, this will appear below nodes we considered as part of the join + * strategy, but we could find another uniqueness-enforcing node below the + * Gather or Gather Merge, if present. + */ + if (elidedouter == NULL) + { + elidedouter = pgpa_descend_any_gather(pstmt, &outerplan, + found_any_outer_gather); + if (found_any_outer_gather && + pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter)) + uniqueouter = true; + } + if (elidedinner == NULL) + { + elidedinner = pgpa_descend_any_gather(pstmt, &innerplan, + found_any_inner_gather); + if (found_any_inner_gather && + pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner)) + uniqueinner = true; + } + + /* + * It's possible that Result node has been inserted either to project a + * target list or to implement a one-time filter. If so, we can descend + * throught it. Note that a result node without a child would be a + * degenerate scan or join, and not something we could descend through. + * + * XXX. I suspect it's possible for this to happen above the Gather or + * Gather Merge node, too, but apparently we have no test case for that + * scenario. + */ + if (elidedouter == NULL && is_result_node_with_child(outerplan)) + elidedouter = pgpa_descend_node(pstmt, &outerplan); + if (elidedinner == NULL && is_result_node_with_child(innerplan)) + elidedinner = pgpa_descend_node(pstmt, &innerplan); + + /* + * If this is a semijoin that was converted to an inner join by making one + * side or the other unique, make a note that the inner or outer subplan, + * as appropriate, should be treated as a query plan feature when the main + * tree traversal reaches it. + * + * Conversely, if the planner could have made one side of the join unique + * and thereby converted it to an inner join, and chose not to do so, that + * is also worth noting. + * + * NB: This code could appear slightly higher up in in this function, but + * none of the nodes through which we just descended should have + * associated RTIs. + * + * NB: This seems like a somewhat hacky way of passing information up to + * the main tree walk, but I don't currently have a better idea. + */ + if (uniqueouter) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, outerplan); + else if (jointype == JOIN_RIGHT_SEMI) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, outerplan); + if (uniqueinner) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, innerplan); + else if (jointype == JOIN_SEMI) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, innerplan); + + /* Set output parameters. */ + *realouter = outerplan; + *realinner = innerplan; + *elidedrealouter = elidedouter; + *elidedrealinner = elidedinner; + return strategy; +} + +/* + * Descend through a Plan node in a join tree that the caller has determined + * to be irrelevant. + * + * Updates *plan, and returns the last of any elided nodes pertaining to the + * new plan node. + */ +static ElidedNode * +pgpa_descend_node(PlannedStmt *pstmt, Plan **plan) +{ + *plan = (*plan)->lefttree; + return pgpa_last_elided_node(pstmt, *plan); +} + +/* + * Descend through a Gather or Gather Merge node, if present, and any Sort + * or IncrementalSort node occurring under a Gather Merge. + * + * Caller should have verified that there is no ElidedNode pertaining to + * the initial value of *plan. + * + * Updates *plan, and returns the last of any elided nodes pertaining to the + * new plan node. Sets *found_any_gather = true if either Gather or + * Gather Merge was found, and otherwise leaves it unchanged. + */ +static ElidedNode * +pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan, + bool *found_any_gather) +{ + if (IsA(*plan, Gather)) + { + *found_any_gather = true; + return pgpa_descend_node(pstmt, plan); + } + + if (IsA(*plan, GatherMerge)) + { + ElidedNode *elided = pgpa_descend_node(pstmt, plan); + + if (elided == NULL && is_sorting_plan(*plan)) + elided = pgpa_descend_node(pstmt, plan); + + *found_any_gather = true; + return elided; + } + + return NULL; +} + +/* + * If *plan is an Agg or Unique node, we want to descend through it, unless + * it has a corresponding elided node. If its immediate child is a Sort or + * IncrementalSort, we also want to descend through that, unless it has a + * corresponding elided node. + * + * On entry, *elided_node must be the last of any elided nodes corresponding + * to *plan; on exit, this will still be true, but *plan may have been updated. + * + * The reason we don't want to descend through elided nodes is that a single + * join tree can't cross through any sort of elided node: subqueries are + * planned separately, and planning inside an Append or MergeAppend is + * separate from planning outside of it. + * + * The return value is true if we descend through a node that we believe is + * making one side of a semijoin unique, and otherwise false. + */ +static bool +pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, + ElidedNode **elided_node) +{ + bool descend = false; + bool sjunique = false; + + if (*elided_node != NULL) + return sjunique; + + if (IsA(*plan, Unique)) + { + descend = true; + sjunique = true; + } + else if (IsA(*plan, Agg)) + { + /* + * If this is a simple Agg node, then assume it's here to implement + * semijoin uniqueness. Otherwise, assume it's completing an eager + * aggregation or partitionwise aggregation operation that began at a + * higher level of the plan tree. + * + * XXX. I suspect this logic does not cover all cases: couldn't SJ + * uniqueness be implemented in two steps with an intermediate Gather? + */ + descend = true; + sjunique = (((Agg *) *plan)->aggsplit == AGGSPLIT_SIMPLE); + } + + if (descend) + { + *elided_node = pgpa_descend_node(pstmt, plan); + + if (*elided_node == NULL && is_sorting_plan(*plan)) + *elided_node = pgpa_descend_node(pstmt, plan); + } + + return sjunique; +} + +/* + * Is this a Result node that has a child? + */ +static bool +is_result_node_with_child(Plan *plan) +{ + return IsA(plan, Result) && plan->lefttree != NULL; +} + +/* + * Is this a Plan node whose purpose is put the data in a certain order? + */ +static bool +is_sorting_plan(Plan *plan) +{ + return IsA(plan, Sort) || IsA(plan, IncrementalSort); +} diff --git a/contrib/pg_plan_advice/pgpa_join.h b/contrib/pg_plan_advice/pgpa_join.h new file mode 100644 index 0000000000..4dc72986a7 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_join.h @@ -0,0 +1,105 @@ +/*------------------------------------------------------------------------- + * + * pgpa_join.h + * analysis of joins in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_join.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_JOIN_H +#define PGPA_JOIN_H + +#include "nodes/plannodes.h" + +typedef struct pgpa_plan_walker_context pgpa_plan_walker_context; +typedef struct pgpa_join_unroller pgpa_join_unroller; +typedef struct pgpa_unrolled_join pgpa_unrolled_join; + +/* + * Although there are three main join strategies, we try to classify things + * more precisely here: merge joins have the option of using materialization + * on the inner side, and nested loops can use either materialization or + * memoization. + */ +typedef enum +{ + JSTRAT_MERGE_JOIN_PLAIN = 0, + JSTRAT_MERGE_JOIN_MATERIALIZE, + JSTRAT_NESTED_LOOP_PLAIN, + JSTRAT_NESTED_LOOP_MATERIALIZE, + JSTRAT_NESTED_LOOP_MEMOIZE, + JSTRAT_HASH_JOIN + /* update NUM_PGPA_JOIN_STRATEGY if you add anything here */ +} pgpa_join_strategy; + +#define NUM_PGPA_JOIN_STRATEGY ((int) JSTRAT_HASH_JOIN + 1) + +/* + * In an outer-deep join tree, every member of an unrolled join will be a scan, + * but join trees with other shapes can contain unrolled joins. + * + * The plan node we store here will be the inner or outer child of the join + * node, as appropriate, except that we look through subnodes that we regard as + * part of the join method itself. For instance, for a Nested Loop that + * materializes the inner input, we'll store the child of the Materialize node, + * not the Materialize node itself. + * + * If setrefs processing elided one or more nodes from the plan tree, then + * we'll store details about the topmost of those in elided_node; otherwise, + * it will be NULL. + * + * Exactly one of scan and unrolled_join will be non-NULL. + */ +typedef struct +{ + Plan *plan; + ElidedNode *elided_node; + struct pgpa_scan *scan; + pgpa_unrolled_join *unrolled_join; +} pgpa_join_member; + +/* + * We convert outer-deep join trees to a flat structure; that is, ((A JOIN B) + * JOIN C) JOIN D gets converted to outer = A, inner = . When joins + * aren't outer-deep, substructure is required, e.g. (A JOIN B) JOIN (C JOIN D) + * is represented as outer = A, inner = , where X is a pgpa_unrolled_join + * covering C-D. + */ +struct pgpa_unrolled_join +{ + /* Outermost member; must not itself be an unrolled join. */ + pgpa_join_member outer; + + /* Number of inner members. Length of the strategy and inner arrays. */ + unsigned ninner; + + /* Array of strategies, one per non-outermost member. */ + pgpa_join_strategy *strategy; + + /* Array of members, excluding the outermost. Deepest first. */ + pgpa_join_member *inner; +}; + +/* + * Does this plan node inherit from Join? + */ +static inline bool +pgpa_is_join(Plan *plan) +{ + return IsA(plan, NestLoop) || IsA(plan, MergeJoin) || IsA(plan, HashJoin); +} + +extern pgpa_join_unroller *pgpa_create_join_unroller(void); +extern void pgpa_unroll_join(pgpa_plan_walker_context *walker, + Plan *plan, bool beneath_any_gather, + pgpa_join_unroller *join_unroller, + pgpa_join_unroller **outer_join_unroller, + pgpa_join_unroller **inner_join_unroller); +extern pgpa_unrolled_join *pgpa_build_unrolled_join(pgpa_plan_walker_context *walker, + pgpa_join_unroller *join_unroller); +extern void pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_output.c b/contrib/pg_plan_advice/pgpa_output.c new file mode 100644 index 0000000000..67647acdf5 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_output.c @@ -0,0 +1,571 @@ +/*------------------------------------------------------------------------- + * + * pgpa_output.c + * produce textual output from the results of a plan tree walk + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_output.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_output.h" +#include "pgpa_scan.h" + +#include "nodes/parsenodes.h" +#include "parser/parsetree.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +/* + * Context object for textual advice generation. + * + * rt_identifiers is the caller-provided array of range table identifiers. + * See the comments at the top of pgpa_identifier.c for more details. + * + * buf is the caller-provided output buffer. + * + * wrap_column is the wrap column, so that we don't create output that is + * too wide. See pgpa_maybe_linebreak() and comments in pgpa_output_advice. + */ +typedef struct pgpa_output_context +{ + const char **rid_strings; + StringInfo buf; + int wrap_column; +} pgpa_output_context; + +static void pgpa_output_unrolled_join(pgpa_output_context *context, + pgpa_unrolled_join *join); +static void pgpa_output_join_member(pgpa_output_context *context, + pgpa_join_member *member); +static void pgpa_output_scan_strategy(pgpa_output_context *context, + pgpa_scan_strategy strategy, + List *scans); +static void pgpa_output_relation_name(pgpa_output_context *context, Oid relid); +static void pgpa_output_query_feature(pgpa_output_context *context, + pgpa_qf_type type, + List *query_features); +static void pgpa_output_simple_strategy(pgpa_output_context *context, + char *strategy, + List *relid_sets); +static void pgpa_output_no_gather(pgpa_output_context *context, + Bitmapset *relids); +static void pgpa_output_relations(pgpa_output_context *context, StringInfo buf, + Bitmapset *relids); + +static char *pgpa_cstring_join_strategy(pgpa_join_strategy strategy); +static char *pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy); +static char *pgpa_cstring_query_feature_type(pgpa_qf_type type); + +static void pgpa_maybe_linebreak(StringInfo buf, int wrap_column); + +/* + * Append query advice to the provided buffer. + * + * Before calling this function, 'walker' must be used to iterate over the + * main plan tree and all subplans from the PlannedStmt. + * + * 'rt_identifiers' is a table of unique identifiers, one for each RTI. + * See pgpa_create_identifiers_for_planned_stmt(). + * + * Results will be appended to 'buf'. + */ +void +pgpa_output_advice(StringInfo buf, pgpa_plan_walker_context *walker, + pgpa_identifier *rt_identifiers) +{ + Index rtable_length = list_length(walker->pstmt->rtable); + ListCell *lc; + pgpa_output_context context; + + /* Basic initialization. */ + memset(&context, 0, sizeof(pgpa_output_context)); + context.buf = buf; + + /* + * Convert identifiers to string form. Note that the loop variable here is + * not an RTI, because RTIs are 1-based. Some RTIs will have no + * identifier, either because the reloptkind is RTE_JOIN or because that + * portion of the query didn't make it into the final plan. + */ + context.rid_strings = palloc0_array(const char *, rtable_length); + for (int i = 0; i < rtable_length; ++i) + if (rt_identifiers[i].alias_name != NULL) + context.rid_strings[i] = pgpa_identifier_string(&rt_identifiers[i]); + + /* + * If the user chooses to use EXPLAIN (PLAN_ADVICE) in an 80-column window + * from a psql client with default settings, psql will add one space to + * the left of the output and EXPLAIN will add two more to the left of the + * advice. Thus, lines of more than 77 characters will wrap. We set the + * wrap limit to 76 here so that the output won't reach all the way to the + * very last column of the terminal. + * + * Of course, this is fairly arbitrary set of assumptions, and one could + * well make an argument for a different wrap limit, or for a configurable + * one. + */ + context.wrap_column = 76; + + /* + * Each piece of JOIN_ORDER() advice fully describes the join order for a + * a single unrolled join. Merging is not permitted, because that would + * change the meaning, e.g. SEQ_SCAN(a b c d) means simply that sequential + * scans should be used for all of those relations, and is thus equivalent + * to SEQ_SCAN(a b) SEQ_SCAN(c d), but JOIN_ORDER(a b c d) means that "a" + * is the driving table which is then joined to "b" then "c" then "d", + * which is totally different from JOIN_ORDER(a b) and JOIN_ORDER(c d). + */ + foreach(lc, walker->toplevel_unrolled_joins) + { + pgpa_unrolled_join *ujoin = lfirst(lc); + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + appendStringInfo(context.buf, "JOIN_ORDER("); + pgpa_output_unrolled_join(&context, ujoin); + appendStringInfoChar(context.buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } + + /* Emit join strategy advice. */ + for (int s = 0; s < NUM_PGPA_JOIN_STRATEGY; ++s) + { + char *strategy = pgpa_cstring_join_strategy(s); + + pgpa_output_simple_strategy(&context, + strategy, + walker->join_strategies[s]); + } + + /* + * Emit scan strategy advice (but not for ordinary scans, which are + * definitionally uninteresting). + */ + for (int c = 0; c < NUM_PGPA_SCAN_STRATEGY; ++c) + if (c != PGPA_SCAN_ORDINARY) + pgpa_output_scan_strategy(&context, c, walker->scans[c]); + + /* Emit query feature advice. */ + for (int t = 0; t < NUM_PGPA_QF_TYPES; ++t) + pgpa_output_query_feature(&context, t, walker->query_features[t]); + + /* Emit NO_GATHER advice. */ + pgpa_output_no_gather(&context, walker->no_gather_scans); +} + +/* + * Output the members of an unrolled join, first the outermost member, and + * then the inner members one by one, as part of JOIN_ORDER() advice. + */ +static void +pgpa_output_unrolled_join(pgpa_output_context *context, + pgpa_unrolled_join *join) +{ + pgpa_output_join_member(context, &join->outer); + + for (int k = 0; k < join->ninner; ++k) + { + pgpa_join_member *member = &join->inner[k]; + + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + pgpa_output_join_member(context, member); + } +} + +/* + * Output a single member of an unrolled join as part of JOIN_ORDER() advice. + */ +static void +pgpa_output_join_member(pgpa_output_context *context, + pgpa_join_member *member) +{ + if (member->unrolled_join != NULL) + { + appendStringInfoChar(context->buf, '('); + pgpa_output_unrolled_join(context, member->unrolled_join); + appendStringInfoChar(context->buf, ')'); + } + else + { + pgpa_scan *scan = member->scan; + + Assert(scan != NULL); + if (bms_membership(scan->relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, scan->relids); + else + { + appendStringInfoChar(context->buf, '{'); + pgpa_output_relations(context, context->buf, scan->relids); + appendStringInfoChar(context->buf, '}'); + } + } +} + +/* + * Output advice for a List of pgpa_scan objects. + * + * All the scans must use the strategy specified by the "strategy" argument. + */ +static void +pgpa_output_scan_strategy(pgpa_output_context *context, + pgpa_scan_strategy strategy, + List *scans) +{ + bool first = true; + + if (scans == NIL) + return; + + if (context->buf->len > 0) + appendStringInfoChar(context->buf, '\n'); + appendStringInfo(context->buf, "%s(", + pgpa_cstring_scan_strategy(strategy)); + + foreach_ptr(pgpa_scan, scan, scans) + { + Plan *plan = scan->plan; + + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + } + + /* Output the relation identifiers. */ + if (bms_membership(scan->relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, scan->relids); + else + { + appendStringInfoChar(context->buf, '('); + pgpa_output_relations(context, context->buf, scan->relids); + appendStringInfoChar(context->buf, ')'); + } + + /* For index or index-only scans, output index information. */ + if (strategy == PGPA_SCAN_INDEX) + { + Assert(IsA(plan, IndexScan)); + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + pgpa_output_relation_name(context, ((IndexScan *) plan)->indexid); + } + else if (strategy == PGPA_SCAN_INDEX_ONLY) + { + Assert(IsA(plan, IndexOnlyScan)); + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + pgpa_output_relation_name(context, + ((IndexOnlyScan *) plan)->indexid); + } + } + + appendStringInfoChar(context->buf, ')'); + pgpa_maybe_linebreak(context->buf, context->wrap_column); +} + +/* + * Output a schema-qualified relation name. + */ +static void +pgpa_output_relation_name(pgpa_output_context *context, Oid relid) +{ + Oid nspoid = get_rel_namespace(relid); + char *relnamespace = get_namespace_name_or_temp(nspoid); + char *relname = get_rel_name(relid); + + appendStringInfoString(context->buf, quote_identifier(relnamespace)); + appendStringInfoChar(context->buf, '.'); + appendStringInfoString(context->buf, quote_identifier(relname)); +} + +/* + * Output advice for a List of pgpa_query_feature objects. + * + * All features must be of the type specified by the "type" argument. + */ +static void +pgpa_output_query_feature(pgpa_output_context *context, pgpa_qf_type type, + List *query_features) +{ + bool first = true; + + if (query_features == NIL) + return; + + if (context->buf->len > 0) + appendStringInfoChar(context->buf, '\n'); + appendStringInfo(context->buf, "%s(", + pgpa_cstring_query_feature_type(type)); + + foreach_ptr(pgpa_query_feature, qf, query_features) + { + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + } + + if (bms_membership(qf->relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, qf->relids); + else + { + appendStringInfoChar(context->buf, '('); + pgpa_output_relations(context, context->buf, qf->relids); + appendStringInfoChar(context->buf, ')'); + } + } + + appendStringInfoChar(context->buf, ')'); + pgpa_maybe_linebreak(context->buf, context->wrap_column); +} + +/* + * Output "simple" advice for a List of Bitmapset objects each of which + * contains one or more RTIs. + * + * By simple, we just mean that the advice emitted follows the most + * straightforward pattern: the strategy name, followed by a list of items + * separated by spaces and surrounded by parentheses. Individual items in + * the list are a single relation identifier for a Bitmapset that contains + * just one member, or a sub-list again separated by spaces and surrounded + * by parentheses for a Bitmapset with multiple members. Bitmapsets with + * no members probably shouldn't occur here, but if they do they'll be + * rendered as an empty sub-list. + */ +static void +pgpa_output_simple_strategy(pgpa_output_context *context, char *strategy, + List *relid_sets) +{ + bool first = true; + + if (relid_sets == NIL) + return; + + if (context->buf->len > 0) + appendStringInfoChar(context->buf, '\n'); + appendStringInfo(context->buf, "%s(", strategy); + + foreach_node(Bitmapset, relids, relid_sets) + { + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + } + + if (bms_membership(relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, relids); + else + { + appendStringInfoChar(context->buf, '('); + pgpa_output_relations(context, context->buf, relids); + appendStringInfoChar(context->buf, ')'); + } + } + + appendStringInfoChar(context->buf, ')'); + pgpa_maybe_linebreak(context->buf, context->wrap_column); +} + +/* + * Output NO_GATHER advice for all relations not appearing beneath any + * Gather or Gather Merge node. + */ +static void +pgpa_output_no_gather(pgpa_output_context *context, Bitmapset *relids) +{ + if (relids == NULL) + return; + if (context->buf->len > 0) + appendStringInfoChar(context->buf, '\n'); + appendStringInfoString(context->buf, "NO_GATHER("); + pgpa_output_relations(context, context->buf, relids); + appendStringInfoChar(context->buf, ')'); +} + +/* + * Output the identifiers for each RTI in the provided set. + * + * Identifiers are separated by spaces, and a line break is possible after + * each one. + */ +static void +pgpa_output_relations(pgpa_output_context *context, StringInfo buf, + Bitmapset *relids) +{ + int rti = -1; + bool first = true; + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + const char *rid_string = context->rid_strings[rti - 1]; + + if (rid_string == NULL) + elog(ERROR, "no identifier for RTI %d", rti); + + if (first) + { + first = false; + appendStringInfoString(buf, rid_string); + } + else + { + pgpa_maybe_linebreak(buf, context->wrap_column); + appendStringInfo(buf, " %s", rid_string); + } + } +} + +/* + * Get a C string that corresponds to the specified join strategy. + */ +static char * +pgpa_cstring_join_strategy(pgpa_join_strategy strategy) +{ + switch (strategy) + { + case JSTRAT_MERGE_JOIN_PLAIN: + return "MERGE_JOIN_PLAIN"; + case JSTRAT_MERGE_JOIN_MATERIALIZE: + return "MERGE_JOIN_MATERIALIZE"; + case JSTRAT_NESTED_LOOP_PLAIN: + return "NESTED_LOOP_PLAIN"; + case JSTRAT_NESTED_LOOP_MATERIALIZE: + return "NESTED_LOOP_MATERIALIZE"; + case JSTRAT_NESTED_LOOP_MEMOIZE: + return "NESTED_LOOP_MEMOIZE"; + case JSTRAT_HASH_JOIN: + return "HASH_JOIN"; + } + + pg_unreachable(); + return NULL; +} + +/* + * Get a C string that corresponds to the specified scan strategy. + */ +static char * +pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy) +{ + switch (strategy) + { + case PGPA_SCAN_ORDINARY: + return "ORDINARY_SCAN"; + case PGPA_SCAN_SEQ: + return "SEQ_SCAN"; + case PGPA_SCAN_BITMAP_HEAP: + return "BITMAP_HEAP_SCAN"; + case PGPA_SCAN_FOREIGN: + return "FOREIGN_JOIN"; + case PGPA_SCAN_INDEX: + return "INDEX_SCAN"; + case PGPA_SCAN_INDEX_ONLY: + return "INDEX_ONLY_SCAN"; + case PGPA_SCAN_PARTITIONWISE: + return "PARTITIONWISE"; + case PGPA_SCAN_TID: + return "TID_SCAN"; + } + + pg_unreachable(); + return NULL; +} + +/* + * Get a C string that corresponds to the specified scan strategy. + */ +static char * +pgpa_cstring_query_feature_type(pgpa_qf_type type) +{ + switch (type) + { + case PGPAQF_GATHER: + return "GATHER"; + case PGPAQF_GATHER_MERGE: + return "GATHER_MERGE"; + case PGPAQF_SEMIJOIN_NON_UNIQUE: + return "SEMIJOIN_NON_UNIQUE"; + case PGPAQF_SEMIJOIN_UNIQUE: + return "SEMIJOIN_UNIQUE"; + } + + + pg_unreachable(); + return NULL; +} + +/* + * Insert a line break into the StringInfoData, if needed. + * + * If wrap_column is zero or negative, this does nothing. Otherwise, we + * consider inserting a newline. We only insert a newline if the length of + * the last line in the buffer exceeds wrap_column, and not if we'd be + * inserting a newline at or before the beginning of the current line. + * + * The position at which the newline is inserted is simply wherever the + * buffer ended the last time this function was called. In other words, + * the caller is expected to call this function every time we reach a good + * place for a line break. + */ +static void +pgpa_maybe_linebreak(StringInfo buf, int wrap_column) +{ + char *trailing_nl; + int line_start; + int save_cursor; + + /* If line wrapping is disabled, exit quickly. */ + if (wrap_column <= 0) + return; + + /* + * Set line_start to the byte offset within buf->data of the first + * character of the current line, where the current line means the last + * one in the buffer. Note that line_start could be the offset of the + * trailing '\0' if the last character in the buffer is a line break. + */ + trailing_nl = strrchr(buf->data, '\n'); + if (trailing_nl == NULL) + line_start = 0; + else + line_start = (trailing_nl - buf->data) + 1; + + /* + * Remember that the current end of the buffer is a potential location to + * insert a line break on a future call to this function. + */ + save_cursor = buf->cursor; + buf->cursor = buf->len; + + /* If we haven't passed the wrap column, we don't need a newline. */ + if (buf->len - line_start <= wrap_column) + return; + + /* + * It only makes sense to insert a newline at a position later than the + * beginning of the current line. + */ + if (buf->cursor <= line_start) + return; + + /* Insert a newline at the previous cursor location. */ + enlargeStringInfo(buf, 1); + memmove(&buf->data[save_cursor] + 1, &buf->data[save_cursor], + buf->len - save_cursor); + ++buf->cursor; + buf->data[++buf->len] = '\0'; + buf->data[save_cursor] = '\n'; +} diff --git a/contrib/pg_plan_advice/pgpa_output.h b/contrib/pg_plan_advice/pgpa_output.h new file mode 100644 index 0000000000..47496d76f5 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_output.h @@ -0,0 +1,22 @@ +/*------------------------------------------------------------------------- + * + * pgpa_output.h + * produce textual output from the results of a plan tree walk + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_output.c + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_OUTPUT_H +#define PGPA_OUTPUT_H + +#include "pgpa_identifier.h" +#include "pgpa_walker.h" + +extern void pgpa_output_advice(StringInfo buf, + pgpa_plan_walker_context *walker, + pgpa_identifier *rt_identifiers); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_parser.y b/contrib/pg_plan_advice/pgpa_parser.y new file mode 100644 index 0000000000..4c3a3ed6db --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_parser.y @@ -0,0 +1,301 @@ +%{ +/* + * Parser for plan advice + * + * Copyright (c) 2000-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_parser.y + */ + +#include "postgres.h" + +#include +#include + +#include "fmgr.h" +#include "nodes/miscnodes.h" +#include "utils/builtins.h" +#include "utils/float.h" + +#include "pgpa_ast.h" +#include "pgpa_parser.h" + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. + */ +#define YYMALLOC palloc +#define YYFREE pfree +%} + +/* BISON Declarations */ +%parse-param {List **result} +%parse-param {char **parse_error_msg_p} +%parse-param {yyscan_t yyscanner} +%lex-param {List **result} +%lex-param {char **parse_error_msg_p} +%lex-param {yyscan_t yyscanner} +%pure-parser +%expect 0 +%name-prefix="pgpa_yy" + +%union +{ + char *str; + int integer; + List *list; + pgpa_advice_item *item; + pgpa_advice_target *target; + pgpa_index_target *itarget; +} +%token TOK_IDENT TOK_TAG_JOIN_ORDER TOK_TAG_INDEX +%token TOK_TAG_SIMPLE TOK_TAG_GENERIC +%token TOK_INTEGER + +%type opt_ri_occurrence +%type advice_item +%type advice_item_list generic_target_list +%type index_target_list join_order_target_list +%type opt_partition simple_target_list +%type identifier opt_plan_name +%type generic_sublist join_order_sublist +%type relation_identifier +%type index_name + +%start parse_toplevel + +/* Grammar follows */ +%% + +parse_toplevel: advice_item_list + { + (void) yynerrs; /* suppress compiler warning */ + *result = $1; + } + ; + +advice_item_list: advice_item_list advice_item + { $$ = lappend($1, $2); } + | + { $$ = NIL; } + ; + +advice_item: TOK_TAG_JOIN_ORDER '(' join_order_target_list ')' + { + $$ = palloc0_object(pgpa_advice_item); + $$->tag = PGPA_TAG_JOIN_ORDER; + $$->targets = $3; + if ($3 == NIL) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "JOIN_ORDER must have at least one target"); + } + | TOK_TAG_INDEX '(' index_target_list ')' + { + $$ = palloc0_object(pgpa_advice_item); + if (strcmp($1, "index_only_scan") == 0) + $$->tag = PGPA_TAG_INDEX_ONLY_SCAN; + else if (strcmp($1, "index_scan") == 0) + $$->tag = PGPA_TAG_INDEX_SCAN; + else + elog(ERROR, "tag parsing failed: %s", $1); + $$->targets = $3; + } + | TOK_TAG_SIMPLE '(' simple_target_list ')' + { + $$ = palloc0_object(pgpa_advice_item); + if (strcmp($1, "bitmap_heap_scan") == 0) + $$->tag = PGPA_TAG_BITMAP_HEAP_SCAN; + else if (strcmp($1, "no_gather") == 0) + $$->tag = PGPA_TAG_NO_GATHER; + else if (strcmp($1, "seq_scan") == 0) + $$->tag = PGPA_TAG_SEQ_SCAN; + else if (strcmp($1, "tid_scan") == 0) + $$->tag = PGPA_TAG_TID_SCAN; + else + elog(ERROR, "tag parsing failed: %s", $1); + $$->targets = $3; + } + | TOK_TAG_GENERIC '(' generic_target_list ')' + { + bool fail; + + $$ = palloc0_object(pgpa_advice_item); + $$->tag = pgpa_parse_advice_tag($1, &fail); + if (fail) + { + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "unrecognized advice tag"); + } + + if ($$->tag == PGPA_TAG_FOREIGN_JOIN) + { + foreach_ptr(pgpa_advice_target, target, $3) + { + if (target->ttype == PGPA_TARGET_IDENTIFIER || + list_length(target->children) == 1) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "FOREIGN_JOIN targets must contain more than one relation identifier"); + } + } + + $$->targets = $3; + } + ; + +relation_identifier: identifier opt_ri_occurrence opt_partition opt_plan_name + { + $$ = palloc0_object(pgpa_advice_target); + $$->ttype = PGPA_TARGET_IDENTIFIER; + $$->rid.alias_name = $1; + $$->rid.occurrence = $2; + if (list_length($3) == 2) + { + $$->rid.partnsp = linitial($3); + $$->rid.partrel = lsecond($3); + } + else if ($3 != NIL) + $$->rid.partrel = linitial($3); + $$->rid.plan_name = $4; + } + ; + +index_name: identifier + { + $$ = palloc0_object(pgpa_index_target); + $$->indname = $1; + } + | identifier '.' identifier + { + $$ = palloc0_object(pgpa_index_target); + $$->indnamespace = $1; + $$->indname = $3; + } + ; + +opt_ri_occurrence: + '#' TOK_INTEGER + { + if ($2 <= 0) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "only positive occurrence numbers are permitted"); + $$ = $2; + } + | + { + /* The default occurrence number is 1. */ + $$ = 1; + } + ; + +identifier: TOK_IDENT + | TOK_TAG_JOIN_ORDER + | TOK_TAG_INDEX + | TOK_TAG_SIMPLE + | TOK_TAG_GENERIC + ; + +/* + * When generating advice, we always schema-qualify the partition name, but + * when parsing advice, we accept a specification that lacks one. + */ +opt_partition: + '/' TOK_IDENT '.' TOK_IDENT + { $$ = list_make2($2, $4); } + | '/' TOK_IDENT + { $$ = list_make1($2); } + | + { $$ = NIL; } + ; + +opt_plan_name: + '@' TOK_IDENT + { $$ = $2; } + | + { $$ = NULL; } + ; + +generic_target_list: generic_target_list relation_identifier + { $$ = lappend($1, $2); } + | generic_target_list generic_sublist + { $$ = lappend($1, $2); } + | + { $$ = NIL; } + ; + +generic_sublist: '(' simple_target_list ')' + { + $$ = palloc0_object(pgpa_advice_target); + $$->ttype = PGPA_TARGET_ORDERED_LIST; + $$->children = $2; + } + ; + +index_target_list: + index_target_list relation_identifier index_name + { + $2->itarget = $3; + $$ = lappend($1, $2); + } + | + { $$ = NIL; } + ; + +join_order_target_list: join_order_target_list relation_identifier + { $$ = lappend($1, $2); } + | join_order_target_list join_order_sublist + { $$ = lappend($1, $2); } + | + { $$ = NIL; } + ; + +join_order_sublist: + '(' join_order_target_list ')' + { + $$ = palloc0_object(pgpa_advice_target); + $$->ttype = PGPA_TARGET_ORDERED_LIST; + $$->children = $2; + } + | '{' simple_target_list '}' + { + $$ = palloc0_object(pgpa_advice_target); + $$->ttype = PGPA_TARGET_UNORDERED_LIST; + $$->children = $2; + } + ; + +simple_target_list: simple_target_list relation_identifier + { $$ = lappend($1, $2); } + | + { $$ = NIL; } + ; + +%% + +/* + * Parse an advice_string and return the resulting list of pgpa_advice_item + * objects. If a parse error occurs, instead return NULL. + * + * If the return value is NULL, *error_p will be set to the error message; + * otherwise, *error_p will be set to NULL. + */ +List * +pgpa_parse(const char *advice_string, char **error_p) +{ + yyscan_t scanner; + List *result; + char *error = NULL; + + pgpa_scanner_init(advice_string, &scanner); + pgpa_yyparse(&result, &error, scanner); + pgpa_scanner_finish(scanner); + + if (error != NULL) + { + *error_p = error; + return NULL; + } + + *error_p = NULL; + return result; +} diff --git a/contrib/pg_plan_advice/pgpa_planner.c b/contrib/pg_plan_advice/pgpa_planner.c new file mode 100644 index 0000000000..3fc9127a99 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_planner.c @@ -0,0 +1,2166 @@ +/*------------------------------------------------------------------------- + * + * pgpa_planner.c + * Use planner hooks to observe and modifiy planner behavior + * + * All interaction with the core planner happens here. Much of it has to + * do with enforcing supplied advice, but we also need these hooks to + * generate advice strings (though the heavy lifting in that case is + * mostly done by pgpa_walker.c). + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_planner.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_collector.h" +#include "pgpa_identifier.h" +#include "pgpa_output.h" +#include "pgpa_planner.h" +#include "pgpa_trove.h" +#include "pgpa_walker.h" + +#include "commands/defrem.h" +#include "common/hashfn_unstable.h" +#include "nodes/makefuncs.h" +#include "optimizer/extendplan.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/plancat.h" +#include "optimizer/planner.h" +#include "parser/parsetree.h" +#include "utils/lsyscache.h" + +#ifdef USE_ASSERT_CHECKING + +/* + * When assertions are enabled, we try generating relation identifiers during + * planning, saving them in a hash table, and then cross-checking them against + * the ones generated after planning is complete. + */ +typedef struct pgpa_ri_checker_key +{ + char *plan_name; + Index rti; +} pgpa_ri_checker_key; + +typedef struct pgpa_ri_checker +{ + pgpa_ri_checker_key key; + uint32 status; + const char *rid_string; +} pgpa_ri_checker; + +static uint32 pgpa_ri_checker_hash_key(pgpa_ri_checker_key key); + +static inline bool +pgpa_ri_checker_compare_key(pgpa_ri_checker_key a, pgpa_ri_checker_key b) +{ + if (a.rti != b.rti) + return false; + if (a.plan_name == NULL) + return (b.plan_name == NULL); + if (b.plan_name == NULL) + return false; + return strcmp(a.plan_name, b.plan_name) == 0; +} + +#define SH_PREFIX pgpa_ri_check +#define SH_ELEMENT_TYPE pgpa_ri_checker +#define SH_KEY_TYPE pgpa_ri_checker_key +#define SH_KEY key +#define SH_HASH_KEY(tb, key) pgpa_ri_checker_hash_key(key) +#define SH_EQUAL(tb, a, b) pgpa_ri_checker_compare_key(a, b) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +#endif + +typedef enum pgpa_jo_outcome +{ + PGPA_JO_PERMITTED, /* permit this join order */ + PGPA_JO_DENIED, /* deny this join order */ + PGPA_JO_INDIFFERENT /* do neither */ +} pgpa_jo_outcome; + +typedef struct pgpa_planner_state +{ + ExplainState *explain_state; + bool generate_advice_feedback; + bool generate_advice_string; + pgpa_trove *trove; + MemoryContext trove_cxt; + List *sj_unique_rels; + +#ifdef USE_ASSERT_CHECKING + pgpa_ri_check_hash *ri_check_hash; +#endif +} pgpa_planner_state; + +typedef struct pgpa_join_state +{ + /* Most-recently-considered outer rel. */ + RelOptInfo *outerrel; + + /* Most-recently-considered inner rel. */ + RelOptInfo *innerrel; + + /* + * Array of relation identifiers for all members of this joinrel, with + * outerrel idenifiers before innerrel identifiers. + */ + pgpa_identifier *rids; + + /* Number of outer rel identifiers. */ + int outer_count; + + /* Number of inner rel identifiers. */ + int inner_count; + + /* + * Trove lookup results. + * + * join_entries and rel_entries are arrays of entries, and join_indexes + * and rel_indexes are the integer offsets within those arrays of entries + * potentially relevant to us. The "join" fields correspond to a lookup + * using PGPA_TROVE_LOOKUP_JOIN and the "rel" fields to a lookup using + * PGPA_TROVE_LOOKUP_REL. + */ + pgpa_trove_entry *join_entries; + Bitmapset *join_indexes; + pgpa_trove_entry *rel_entries; + Bitmapset *rel_indexes; +} pgpa_join_state; + +/* Saved hook values */ +static build_simple_rel_hook_type prev_build_simple_rel = NULL; +static join_path_setup_hook_type prev_join_path_setup = NULL; +static joinrel_setup_hook_type prev_joinrel_setup = NULL; +static planner_setup_hook_type prev_planner_setup = NULL; +static planner_shutdown_hook_type prev_planner_shutdown = NULL; + +/* Other global variabes */ +static int planner_extension_id = -1; + +/* Function prototypes. */ +static void pgpa_planner_setup(PlannerGlobal *glob, Query *parse, + const char *query_string, + int cursorOptions, + double *tuple_fraction, + ExplainState *es); +static void pgpa_planner_shutdown(PlannerGlobal *glob, Query *parse, + const char *query_string, PlannedStmt *pstmt); +static void pgpa_build_simple_rel(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); +static void pgpa_joinrel_setup(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + SpecialJoinInfo *sjinfo, + List *restrictlist); +static void pgpa_join_path_setup(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType jointype, + JoinPathExtraData *extra); +static pgpa_join_state *pgpa_get_join_state(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel); +static void pgpa_planner_apply_joinrel_advice(uint64 *pgs_mask_p, + char *plan_name, + pgpa_join_state *pjs); +static void pgpa_planner_apply_join_path_advice(JoinType jointype, + uint64 *pgs_mask_p, + char *plan_name, + pgpa_join_state *pjs); +static void pgpa_planner_apply_scan_advice(RelOptInfo *rel, + pgpa_trove_entry *scan_entries, + Bitmapset *scan_indexes, + pgpa_trove_entry *rel_entries, + Bitmapset *rel_indexes); +static uint64 pgpa_join_strategy_mask_from_advice_tag(pgpa_advice_tag_type tag); +static pgpa_jo_outcome pgpa_join_order_permits_join(int outer_count, + int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry); +static bool pgpa_join_method_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool *restrict_method); +static bool pgpa_opaque_join_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool *restrict_method); +static bool pgpa_semijoin_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool outer_side_nullable, + bool *restrict_method); + +static List *pgpa_planner_append_feedback(List *list, pgpa_trove *trove, + pgpa_trove_lookup_type type, + pgpa_identifier *rt_identifiers, + pgpa_plan_walker_context *walker); +static void pgpa_planner_feedback_warning(List *feedback); + +static inline void pgpa_ri_checker_save(pgpa_planner_state *pps, + PlannerInfo *root, + RelOptInfo *rel); +static void pgpa_ri_checker_validate(pgpa_planner_state *pps, + PlannedStmt *pstmt); + +static char *pgpa_bms_to_cstring(Bitmapset *bms); +static const char *pgpa_jointype_to_cstring(JoinType jointype); + +/* + * Install planner-related hooks. + */ +void +pgpa_planner_install_hooks(void) +{ + planner_extension_id = GetPlannerExtensionId("pg_plan_advice"); + prev_planner_setup = planner_setup_hook; + planner_setup_hook = pgpa_planner_setup; + prev_planner_shutdown = planner_shutdown_hook; + planner_shutdown_hook = pgpa_planner_shutdown; + prev_build_simple_rel = build_simple_rel_hook; + build_simple_rel_hook = pgpa_build_simple_rel; + prev_joinrel_setup = joinrel_setup_hook; + joinrel_setup_hook = pgpa_joinrel_setup; + prev_join_path_setup = join_path_setup_hook; + join_path_setup_hook = pgpa_join_path_setup; +} + +/* + * Carry out whatever setup work we need to do before planning. + */ +static void +pgpa_planner_setup(PlannerGlobal *glob, Query *parse, const char *query_string, + int cursorOptions, double *tuple_fraction, + ExplainState *es) +{ + pgpa_trove *trove = NULL; + pgpa_planner_state *pps; + char *supplied_advice; + bool generate_advice_feedback = false; + bool generate_advice_string = false; + bool needs_pps = false; + + /* + * Decide whether we need to generate an advice string. We must do this if + * the user has told us to do it categorically, or if at least one + * collector is enabled, or if the user has requested it using the EXPLAIN + * (PLAN_ADVICE) option. + */ + generate_advice_string = (pg_plan_advice_always_store_advice_details || + pg_plan_advice_local_collector || + pg_plan_advice_shared_collector || + pg_plan_advice_should_explain(es)); + if (generate_advice_string) + needs_pps = true; + + /* + * If any advice was provided, build a trove of advice for use during + * planning. + */ + supplied_advice = pg_plan_advice_get_supplied_query_advice(glob, parse, + query_string, + cursorOptions, + es); + if (supplied_advice != NULL && supplied_advice[0] != '\0') + { + List *advice_items; + char *error; + + /* + * If the supplied advice string comes from pg_plan_advice.advice, + * parsing shouldn't fail here, because we must have previously parsed + * successfully in pg_plan_advice_advice_check_hook. However, it might + * also be come from a hook registered via pg_plan_advice_add_advisor, + * and we can't be sure whether that's valid. (Plus, having an error + * check of here seems like a good idea anyway, just for safety.) + */ + advice_items = pgpa_parse(supplied_advice, &error); + if (error) + ereport(WARNING, + errmsg("could not parse supplied advice: %s", error)); + + /* + * It's possible that the advice string was non-empty but contained no + * actual advice, e.g. it was all whitespace. + */ + if (advice_items != NIL) + { + trove = pgpa_build_trove(advice_items); + needs_pps = true; + + /* + * If we know that we're running under EXPLAIN, or if the user has + * told us to always do the work, generate advice feedback. + */ + if (es != NULL || pg_plan_advice_feedback_warnings || + pg_plan_advice_always_store_advice_details) + generate_advice_feedback = true; + } + } + +#ifdef USE_ASSERT_CHECKING + + /* + * If asserts are enabled, always build a private state object for + * cross-checks. + */ + needs_pps = true; +#endif + + /* + * We only create and initialize a private state object if it's needed for + * some purpose. That could be (1) recording that we will need to generate + * an advice string, (2) storing a trove of supplied advice, or (3) + * facilitating debugging cross-checks when asserts are enabled. + */ + if (needs_pps) + { + pps = palloc0_object(pgpa_planner_state); + pps->explain_state = es; + pps->generate_advice_feedback = generate_advice_feedback; + pps->generate_advice_string = generate_advice_string; + pps->trove = trove; +#ifdef USE_ASSERT_CHECKING + pps->ri_check_hash = + pgpa_ri_check_create(CurrentMemoryContext, 1024, NULL); +#endif + SetPlannerGlobalExtensionState(glob, planner_extension_id, pps); + } +} + +/* + * Carry out whatever work we want to do after planning is complete. + */ +static void +pgpa_planner_shutdown(PlannerGlobal *glob, Query *parse, + const char *query_string, PlannedStmt *pstmt) +{ + pgpa_planner_state *pps; + pgpa_trove *trove = NULL; + pgpa_plan_walker_context walker = {0}; /* placate compiler */ + bool generate_advice_feedback = false; + bool generate_advice_string = false; + List *pgpa_items = NIL; + pgpa_identifier *rt_identifiers = NULL; + + /* Fetch our private state, set up by pgpa_planner_setup(). */ + pps = GetPlannerGlobalExtensionState(glob, planner_extension_id); + if (pps != NULL) + { + trove = pps->trove; + generate_advice_feedback = pps->generate_advice_feedback; + generate_advice_string = pps->generate_advice_string; + } + + /* + * If we're trying to generate an advice string or if we're trying to + * provide advice feedback, then we will need to create range table + * identifiers. + */ + if (generate_advice_string || generate_advice_feedback) + { + pgpa_plan_walker(&walker, pstmt, pps->sj_unique_rels); + rt_identifiers = pgpa_create_identifiers_for_planned_stmt(pstmt); + } + + /* Generate the advice string, if we need to do so. */ + if (generate_advice_string) + { + char *advice_string; + StringInfoData buf; + + /* Generate a textual advice string. */ + initStringInfo(&buf); + pgpa_output_advice(&buf, &walker, rt_identifiers); + advice_string = buf.data; + + /* If the advice string is empty, don't bother collecting it. */ + if (advice_string[0] != '\0') + pgpa_collect_advice(pstmt->queryId, query_string, advice_string); + + /* Save the advice string in the final plan. */ + pgpa_items = lappend(pgpa_items, + makeDefElem("advice_string", + (Node *) makeString(advice_string), + -1)); + } + + /* + * If we're trying to provide advice feedback, then we will need to + * analyze how successful the advice was. + */ + if (generate_advice_feedback) + { + List *feedback = NIL; + + /* + * Inject a Node-tree representation of all the trove-entry flags into + * the PlannedStmt. + */ + feedback = pgpa_planner_append_feedback(feedback, + trove, + PGPA_TROVE_LOOKUP_SCAN, + rt_identifiers, &walker); + feedback = pgpa_planner_append_feedback(feedback, + trove, + PGPA_TROVE_LOOKUP_JOIN, + rt_identifiers, &walker); + feedback = pgpa_planner_append_feedback(feedback, + trove, + PGPA_TROVE_LOOKUP_REL, + rt_identifiers, &walker); + + pgpa_items = lappend(pgpa_items, makeDefElem("feedback", + (Node *) feedback, -1)); + + /* If we were asked to generate feedback warnings, do so. */ + if (pg_plan_advice_feedback_warnings) + pgpa_planner_feedback_warning(feedback); + } + + /* Push whatever data we're saving into the PlannedStmt. */ + if (pgpa_items != NIL) + pstmt->extension_state = + lappend(pstmt->extension_state, + makeDefElem("pg_plan_advice", (Node *) pgpa_items, -1)); + + /* + * If assertions are enabled, cross-check the generated range table + * identifiers. + */ + if (pps != NULL) + pgpa_ri_checker_validate(pps, pstmt); +} + +/* + * Hook function for build_simple_rel(). + * + * We can apply scan advice at this point, and we also usee this as an + * opportunity to do range-table identifier cross-checking in assert-enabled + * builds. + */ +static void +pgpa_build_simple_rel(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) +{ + pgpa_planner_state *pps; + + /* Fetch our private state, set up by pgpa_planner_setup(). */ + pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id); + + /* Save details needed for range table identifier cross-checking. */ + if (pps != NULL) + pgpa_ri_checker_save(pps, root, rel); + + /* If query advice was provided, search for relevant entries. */ + if (pps != NULL && pps->trove != NULL) + { + pgpa_identifier rid; + pgpa_trove_result tresult_scan; + pgpa_trove_result tresult_rel; + + /* Search for scan advice and general rel advice. */ + pgpa_compute_identifier_by_rti(root, rel->relid, &rid); + pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_SCAN, 1, &rid, + &tresult_scan); + pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_REL, 1, &rid, + &tresult_rel); + + /* If relevant entries were found, apply them. */ + if (tresult_scan.indexes != NULL || tresult_rel.indexes != NULL) + { + uint64 original_mask = rel->pgs_mask; + + pgpa_planner_apply_scan_advice(rel, + tresult_scan.entries, + tresult_scan.indexes, + tresult_rel.entries, + tresult_rel.indexes); + + /* Emit debugging message, if enabled. */ + if (pg_plan_advice_trace_mask && original_mask != rel->pgs_mask) + ereport(WARNING, + (errmsg("strategy mask for RTI %u changed from 0x%" PRIx64 " to 0x%" PRIx64, + rel->relid, original_mask, rel->pgs_mask))); + } + } + + /* Pass call to previous hook. */ + if (prev_build_simple_rel) + (*prev_build_simple_rel) (root, rel, rte); +} + +/* + * Enforce any provided advice that is relevant to any method of implementing + * this join. + * + * Although we're passed the outerrel and innerrel here, those are just + * whatever values happened to prompt the creation of this joinrel; they + * shouldn't really influence our choice of what advice to apply. + */ +static void +pgpa_joinrel_setup(PlannerInfo *root, RelOptInfo *joinrel, + RelOptInfo *outerrel, RelOptInfo *innerrel, + SpecialJoinInfo *sjinfo, List *restrictlist) +{ + pgpa_join_state *pjs; + + Assert(bms_membership(joinrel->relids) == BMS_MULTIPLE); + + /* Get our private state information for this join. */ + pjs = pgpa_get_join_state(root, joinrel, outerrel, innerrel); + + /* If there is relevant advice, call a helper function to apply it. */ + if (pjs != NULL) + { + uint64 original_mask = joinrel->pgs_mask; + + pgpa_planner_apply_joinrel_advice(&joinrel->pgs_mask, + root->plan_name, + pjs); + + /* Emit debugging message, if enabled. */ + if (pg_plan_advice_trace_mask && original_mask != joinrel->pgs_mask) + ereport(WARNING, + (errmsg("strategy mask for join on RTIs %s changed from 0x%" PRIx64 " to 0x%" PRIx64, + pgpa_bms_to_cstring(joinrel->relids), + original_mask, + joinrel->pgs_mask))); + } + + /* Pass call to previous hook. */ + if (prev_joinrel_setup) + (*prev_joinrel_setup) (root, joinrel, outerrel, innerrel, + sjinfo, restrictlist); +} + +/* + * Enforce any provided advice that is relevant to this particular method of + * implementing this particular join. + */ +static void +pgpa_join_path_setup(PlannerInfo *root, RelOptInfo *joinrel, + RelOptInfo *outerrel, RelOptInfo *innerrel, + JoinType jointype, JoinPathExtraData *extra) +{ + pgpa_join_state *pjs; + + Assert(bms_membership(joinrel->relids) == BMS_MULTIPLE); + + /* + * If we're considering implementing a semijoin by making one side unique, + * make a note of it in the pgpa_planner_state. See comments for + * pgpa_sj_unique_rel for why we do this. + */ + if (jointype == JOIN_UNIQUE_OUTER || jointype == JOIN_UNIQUE_INNER) + { + pgpa_planner_state *pps; + RelOptInfo *uniquerel; + + uniquerel = jointype == JOIN_UNIQUE_OUTER ? outerrel : innerrel; + pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id); + if (pps != NULL && + (pps->generate_advice_string || pps->generate_advice_feedback)) + { + bool found = false; + + /* Avoid adding duplicates. */ + foreach_ptr(pgpa_sj_unique_rel, ur, pps->sj_unique_rels) + { + /* + * We should always use the same pointer for the same plan + * name, so we need not use strcmp() here. + */ + if (root->plan_name == ur->plan_name && + bms_equal(uniquerel->relids, ur->relids)) + { + found = true; + break; + } + } + + /* If not a duplicate, append to the list. */ + if (!found) + { + pgpa_sj_unique_rel *ur = palloc_object(pgpa_sj_unique_rel); + + ur->plan_name = root->plan_name; + ur->relids = uniquerel->relids; + pps->sj_unique_rels = lappend(pps->sj_unique_rels, ur); + } + } + } + + /* Get our private state information for this join. */ + pjs = pgpa_get_join_state(root, joinrel, outerrel, innerrel); + + /* If there is relevant advice, call a helper function to apply it. */ + if (pjs != NULL) + { + uint64 original_mask = extra->pgs_mask; + + pgpa_planner_apply_join_path_advice(jointype, + &extra->pgs_mask, + root->plan_name, + pjs); + + /* Emit debugging message, if enabled. */ + if (pg_plan_advice_trace_mask && original_mask != extra->pgs_mask) + ereport(WARNING, + (errmsg("strategy mask for %s join on %s with outer %s and inner %s changed from 0x%" PRIx64 " to 0x%" PRIx64, + pgpa_jointype_to_cstring(jointype), + pgpa_bms_to_cstring(joinrel->relids), + pgpa_bms_to_cstring(outerrel->relids), + pgpa_bms_to_cstring(innerrel->relids), + original_mask, + extra->pgs_mask))); + } + + /* Pass call to previous hook. */ + if (prev_join_path_setup) + (*prev_join_path_setup) (root, joinrel, outerrel, innerrel, + jointype, extra); +} + +/* + * Search for advice pertaining to a proposed join. + */ +static pgpa_join_state * +pgpa_get_join_state(PlannerInfo *root, RelOptInfo *joinrel, + RelOptInfo *outerrel, RelOptInfo *innerrel) +{ + pgpa_planner_state *pps; + pgpa_join_state *pjs; + bool new_pjs = false; + + /* Fetch our private state, set up by pgpa_planner_setup(). */ + pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id); + if (pps == NULL || pps->trove == NULL) + { + /* No advice applies to this query, hence none to this joinrel. */ + return NULL; + } + + /* + * See whether we've previously associated a pgpa_join_state with this + * joinrel. If we have not, we need to try to construct one. If we have, + * then there are two cases: (a) if innerrel and outerrel are unchanged, + * we can simply use it, and (b) if they have changed, we need to rejigger + * the array of identifiers but can still skip the trove lookup. + */ + pjs = GetRelOptInfoExtensionState(joinrel, planner_extension_id); + if (pjs != NULL) + { + if (pjs->join_indexes == NULL && pjs->rel_indexes == NULL) + { + /* + * If there's no potentially relevant advice, then the presence of + * this pgpa_join_state acts like a negative cache entry: it tells + * us not to bother searching the trove for advice, because we + * will not find any. + */ + return NULL; + } + + if (pjs->outerrel == outerrel && pjs->innerrel == innerrel) + { + /* No updates required, so just return. */ + /* XXX. Does this need to do something different under GEQO? */ + return pjs; + } + } + + /* + * If there's no pgpa_join_state yet, we need to allocate one. Trove keys + * will not get built for RTE_JOIN RTEs, so the array may end up being + * larger than needed. It's not worth trying to compute a perfectly + * accurate count here. + */ + if (pjs == NULL) + { + int pessimistic_count = bms_num_members(joinrel->relids); + + pjs = palloc0_object(pgpa_join_state); + pjs->rids = palloc_array(pgpa_identifier, pessimistic_count); + new_pjs = true; + } + + /* + * Either we just allocated a new pgpa_join_state, or the existing one + * needs reconfiguring for a new innerrel and outerrel. The required array + * size can't change, so we can overwrite the existing one. + */ + pjs->outerrel = outerrel; + pjs->innerrel = innerrel; + pjs->outer_count = + pgpa_compute_identifiers_by_relids(root, outerrel->relids, pjs->rids); + pjs->inner_count = + pgpa_compute_identifiers_by_relids(root, innerrel->relids, + pjs->rids + pjs->outer_count); + + /* + * If we allocated a new pgpa_join_state, search our trove of advice for + * relevant entries. The trove lookup will return the same results for + * every outerrel/innerrel combination, so we don't need to repeat that + * work every time. + */ + if (new_pjs) + { + pgpa_trove_result tresult; + + /* Find join entries. */ + pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_JOIN, + pjs->outer_count + pjs->inner_count, + pjs->rids, &tresult); + pjs->join_entries = tresult.entries; + pjs->join_indexes = tresult.indexes; + + /* Find rel entries. */ + pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_REL, + pjs->outer_count + pjs->inner_count, + pjs->rids, &tresult); + pjs->rel_entries = tresult.entries; + pjs->rel_indexes = tresult.indexes; + + /* Now that the new pgpa_join_state is fully valid, save a pointer. */ + SetRelOptInfoExtensionState(joinrel, planner_extension_id, pjs); + + /* + * If there was no relevant advice found, just return NULL. This + * pgpa_join_state will stick around as a sort of negative cache + * entry, so that future calls for this same joinrel quickly return + * NULL. + */ + if (pjs->join_indexes == NULL && pjs->rel_indexes == NULL) + return NULL; + } + + return pjs; +} + +/* + * Enforce overall restrictions on a join relation that apply uniformly + * regardless of the choice of inner and outer rel. + */ +static void +pgpa_planner_apply_joinrel_advice(uint64 *pgs_mask_p, char *plan_name, + pgpa_join_state *pjs) +{ + int i = -1; + int flags; + bool gather_conflict = false; + uint64 gather_mask = 0; + Bitmapset *gather_partial_match = NULL; + Bitmapset *gather_full_match = NULL; + bool partitionwise_conflict = false; + int partitionwise_outcome = 0; + Bitmapset *partitionwise_partial_match = NULL; + Bitmapset *partitionwise_full_match = NULL; + + /* Iterate over all possibly-relevant advice. */ + while ((i = bms_next_member(pjs->rel_indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &pjs->rel_entries[i]; + pgpa_itm_type itm; + bool full_match = false; + uint64 my_gather_mask = 0; + int my_partitionwise_outcome = 0; /* >0 yes, <0 no */ + + /* + * For GATHER and GATHER_MERGE, if the specified relations exactly + * match this joinrel, do whatever the advice says; otherwise, don't + * allow Gather or Gather Merge at this level. For NO_GATHER, there + * must be a single target relation which must be included in this + * joinrel, so just don't allow Gather or Gather Merge here, full + * stop. + */ + if (entry->tag == PGPA_TAG_NO_GATHER) + { + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + full_match = true; + } + else + { + int total_count; + + total_count = pjs->outer_count + pjs->inner_count; + itm = pgpa_identifiers_match_target(total_count, pjs->rids, + entry->target); + Assert(itm != PGPA_ITM_DISJOINT); + + if (itm == PGPA_ITM_EQUAL) + { + full_match = true; + if (entry->tag == PGPA_TAG_PARTITIONWISE) + my_partitionwise_outcome = 1; + else if (entry->tag == PGPA_TAG_GATHER) + my_gather_mask = PGS_GATHER; + else if (entry->tag == PGPA_TAG_GATHER_MERGE) + my_gather_mask = PGS_GATHER_MERGE; + else + elog(ERROR, "unexpected advice tag: %d", + (int) entry->tag); + } + else + { + /* + * If specified relations don't exactly match this joinrel, + * then we should do the opposite of whatever the advice says. + * For instance, if we have PARTITIONWISE((a b c)) or + * GATHER((a b c)) and this joinrel covers {a, b} or {a, b, c, + * d} or {a, d}, we shouldn't plan it partititionwise or put a + * Gather or Gather Merge on it here. + * + * Also, we can't put a Gather or Gather Merge at this level + * if there is PARTITIONWISE advice that overlaps with it, + * unless the PARTITIONWISE advice covers a subset of the + * relations in the joinrel. To continue the previous example, + * PARTITIONWISE((a b c)) is logically incompatible with + * GATHER((a b)) or GATHER((a d)), but not with GATHER((a b c + * d)). + * + * Conversely, we can't proceed partitionwise at this level if + * there is overlapping GATHER or GATHER_MERGE advice, unless + * that advice covers a superset of the relations in this + * joinrel. This is just the flip side of the preceding point. + */ + if (entry->tag == PGPA_TAG_PARTITIONWISE) + { + my_partitionwise_outcome = -1; + if (itm != PGPA_ITM_TARGETS_ARE_SUBSET) + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + } + else if (entry->tag == PGPA_TAG_GATHER || + entry->tag == PGPA_TAG_GATHER_MERGE) + { + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + if (itm != PGPA_ITM_KEYS_ARE_SUBSET) + my_partitionwise_outcome = -1; + } + else + elog(ERROR, "unexpected advice tag: %d", + (int) entry->tag); + } + } + + /* + * If we set my_gather_mask up above, then we (1) make a note if the + * advice conflicted, (2) remember the mask value, and (3) remember + * whether this was a full or partial match. + */ + if (my_gather_mask != 0) + { + if (gather_mask != 0 && gather_mask != my_gather_mask) + gather_conflict = true; + gather_mask = my_gather_mask; + if (full_match) + gather_full_match = bms_add_member(gather_full_match, i); + else + gather_partial_match = bms_add_member(gather_partial_match, i); + } + + /* + * Likewise, if we set my_partitionwise_outcome up above, then we (1) + * make a note if the advice conflicted, (2) remember what the desired + * outcome was, and (3) remember whether this was a full or partial + * match. + */ + if (my_partitionwise_outcome != 0) + { + if (partitionwise_outcome != 0 && + partitionwise_outcome != my_partitionwise_outcome) + partitionwise_conflict = true; + partitionwise_outcome = my_partitionwise_outcome; + if (full_match) + partitionwise_full_match = + bms_add_member(partitionwise_full_match, i); + else + partitionwise_partial_match = + bms_add_member(partitionwise_partial_match, i); + } + } + + /* + * Mark every Gather-related piece of advice as partially matched, and if + * the set of targets exactly matched this relation, fully matched. If + * there was a conflict, mark them all as conflicting. + */ + flags = PGPA_TE_MATCH_PARTIAL; + if (gather_conflict) + flags |= PGPA_TE_CONFLICTING; + pgpa_trove_set_flags(pjs->rel_entries, gather_partial_match, flags); + flags |= PGPA_TE_MATCH_FULL; + pgpa_trove_set_flags(pjs->rel_entries, gather_full_match, flags); + + /* Likewise for partitionwise advice. */ + flags = PGPA_TE_MATCH_PARTIAL; + if (partitionwise_conflict) + flags |= PGPA_TE_CONFLICTING; + pgpa_trove_set_flags(pjs->rel_entries, partitionwise_partial_match, flags); + flags |= PGPA_TE_MATCH_FULL; + pgpa_trove_set_flags(pjs->rel_entries, partitionwise_full_match, flags); + + /* + * Enforce restrictions on the Gather/Gather Merge. Only clear bits here, + * so that we still respect the enable_* GUCs. Do nothing if the advise + * conflicts. + */ + if (gather_mask != 0 && !gather_conflict) + { + uint64 all_gather_mask; + + all_gather_mask = + PGS_GATHER | PGS_GATHER_MERGE | PGS_CONSIDER_NONPARTIAL; + *pgs_mask_p &= ~(all_gather_mask & ~gather_mask); + } + + /* + * As above, but for partitionwise advice. + * + * To induce a partitionwise join, we disable all the ordinary means of + * performing a join, so that an Append or MergeAppend path will hopefully + * be chosen. + * + * To prevent one, we just disable Append and MergeAppend. Note that we + * must not unset PGS_CONSIDER_PARTITIONWISE even when we don't want a + * partitionwise join here, because we might want one at a higher level + * that is constructing using paths from this level. + */ + if (partitionwise_outcome != 0 && !partitionwise_conflict) + { + if (partitionwise_outcome > 0) + *pgs_mask_p = (*pgs_mask_p & ~PGS_JOIN_ANY); + else + *pgs_mask_p &= ~(PGS_APPEND | PGS_MERGE_APPEND); + } +} + +/* + * Enforce restrictions on the join order or join method. + */ +static void +pgpa_planner_apply_join_path_advice(JoinType jointype, uint64 *pgs_mask_p, + char *plan_name, + pgpa_join_state *pjs) +{ + int i = -1; + Bitmapset *jo_permit_indexes = NULL; + Bitmapset *jo_deny_indexes = NULL; + Bitmapset *jo_deny_rel_indexes = NULL; + Bitmapset *jm_indexes = NULL; + bool jm_conflict = false; + uint32 join_mask = 0; + Bitmapset *sj_permit_indexes = NULL; + Bitmapset *sj_deny_indexes = NULL; + + /* + * Reconsider PARTITIONWISE(...) advice. + * + * We already thought about this for the joinrel as a whole, but in some + * cases, partitionwise advice can also constrain the join order. For + * instance, if the advice says PARTITIONWISE((t1 t2)), we shouldn't build + * join paths for a any joinrel that includes t1 or t2 unless it also + * includes the other. In general, the paritionwise operation must have + * already been completed within one side of the current join or the + * other, else the join order is impermissible. + * + * NB: It might seem tempting to try to deal with PARTITIONWISE advise + * entirely in this function, but that doesn't work. Here, we can only + * affect the pgs_mask within a particular JoinPathExtraData, that is, for + * a particular choice of innerrel and outerrel. Partitionwise paths are + * not built that way, so we must set pgs_mask for the RelOptInfo, which + * is best done in pgpa_planner_apply_joinrel_advice. + */ + while ((i = bms_next_member(pjs->rel_indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &pjs->rel_entries[i]; + pgpa_itm_type inner_itm; + pgpa_itm_type outer_itm; + + if (entry->tag != PGPA_TAG_PARTITIONWISE) + continue; + + outer_itm = pgpa_identifiers_match_target(pjs->outer_count, + pjs->rids, entry->target); + if (outer_itm == PGPA_ITM_EQUAL || + outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + continue; + + inner_itm = pgpa_identifiers_match_target(pjs->inner_count, + pjs->rids + pjs->outer_count, + entry->target); + if (inner_itm == PGPA_ITM_EQUAL || + inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + continue; + + jo_deny_rel_indexes = bms_add_member(jo_deny_rel_indexes, i); + } + + /* Iterate over advice that pertains to the join order and method. */ + i = -1; + while ((i = bms_next_member(pjs->join_indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &pjs->join_entries[i]; + uint32 my_join_mask; + + /* Handle join order advice. */ + if (entry->tag == PGPA_TAG_JOIN_ORDER) + { + pgpa_jo_outcome jo_outcome; + + jo_outcome = pgpa_join_order_permits_join(pjs->outer_count, + pjs->inner_count, + pjs->rids, + entry); + if (jo_outcome == PGPA_JO_PERMITTED) + jo_permit_indexes = bms_add_member(jo_permit_indexes, i); + else if (jo_outcome == PGPA_JO_DENIED) + jo_deny_indexes = bms_add_member(jo_deny_indexes, i); + continue; + } + + /* Handle join method advice. */ + my_join_mask = pgpa_join_strategy_mask_from_advice_tag(entry->tag); + if (my_join_mask != 0) + { + bool permit; + bool restrict_method; + + if (entry->tag == PGPA_TAG_FOREIGN_JOIN) + permit = pgpa_opaque_join_permits_join(pjs->outer_count, + pjs->inner_count, + pjs->rids, + entry, + &restrict_method); + else + permit = pgpa_join_method_permits_join(pjs->outer_count, + pjs->inner_count, + pjs->rids, + entry, + &restrict_method); + if (!permit) + jo_deny_indexes = bms_add_member(jo_deny_indexes, i); + else if (restrict_method) + { + jm_indexes = bms_add_member(jm_indexes, i); + if (join_mask != 0 && join_mask != my_join_mask) + jm_conflict = true; + join_mask = my_join_mask; + } + continue; + } + + /* Handle semijoin uniqueness advice. */ + if (entry->tag == PGPA_TAG_SEMIJOIN_UNIQUE || + entry->tag == PGPA_TAG_SEMIJOIN_NON_UNIQUE) + { + bool outer_side_nullable; + bool restrict_method; + + /* Planner has nullable side of the semijoin on the outer side? */ + outer_side_nullable = (jointype == JOIN_UNIQUE_OUTER || + jointype == JOIN_RIGHT_SEMI); + + if (!pgpa_semijoin_permits_join(pjs->outer_count, + pjs->inner_count, + pjs->rids, + entry, + outer_side_nullable, + &restrict_method)) + jo_deny_indexes = bms_add_member(jo_deny_indexes, i); + else if (restrict_method) + { + bool advice_unique; + bool jt_unique; + bool jt_non_unique; + + /* Advice wants to unique-ify and use a regular join? */ + advice_unique = (entry->tag == PGPA_TAG_SEMIJOIN_UNIQUE); + + /* Planner is trying to unique-ify and use a regular join? */ + jt_unique = (jointype == JOIN_UNIQUE_INNER || + jointype == JOIN_UNIQUE_OUTER); + + /* Planner is trying a semi-join, without unique-ifying? */ + jt_non_unique = (jointype == JOIN_SEMI || + jointype == JOIN_RIGHT_SEMI); + + if (!jt_unique && !jt_non_unique) + { + /* + * This doesn't seem to be a semijoin to which SJ_UNIQUE + * or SJ_NON_UNIQUE can be applied. + */ + entry->flags |= PGPA_TE_INAPPLICABLE; + } + else if (advice_unique != jt_unique) + sj_deny_indexes = bms_add_member(sj_deny_indexes, i); + else + sj_permit_indexes = bms_add_member(sj_permit_indexes, i); + } + continue; + } + } + + /* + * If the advice indicates both that this join order is permissible and + * also that it isn't, then mark advice related to the join order as + * conflicting. + */ + if (jo_permit_indexes != NULL && + (jo_deny_indexes != NULL || jo_deny_rel_indexes != NULL)) + { + pgpa_trove_set_flags(pjs->join_entries, jo_permit_indexes, + PGPA_TE_CONFLICTING); + pgpa_trove_set_flags(pjs->join_entries, jo_deny_indexes, + PGPA_TE_CONFLICTING); + pgpa_trove_set_flags(pjs->rel_entries, jo_deny_rel_indexes, + PGPA_TE_CONFLICTING); + } + + /* + * If more than one join method specification is relevant here and they + * differ, mark them all as conflicting. + */ + if (jm_conflict) + pgpa_trove_set_flags(pjs->join_entries, jm_indexes, + PGPA_TE_CONFLICTING); + + /* If semijoin advice says both yes and no, mark it all as conflicting. */ + if (sj_permit_indexes != NULL && sj_deny_indexes != NULL) + { + pgpa_trove_set_flags(pjs->join_entries, sj_permit_indexes, + PGPA_TE_CONFLICTING); + pgpa_trove_set_flags(pjs->join_entries, sj_deny_indexes, + PGPA_TE_CONFLICTING); + } + + /* + * Enforce restrictions on the join order and join method, and any + * semijoin-related restrictions. Only clear bits here, so that we still + * respect the enable_* GUCs. Do nothing in cases where the advice on a + * single topic conflicts. + */ + if ((jo_deny_indexes != NULL || jo_deny_rel_indexes != NULL) && + jo_permit_indexes == NULL) + *pgs_mask_p &= ~PGS_JOIN_ANY; + if (join_mask != 0 && !jm_conflict) + *pgs_mask_p &= ~(PGS_JOIN_ANY & ~join_mask); + if (sj_deny_indexes != NULL && sj_permit_indexes == NULL) + *pgs_mask_p &= ~PGS_JOIN_ANY; +} + +/* + * Translate an advice tag into a path generation strategy mask. + * + * This function can be called with tag types that don't represent join + * strategies. In such cases, we just return 0, which can't be confused with + * a valid mask. + */ +static uint64 +pgpa_join_strategy_mask_from_advice_tag(pgpa_advice_tag_type tag) +{ + switch (tag) + { + case PGPA_TAG_FOREIGN_JOIN: + return PGS_FOREIGNJOIN; + case PGPA_TAG_MERGE_JOIN_PLAIN: + return PGS_MERGEJOIN_PLAIN; + case PGPA_TAG_MERGE_JOIN_MATERIALIZE: + return PGS_MERGEJOIN_MATERIALIZE; + case PGPA_TAG_NESTED_LOOP_PLAIN: + return PGS_NESTLOOP_PLAIN; + case PGPA_TAG_NESTED_LOOP_MATERIALIZE: + return PGS_NESTLOOP_MATERIALIZE; + case PGPA_TAG_NESTED_LOOP_MEMOIZE: + return PGS_NESTLOOP_MEMOIZE; + case PGPA_TAG_HASH_JOIN: + return PGS_HASHJOIN; + default: + return 0; + } +} + +/* + * Does a certain item of join order advice permit a certain join? + * + * Returns PGPA_JO_DENIED if the advice is incompatible with the proposed + * join order. + * + * Returns PGPA_JO_PERMITTED if the advice specifies exactly the proposed + * join order. This implies that a partitionwise join should not be + * performed at this level; rather, one of the traditional join methods + * should be used. + * + * Returns PGPA_JO_INDIFFERENT if the advice does not care what happens. + * We use this for unordered JOIN_ORDER sublists, which are compatible with + * partitionwise join but do not mandate it. + */ +static pgpa_jo_outcome +pgpa_join_order_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry) +{ + bool loop = true; + bool sublist = false; + int length; + int outer_length; + pgpa_advice_target *target = entry->target; + pgpa_advice_target *prefix_target; + + /* We definitely have at least a partial match for this trove entry. */ + entry->flags |= PGPA_TE_MATCH_PARTIAL; + + /* + * Find the innermost sublist that contains all keys; if no sublist does, + * then continue processing with the toplevel list. + * + * For example, if the advice says JOIN_ORDER(t1 t2 (t3 t4 t5)), then we + * should evaluate joins that only involve t3, t4, and/or t5 against the + * (t3 t4 t5) sublist, and others against the full list. + * + * Note that (1) outermost sublist is always ordered and (2) whenever we + * zoom into an unordered sublist, we instantly return + * PGPA_JO_INDIFFERENT. + */ + while (loop) + { + Assert(target->ttype == PGPA_TARGET_ORDERED_LIST); + + loop = false; + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + pgpa_itm_type itm; + + if (child_target->ttype == PGPA_TARGET_IDENTIFIER) + continue; + + itm = pgpa_identifiers_match_target(outer_count + inner_count, + rids, child_target); + if (itm == PGPA_ITM_EQUAL || itm == PGPA_ITM_KEYS_ARE_SUBSET) + { + if (child_target->ttype == PGPA_TARGET_ORDERED_LIST) + { + target = child_target; + sublist = true; + loop = true; + break; + } + else + { + Assert(child_target->ttype == PGPA_TARGET_UNORDERED_LIST); + return PGPA_JO_INDIFFERENT; + } + } + } + } + + /* + * Try to find a prefix of the selected join order list that is exactly + * equal to the outer side of the proposed join. + */ + length = list_length(target->children); + prefix_target = palloc0_object(pgpa_advice_target); + prefix_target->ttype = PGPA_TARGET_ORDERED_LIST; + for (outer_length = 1; outer_length <= length; ++outer_length) + { + pgpa_itm_type itm; + + /* Avoid leaking memory in every loop iteration. */ + if (prefix_target->children != NULL) + list_free(prefix_target->children); + prefix_target->children = list_copy_head(target->children, + outer_length); + + /* Search, hoping to find an exact match. */ + itm = pgpa_identifiers_match_target(outer_count, rids, prefix_target); + if (itm == PGPA_ITM_EQUAL) + break; + + /* + * If the prefix of the join order list that we're considering + * includes some but not all of the outer rels, we can make the prefix + * longer to find an exact match. But the advice hasn't mentioned + * everything that's part of our outer rel yet, but has mentioned + * things that are not, then this join doesn't match the join order + * list. + */ + if (itm != PGPA_ITM_TARGETS_ARE_SUBSET) + return PGPA_JO_DENIED; + } + + /* + * If the previous looped stopped before the prefix_target included the + * entire join order list, then the next member of the join order list + * must exactly match the inner side of the join. + * + * Example: Given JOIN_ORDER(t1 t2 (t3 t4 t5)), if the outer side of the + * current join includes only t1, then the inner side must be exactly t2; + * if the outer side includes both t1 and t2, then the inner side must + * include exactly t3, t4, and t5. + */ + if (outer_length < length) + { + pgpa_advice_target *inner_target; + pgpa_itm_type itm; + + inner_target = list_nth(target->children, outer_length); + + itm = pgpa_identifiers_match_target(inner_count, rids + outer_count, + inner_target); + + /* + * Before returning, consider whether we need to mark this entry as + * fully matched. If we're considering the full list rather than a + * sublist, and if we found every item but one on the outer side of + * the join and the last item on the inner side of the join, then the + * answer is yes. + */ + if (!sublist && outer_length + 1 == length && itm == PGPA_ITM_EQUAL) + entry->flags |= PGPA_TE_MATCH_FULL; + + return (itm == PGPA_ITM_EQUAL) ? PGPA_JO_PERMITTED : PGPA_JO_DENIED; + } + + /* + * If we get here, then the outer side of the join includes the entirety + * of the join order list. In this case, we behave differently depending + * on whether we're looking at the top-level join order list or sublist. + * At the top-level, we treat the specified list as mandating that the + * actual join order has the given list as a prefix, but a sublist + * requires an exact match. + * + * Exmaple: Given JOIN_ORDER(t1 t2 (t3 t4 t5)), we must start by joining + * all five of those relations and in that sequence, but once that is + * done, it's OK to join any other rels that are part of the join problem. + * This allows a user to specify the driving table and perhaps the first + * few things to which it should be joined while leaving the rest of the + * join order up the optimizer. But it seems like it would be surprising, + * given that specification, if the user could add t6 to the (t3 t4 t5) + * sub-join, so we don't allow that. If we did want to allow it, the logic + * earlier in this function would require substantial adjustment: we could + * allow the t3-t4-t5-t6 join to be built here, but the next step of + * joining t1-t2 to the result would still be rejected. + */ + if (!sublist) + entry->flags |= PGPA_TE_MATCH_FULL; + return sublist ? PGPA_JO_DENIED : PGPA_JO_PERMITTED; +} + +/* + * Does a certain item of join method advice permit a certain join? + * + * Advice such as HASH_JOIN((x y)) means that there should be a hash join with + * exactly x and y on the inner side. Obviously, this means that if we are + * considering a join with exactly x and y on the inner side, we should enforce + * the use of a hash join. However, it also means that we must reject some + * incompatible join orders entirely. For example, a join with exactly x + * and y on the outer side shouldn't be allowed, because such paths might win + * over the advice-driven path on cost. + * + * To accommodate these requirements, this function returns true if the join + * should be allowed and false if it should not. Furthermore, *restrict_method + * is set to true if the join method should be enforced and false if not. + */ +static bool +pgpa_join_method_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool *restrict_method) +{ + pgpa_advice_target *target = entry->target; + pgpa_itm_type inner_itm; + pgpa_itm_type outer_itm; + pgpa_itm_type join_itm; + + /* We definitely have at least a partial match for this trove entry. */ + entry->flags |= PGPA_TE_MATCH_PARTIAL; + + *restrict_method = false; + + /* + * If our inner rel mentions exactly the same relations as the advice + * target, allow the join and enforce the join method restriction. + * + * If our inner rel mentions a superset of the target relations, allow the + * join. The join we care about has already taken place, and this advice + * imposes no further restrictions. + */ + inner_itm = pgpa_identifiers_match_target(inner_count, + rids + outer_count, + target); + if (inner_itm == PGPA_ITM_EQUAL) + { + entry->flags |= PGPA_TE_MATCH_FULL; + *restrict_method = true; + return true; + } + else if (inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + return true; + + /* + * If our outer rel mentions a supserset of the relations in the advice + * target, no restrictions apply. The join we care has already taken + * place, and this advice imposes no further restrictions. + * + * On the other hand, if our outer rel mentions exactly the relations + * mentioned in the advice target, the planner is trying to reverse the + * sides of the join as compared with our desired outcome. Reject that. + */ + outer_itm = pgpa_identifiers_match_target(outer_count, + rids, target); + if (outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + return true; + else if (outer_itm == PGPA_ITM_EQUAL) + return false; + + /* + * If the advice target mentions only a single relation, the test below + * cannot ever pass, so save some work by exiting now. + */ + if (target->ttype == PGPA_TARGET_IDENTIFIER) + return false; + + /* + * If everything in the joinrel appears in the advice target, we're below + * the level of the join we want to control. + * + * For example, HASH_JOIN((x y)) doesn't restrict how x and y can be + * joined. + * + * This lookup shouldn't return PGPA_ITM_DISJOINT, because any such advice + * should not have been returned from the trove in the first place. + */ + join_itm = pgpa_identifiers_match_target(outer_count + inner_count, + rids, target); + Assert(join_itm != PGPA_ITM_DISJOINT); + if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET || + join_itm == PGPA_ITM_EQUAL) + return true; + + /* + * We've already permitted all allowable cases, so reject this. + * + * If we reach this point, then the advice overlaps with this join but + * isn't entirely contained within either side, and there's also at least + * one relation present in the join that isn't mentioned by the advice. + * + * For instance, in the HASH_JOIN((x y)) example, we would reach here if x + * were on one side of the join, y on the other, and at least one of the + * two sides also included some other relation, say t. In that case, + * accepting this join would allow the (x y t) joinrel to contain + * non-disabled paths that do not put (x y) on the inner side of a hash + * join; we could instead end up with something like (x JOIN t) JOIN y. + */ + return false; +} + +/* + * Does advice concerning an opaque join permit a certain join? + * + * By an opaque join, we mean one where the exact mechanism by which the + * join is performed is not visible to PostgreSQL. Currently this is the + * case only for foreign joins: FOREIGN_JOIN((x y z)) means that x, y, and + * z are joined on the remote side, but we know nothing about the join order + * or join methods used over there. + * + * The logic here needs to differ from pgpa_join_method_permits_join because, + * for other join types, the advice target is the set of inner rels; here, it + * includes both inner and outer rels. + */ +static bool +pgpa_opaque_join_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool *restrict_method) +{ + pgpa_advice_target *target = entry->target; + pgpa_itm_type join_itm; + + /* We definitely have at least a partial match for this trove entry. */ + entry->flags |= PGPA_TE_MATCH_PARTIAL; + + *restrict_method = false; + + join_itm = pgpa_identifiers_match_target(outer_count + inner_count, + rids, target); + if (join_itm == PGPA_ITM_EQUAL) + { + /* + * We have an exact match, and should therefore allow the join and + * enforce the use of the relevant opaque join method. + */ + entry->flags |= PGPA_TE_MATCH_FULL; + *restrict_method = true; + return true; + } + + if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET || + join_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + { + /* + * If join_itm == PGPA_ITM_TARGETS_ARE_SUBSET, then the join we care + * about has already taken place and no further restrictions apply. + * + * If join_itm == PGPA_ITM_KEYS_ARE_SUBSET, we're still building up to + * the join we care about and have not introduced any extraneous + * relations not named in the advice. Note that ForeignScan paths for + * joins are built up from ForeignScan paths from underlying joins and + * scans, so we must not disable this join when considering a subset + * of the relations we ultimately want. + */ + return true; + } + + /* + * The advice overlaps the join, but at least one relation is present in + * the join that isn't mentioned by the advice. We want to disable such + * paths so that we actually push down the join as intended. + */ + return false; +} + +/* + * Does advice concerning a semijoin permit a certain join? + * + * Unlike join method advice, which lists the rels on the inner side of the + * join, semijoin uniqueness advice lists the rels on the nullable side of the + * join. Those can be the same, if the join type is JOIN_UNIQUE_INNER or + * JOIN_SEMI, or they can be different, in case of JOIN_UNIQUE_OUTER or + * JOIN_RIGHT_SEMI. + * + * We don't know here whether the caller specified SEMIJOIN_UNIQUE or + * SEMIJOIN_NON_UNIQUE. The caller should check the join type against the + * advice type if and only if we set *restrict_method to true. + */ +static bool +pgpa_semijoin_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool outer_is_nullable, + bool *restrict_method) +{ + pgpa_advice_target *target = entry->target; + pgpa_itm_type join_itm; + pgpa_itm_type inner_itm; + pgpa_itm_type outer_itm; + + *restrict_method = false; + + /* We definitely have at least a partial match for this trove entry. */ + entry->flags |= PGPA_TE_MATCH_PARTIAL; + + /* + * If outer rel is the nullable side and contains exactly the same + * relations as the advice target, then the join order is allowable, but + * the caller must check whether the advice tag (either SEMIJOIN_UNIQUE or + * SEMIJOIN_NON_UNIQUE) matches the join type. + * + * If the outer rel is a superset of the target relations, the join we + * care about has already taken place, so we should impose no futher + * restritions. + */ + outer_itm = pgpa_identifiers_match_target(outer_count, + rids, target); + if (outer_itm == PGPA_ITM_EQUAL) + { + entry->flags |= PGPA_TE_MATCH_FULL; + if (outer_is_nullable) + { + *restrict_method = true; + return true; + } + } + else if (outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + return true; + + /* As above, but for the inner rel. */ + inner_itm = pgpa_identifiers_match_target(inner_count, + rids + outer_count, + target); + if (inner_itm == PGPA_ITM_EQUAL) + { + entry->flags |= PGPA_TE_MATCH_FULL; + if (!outer_is_nullable) + { + *restrict_method = true; + return true; + } + } + else if (inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + return true; + + /* + * If everything in the joinrel appears in the advice target, we're below + * the level of the join we want to control. + */ + join_itm = pgpa_identifiers_match_target(outer_count + inner_count, + rids, target); + Assert(join_itm != PGPA_ITM_DISJOINT); + if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET || + join_itm == PGPA_ITM_EQUAL) + return true; + + /* + * We've tested for all allowable possibilities, and so must reject this + * join order. This can happen in two ways. + * + * First, we migh be considering a semijoin that overlaps incompletely + * with one or both sides of the join. For example, if the user has + * specified SEMIJOIN_UNIQUE((t1 t2)) or SEMIJOIN_NON_UNIQUE((t1 t2)), we + * should reject a proposed t2-t3 join, since that could not result in a + * final plan compatible with the advice. + * + * Second, we might be considering a semijoin where the advice target + * perfectly matches one side of the join, but it's the wrong one. For + * example, in the example above, we might see a 3-way join between t1, + * t2, and t3, with (t1 t2) on the non-nullable side. That, too, would be + * incompatible with the advice. + */ + return false; +} + +/* + * Apply scan advice to a RelOptInfo. + */ +static void +pgpa_planner_apply_scan_advice(RelOptInfo *rel, + pgpa_trove_entry *scan_entries, + Bitmapset *scan_indexes, + pgpa_trove_entry *rel_entries, + Bitmapset *rel_indexes) +{ + bool gather_conflict = false; + Bitmapset *gather_partial_match = NULL; + Bitmapset *gather_full_match = NULL; + int i = -1; + pgpa_trove_entry *scan_entry = NULL; + int flags; + bool scan_type_conflict = false; + Bitmapset *scan_type_indexes = NULL; + Bitmapset *scan_type_rel_indexes = NULL; + uint64 gather_mask = 0; + uint64 scan_type = 0; + + /* Scrutinize available scan advice. */ + while ((i = bms_next_member(scan_indexes, i)) >= 0) + { + pgpa_trove_entry *my_entry = &scan_entries[i]; + uint64 my_scan_type = 0; + + /* Translate our advice tags to a scan strategy advice value. */ + if (my_entry->tag == PGPA_TAG_BITMAP_HEAP_SCAN) + { + /* + * Currently, PGS_CONSIDER_INDEXONLY can suppress Bitmap Heap Scans, + * so don't clear it when such a scan is requested. This happens + * because build_index_scan() thinks that the possibility of an + * index-only scan is a sufficient reason to consider using an + * otherwise-useless index, and get_index_paths() thinks that the + * same paths that are useful for index or index-only scans should + * also be considered for bitmap scans. Perhaps that logic should + * be tightened up, but until then we need to include + * PGS_CONSIDER_INDEXONLY in my_scan_type here. + */ + my_scan_type = PGS_BITMAPSCAN | PGS_CONSIDER_INDEXONLY; + } + else if (my_entry->tag == PGPA_TAG_INDEX_ONLY_SCAN) + my_scan_type = PGS_INDEXONLYSCAN | PGS_CONSIDER_INDEXONLY; + else if (my_entry->tag == PGPA_TAG_INDEX_SCAN) + my_scan_type = PGS_INDEXSCAN; + else if (my_entry->tag == PGPA_TAG_SEQ_SCAN) + my_scan_type = PGS_SEQSCAN; + else if (my_entry->tag == PGPA_TAG_TID_SCAN) + my_scan_type = PGS_TIDSCAN; + + /* + * If this is understandable scan advice, hang on to the entry, the + * inferred scan type type, and the index at which we found it. + * + * Also make a note if we see conflicting scan type advice. Note that + * we regard two index specifications as conflicting unless they match + * exactly. In theory, perhaps we could regard INDEX_SCAN(a c) and + * INDEX_SCAN(a b.c) as non-conflicting if it happens that the only + * index named c is in schema b, but it doesn't seem worth the code. + */ + if (my_scan_type != 0) + { + if (scan_type != 0 && scan_type != my_scan_type) + scan_type_conflict = true; + if (!scan_type_conflict && scan_entry != NULL && + my_entry->target->itarget != NULL && + scan_entry->target->itarget != NULL && + !pgpa_index_targets_equal(scan_entry->target->itarget, + my_entry->target->itarget)) + scan_type_conflict = true; + scan_entry = my_entry; + scan_type = my_scan_type; + scan_type_indexes = bms_add_member(scan_type_indexes, i); + } + } + + /* Scrutinize available gather-related and partitionwise advice. */ + i = -1; + while ((i = bms_next_member(rel_indexes, i)) >= 0) + { + pgpa_trove_entry *my_entry = &rel_entries[i]; + uint64 my_gather_mask = 0; + bool just_one_rel; + + just_one_rel = my_entry->target->ttype == PGPA_TARGET_IDENTIFIER + || list_length(my_entry->target->children) == 1; + + /* + * PARTITIONWISE behaves like a scan type, except that if there's more + * than one relation targeted, it has no effect at this level. + */ + if (my_entry->tag == PGPA_TAG_PARTITIONWISE) + { + if (just_one_rel) + { + const uint64 my_scan_type = PGS_APPEND | PGS_MERGE_APPEND; + + if (scan_type != 0 && scan_type != my_scan_type) + scan_type_conflict = true; + scan_entry = my_entry; + scan_type = my_scan_type; + scan_type_rel_indexes = + bms_add_member(scan_type_rel_indexes, i); + } + continue; + } + + /* + * GATHER and GATHER_MERGE applied to a single rel mean that we should + * use the correspondings strategy here, while applying either to more + * than one rel means we should not use those strategies here, but + * rather at the level of the joinrel that corresponds to what was + * specified. NO_GATHER can only be applied to single rels. + * + * Note that setting PGS_CONSIDER_NONPARTIAL in my_gather_mask is + * equivalent to allowing the non-use of either form of Gather here. + */ + if (my_entry->tag == PGPA_TAG_GATHER || + my_entry->tag == PGPA_TAG_GATHER_MERGE) + { + if (!just_one_rel) + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + else if (my_entry->tag == PGPA_TAG_GATHER) + my_gather_mask = PGS_GATHER; + else + my_gather_mask = PGS_GATHER_MERGE; + } + else if (my_entry->tag == PGPA_TAG_NO_GATHER) + { + Assert(just_one_rel); + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + } + + /* + * If we set my_gather_mask up above, then we (1) make a note if the + * advice conflicted, (2) remember the mask value, and (3) remember + * whether this was a full or partial match. + */ + if (my_gather_mask != 0) + { + if (gather_mask != 0 && gather_mask != my_gather_mask) + gather_conflict = true; + gather_mask = my_gather_mask; + if (just_one_rel) + gather_full_match = bms_add_member(gather_full_match, i); + else + gather_partial_match = bms_add_member(gather_partial_match, i); + } + } + + /* Enforce choice of index. */ + if (scan_entry != NULL && !scan_type_conflict && + (scan_entry->tag == PGPA_TAG_INDEX_SCAN || + scan_entry->tag == PGPA_TAG_INDEX_ONLY_SCAN)) + { + pgpa_index_target *itarget = scan_entry->target->itarget; + IndexOptInfo *matched_index = NULL; + + foreach_node(IndexOptInfo, index, rel->indexlist) + { + char *relname = get_rel_name(index->indexoid); + Oid nspoid = get_rel_namespace(index->indexoid); + char *relnamespace = get_namespace_name_or_temp(nspoid); + + if (strcmp(itarget->indname, relname) == 0 && + (itarget->indnamespace == NULL || + strcmp(itarget->indnamespace, relnamespace) == 0)) + { + matched_index = index; + break; + } + } + + if (matched_index == NULL) + { + /* Don't force the scan type if the index doesn't exist. */ + scan_type = 0; + + /* Mark advice as inapplicable. */ + pgpa_trove_set_flags(scan_entries, scan_type_indexes, + PGPA_TE_INAPPLICABLE); + } + else + { + /* Disable every other index. */ + foreach_node(IndexOptInfo, index, rel->indexlist) + { + if (index != matched_index) + index->disabled = true; + } + } + } + + /* + * Mark all the scan method entries as fully matched; and if they specify + * different things, mark them all as conflicting. + */ + flags = PGPA_TE_MATCH_PARTIAL | PGPA_TE_MATCH_FULL; + if (scan_type_conflict) + flags |= PGPA_TE_CONFLICTING; + pgpa_trove_set_flags(scan_entries, scan_type_indexes, flags); + pgpa_trove_set_flags(rel_entries, scan_type_rel_indexes, flags); + + /* + * Mark every Gather-related piece of advice as partially matched. Mark + * the ones that included this relation as a target by itself as fully + * matched. If there was a conflict, mark them all as conflicting. + */ + flags = PGPA_TE_MATCH_PARTIAL; + if (gather_conflict) + flags |= PGPA_TE_CONFLICTING; + pgpa_trove_set_flags(rel_entries, gather_partial_match, flags); + flags |= PGPA_TE_MATCH_FULL; + pgpa_trove_set_flags(rel_entries, gather_full_match, flags); + + /* + * Enforce restrictions on the scan type and use of Gather/Gather Merge. + * Only clear bits here, so that we still respect the enable_* GUCs. Do + * nothing in cases where the advice on a single topic conflicts. + */ + if (scan_type != 0 && !scan_type_conflict) + { + uint64 all_scan_mask; + + all_scan_mask = PGS_SCAN_ANY | PGS_APPEND | PGS_MERGE_APPEND | + PGS_CONSIDER_INDEXONLY; + rel->pgs_mask &= ~(all_scan_mask & ~scan_type); + } + if (gather_mask != 0 && !gather_conflict) + { + uint64 all_gather_mask; + + all_gather_mask = + PGS_GATHER | PGS_GATHER_MERGE | PGS_CONSIDER_NONPARTIAL; + rel->pgs_mask &= ~(all_gather_mask & ~gather_mask); + } +} + +/* + * Add feedback entries to for one trove slice to the provided list and + * return the resulting list. + * + * Feedback entries are generated from the trove entry's flags. It's assumed + * that the caller has already set all relevant flags with the exception of + * PGPA_TE_FAILED. We set that flag here if appropriate. + */ +static List * +pgpa_planner_append_feedback(List *list, pgpa_trove *trove, + pgpa_trove_lookup_type type, + pgpa_identifier *rt_identifiers, + pgpa_plan_walker_context *walker) +{ + pgpa_trove_entry *entries; + int nentries; + StringInfoData buf; + + initStringInfo(&buf); + pgpa_trove_lookup_all(trove, type, &entries, &nentries); + for (int i = 0; i < nentries; ++i) + { + pgpa_trove_entry *entry = &entries[i]; + DefElem *item; + + /* + * If this entry was fully matched, check whether generating advice + * from this plan would produce such an entry. If not, label the entry + * as failed. + */ + if ((entry->flags & PGPA_TE_MATCH_FULL) != 0 && + !pgpa_walker_would_advise(walker, rt_identifiers, + entry->tag, entry->target)) + entry->flags |= PGPA_TE_FAILED; + + item = makeDefElem(pgpa_cstring_trove_entry(entry), + (Node *) makeInteger(entry->flags), -1); + list = lappend(list, item); + } + + return list; +} + +/* + * Emit a WARNING tell the user about a problem with the supplied plan advice. + */ +static void +pgpa_planner_feedback_warning(List *feedback) +{ + StringInfoData detailbuf; + StringInfoData flagbuf; + + /* Quick exit if there's no feedback. */ + if (feedback == NIL) + return; + + /* Initialize buffers. */ + initStringInfo(&detailbuf); + initStringInfo(&flagbuf); + + /* Main loop. */ + foreach_node(DefElem, item, feedback) + { + int flags = defGetInt32(item); + + /* + * Don't emit anything if it was fully matched with no problems found. + * + * NB: Feedback should never be marked fully matched without also + * being marked partially matched. + */ + if (flags == (PGPA_TE_MATCH_PARTIAL | PGPA_TE_MATCH_FULL)) + continue; + + /* + * Terminate each detail line except the last with a newline. This is + * also a convenient place to reset flagbuf. + */ + if (detailbuf.len > 0) + { + appendStringInfoChar(&detailbuf, '\n'); + resetStringInfo(&flagbuf); + } + + /* Generate output. */ + pgpa_trove_append_flags(&flagbuf, flags); + appendStringInfo(&detailbuf, _("advice %s feedback is \"%s\""), + item->defname, flagbuf.data); + } + + /* Emit the warning, if any problems were found. */ + if (detailbuf.len > 0) + ereport(WARNING, + errmsg("supplied plan advice was not enforced"), + errdetail("%s", detailbuf.data)); +} + +#ifdef USE_ASSERT_CHECKING + +/* + * Fast hash function for a key consisting of an RTI and plan name. + */ +static uint32 +pgpa_ri_checker_hash_key(pgpa_ri_checker_key key) +{ + fasthash_state hs; + int sp_len; + + fasthash_init(&hs, 0); + + hs.accum = key.rti; + fasthash_combine(&hs); + + /* plan_name can be NULL */ + if (key.plan_name == NULL) + sp_len = 0; + else + sp_len = fasthash_accum_cstring(&hs, key.plan_name); + + /* hashfn_unstable.h recommends using string length as tweak */ + return fasthash_final32(&hs, sp_len); +} + +#endif + +/* + * Save the range table identifier for one relation for future cross-checking. + */ +static void +pgpa_ri_checker_save(pgpa_planner_state *pps, PlannerInfo *root, + RelOptInfo *rel) +{ +#ifdef USE_ASSERT_CHECKING + pgpa_ri_checker_key key; + pgpa_ri_checker *check; + pgpa_identifier rid; + const char *rid_string; + bool found; + + key.rti = bms_singleton_member(rel->relids); + key.plan_name = root->plan_name; + pgpa_compute_identifier_by_rti(root, key.rti, &rid); + rid_string = pgpa_identifier_string(&rid); + check = pgpa_ri_check_insert(pps->ri_check_hash, key, &found); + Assert(!found || strcmp(check->rid_string, rid_string) == 0); + check->rid_string = rid_string; +#endif +} + +/* + * Validate that the range table identifiers we were able to generate during + * planning match the ones we generated from the final plan. + */ +static void +pgpa_ri_checker_validate(pgpa_planner_state *pps, PlannedStmt *pstmt) +{ +#ifdef USE_ASSERT_CHECKING + pgpa_identifier *rt_identifiers; + pgpa_ri_check_iterator it; + pgpa_ri_checker *check; + + /* Create identifiers from the planned statement. */ + rt_identifiers = pgpa_create_identifiers_for_planned_stmt(pstmt); + + /* Iterate over identifiers created during planning, so we can compare. */ + pgpa_ri_check_start_iterate(pps->ri_check_hash, &it); + while ((check = pgpa_ri_check_iterate(pps->ri_check_hash, &it)) != NULL) + { + int rtoffset = 0; + const char *rid_string; + Index flat_rti; + + /* + * If there's no plan name associated with this entry, then the + * rtoffset is 0. Otherwise, we can search the SubPlanRTInfo list to + * find the rtoffset. + */ + if (check->key.plan_name != NULL) + { + foreach_node(SubPlanRTInfo, rtinfo, pstmt->subrtinfos) + { + /* + * If rtinfo->dummy is set, then the subquery's range table + * will only have been partially copied to the final range + * table. Specifically, only RTE_RELATION entries and + * RTE_SUBQUERY entries that were once RTE_RELATION entries + * will be copied, as per add_rtes_to_flat_rtable. Therefore, + * there's no fixed rtoffset that we can apply to the RTIs + * used during planning to locate the corresponding relations + * in the final rtable. + * + * With more complex logic, we could work around that problem + * by remembering the whole contents of the subquery's rtable + * during planning, determining which of those would have been + * copied to the final rtable, and matching them up. But it + * doesn't seem like a worthwhile endeavor for right now, + * because RTIs from such subqueries won't appear in the plan + * tree itself, just in the range table. Hence, we can neither + * generate nor accept advice for them. + */ + if (strcmp(check->key.plan_name, rtinfo->plan_name) == 0 + && !rtinfo->dummy) + { + rtoffset = rtinfo->rtoffset; + Assert(rtoffset > 0); + break; + } + } + + /* + * It's not an error if we don't find the plan name: that just + * means that we planned a subplan by this name but it ended up + * being a dummy subplan and so wasn't included in the final plan + * tree. + */ + if (rtoffset == 0) + continue; + } + + /* + * check->key.rti is the RTI that we saw prior to range-table + * flattening, so we must add the appropriate RT offset to get the + * final RTI. + */ + flat_rti = check->key.rti + rtoffset; + Assert(flat_rti <= list_length(pstmt->rtable)); + + /* Assert that the string we compute now matches the previous one. */ + rid_string = pgpa_identifier_string(&rt_identifiers[flat_rti - 1]); + Assert(strcmp(rid_string, check->rid_string) == 0); + } +#endif +} + +/* + * Convert a bitmapset to a C string of comma-separated integers. + */ +static char * +pgpa_bms_to_cstring(Bitmapset *bms) +{ + StringInfoData buf; + int x = -1; + + if (bms_is_empty(bms)) + return "none"; + + initStringInfo(&buf); + while ((x = bms_next_member(bms, x)) >= 0) + { + if (buf.len > 0) + appendStringInfo(&buf, ", %d", x); + else + appendStringInfo(&buf, "%d", x); + } + + return buf.data; +} + +/* + * Convert a JoinType to a C string. + */ +static const char * +pgpa_jointype_to_cstring(JoinType jointype) +{ + switch (jointype) + { + case JOIN_INNER: + return "inner"; + case JOIN_LEFT: + return "left"; + case JOIN_FULL: + return "full"; + case JOIN_RIGHT: + return "right"; + case JOIN_SEMI: + return "semi"; + case JOIN_ANTI: + return "anti"; + case JOIN_RIGHT_SEMI: + return "right semi"; + case JOIN_RIGHT_ANTI: + return "right anti"; + case JOIN_UNIQUE_OUTER: + return "unique outer"; + case JOIN_UNIQUE_INNER: + return "unique inner"; + } + return "???"; +} diff --git a/contrib/pg_plan_advice/pgpa_planner.h b/contrib/pg_plan_advice/pgpa_planner.h new file mode 100644 index 0000000000..7d40b910b0 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_planner.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * pgpa_planner.h + * planner hooks + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_planner.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_PLANNER_H +#define PGPA_PLANNER_H + +extern void pgpa_planner_install_hooks(void); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_scan.c b/contrib/pg_plan_advice/pgpa_scan.c new file mode 100644 index 0000000000..a04f9eca8e --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scan.c @@ -0,0 +1,269 @@ +/*------------------------------------------------------------------------- + * + * pgpa_scan.c + * analysis of scans in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/parsenodes.h" +#include "parser/parsetree.h" + +static pgpa_scan *pgpa_make_scan(pgpa_plan_walker_context *walker, Plan *plan, + pgpa_scan_strategy strategy, + Bitmapset *relids); + + +static RTEKind unique_nonjoin_rtekind(Bitmapset *relids, List *rtable); + +/* + * Build a pgpa_scan object for a Plan node and update the plan walker + * context as appopriate. If this is an Append or MergeAppend scan, also + * build pgpa_scan for any scans that were consolidated into this one by + * Append/MergeAppend pull-up. + * + * If there is at least one ElidedNode for this plan node, pass the uppermost + * one as elided_node, else pass NULL. + * + * Set the 'beneath_any_gather' node if we are underneath a Gather or + * Gather Merge node (except for a single-copy Gather node, for which + * GATHER or GATHER_MERGE advice should not be emitted). + * + * Set the 'within_join_problem' flag if we're inside of a join problem and + * not otherwise. + */ +pgpa_scan * +pgpa_build_scan(pgpa_plan_walker_context *walker, Plan *plan, + ElidedNode *elided_node, + bool beneath_any_gather, bool within_join_problem) +{ + pgpa_scan_strategy strategy = PGPA_SCAN_ORDINARY; + Bitmapset *relids = NULL; + int rti = -1; + List *child_append_relid_sets = NIL; + NodeTag nodetype = nodeTag(plan); + + if (elided_node != NULL) + { + nodetype = elided_node->elided_type; + relids = elided_node->relids; + + /* + * If setrefs processing elided an Append or MergeAppend node that had + * only one surviving child, it might be a partitionwise operation, + * but then this is either a setop over subqueries, or a partitionwise + * operation (which might be a scan or a join in reality, but here we + * don't care about the distinction and consider it simply a scan). + * + * A setop over subqueries, or a trivial SubQueryScan that was elided, + * is an "ordinary" scan i.e. one for which we need to generate advice + * because the planner has not made any meaningful choice. + */ + if ((nodetype == T_Append || nodetype == T_MergeAppend) && + unique_nonjoin_rtekind(relids, + walker->pstmt->rtable) == RTE_RELATION) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + + /* Join RTIs can be present, but advice never refers to them. */ + relids = pgpa_filter_out_join_relids(relids, walker->pstmt->rtable); + } + else if ((rti = pgpa_scanrelid(plan)) != 0) + { + relids = bms_make_singleton(rti); + + switch (nodeTag(plan)) + { + case T_SeqScan: + strategy = PGPA_SCAN_SEQ; + break; + case T_BitmapHeapScan: + strategy = PGPA_SCAN_BITMAP_HEAP; + break; + case T_IndexScan: + strategy = PGPA_SCAN_INDEX; + break; + case T_IndexOnlyScan: + strategy = PGPA_SCAN_INDEX_ONLY; + break; + case T_TidScan: + case T_TidRangeScan: + strategy = PGPA_SCAN_TID; + break; + default: + + /* + * This case includes a ForeignScan targeting a single + * relation; no other strategy is possible in that case, but + * see below, where things are different in multi-relation + * cases. + */ + strategy = PGPA_SCAN_ORDINARY; + break; + } + } + else if ((relids = pgpa_relids(plan)) != NULL) + { + switch (nodeTag(plan)) + { + case T_ForeignScan: + + /* + * If multiple relations are being targeted by a single + * foreign scan, then the foreign join has been pushed to the + * remote side, and we want that to be reflected in the + * generated advice. + */ + strategy = PGPA_SCAN_FOREIGN; + break; + case T_Append: + + /* + * Append nodes can represent partitionwise scans of a a + * relation, but when they implement a set operation, they are + * just ordinary scans. + */ + if (unique_nonjoin_rtekind(relids, walker->pstmt->rtable) + == RTE_RELATION) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + + /* Be sure to account for pulled-up scans. */ + child_append_relid_sets = + ((Append *) plan)->child_append_relid_sets; + break; + case T_MergeAppend: + /* Some logic here as for Append, above. */ + if (unique_nonjoin_rtekind(relids, walker->pstmt->rtable) + == RTE_RELATION) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + + /* Be sure to account for pulled-up scans. */ + child_append_relid_sets = + ((MergeAppend *) plan)->child_append_relid_sets; + break; + default: + strategy = PGPA_SCAN_ORDINARY; + break; + } + + + /* Join RTIs can be present, but advice never refers to them. */ + relids = pgpa_filter_out_join_relids(relids, walker->pstmt->rtable); + } + + /* + * If this is an Append or MergeAppend node into which subordinate Append + * or MergeAppend paths were merged, each of those merged paths is + * effectively another scan for which we need to account. + */ + foreach_node(Bitmapset, child_relids, child_append_relid_sets) + { + Bitmapset *child_nonjoin_relids; + + child_nonjoin_relids = + pgpa_filter_out_join_relids(child_relids, + walker->pstmt->rtable); + (void) pgpa_make_scan(walker, plan, strategy, + child_nonjoin_relids); + } + + /* + * If this plan node has no associated RTIs, it's not a scan. When the + * 'within_join_problem' flag is set, that's unexpected, so throw an + * error, else return quietly. + */ + if (relids == NULL) + { + if (within_join_problem) + elog(ERROR, "plan node has no RTIs: %d", (int) nodeTag(plan)); + return NULL; + } + + /* + * Add the appropriate set of RTIs to walker->no_gather_scans. + * + * Add nothing if we're beneath a Gather or Gather Merge node, since + * NO_GATHER advice is clearly inappropriate in that situation. + * + * Add nothing if this is an Append or MergeAppend node, whether or not + * elided. We'll emit NO_GATHER() for the underlying scan, which is good + * enough. + */ + if (!beneath_any_gather && nodetype != T_Append && + nodetype != T_MergeAppend) + walker->no_gather_scans = + bms_add_members(walker->no_gather_scans, relids); + + /* Caller tells us whether NO_GATHER() advice for this scan is needed. */ + return pgpa_make_scan(walker, plan, strategy, relids); +} + +/* + * Create a single pgpa_scan object and update the pgpa_plan_walker_context. + */ +static pgpa_scan * +pgpa_make_scan(pgpa_plan_walker_context *walker, Plan *plan, + pgpa_scan_strategy strategy, Bitmapset *relids) +{ + pgpa_scan *scan; + + /* Create the scan object. */ + scan = palloc(sizeof(pgpa_scan)); + scan->plan = plan; + scan->strategy = strategy; + scan->relids = relids; + + /* Add it to the appropriate list. */ + walker->scans[scan->strategy] = lappend(walker->scans[scan->strategy], + scan); + + return scan; +} + +/* + * Determine the unique rtekind of a set of relids. + */ +static RTEKind +unique_nonjoin_rtekind(Bitmapset *relids, List *rtable) +{ + int rti = -1; + bool first = true; + RTEKind rtekind; + + Assert(relids != NULL); + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + RangeTblEntry *rte = rt_fetch(rti, rtable); + + if (rte->rtekind == RTE_JOIN) + continue; + + if (first) + { + rtekind = rte->rtekind; + first = false; + } + else if (rtekind != rte->rtekind) + elog(ERROR, "rtekind mismatch: %d vs. %d", + rtekind, rte->rtekind); + } + + if (first) + elog(ERROR, "no non-RTE_JOIN RTEs found"); + + return rtekind; +} diff --git a/contrib/pg_plan_advice/pgpa_scan.h b/contrib/pg_plan_advice/pgpa_scan.h new file mode 100644 index 0000000000..3bb8726ff1 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scan.h @@ -0,0 +1,85 @@ +/*------------------------------------------------------------------------- + * + * pgpa_scan.h + * analysis of scans in Plan trees + * + * For purposes of this module, a "scan" includes (1) single plan nodes that + * scan multiple RTIs, such as a degenerate Result node that replaces what + * would otherwise have been a join, and (2) Append and MergeAppend nodes + * implementing a partitionwise scan or a partitionwise join. Said + * differently, scans are the leaves of the join tree for a single join + * problem. + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scan.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_SCAN_H +#define PGPA_SCAN_H + +#include "nodes/plannodes.h" + +typedef struct pgpa_plan_walker_context pgpa_plan_walker_context; + +/* + * Scan strategies. + * + * PGPA_SCAN_ORDINARY is any scan strategy that isn't interesting to us + * because there is no meaningful planner decision involved. For example, + * the only way to scan a subquery is a SubqueryScan, and the only way to + * scan a VALUES construct is a ValuesScan. We need not care exactly which + * type of planner node was used in such cases, because the same thing will + * happen when replanning. + * + * PGPA_SCAN_ORDINARY also includes Result nodes that correspond to scans + * or even joins that are proved empty. We don't know whether or not the scan + * or join will still be provably empty at replanning time, but if it is, + * then no scan-type advice is needed, and if it's not, we can't recommend + * a scan type based on the current plan. + * + * PGPA_SCAN_PARTITIONWISE also lumps together scans and joins: this can + * be either a partitionwise scan of a partitioned table or a partitionwise + * join between several partitioned tables. Note that all decisions about + * whether or not to use partitionwise join are meaningful: no matter what + * we decided this time, we could do more or fewer things partitionwise the + * next time. + * + * PGPA_SCAN_FOREIGN is only used when there's more than one relation involved; + * a single-table foreign scan is classified as ordinary, since there is no + * decision to make in that case. + * + * Other scan strategies map one-to-one to plan nodes. + */ +typedef enum +{ + PGPA_SCAN_ORDINARY = 0, + PGPA_SCAN_SEQ, + PGPA_SCAN_BITMAP_HEAP, + PGPA_SCAN_FOREIGN, + PGPA_SCAN_INDEX, + PGPA_SCAN_INDEX_ONLY, + PGPA_SCAN_PARTITIONWISE, + PGPA_SCAN_TID + /* update NUM_PGPA_SCAN_STRATEGY if you add anything here */ +} pgpa_scan_strategy; + +#define NUM_PGPA_SCAN_STRATEGY ((int) PGPA_SCAN_TID + 1) + +/* + * All of the details we need regarding a scan. + */ +typedef struct pgpa_scan +{ + Plan *plan; + pgpa_scan_strategy strategy; + Bitmapset *relids; +} pgpa_scan; + +extern pgpa_scan *pgpa_build_scan(pgpa_plan_walker_context *walker, Plan *plan, + ElidedNode *elided_node, + bool beneath_any_gather, + bool within_join_problem); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_scanner.l b/contrib/pg_plan_advice/pgpa_scanner.l new file mode 100644 index 0000000000..a887735f31 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scanner.l @@ -0,0 +1,297 @@ +%top{ +/* + * Scanner for plan advice + * + * Copyright (c) 2000-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scanner.l + */ +#include "postgres.h" + +#include "common/string.h" +#include "nodes/miscnodes.h" +#include "parser/scansup.h" + +#include "pgpa_ast.h" +#include "pgpa_parser.h" + +/* + * Extra data that we pass around when during scanning. + * + * 'litbuf' is used to implement the exclusive state, which handles + * double-quoted identifiers. + */ +typedef struct pgpa_yy_extra_type +{ + StringInfoData litbuf; +} pgpa_yy_extra_type; + +} + +%{ +/* LCOV_EXCL_START */ + +#define YY_DECL \ + extern int pgpa_yylex(union YYSTYPE *yylval_param, List **result, \ + char **parse_error_msg_p, yyscan_t yyscanner) + +/* No reason to constrain amount of data slurped */ +#define YY_READ_BUF_SIZE 16777216 + +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) + +static void +fprintf_to_ereport(const char *fmt, const char *msg) +{ + ereport(ERROR, (errmsg_internal("%s", msg))); +} +%} + +%option reentrant +%option bison-bridge +%option 8bit +%option never-interactive +%option nodefault +%option noinput +%option nounput +%option noyywrap +%option noyyalloc +%option noyyrealloc +%option noyyfree +%option warn +%option prefix="pgpa_yy" +%option extra-type="pgpa_yy_extra_type *" + +/* + * What follows is a severely stripped-down version of the core scanner. We + * only care about recognizing identifiers with or without identifier quoting + * (i.e. double-quoting), decimal integers, and a small handful of other + * things. Keep these rules in sync with src/backend/parser/scan.l. As in that + * file, we use an exclusive state called 'xc' for C-style comments, and an + * exclusive state called 'xd' for double-quoted identifiers. + */ +%x xc +%x xd + +ident_start [A-Za-z\200-\377_] +ident_cont [A-Za-z\200-\377_0-9\$] + +identifier {ident_start}{ident_cont}* + +decdigit [0-9] +decinteger {decdigit}(_?{decdigit})* + +space [ \t\n\r\f\v] +whitespace {space}+ + +dquote \" +xdstart {dquote} +xdstop {dquote} +xddouble {dquote}{dquote} +xdinside [^"]+ + +xcstart \/\* +xcstop \*+\/ +xcinside [^*/]+ + +%% + +{whitespace} { /* ignore */ } + +{identifier} { + char *str; + bool fail; + pgpa_advice_tag_type tag; + + /* + * Unlike the core scanner, we don't truncate identifiers + * here. There is no obvious reason to do so. + */ + str = downcase_identifier(yytext, yyleng, false, false); + yylval->str = str; + + /* + * If it's not a tag, just return TOK_IDENT; else, return + * a token type based on how further parsing should + * proceed. + */ + tag = pgpa_parse_advice_tag(str, &fail); + if (fail) + return TOK_IDENT; + else if (tag == PGPA_TAG_JOIN_ORDER) + return TOK_TAG_JOIN_ORDER; + else if (tag == PGPA_TAG_INDEX_SCAN || + tag == PGPA_TAG_INDEX_ONLY_SCAN) + return TOK_TAG_INDEX; + else if (tag == PGPA_TAG_SEQ_SCAN || + tag == PGPA_TAG_TID_SCAN || + tag == PGPA_TAG_BITMAP_HEAP_SCAN || + tag == PGPA_TAG_NO_GATHER) + return TOK_TAG_SIMPLE; + else + return TOK_TAG_GENERIC; + } + +{decinteger} { + char *endptr; + + errno = 0; + yylval->integer = strtoint(yytext, &endptr, 10); + if (*endptr != '\0' || errno == ERANGE) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "integer out of range"); + return TOK_INTEGER; + } + +{xcstart} { + BEGIN(xc); + } + +{xdstart} { + BEGIN(xd); + resetStringInfo(&yyextra->litbuf); + } + +. { return yytext[0]; } + +{xcstop} { + BEGIN(INITIAL); + } + +{xcinside} { + /* discard multiple characters without slash or asterisk */ + } + +. { + /* + * Discard any single character. flex prefers longer + * matches, so this rule will never be picked when we could + * have matched xcstop. + * + * NB: At present, we don't bother to support nested + * C-style comments here, but this logic could be extended + * if that restriction poses a problem. + */ + } + +<> { + BEGIN(INITIAL); + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "unterminated comment"); + } + +{xdstop} { + BEGIN(INITIAL); + if (yyextra->litbuf.len == 0) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "zero-length delimited identifier"); + yylval->str = pstrdup(yyextra->litbuf.data); + return TOK_IDENT; + } + +{xddouble} { + appendStringInfoChar(&yyextra->litbuf, '"'); + } + +{xdinside} { + appendBinaryStringInfo(&yyextra->litbuf, yytext, yyleng); + } + +<> { + BEGIN(INITIAL); + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "unterminated quoted identifier"); + } + +%% + +/* LCOV_EXCL_STOP */ + +/* + * Handler for errors while scanning or parsing advice. + * + * bison passes the error message to us via 'message', and the context is + * available via the 'yytext' macro. We assemble those values into a final + * error text and then arrange to pass it back to the caller of pgpa_yyparse() + * by storing it into *parse_error_msg_p. + */ +void +pgpa_yyerror(List **result, char **parse_error_msg_p, yyscan_t yyscanner, + const char *message) +{ + struct yyguts_t *yyg = (struct yyguts_t *) yyscanner; /* needed for yytext + * macro */ + + + /* report only the first error in a parse operation */ + if (*parse_error_msg_p) + return; + + if (yytext[0]) + *parse_error_msg_p = psprintf("%s at or near \"%s\"", message, yytext); + else + *parse_error_msg_p = psprintf("%s at end of input", message); +} + +/* + * Initialize the advice scanner. + * + * This should be called before parsing begins. + */ +void +pgpa_scanner_init(const char *str, yyscan_t *yyscannerp) +{ + yyscan_t yyscanner; + pgpa_yy_extra_type *yyext = palloc0_object(pgpa_yy_extra_type); + + if (yylex_init(yyscannerp) != 0) + elog(ERROR, "yylex_init() failed: %m"); + + yyscanner = *yyscannerp; + + initStringInfo(&yyext->litbuf); + pgpa_yyset_extra(yyext, yyscanner); + + yy_scan_string(str, yyscanner); +} + + +/* + * Shut down the advice scanner. + * + * This should be called after parsing is complete. + */ +void +pgpa_scanner_finish(yyscan_t yyscanner) +{ + yylex_destroy(yyscanner); +} + +/* + * Interface functions to make flex use palloc() instead of malloc(). + * It'd be better to make these static, but flex insists otherwise. + */ + +void * +yyalloc(yy_size_t size, yyscan_t yyscanner) +{ + return palloc(size); +} + +void * +yyrealloc(void *ptr, yy_size_t size, yyscan_t yyscanner) +{ + if (ptr) + return repalloc(ptr, size); + else + return palloc(size); +} + +void +yyfree(void *ptr, yyscan_t yyscanner) +{ + if (ptr) + pfree(ptr); +} diff --git a/contrib/pg_plan_advice/pgpa_trove.c b/contrib/pg_plan_advice/pgpa_trove.c new file mode 100644 index 0000000000..e924959c01 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_trove.c @@ -0,0 +1,516 @@ +/*------------------------------------------------------------------------- + * + * pgpa_trove.c + * All of the advice given for a particular query, appropriately + * organized for convenient access. + * + * This name comes from the English expression "trove of advice", which + * means a collection of wisdom. This slightly unusual term is chosen to + * avoid naming confusion; for example, "collection of advice" would + * invite confusion with pgpa_collector.c. Note that, while we don't know + * whether the provided advice is actually wise, it's not our job to + * question the user's choices. + * + * The goal of this module is to make it easy to locate the specific + * bits of advice that pertain to any given part of a query, or to + * determine that there are none. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_trove.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgpa_trove.h" + +#include "common/hashfn_unstable.h" + +/* + * An advice trove is organized into a series of "slices", each of which + * contains information about one topic e.g. scan methods. Each slice consists + * of an array of trove entries plus a hash table that we can use to determine + * which ones are relevant to a particular part of the query. + */ +typedef struct pgpa_trove_slice +{ + unsigned nallocated; + unsigned nused; + pgpa_trove_entry *entries; + struct pgpa_trove_entry_hash *hash; +} pgpa_trove_slice; + +/* + * Scan advice is stored into 'scan'; join advice is stored into 'join'; and + * advice that can apply to both cases is stored into 'rel'. This lets callers + * ask just for what's relevant. These slices correspond to the possible values + * of pgpa_trove_lookup_type. + */ +struct pgpa_trove +{ + pgpa_trove_slice join; + pgpa_trove_slice rel; + pgpa_trove_slice scan; +}; + +/* + * We're going to build a hash table to allow clients of this module to find + * relevant advice for a given part of the query quickly. However, we're going + * to use only three of the five key fields as hash keys. There are two reasons + * for this. + * + * First, it's allowable to set partition_schema to NULL to match a partition + * with the correct name in any schema. + * + * Second, we expect the "occurrence" and "partition_schema" portions of the + * relation identifiers to be mostly uninteresting. Most of the time, the + * occurrence field will be 1 and the partition_schema values will all be the + * same. Even when there is some variation, the absolute number of entries + * that have the same values for all three of these key fields should be + * quite small. + */ +typedef struct +{ + const char *alias_name; + const char *partition_name; + const char *plan_name; +} pgpa_trove_entry_key; + +typedef struct +{ + pgpa_trove_entry_key key; + int status; + Bitmapset *indexes; +} pgpa_trove_entry_element; + +static uint32 pgpa_trove_entry_hash_key(pgpa_trove_entry_key key); + +static inline bool +pgpa_trove_entry_compare_key(pgpa_trove_entry_key a, pgpa_trove_entry_key b) +{ + if (strcmp(a.alias_name, b.alias_name) != 0) + return false; + + if (!strings_equal_or_both_null(a.partition_name, b.partition_name)) + return false; + + if (!strings_equal_or_both_null(a.plan_name, b.plan_name)) + return false; + + return true; +} + +#define SH_PREFIX pgpa_trove_entry +#define SH_ELEMENT_TYPE pgpa_trove_entry_element +#define SH_KEY_TYPE pgpa_trove_entry_key +#define SH_KEY key +#define SH_HASH_KEY(tb, key) pgpa_trove_entry_hash_key(key) +#define SH_EQUAL(tb, a, b) pgpa_trove_entry_compare_key(a, b) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +static void pgpa_init_trove_slice(pgpa_trove_slice *tslice); +static void pgpa_trove_add_to_slice(pgpa_trove_slice *tslice, + pgpa_advice_tag_type tag, + pgpa_advice_target *target); +static void pgpa_trove_add_to_hash(pgpa_trove_entry_hash *hash, + pgpa_advice_target *target, + int index); +static Bitmapset *pgpa_trove_slice_lookup(pgpa_trove_slice *tslice, + pgpa_identifier *rid); + +/* + * Build a trove of advice from a list of advice items. + * + * Caller can obtain a list of advice items to pass to this function by + * calling pgpa_parse(). + */ +pgpa_trove * +pgpa_build_trove(List *advice_items) +{ + pgpa_trove *trove = palloc_object(pgpa_trove); + + pgpa_init_trove_slice(&trove->join); + pgpa_init_trove_slice(&trove->rel); + pgpa_init_trove_slice(&trove->scan); + + foreach_ptr(pgpa_advice_item, item, advice_items) + { + switch (item->tag) + { + case PGPA_TAG_JOIN_ORDER: + { + pgpa_advice_target *target; + + /* + * For most advice types, each element in the top-level + * list is a separate target, but it's most convenient to + * regard the entirety of a JOIN_ORDER specification as a + * single target. Since it wasn't represented that way + * during parsing, build a surrogate object now. + */ + target = palloc0_object(pgpa_advice_target); + target->ttype = PGPA_TARGET_ORDERED_LIST; + target->children = item->targets; + + pgpa_trove_add_to_slice(&trove->join, + item->tag, target); + } + break; + + case PGPA_TAG_BITMAP_HEAP_SCAN: + case PGPA_TAG_INDEX_ONLY_SCAN: + case PGPA_TAG_INDEX_SCAN: + case PGPA_TAG_SEQ_SCAN: + case PGPA_TAG_TID_SCAN: + + /* + * Scan advice. + */ + foreach_ptr(pgpa_advice_target, target, item->targets) + { + /* + * For now, all of our scan types target single relations, + * but in the future this might not be true, e.g. a custom + * scan could replace a join. + */ + Assert(target->ttype == PGPA_TARGET_IDENTIFIER); + pgpa_trove_add_to_slice(&trove->scan, + item->tag, target); + } + break; + + case PGPA_TAG_FOREIGN_JOIN: + case PGPA_TAG_HASH_JOIN: + case PGPA_TAG_MERGE_JOIN_MATERIALIZE: + case PGPA_TAG_MERGE_JOIN_PLAIN: + case PGPA_TAG_NESTED_LOOP_MATERIALIZE: + case PGPA_TAG_NESTED_LOOP_MEMOIZE: + case PGPA_TAG_NESTED_LOOP_PLAIN: + case PGPA_TAG_SEMIJOIN_NON_UNIQUE: + case PGPA_TAG_SEMIJOIN_UNIQUE: + + /* + * Join strategy advice. + */ + foreach_ptr(pgpa_advice_target, target, item->targets) + { + pgpa_trove_add_to_slice(&trove->join, + item->tag, target); + } + break; + + case PGPA_TAG_PARTITIONWISE: + case PGPA_TAG_GATHER: + case PGPA_TAG_GATHER_MERGE: + case PGPA_TAG_NO_GATHER: + + /* + * Advice about a RelOptInfo relevant to both scans and joins. + */ + foreach_ptr(pgpa_advice_target, target, item->targets) + { + pgpa_trove_add_to_slice(&trove->rel, + item->tag, target); + } + break; + } + } + + return trove; +} + +/* + * Search a trove of advice for relevant entries. + * + * All parameters are input parameters except for *result, which is an output + * parameter used to return results to the caller. + */ +void +pgpa_trove_lookup(pgpa_trove *trove, pgpa_trove_lookup_type type, + int nrids, pgpa_identifier *rids, pgpa_trove_result *result) +{ + pgpa_trove_slice *tslice; + Bitmapset *indexes; + + Assert(nrids > 0); + + if (type == PGPA_TROVE_LOOKUP_SCAN) + tslice = &trove->scan; + else if (type == PGPA_TROVE_LOOKUP_JOIN) + tslice = &trove->join; + else + tslice = &trove->rel; + + indexes = pgpa_trove_slice_lookup(tslice, &rids[0]); + for (int i = 1; i < nrids; ++i) + { + Bitmapset *other_indexes; + + /* + * If the caller is asking about two relations that aren't part of the + * same subquery, they've messed up. + */ + Assert(strings_equal_or_both_null(rids[0].plan_name, + rids[i].plan_name)); + + other_indexes = pgpa_trove_slice_lookup(tslice, &rids[i]); + indexes = bms_union(indexes, other_indexes); + } + + result->entries = tslice->entries; + result->indexes = indexes; +} + +/* + * Return all entries in a trove slice to the caller. + * + * The first two arguments are input arguments, and the remainder are output + * arguments. + */ +void +pgpa_trove_lookup_all(pgpa_trove *trove, pgpa_trove_lookup_type type, + pgpa_trove_entry **entries, int *nentries) +{ + pgpa_trove_slice *tslice; + + if (type == PGPA_TROVE_LOOKUP_SCAN) + tslice = &trove->scan; + else if (type == PGPA_TROVE_LOOKUP_JOIN) + tslice = &trove->join; + else + tslice = &trove->rel; + + *entries = tslice->entries; + *nentries = tslice->nused; +} + +/* + * Convert a trove entry to an item of plan advice that would produce it. + */ +char * +pgpa_cstring_trove_entry(pgpa_trove_entry *entry) +{ + StringInfoData buf; + + initStringInfo(&buf); + appendStringInfo(&buf, "%s", pgpa_cstring_advice_tag(entry->tag)); + + /* JOIN_ORDER tags are transformed by pgpa_build_trove; undo that here */ + if (entry->tag != PGPA_TAG_JOIN_ORDER) + appendStringInfoChar(&buf, '('); + else + Assert(entry->target->ttype == PGPA_TARGET_ORDERED_LIST); + + pgpa_format_advice_target(&buf, entry->target); + + if (entry->target->itarget != NULL) + { + appendStringInfoChar(&buf, ' '); + pgpa_format_index_target(&buf, entry->target->itarget); + } + + if (entry->tag != PGPA_TAG_JOIN_ORDER) + appendStringInfoChar(&buf, ')'); + + return buf.data; +} + +/* + * Set PGPA_TE_* flags on a set of trove entries. + */ +void +pgpa_trove_set_flags(pgpa_trove_entry *entries, Bitmapset *indexes, int flags) +{ + int i = -1; + + while ((i = bms_next_member(indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &entries[i]; + + entry->flags |= flags; + } +} + +/* + * Append a string representation of the specified PGPA_TE_* flags to the + * given StringInfo. + */ +void +pgpa_trove_append_flags(StringInfo buf, int flags) +{ + if ((flags & PGPA_TE_MATCH_FULL) != 0) + { + Assert((flags & PGPA_TE_MATCH_PARTIAL) != 0); + appendStringInfo(buf, "matched"); + } + else if ((flags & PGPA_TE_MATCH_PARTIAL) != 0) + appendStringInfo(buf, "partially matched"); + else + appendStringInfo(buf, "not matched"); + if ((flags & PGPA_TE_INAPPLICABLE) != 0) + appendStringInfo(buf, ", inapplicable"); + if ((flags & PGPA_TE_CONFLICTING) != 0) + appendStringInfo(buf, ", conflicting"); + if ((flags & PGPA_TE_FAILED) != 0) + appendStringInfo(buf, ", failed"); +} + +/* + * Add a new advice target to an existing pgpa_trove_slice object. + */ +static void +pgpa_trove_add_to_slice(pgpa_trove_slice *tslice, + pgpa_advice_tag_type tag, + pgpa_advice_target *target) +{ + pgpa_trove_entry *entry; + + if (tslice->nused >= tslice->nallocated) + { + int new_allocated; + + new_allocated = tslice->nallocated * 2; + tslice->entries = repalloc_array(tslice->entries, pgpa_trove_entry, + new_allocated); + tslice->nallocated = new_allocated; + } + + entry = &tslice->entries[tslice->nused]; + entry->tag = tag; + entry->target = target; + entry->flags = 0; + + pgpa_trove_add_to_hash(tslice->hash, target, tslice->nused); + + tslice->nused++; +} + +/* + * Update the hash table for a newly-added advice target. + */ +static void +pgpa_trove_add_to_hash(pgpa_trove_entry_hash *hash, pgpa_advice_target *target, + int index) +{ + pgpa_trove_entry_key key; + pgpa_trove_entry_element *element; + bool found; + + /* For non-identifiers, add entries for all descendents. */ + if (target->ttype != PGPA_TARGET_IDENTIFIER) + { + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + pgpa_trove_add_to_hash(hash, child_target, index); + } + return; + } + + /* Sanity checks. */ + Assert(target->rid.occurrence > 0); + Assert(target->rid.alias_name != NULL); + + /* Add an entry for this relation identifier. */ + key.alias_name = target->rid.alias_name; + key.partition_name = target->rid.partrel; + key.plan_name = target->rid.plan_name; + element = pgpa_trove_entry_insert(hash, key, &found); + if (!found) + element->indexes = NULL; + element->indexes = bms_add_member(element->indexes, index); +} + +/* + * Create and initialize a new pgpa_trove_slice object. + */ +static void +pgpa_init_trove_slice(pgpa_trove_slice *tslice) +{ + /* + * In an ideal world, we'll make tslice->nallocated big enough that the + * array and hash table will be large enough to contain the number of + * advice items in this trove slice, but a generous default value is not + * good for performance, because pgpa_init_trove_slice() has to zero an + * amount of memory proportional to tslice->nallocated. Hence, we keep the + * starting value quite small, on the theory that advice strings will + * often be relatively short. + */ + tslice->nallocated = 16; + tslice->nused = 0; + tslice->entries = palloc_array(pgpa_trove_entry, tslice->nallocated); + tslice->hash = pgpa_trove_entry_create(CurrentMemoryContext, + tslice->nallocated, NULL); +} + +/* + * Fast hash function for a key consisting of alias_name, partition_name, + * and plan_name. + */ +static uint32 +pgpa_trove_entry_hash_key(pgpa_trove_entry_key key) +{ + fasthash_state hs; + int sp_len; + + fasthash_init(&hs, 0); + + /* alias_name may not be NULL */ + sp_len = fasthash_accum_cstring(&hs, key.alias_name); + + /* partition_name and plan_name, however, can be NULL */ + if (key.partition_name != NULL) + sp_len += fasthash_accum_cstring(&hs, key.partition_name); + if (key.plan_name != NULL) + sp_len += fasthash_accum_cstring(&hs, key.plan_name); + + /* + * hashfn_unstable.h recommends using string length as tweak. It's not + * clear to me what to do if there are multiple strings, so for now I'm + * just using the total of all of the lengths. + */ + return fasthash_final32(&hs, sp_len); +} + +/* + * Look for matching entries. + */ +static Bitmapset * +pgpa_trove_slice_lookup(pgpa_trove_slice *tslice, pgpa_identifier *rid) +{ + pgpa_trove_entry_key key; + pgpa_trove_entry_element *element; + Bitmapset *result = NULL; + + Assert(rid->occurrence >= 1); + + key.alias_name = rid->alias_name; + key.partition_name = rid->partrel; + key.plan_name = rid->plan_name; + + element = pgpa_trove_entry_lookup(tslice->hash, key); + + if (element != NULL) + { + int i = -1; + + while ((i = bms_next_member(element->indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &tslice->entries[i]; + + /* + * We know that this target or one of its descendents matches the + * identifier on the three key fields above, but we don't know + * which descendent or whether the occurence and schema also + * match. + */ + if (pgpa_identifier_matches_target(rid, entry->target)) + result = bms_add_member(result, i); + } + } + + return result; +} diff --git a/contrib/pg_plan_advice/pgpa_trove.h b/contrib/pg_plan_advice/pgpa_trove.h new file mode 100644 index 0000000000..a1b75af724 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_trove.h @@ -0,0 +1,114 @@ +/*------------------------------------------------------------------------- + * + * pgpa_trove.h + * All of the advice given for a particular query, appropriately + * organized for convenient access. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_trove.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_TROVE_H +#define PGPA_TROVE_H + +#include "pgpa_ast.h" + +#include "nodes/bitmapset.h" + +typedef struct pgpa_trove pgpa_trove; + +/* + * Flags that can be set on a pgpa_trove_entry to indicate what happened when + * trying to plan using advice. + * + * PGPA_TE_MATCH_PARTIAL means that we found some part of the query that at + * least partially matched the target; e.g. given JOIN_ORDER(a b), this would + * be set if we ever saw any joinrel including either "a" or "b". + * + * PGPA_TE_MATCH_FULL means that we found an exact match for the target; e.g. + * given JOIN_ORDER(a b), this would be set if we saw a joinrel containing + * exactly "a" and "b" and nothing else. + * + * PGPA_TE_INAPPLICABLE means that the advice doesn't properly apply to the + * target; e.g. INDEX_SCAN(foo bar_idx) would be so marked if bar_idx does not + * exist on foo. The fact that this bit has been set does not mean that the + * advice had no effect. + * + * PGPA_TE_CONFLICTING means that a conflict was detected between what this + * advice wants and what some other plan advice wants; e.g. JOIN_ORDER(a b) + * would conflict with HASH_JOIN(a), because the former requires "a" to be the + * outer table while the latter requires it to be the inner table. + * + * PGPA_TE_FAILED means that the resulting plan did not conform to the advice. + */ +#define PGPA_TE_MATCH_PARTIAL 0x0001 +#define PGPA_TE_MATCH_FULL 0x0002 +#define PGPA_TE_INAPPLICABLE 0x0004 +#define PGPA_TE_CONFLICTING 0x0008 +#define PGPA_TE_FAILED 0x0010 + +/* + * Each entry in a trove of advice represents the application of a tag to + * a single target. + */ +typedef struct pgpa_trove_entry +{ + pgpa_advice_tag_type tag; + pgpa_advice_target *target; + int flags; +} pgpa_trove_entry; + +/* + * What kind of information does the caller want to find in a trove? + * + * PGPA_TROVE_LOOKUP_SCAN means we're looking for scan advice. + * + * PGPA_TROVE_LOOKUP_JOIN means we're looking for join-related advice. + * This includes join order advice, join method advice, and semijoin-uniqueness + * advice. + * + * PGPA_TROVE_LOOKUP_REL means we're looking for general advice about this + * a RelOptInfo that may correspond to either a scan or a join. This includes + * gather-related advice and partitionwise advice. Note that partitionwise + * advice might seem like join advice, but that's not a helpful way of viewing + * the matter because (1) partitionwise advice is also relevant at the scan + * level and (2) other types of join advice affect only what to do from + * join_path_setup_hook, but partitionwise advice affects what to do in + * joinrel_setup_hook. + */ +typedef enum pgpa_trove_lookup_type +{ + PGPA_TROVE_LOOKUP_JOIN, + PGPA_TROVE_LOOKUP_REL, + PGPA_TROVE_LOOKUP_SCAN +} pgpa_trove_lookup_type; + +/* + * This struct is used to store the result of a trove lookup. For each member + * of "indexes", the entry at the corresponding offset within "entries" is one + * of the results. + */ +typedef struct pgpa_trove_result +{ + pgpa_trove_entry *entries; + Bitmapset *indexes; +} pgpa_trove_result; + +extern pgpa_trove *pgpa_build_trove(List *advice_items); +extern void pgpa_trove_lookup(pgpa_trove *trove, + pgpa_trove_lookup_type type, + int nrids, + pgpa_identifier *rids, + pgpa_trove_result *result); +extern void pgpa_trove_lookup_all(pgpa_trove *trove, + pgpa_trove_lookup_type type, + pgpa_trove_entry **entries, + int *nentries); +extern char *pgpa_cstring_trove_entry(pgpa_trove_entry *entry); +extern void pgpa_trove_set_flags(pgpa_trove_entry *entries, + Bitmapset *indexes, int flags); +extern void pgpa_trove_append_flags(StringInfo buf, int flags); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_walker.c b/contrib/pg_plan_advice/pgpa_walker.c new file mode 100644 index 0000000000..1e4d9c1cf9 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_walker.c @@ -0,0 +1,1029 @@ +/*------------------------------------------------------------------------- + * + * pgpa_walker.c + * Main entrypoints for analyzing a plan to generate an advice string + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_walker.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgpa_join.h" +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/plannodes.h" +#include "parser/parsetree.h" +#include "utils/lsyscache.h" + +static void pgpa_walk_recursively(pgpa_plan_walker_context *walker, Plan *plan, + bool within_join_problem, + pgpa_join_unroller *join_unroller, + List *active_query_features, + bool beneath_any_gather); +static Bitmapset *pgpa_process_unrolled_join(pgpa_plan_walker_context *walker, + pgpa_unrolled_join *ujoin); + +static pgpa_query_feature *pgpa_add_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Plan *plan); + +static void pgpa_qf_add_rti(List *active_query_features, Index rti); +static void pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids); +static void pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan, + List *rtable); + +static bool pgpa_walker_join_order_matches(pgpa_unrolled_join *ujoin, + Index rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_advice_target *target, + bool toplevel); +static bool pgpa_walker_join_order_matches_member(pgpa_join_member *member, + Index rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_advice_target *target); +static pgpa_scan *pgpa_walker_find_scan(pgpa_plan_walker_context *walker, + pgpa_scan_strategy strategy, + Bitmapset *relids); +static bool pgpa_walker_index_target_matches_plan(pgpa_index_target *itarget, + Plan *plan); +static bool pgpa_walker_contains_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Bitmapset *relids); +static bool pgpa_walker_contains_join(pgpa_plan_walker_context *walker, + pgpa_join_strategy strategy, + Bitmapset *relids); +static bool pgpa_walker_contains_no_gather(pgpa_plan_walker_context *walker, + Bitmapset *relids); + +/* + * Top-level entrypoint for the plan tree walk. + * + * Populates walker based on a traversal of the Plan trees in pstmt. + * + * sj_unique_rels is a list of pgpa_sj_unique_rel objects, one for each + * relation we considered making unique as part of semijoin planning. + */ +void +pgpa_plan_walker(pgpa_plan_walker_context *walker, PlannedStmt *pstmt, + List *sj_unique_rels) +{ + ListCell *lc; + List *sj_unique_rtis = NULL; + List *sj_nonunique_qfs = NULL; + + /* Initialization. */ + memset(walker, 0, sizeof(pgpa_plan_walker_context)); + walker->pstmt = pstmt; + + /* Walk the main plan tree. */ + pgpa_walk_recursively(walker, pstmt->planTree, 0, NULL, NIL, false); + + /* Main plan tree walk won't reach subplans, so walk those. */ + foreach(lc, pstmt->subplans) + { + Plan *plan = lfirst(lc); + + if (plan != NULL) + pgpa_walk_recursively(walker, plan, 0, NULL, NIL, false); + } + + /* Adjust RTIs from sj_unique_rels for the flattened range table. */ + foreach_ptr(pgpa_sj_unique_rel, ur, sj_unique_rels) + { + int rtindex = -1; + int rtoffset = 0; + bool dummy = false; + Bitmapset *relids = NULL; + + /* If this is a subplan, find the range table offset. */ + if (ur->plan_name != NULL) + { + foreach_node(SubPlanRTInfo, rtinfo, pstmt->subrtinfos) + { + if (strcmp(ur->plan_name, rtinfo->plan_name) == 0) + { + rtoffset = rtinfo->rtoffset; + dummy = rtinfo->dummy; + break; + } + } + + if (rtoffset == 0) + elog(ERROR, "no rtoffset for plan %s", ur->plan_name); + } + + /* If this entry pertains to a dummy subquery, ignore it. */ + if (dummy) + continue; + + /* Offset each entry from the original set. */ + while ((rtindex = bms_next_member(ur->relids, rtindex)) >= 0) + relids = bms_add_member(relids, rtindex + rtoffset); + + /* Store the resulting set. */ + sj_unique_rtis = lappend(sj_unique_rtis, relids); + } + + /* + * Remove any non-unique semjoin query features for which making the rel + * unique wasn't considered. + */ + foreach_ptr(pgpa_query_feature, qf, + walker->query_features[PGPAQF_SEMIJOIN_NON_UNIQUE]) + { + if (list_member(sj_unique_rtis, qf->relids)) + sj_nonunique_qfs = lappend(sj_nonunique_qfs, qf); + } + walker->query_features[PGPAQF_SEMIJOIN_NON_UNIQUE] = sj_nonunique_qfs; + + /* + * If we find any cases where analysis of the Plan tree shows that the + * semijoin was made unique but this possibility was never observed to be + * considered during planning, then we have a bug somewhere. + */ + foreach_ptr(pgpa_query_feature, qf, + walker->query_features[PGPAQF_SEMIJOIN_UNIQUE]) + { + if (!list_member(sj_unique_rtis, qf->relids)) + { + StringInfoData buf; + + initStringInfo(&buf); + outBitmapset(&buf, qf->relids); + elog(ERROR, + "unique semijoin found for relids %s but not observed during planning", + buf.data); + } + } + + /* + * It's possible for a Gather or Gather Merge query feature to find no + * RTIs when partitionwise aggregation is in use. We shouldn't emit + * something like GATHER_MERGE(()), so instead emit nothing. This means + * that we won't advise either GATHER or GATHER_MERGE or NO_GATHER in + * such cases, which might be something we want to improve in the future. + * + * (Should the Partial Aggregates in such a case be created in an + * UPPERREL_GROUP_AGG with a non-empty relid set? Right now that doesn't + * happen, but it seems like it would make life easier for us if it did.) + */ + for (int t = 0; t < NUM_PGPA_QF_TYPES; ++t) + { + List *query_features = NIL; + + foreach_ptr(pgpa_query_feature, qf, walker->query_features[t]) + { + if (qf->relids != NULL) + query_features = lappend(query_features, qf); + else + Assert(t == PGPAQF_GATHER || t == PGPAQF_GATHER_MERGE); + } + + walker->query_features[t] = query_features; + } +} + +/* + * Main workhorse for the plan tree walk. + * + * If within_join_problem is true, we encountered a join at some higher level + * of the tree walk and haven't yet descended out of the portion of the plan + * tree that is part of that same join problem. We're no longer in the same + * join problem if (1) we cross into a different subquery or (2) we descend + * through an Append or MergeAppend node, below which any further joins would + * be partitionwise joins planned separately from the outer join problem. + * + * If join_unroller != NULL, the join unroller code expects us to find a join + * that should be unrolled into that object. This implies that we're within a + * join problem, but the reverse is not true: when we've traversed all the + * joins but are still looking for the scan that is the leaf of the join tree, + * join_unroller will be NULL but within_join_problem will be true. + * + * Each element of active_query_features corresponds to some item of advice + * that needs to enumerate all the relations it affects. We add RTIs we find + * during tree traversal to each of these query features. + * + * If beneath_any_gather == true, some higher level of the tree traversal found + * a Gather or Gather Merge node. + */ +static void +pgpa_walk_recursively(pgpa_plan_walker_context *walker, Plan *plan, + bool within_join_problem, + pgpa_join_unroller *join_unroller, + List *active_query_features, + bool beneath_any_gather) +{ + pgpa_join_unroller *outer_join_unroller = NULL; + pgpa_join_unroller *inner_join_unroller = NULL; + bool join_unroller_toplevel = false; + ListCell *lc; + List *extraplans = NIL; + List *elided_nodes = NIL; + + Assert(within_join_problem || join_unroller == NULL); + + /* + * Check the future_query_features list to see whether this was previously + * identified as a plan node that needs to be treated as a query feature. + * We must do this before handling elided nodes, because if there's an + * elided node associated with a future query feature, the RTIs associated + * with the elided node should be the only ones attributed to the query + * feature. + */ + foreach_ptr(pgpa_query_feature, qf, walker->future_query_features) + { + if (qf->plan == plan) + { + active_query_features = list_copy(active_query_features); + active_query_features = lappend(active_query_features, qf); + walker->future_query_features = + list_delete_ptr(walker->future_query_features, plan); + break; + } + } + + /* + * Find all elided nodes for this Plan node. + */ + foreach_node(ElidedNode, n, walker->pstmt->elidedNodes) + { + if (n->plan_node_id == plan->plan_node_id) + elided_nodes = lappend(elided_nodes, n); + } + + /* If we found any elided_nodes, handle them. */ + if (elided_nodes != NIL) + { + int num_elided_nodes = list_length(elided_nodes); + ElidedNode *last_elided_node; + + /* + * RTIs for the final -- and thus logically uppermost -- elided node + * should be collected for query features passed down by the caller. + * However, elided nodes act as barriers to query features, which + * means that (1) the remaining elided nodes, if any, should be + * ignored for purposes of query features and (2) the list of active + * query features should be reset to empty so that we do not add RTIs + * from the plan node that is logically beneath the elided node to the + * query features passed down from the caller. + */ + last_elided_node = list_nth(elided_nodes, num_elided_nodes - 1); + pgpa_qf_add_rtis(active_query_features, + pgpa_filter_out_join_relids(last_elided_node->relids, + walker->pstmt->rtable)); + active_query_features = NIL; + + /* + * If we're within a join problem, the join_unroller is responsible + * for building the scan for the final elided node, so throw it out. + */ + if (within_join_problem) + elided_nodes = list_truncate(elided_nodes, num_elided_nodes - 1); + + /* Build scans for all (or the remaining) elided nodes. */ + foreach_node(ElidedNode, elided_node, elided_nodes) + { + (void) pgpa_build_scan(walker, plan, elided_node, + beneath_any_gather, within_join_problem); + } + + /* + * If there were any elided nodes, then everything beneath those nodes + * is not part of the same join problem. + * + * In more detail, if an Append or MergeAppend was elided, then a + * partitionwise join was chosen and only a single child survived; if + * a SubqueryScan was elided, the subquery was planned without + * flattening it into the parent. + */ + within_join_problem = false; + join_unroller = NULL; + } + + /* + * If this is a Gather or Gather Merge node, directly add it to the list + * of currently-active query features. We must do this after handling + * elided nodes, since the Gather or Gather Merge node occurs logically + * beneath any associated elided nodes. + * + * Exception: We disregard any single_copy Gather nodes. These are created + * by debug_parallel_query, and having them affect the plan advice is + * counterproductive, as the result will be to advise the use of a real + * Gather node, rather than a single copy one. + */ + if (IsA(plan, Gather) && !((Gather *) plan)->single_copy) + { + active_query_features = + lappend(list_copy(active_query_features), + pgpa_add_feature(walker, PGPAQF_GATHER, plan)); + beneath_any_gather = true; + } + else if (IsA(plan, GatherMerge)) + { + active_query_features = + lappend(list_copy(active_query_features), + pgpa_add_feature(walker, PGPAQF_GATHER_MERGE, plan)); + beneath_any_gather = true; + } + + /* + * If we're within a join problem, the join unroller is responsible for + * building any required scan for this node. If not, we do it here. + */ + if (!within_join_problem) + (void) pgpa_build_scan(walker, plan, NULL, beneath_any_gather, false); + + /* + * If this join needs to unrolled but there's no join unroller already + * available, create one. + */ + if (join_unroller == NULL && pgpa_is_join(plan)) + { + join_unroller = pgpa_create_join_unroller(); + join_unroller_toplevel = true; + within_join_problem = true; + } + + /* + * If this join is to be unrolled, pgpa_unroll_join() will return the join + * unroller object that should be passed down when we recurse into the + * outer and inner sides of the plan. + */ + if (join_unroller != NULL) + pgpa_unroll_join(walker, plan, beneath_any_gather, join_unroller, + &outer_join_unroller, &inner_join_unroller); + + /* Add RTIs from the plan node to all active query features. */ + pgpa_qf_add_plan_rtis(active_query_features, plan, walker->pstmt->rtable); + + /* + * Recurse into the outer and inner subtrees. + * + * As an exception, if this is a ForeignScan, don't recurse. postgres_fdw + * sometimes stores an EPQ recheck plan in plan->leftree, but that's going + * to mention the same set of relations as the ForeignScan itself, and we + * have no way to emit advice targeting the EPQ case vs. the non-EPQ case. + * Moreover, it's not entirely clear what other FDWs might do with the + * left and right subtrees. Maybe some better handling is needed here, but + * for now, we just punt. + */ + if (!IsA(plan, ForeignScan)) + { + if (plan->lefttree != NULL) + pgpa_walk_recursively(walker, plan->lefttree, within_join_problem, + outer_join_unroller, active_query_features, + beneath_any_gather); + if (plan->righttree != NULL) + pgpa_walk_recursively(walker, plan->righttree, within_join_problem, + inner_join_unroller, active_query_features, + beneath_any_gather); + } + + /* + * If we created a join unroller up above, then it's also our join to use + * it to build the final pgpa_unrolled_join, and to destroy the object. + */ + if (join_unroller_toplevel) + { + pgpa_unrolled_join *ujoin; + + ujoin = pgpa_build_unrolled_join(walker, join_unroller); + walker->toplevel_unrolled_joins = + lappend(walker->toplevel_unrolled_joins, ujoin); + pgpa_destroy_join_unroller(join_unroller); + (void) pgpa_process_unrolled_join(walker, ujoin); + } + + /* + * Some plan types can have additional children. Nodes like Append that + * can have any number of children store them in a List; a SubqueryScan + * just has a field for a single additional Plan. + */ + switch (nodeTag(plan)) + { + case T_Append: + { + Append *aplan = (Append *) plan; + + extraplans = aplan->appendplans; + } + break; + case T_MergeAppend: + { + MergeAppend *maplan = (MergeAppend *) plan; + + extraplans = maplan->mergeplans; + } + break; + case T_BitmapAnd: + extraplans = ((BitmapAnd *) plan)->bitmapplans; + break; + case T_BitmapOr: + extraplans = ((BitmapOr *) plan)->bitmapplans; + break; + case T_SubqueryScan: + + /* + * We don't pass down active_query_features across here, because + * those are specific to a subquery level. + */ + pgpa_walk_recursively(walker, ((SubqueryScan *) plan)->subplan, + 0, NULL, NIL, beneath_any_gather); + break; + case T_CustomScan: + extraplans = ((CustomScan *) plan)->custom_plans; + break; + default: + break; + } + + /* If we found a list of extra children, iterate over it. */ + foreach(lc, extraplans) + { + Plan *subplan = lfirst(lc); + + pgpa_walk_recursively(walker, subplan, 0, NULL, NIL, + beneath_any_gather); + } +} + +/* + * Perform final processing of a newly-constructed pgpa_unrolled_join. This + * only needs to be called for toplevel pgpa_unrolled_join objects, since it + * recurses to sub-joins as needed. + * + * Our goal is to add the set of inner relids to the relevant join_strategies + * list, and to do the same for any sub-joins. To that end, the return value + * is the set of relids found beneath the the join, but it is expected that + * the toplevel caller will ignore this. + */ +static Bitmapset * +pgpa_process_unrolled_join(pgpa_plan_walker_context *walker, + pgpa_unrolled_join *ujoin) +{ + Bitmapset *all_relids = bms_copy(ujoin->outer.scan->relids); + + /* If this fails, we didn't unroll properly. */ + Assert(ujoin->outer.unrolled_join == NULL); + + for (int k = 0; k < ujoin->ninner; ++k) + { + pgpa_join_member *member = &ujoin->inner[k]; + Bitmapset *relids; + + if (member->unrolled_join != NULL) + relids = pgpa_process_unrolled_join(walker, + member->unrolled_join); + else + { + Assert(member->scan != NULL); + relids = member->scan->relids; + } + walker->join_strategies[ujoin->strategy[k]] = + lappend(walker->join_strategies[ujoin->strategy[k]], relids); + all_relids = bms_add_members(all_relids, relids); + } + + return all_relids; +} + +/* + * Arrange for the given plan node to be treated as a query feature when the + * tree walk reaches it. + * + * Make sure to only use this for nodes that the tree walk can't have reached + * yet! + */ +void +pgpa_add_future_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, Plan *plan) +{ + pgpa_query_feature *qf = pgpa_add_feature(walker, type, plan); + + walker->future_query_features = + lappend(walker->future_query_features, qf); +} + +/* + * Return the last of any elided nodes associated with this plan node ID. + * + * The last elided node is the one that would have been uppermost in the plan + * tree had it not been removed during setrefs processig. + */ +ElidedNode * +pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan) +{ + ElidedNode *elided_node = NULL; + + foreach_node(ElidedNode, n, pstmt->elidedNodes) + { + if (n->plan_node_id == plan->plan_node_id) + elided_node = n; + } + + return elided_node; +} + +/* + * Certain plan nodes can refer to a set of RTIs. Extract and return the set. + */ +Bitmapset * +pgpa_relids(Plan *plan) +{ + if (IsA(plan, Result)) + return ((Result *) plan)->relids; + else if (IsA(plan, ForeignScan)) + return ((ForeignScan *) plan)->fs_relids; + else if (IsA(plan, Append)) + return ((Append *) plan)->apprelids; + else if (IsA(plan, MergeAppend)) + return ((MergeAppend *) plan)->apprelids; + + return NULL; +} + +/* + * Extract the scanned RTI from a plan node. + * + * Returns 0 if there isn't one. + */ +Index +pgpa_scanrelid(Plan *plan) +{ + switch (nodeTag(plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_SubqueryScan: + case T_FunctionScan: + case T_TableFuncScan: + case T_ValuesScan: + case T_CteScan: + case T_NamedTuplestoreScan: + case T_WorkTableScan: + case T_ForeignScan: + case T_CustomScan: + case T_IndexScan: + case T_IndexOnlyScan: + return ((Scan *) plan)->scanrelid; + default: + return 0; + } +} + +/* + * Construct a new Bitmapset containing non-RTE_JOIN members of 'relids'. + */ +Bitmapset * +pgpa_filter_out_join_relids(Bitmapset *relids, List *rtable) +{ + int rti = -1; + Bitmapset *result = NULL; + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + RangeTblEntry *rte = rt_fetch(rti, rtable); + + if (rte->rtekind != RTE_JOIN) + result = bms_add_member(result, rti); + } + + return result; +} + +/* + * Create a pgpa_query_feature and add it to the list of all query features + * for this plan. + */ +static pgpa_query_feature * +pgpa_add_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, Plan *plan) +{ + pgpa_query_feature *qf = palloc0_object(pgpa_query_feature); + + qf->type = type; + qf->plan = plan; + + walker->query_features[qf->type] = + lappend(walker->query_features[qf->type], qf); + + return qf; +} + +/* + * Add a single RTI to each active query feature. + */ +static void +pgpa_qf_add_rti(List *active_query_features, Index rti) +{ + foreach_ptr(pgpa_query_feature, qf, active_query_features) + { + qf->relids = bms_add_member(qf->relids, rti); + } +} + +/* + * Add a set of RTIs to each active query feature. + */ +static void +pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids) +{ + foreach_ptr(pgpa_query_feature, qf, active_query_features) + { + qf->relids = bms_add_members(qf->relids, relids); + } +} + +/* + * Add RTIs directly contained in a plan node to each active query feature, + * but filter out any join RTIs, since advice doesn't mention those. + */ +static void +pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan, List *rtable) +{ + Bitmapset *relids; + Index rti; + + if ((relids = pgpa_relids(plan)) != NULL) + { + relids = pgpa_filter_out_join_relids(relids, rtable); + pgpa_qf_add_rtis(active_query_features, relids); + } + else if ((rti = pgpa_scanrelid(plan)) != 0) + pgpa_qf_add_rti(active_query_features, rti); +} + +/* + * If we generated plan advice using the provided walker object and array + * of identifiers, would we generate the specified tag/target combination? + * + * If yes, the plan conforms to the advice; if no, it does not. Note that + * we have know way of knowing whether the planner was forced to emit a plan + * that conformed to the advice or just happened to do so. + */ +bool +pgpa_walker_would_advise(pgpa_plan_walker_context *walker, + pgpa_identifier *rt_identifiers, + pgpa_advice_tag_type tag, + pgpa_advice_target *target) +{ + Index rtable_length = list_length(walker->pstmt->rtable); + Bitmapset *relids = NULL; + + if (tag == PGPA_TAG_JOIN_ORDER) + { + foreach_ptr(pgpa_unrolled_join, ujoin, walker->toplevel_unrolled_joins) + { + if (pgpa_walker_join_order_matches(ujoin, rtable_length, + rt_identifiers, target, true)) + return true; + } + + return false; + } + + if (target->ttype == PGPA_TARGET_IDENTIFIER) + { + Index rti; + + rti = pgpa_compute_rti_from_identifier(rtable_length, rt_identifiers, + &target->rid); + if (rti == 0) + return false; + relids = bms_make_singleton(rti); + } + else + { + Assert(target->ttype == PGPA_TARGET_ORDERED_LIST); + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + Index rti; + + Assert(child_target->ttype == PGPA_TARGET_IDENTIFIER); + rti = pgpa_compute_rti_from_identifier(rtable_length, + rt_identifiers, + &child_target->rid); + if (rti == 0) + return false; + relids = bms_add_member(relids, rti); + } + } + + switch (tag) + { + case PGPA_TAG_JOIN_ORDER: + /* should have been handled above */ + pg_unreachable(); + break; + case PGPA_TAG_BITMAP_HEAP_SCAN: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_BITMAP_HEAP, + relids) != NULL; + case PGPA_TAG_FOREIGN_JOIN: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_FOREIGN, + relids) != NULL; + case PGPA_TAG_INDEX_ONLY_SCAN: + { + pgpa_scan *scan; + + scan = pgpa_walker_find_scan(walker, PGPA_SCAN_INDEX_ONLY, + relids); + if (scan == NULL) + return false; + + return pgpa_walker_index_target_matches_plan(target->itarget, scan->plan); + } + case PGPA_TAG_INDEX_SCAN: + { + pgpa_scan *scan; + + scan = pgpa_walker_find_scan(walker, PGPA_SCAN_INDEX, + relids); + if (scan == NULL) + return false; + + return pgpa_walker_index_target_matches_plan(target->itarget, scan->plan); + } + case PGPA_TAG_PARTITIONWISE: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_PARTITIONWISE, + relids) != NULL; + case PGPA_TAG_SEQ_SCAN: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_SEQ, + relids) != NULL; + case PGPA_TAG_TID_SCAN: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_TID, + relids) != NULL; + case PGPA_TAG_GATHER: + return pgpa_walker_contains_feature(walker, + PGPAQF_GATHER, + relids); + case PGPA_TAG_GATHER_MERGE: + return pgpa_walker_contains_feature(walker, + PGPAQF_GATHER_MERGE, + relids); + case PGPA_TAG_SEMIJOIN_NON_UNIQUE: + return pgpa_walker_contains_feature(walker, + PGPAQF_SEMIJOIN_NON_UNIQUE, + relids); + case PGPA_TAG_SEMIJOIN_UNIQUE: + return pgpa_walker_contains_feature(walker, + PGPAQF_SEMIJOIN_UNIQUE, + relids); + case PGPA_TAG_HASH_JOIN: + return pgpa_walker_contains_join(walker, + JSTRAT_HASH_JOIN, + relids); + case PGPA_TAG_MERGE_JOIN_MATERIALIZE: + return pgpa_walker_contains_join(walker, + JSTRAT_MERGE_JOIN_MATERIALIZE, + relids); + case PGPA_TAG_MERGE_JOIN_PLAIN: + return pgpa_walker_contains_join(walker, + JSTRAT_MERGE_JOIN_PLAIN, + relids); + case PGPA_TAG_NESTED_LOOP_MATERIALIZE: + return pgpa_walker_contains_join(walker, + JSTRAT_NESTED_LOOP_MATERIALIZE, + relids); + case PGPA_TAG_NESTED_LOOP_MEMOIZE: + return pgpa_walker_contains_join(walker, + JSTRAT_NESTED_LOOP_MEMOIZE, + relids); + case PGPA_TAG_NESTED_LOOP_PLAIN: + return pgpa_walker_contains_join(walker, + JSTRAT_NESTED_LOOP_PLAIN, + relids); + case PGPA_TAG_NO_GATHER: + return pgpa_walker_contains_no_gather(walker, relids); + } + + /* should not get here */ + return false; +} + +/* + * Does the index target match the Plan? + * + * Should only be called when we know that itarget mandates an Index Scan or + * Index Only Scan and this corresponds to the type of Plan. Here, our job is + * just to check whether it's the same index. + */ +static bool +pgpa_walker_index_target_matches_plan(pgpa_index_target *itarget, Plan *plan) +{ + Oid indexoid = InvalidOid; + + /* Retrieve the index OID from the plan. */ + if (IsA(plan, IndexScan)) + indexoid = ((IndexScan *) plan)->indexid; + else if (IsA(plan, IndexOnlyScan)) + indexoid = ((IndexOnlyScan *) plan)->indexid; + else + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(plan)); + + /* Check whether schema name matches, if specified in index target. */ + if (itarget->indnamespace != NULL) + { + Oid nspoid = get_rel_namespace(indexoid); + char *relnamespace = get_namespace_name_or_temp(nspoid); + + if (strcmp(itarget->indnamespace, relnamespace) != 0) + return false; + } + + /* Check whether relation name matches. */ + return (strcmp(itarget->indname, get_rel_name(indexoid)) == 0); +} + +/* + * Does an unrolled join match the join order specified by an advice target? + */ +static bool +pgpa_walker_join_order_matches(pgpa_unrolled_join *ujoin, + Index rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_advice_target *target, + bool toplevel) +{ + int nchildren = list_length(target->children); + + Assert(target->ttype == PGPA_TARGET_ORDERED_LIST); + + /* At toplevel, we allow a prefix match. */ + if (toplevel) + { + if (nchildren > ujoin->ninner + 1) + return false; + } + else + { + if (nchildren != ujoin->ninner + 1) + return false; + } + + /* Outermost rel must match. */ + if (!pgpa_walker_join_order_matches_member(&ujoin->outer, + rtable_length, + rt_identifiers, + linitial(target->children))) + return false; + + /* Each inner rel must match. */ + for (int n = 0; n < nchildren - 1; ++n) + { + pgpa_advice_target *child_target = list_nth(target->children, n + 1); + + if (!pgpa_walker_join_order_matches_member(&ujoin->inner[n], + rtable_length, + rt_identifiers, + child_target)) + return false; + } + + return true; +} + +/* + * Does one member of an unrolled join match an advice target? + */ +static bool +pgpa_walker_join_order_matches_member(pgpa_join_member *member, + Index rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_advice_target *target) +{ + Bitmapset *relids = NULL; + + if (member->unrolled_join != NULL) + { + if (target->ttype != PGPA_TARGET_ORDERED_LIST) + return false; + return pgpa_walker_join_order_matches(member->unrolled_join, + rtable_length, + rt_identifiers, + target, + false); + } + + Assert(member->scan != NULL); + switch (target->ttype) + { + case PGPA_TARGET_ORDERED_LIST: + /* Could only match an unrolled join */ + return false; + + case PGPA_TARGET_UNORDERED_LIST: + { + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + Index rti; + + rti = pgpa_compute_rti_from_identifier(rtable_length, + rt_identifiers, + &child_target->rid); + if (rti == 0) + return false; + relids = bms_add_member(relids, rti); + } + break; + } + + case PGPA_TARGET_IDENTIFIER: + { + Index rti; + + rti = pgpa_compute_rti_from_identifier(rtable_length, + rt_identifiers, + &target->rid); + if (rti == 0) + return false; + relids = bms_make_singleton(rti); + break; + } + } + + return bms_equal(member->scan->relids, relids); +} + +/* + * Find the scan where the walker says that the given scan strategy should be + * used for the given relid set, if one exists. + * + * Returns the pgpa_scan object, or NULL if none was found. + */ +static pgpa_scan * +pgpa_walker_find_scan(pgpa_plan_walker_context *walker, + pgpa_scan_strategy strategy, + Bitmapset *relids) +{ + List *scans = walker->scans[strategy]; + + foreach_ptr(pgpa_scan, scan, scans) + { + if (bms_equal(scan->relids, relids)) + return scan; + } + + return NULL; +} + +/* + * Does this walker say that the given query feature applies to the given + * relid set? + */ +static bool +pgpa_walker_contains_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Bitmapset *relids) +{ + List *query_features = walker->query_features[type]; + + foreach_ptr(pgpa_query_feature, qf, query_features) + { + if (bms_equal(qf->relids, relids)) + return true; + } + + return false; +} + +/* + * Does the walker say that the given join strategy should be used for the + * given relid set? + */ +static bool +pgpa_walker_contains_join(pgpa_plan_walker_context *walker, + pgpa_join_strategy strategy, + Bitmapset *relids) +{ + List *join_strategies = walker->join_strategies[strategy]; + + foreach_ptr(Bitmapset, jsrelids, join_strategies) + { + if (bms_equal(jsrelids, relids)) + return true; + } + + return false; +} + +/* + * Does the walker say that the given relids should be marked as NO_GATHER? + */ +static bool +pgpa_walker_contains_no_gather(pgpa_plan_walker_context *walker, + Bitmapset *relids) +{ + return bms_is_subset(relids, walker->no_gather_scans); +} diff --git a/contrib/pg_plan_advice/pgpa_walker.h b/contrib/pg_plan_advice/pgpa_walker.h new file mode 100644 index 0000000000..b37e209dcc --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_walker.h @@ -0,0 +1,141 @@ +/*------------------------------------------------------------------------- + * + * pgpa_walker.h + * Main entrypoints for analyzing a plan to generate an advice string + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_walker.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_WALKER_H +#define PGPA_WALKER_H + +#include "pgpa_ast.h" +#include "pgpa_join.h" +#include "pgpa_scan.h" + +/* + * When generating advice, we should emit either SEMIJOIN_UNIQUE advice or + * SEMIJOIN_NON_UNIQUE advice for each semijoin depending on whether we chose + * to implement it as a semijoin or whether we instead chose to make the + * nullable side unique and then perform an inner join. When the make-unique + * strategy is not chosen, it's not easy to tell from the final plan tree + * whether it was considered. That's awkward, because we don't want to emit + * useless SEMIJOIN_NON_UNIQUE advice when there was no decision to be made. + * + * To avoid that, during planning, we create a pgpa_sj_unique_rel for each + * relation that we considered making unique for purposes of semijoin planning. + */ +typedef struct pgpa_sj_unique_rel +{ + char *plan_name; + Bitmapset *relids; +} pgpa_sj_unique_rel; + +/* + * We use the term "query feature" to refer to plan nodes that are interesting + * in the following way: to generate advice, we'll need to know the set of + * same-subquery, non-join RTIs occuring at or below that plan node, without + * admixture of parent and child RTIs. + * + * For example, Gather nodes, desiginated by PGPAQF_GATHER, and Gather Merge + * nodes, designated by PGPAQF_GATHER_MERGE, are query features, because we'll + * want to admit some kind of advice that describes the portion of the plan + * tree that appears beneath those nodes. + * + * Each semijoin can be implemented either by directly performing a semijoin, + * or by making one side unique and then performing a normal join. Either way, + * we use a query feature to notice what decision was made, so that we can + * describe it by enumerating the RTIs on that side of the join. + * + * To elaborate on the "no admixture of parent and child RTIs" rule, in all of + * these cases, if the entirety of an inheritance hierarchy appears beneath + * the query feature, we only want to name the parent table. But it's also + * possible to have cases where we must name child tables. This is particularly + * likely to happen when partitionwise join is in use, but could happen for + * Gather or Gather Merge even without that, if one of those appears below + * an Append or MergeAppend node for a single table. + */ +typedef enum pgpa_qf_type +{ + PGPAQF_GATHER, + PGPAQF_GATHER_MERGE, + PGPAQF_SEMIJOIN_NON_UNIQUE, + PGPAQF_SEMIJOIN_UNIQUE + /* update NUM_PGPA_QF_TYPES if you add anything here */ +} pgpa_qf_type; + +#define NUM_PGPA_QF_TYPES ((int) PGPAQF_SEMIJOIN_UNIQUE + 1) + +/* + * For each query feature, we keep track of the feature type and the set of + * relids that we found underneath the relevant plan node. See the comments + * on pgpa_qf_type, above, for additional details. + */ +typedef struct pgpa_query_feature +{ + pgpa_qf_type type; + Plan *plan; + Bitmapset *relids; +} pgpa_query_feature; + +/* + * Context object for plan tree walk. + * + * pstmt is the PlannedStmt we're studying. + * + * scans is an array of lists of pgpa_scan objects. The array is indexed by + * the scan's pgpa_scan_strategy. + * + * no_gather_scans is the set of scan RTIs that do not appear beneath any + * Gather or Gather Merge node. + * + * toplevel_unrolled_joins is a list of all pgpa_unrolled_join objects that + * are not a child of some other pgpa_unrolled_join. + * + * join_strategy is an array of lists of Bitmapset objects. Each Bitmapset + * is the set of relids that appears on the inner side of some join (excluding + * RTIs from partition children and subqueries). The array is indexed by + * pgpa_join_strategy. + * + * query_features is an array lists of pgpa_query_feature objects, indexed + * by pgpa_qf_type. + * + * future_query_features is only used during the plan tree walk and should + * be empty when the tree walk concludes. It is a list of pgpa_query_feature + * objects for Plan nodes that the plan tree walk has not yet encountered; + * when encountered, they will be moved to the list of active query features + * that is propagated via the call stack. + */ +typedef struct pgpa_plan_walker_context +{ + PlannedStmt *pstmt; + List *scans[NUM_PGPA_SCAN_STRATEGY]; + Bitmapset *no_gather_scans; + List *toplevel_unrolled_joins; + List *join_strategies[NUM_PGPA_JOIN_STRATEGY]; + List *query_features[NUM_PGPA_QF_TYPES]; + List *future_query_features; +} pgpa_plan_walker_context; + +extern void pgpa_plan_walker(pgpa_plan_walker_context *walker, + PlannedStmt *pstmt, + List *sj_unique_rels); + +extern void pgpa_add_future_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Plan *plan); + +extern ElidedNode *pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan); +extern Bitmapset *pgpa_relids(Plan *plan); +extern Index pgpa_scanrelid(Plan *plan); +extern Bitmapset *pgpa_filter_out_join_relids(Bitmapset *relids, List *rtable); + +extern bool pgpa_walker_would_advise(pgpa_plan_walker_context *walker, + pgpa_identifier *rt_identifiers, + pgpa_advice_tag_type tag, + pgpa_advice_target *target); + +#endif diff --git a/contrib/pg_plan_advice/sql/gather.sql b/contrib/pg_plan_advice/sql/gather.sql new file mode 100644 index 0000000000..776666bf19 --- /dev/null +++ b/contrib/pg_plan_advice/sql/gather.sql @@ -0,0 +1,86 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 1; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET debug_parallel_query = off; + +CREATE TABLE gt_dim (id serial primary key, dim text) + WITH (autovacuum_enabled = false); +INSERT INTO gt_dim (dim) SELECT random()::text FROM generate_series(1,100) g; +VACUUM ANALYZE gt_dim; + +CREATE TABLE gt_fact ( + id int not null, + dim_id integer not null references gt_dim (id) +) WITH (autovacuum_enabled = false); +INSERT INTO gt_fact + SELECT g, (g%3)+1 FROM generate_series(1,100000) g; +VACUUM ANALYZE gt_fact; + +-- By default, we expect Gather Merge with a parallel hash join. +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + +-- Force Gather or Gather Merge of both relations together. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; + +-- Force a separate Gather or Gather Merge operation for each relation. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather((d d/d.d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; + +-- Force a Gather or Gather Merge on one relation but no parallelism on other. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge(f) no_gather(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather_merge(d) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather(f) no_gather(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather(d) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; + +-- Force no Gather or Gather Merge use at all. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'no_gather(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; + +-- Can't force Gather Merge without the ORDER BY clause, but just Gather is OK. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'gather((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id; +COMMIT; + +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather((f d)) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/join_order.sql b/contrib/pg_plan_advice/sql/join_order.sql new file mode 100644 index 0000000000..5e16e54efa --- /dev/null +++ b/contrib/pg_plan_advice/sql/join_order.sql @@ -0,0 +1,145 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; + +CREATE TABLE jo_dim1 (id integer primary key, dim1 text, val1 int) + WITH (autovacuum_enabled = false); +INSERT INTO jo_dim1 (id, dim1, val1) + SELECT g, 'some filler text ' || g, (g % 3) + 1 + FROM generate_series(1,100) g; +VACUUM ANALYZE jo_dim1; +CREATE TABLE jo_dim2 (id integer primary key, dim2 text, val2 int) + WITH (autovacuum_enabled = false); +INSERT INTO jo_dim2 (id, dim2, val2) + SELECT g, 'some filler text ' || g, (g % 7) + 1 + FROM generate_series(1,1000) g; +VACUUM ANALYZE jo_dim2; + +CREATE TABLE jo_fact ( + id int primary key, + dim1_id integer not null references jo_dim1 (id), + dim2_id integer not null references jo_dim2 (id) +) WITH (autovacuum_enabled = false); +INSERT INTO jo_fact + SELECT g, (g%100)+1, (g%100)+1 FROM generate_series(1,100000) g; +VACUUM ANALYZE jo_fact; + +-- We expect to join to d2 first and then d1, since the condition on d2 +-- is more selective. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + +-- Force a few different join orders. Some of these are very inefficient, +-- but the planner considers them all viable. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(d1 f d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(f (d1 d2))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(f {d1 d2})'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +COMMIT; + +-- Force a join order by mentioning just a prefix of the join list. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +COMMIT; + +-- jo_fact is not partitioned, but let's try pretending that it is and +-- verifying that the advice does not apply. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 (d1 d2))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +COMMIT; + +-- The unusual formulation of this query is intended to prevent the query +-- planner from reducing the FULL JOIN to some other join type, so that we +-- can test what happens with a join type that cannot be reordered. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + +-- We should not be able to force the planner to join f to d1 first, because +-- that is not a valid join order, but we should be able to force the planner +-- to make either d2 or f the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +SET LOCAL pg_plan_advice.advice = 'join_order(d2 f d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +COMMIT; + +-- Two incompatible join orders should conflict. In the second case, +-- the conflict is implicit: if d1 is on the inner side of a join of any +-- type, it cannot also be the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f) join_order(d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +SET LOCAL pg_plan_advice.advice = 'join_order(d1) hash_join(d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/join_strategy.sql b/contrib/pg_plan_advice/sql/join_strategy.sql new file mode 100644 index 0000000000..edd5c4c0e1 --- /dev/null +++ b/contrib/pg_plan_advice/sql/join_strategy.sql @@ -0,0 +1,84 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; + +CREATE TABLE join_dim (id serial primary key, dim text) + WITH (autovacuum_enabled = false); +INSERT INTO join_dim (dim) SELECT random()::text FROM generate_series(1,100) g; +VACUUM ANALYZE join_dim; + +CREATE TABLE join_fact ( + id int primary key, + dim_id integer not null references join_dim (id) +) WITH (autovacuum_enabled = false); +INSERT INTO join_fact + SELECT g, (g%3)+1 FROM generate_series(1,100000) g; +CREATE INDEX join_fact_dim_id ON join_fact (dim_id); +VACUUM ANALYZE join_fact; + +-- We expect a hash join by default. +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + +-- Try forcing each join method in turn with join_dim as the inner table. +-- All of these should work except for MERGE_JOIN_MATERIALIZE; that will +-- fail, because the planner knows that join_dim (id) is unique, and will +-- refuse to add mark/restore overhead. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +COMMIT; + +-- Now try forcing each join method in turn with join_fact as the inner +-- table. All of these should work. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +COMMIT; + +-- Non-working cases. We can't force a foreign join between these tables, +-- because they aren't foreign tables. We also can't use two different +-- strategies on the same table, nor can we put both tables on the inner +-- side of the same join. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'FOREIGN_JOIN((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f) NESTED_LOOP_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/local_collector.sql b/contrib/pg_plan_advice/sql/local_collector.sql new file mode 100644 index 0000000000..3225dd9905 --- /dev/null +++ b/contrib/pg_plan_advice/sql/local_collector.sql @@ -0,0 +1,46 @@ +CREATE EXTENSION pg_plan_advice; +SET debug_parallel_query = off; + +-- Try clearing advice before we've collected any. +SELECT pg_clear_collected_local_advice(); + +-- Set a small advice collection limit so that we'll exceed it. +SET pg_plan_advice.local_collection_limit = 2; + +-- Enable the collector. +SET pg_plan_advice.local_collector = on; + +-- Set up a dummy table. +CREATE TABLE dummy_table (a int primary key, b text) + WITH (autovacuum_enabled = false, parallel_workers = 0); + +-- Test queries. +SELECT * FROM dummy_table a, dummy_table b; +SELECT * FROM dummy_table; + +-- Should return the advice from the second test query. +SET pg_plan_advice.local_collector = off; +SELECT advice FROM pg_get_collected_local_advice() ORDER BY id DESC LIMIT 1; + +-- Now try clearing advice again. +SELECT pg_clear_collected_local_advice(); + +-- Raise the collection limit so that the collector uses multiple chunks. +SET pg_plan_advice.local_collection_limit = 2000; +SET pg_plan_advice.local_collector = on; + +-- Push a bunch of queries through the collector. +DO $$ +BEGIN + FOR x IN 1..2000 LOOP + EXECUTE 'SELECT * FROM dummy_table'; + END LOOP; +END +$$; + +-- Check that the collector worked. +SELECT COUNT(*) FROM pg_get_collected_local_advice(); + +-- And clear one more time, to verify that this doesn't cause a problem +-- even with a larger number of entries. +SELECT pg_clear_collected_local_advice(); diff --git a/contrib/pg_plan_advice/sql/partitionwise.sql b/contrib/pg_plan_advice/sql/partitionwise.sql new file mode 100644 index 0000000000..c51456dbbb --- /dev/null +++ b/contrib/pg_plan_advice/sql/partitionwise.sql @@ -0,0 +1,99 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +SET enable_partitionwise_join = true; + +CREATE TABLE pt1 (id integer primary key, dim1 text, val1 int) + PARTITION BY RANGE (id); +CREATE TABLE pt1a PARTITION OF pt1 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt1b PARTITION OF pt1 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt1c PARTITION OF pt1 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt1 (id, dim1, val1) + SELECT g, 'some filler text ' || g, (g % 3) + 1 + FROM generate_series(1,3000) g; +VACUUM ANALYZE pt1; + +CREATE TABLE pt2 (id integer primary key, dim2 text, val2 int) + PARTITION BY RANGE (id); +CREATE TABLE pt2a PARTITION OF pt2 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt2b PARTITION OF pt2 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt2c PARTITION OF pt2 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt2 (id, dim2, val2) + SELECT g, 'some other text ' || g, (g % 5) + 1 + FROM generate_series(1,3000,2) g; +VACUUM ANALYZE pt2; + +CREATE TABLE pt3 (id integer primary key, dim3 text, val3 int) + PARTITION BY RANGE (id); +CREATE TABLE pt3a PARTITION OF pt3 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt3b PARTITION OF pt3 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt3c PARTITION OF pt3 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt3 (id, dim3, val3) + SELECT g, 'a third random text ' || g, (g % 7) + 1 + FROM generate_series(1,3000,3) g; +VACUUM ANALYZE pt3; + +CREATE TABLE ptmismatch (id integer primary key, dimm text, valm int) + PARTITION BY RANGE (id); +CREATE TABLE ptmismatcha PARTITION OF ptmismatch + FOR VALUES FROM (1) to (1501) + WITH (autovacuum_enabled = false); +CREATE TABLE ptmismatchb PARTITION OF ptmismatch + FOR VALUES FROM (1501) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO ptmismatch (id, dimm, valm) + SELECT g, 'yet another text ' || g, (g % 2) + 1 + FROM generate_series(1,3000) g; +VACUUM ANALYZE ptmismatch; + +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + +-- Suppress partitionwise join, or do it just partially. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE(pt1 pt2 pt3)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) pt3)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +COMMIT; + +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) (pt1 pt3))'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +COMMIT; + +-- Can't force a partitionwise join with a mismatched table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 ptmismatch))'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, ptmismatch WHERE pt1.id = ptmismatch.id; +COMMIT; + +-- Force join order for a particular branch of the partitionwise join with +-- and without mentioning the schema name. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/prepared.sql b/contrib/pg_plan_advice/sql/prepared.sql new file mode 100644 index 0000000000..3ec30eedee --- /dev/null +++ b/contrib/pg_plan_advice/sql/prepared.sql @@ -0,0 +1,37 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; + +CREATE TABLE ptab (id integer, val text) WITH (autovacuum_enabled = false); + +SET pg_plan_advice.always_store_advice_details = false; + +-- Not prepared, so advice should be generated. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM ptab; + +-- Prepared, so advice should not be generated. +PREPARE pt1 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt1; + +SET pg_plan_advice.always_store_advice_details = true; + +-- Prepared, but always_store_advice_details = true, so should show advice. +PREPARE pt2 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2; + +-- Not prepared, so feedback should be generated. +SET pg_plan_advice.always_store_advice_details = false; +SET pg_plan_advice.advice = 'SEQ_SCAN(ptab)'; +EXPLAIN (COSTS OFF) +SELECT * FROM ptab; + +-- Prepared, so advice should not be generated. +PREPARE pt3 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF) EXECUTE pt1; + +SET pg_plan_advice.always_store_advice_details = true; + +-- Prepared, but always_store_advice_details = true, so should show feedback. +PREPARE pt4 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2; + diff --git a/contrib/pg_plan_advice/sql/scan.sql b/contrib/pg_plan_advice/sql/scan.sql new file mode 100644 index 0000000000..4fc494c7d8 --- /dev/null +++ b/contrib/pg_plan_advice/sql/scan.sql @@ -0,0 +1,195 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +SET seq_page_cost = 0.1; +SET random_page_cost = 0.1; +SET cpu_tuple_cost = 0; +SET cpu_index_tuple_cost = 0; + +CREATE TABLE scan_table (a int primary key, b text) + WITH (autovacuum_enabled = false); +INSERT INTO scan_table + SELECT g, 'some text ' || g FROM generate_series(1, 100000) g; +CREATE INDEX scan_table_b ON scan_table USING brin (b); +VACUUM ANALYZE scan_table; + +-- Sequential scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + +-- Index scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + +-- Index-only scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + +-- Bitmap heap scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; + +-- TID scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; + +-- TID range scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; + +-- Try forcing each of our test queries to use the scan type they +-- wanted to use anyway. This should succeed. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; +COMMIT; + +-- Try to force a full scan of the table to use some other scan type. All +-- of these will fail. An index scan or bitmap heap scan could potentially +-- generate the correct answer, but the planner does not even consider these +-- possibilities due to the lack of a WHERE clause. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +COMMIT; + +-- Try again to force index use. This should now succeed for the INDEX_SCAN +-- and BITMAP_HEAP_SCAN, but the INDEX_ONLY_SCAN can't be forced because the +-- query fetches columns not included in the index. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; +COMMIT; + +-- We can force a primary key lookup to use a sequential scan, but we +-- can't force it to use an index-only scan (due to the column list) +-- or a TID scan (due to the absence of a TID qual). +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +COMMIT; + +-- We can forcibly downgrade an index-only scan to an index scan, but we can't +-- force the use of an index that the planner thinks is inapplicable. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +COMMIT; + +-- We can force the use of a sequential scan in place of a bitmap heap scan, +-- but a plain index scan on a BRIN index is not possible. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +COMMIT; + +-- We can force the use of a sequential scan rather than a TID scan or +-- TID range scan. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; +COMMIT; + +-- Test more complex scenarios with index scans. +BEGIN; +-- Should still work if we mention the schema. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +-- But not if we mention the wrong schema. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table cilbup.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +-- It's OK to repeat the same advice. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +-- But it doesn't work if the index target is even notionally different. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +COMMIT; + +-- Test assorted incorrect advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(nothing)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(nothing whatsoever)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table bogus)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(nothing whatsoever)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table bogus)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +COMMIT; + +-- Test our ability to refer to multiple instances of the same alias. +BEGIN; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s#2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s) SEQ_SCAN(s#2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; +COMMIT; + +-- Test our ability to refer to scans within a subquery. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); +BEGIN; +-- Should not match. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); +-- Should match first query only. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@x)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); +-- Should match second query only. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@unnamed_subquery)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); +COMMIT; diff --git a/contrib/pg_plan_advice/sql/semijoin.sql b/contrib/pg_plan_advice/sql/semijoin.sql new file mode 100644 index 0000000000..5a4ae52d1d --- /dev/null +++ b/contrib/pg_plan_advice/sql/semijoin.sql @@ -0,0 +1,118 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; + +CREATE TABLE sj_wide ( + id integer primary key, + val1 integer, + padding text storage plain +) WITH (autovacuum_enabled = false); +INSERT INTO sj_wide + SELECT g, g%10+1, repeat(' ', 300) FROM generate_series(1, 1000) g; +CREATE INDEX ON sj_wide (val1); +VACUUM ANALYZE sj_wide; + +CREATE TABLE sj_narrow ( + id integer primary key, + val1 integer +) WITH (autovacuum_enabled = false); +INSERT INTO sj_narrow + SELECT g, g%10+1 FROM generate_series(1, 1000) g; +CREATE INDEX ON sj_narrow (val1); +VACUUM ANALYZE sj_narrow; + +-- We expect this to make the VALUES list unique and use index lookups to +-- find the rows in sj_wide, so as to avoid a full scan of sj_wide. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + +-- If we ask for a unique semijoin, we should get the same plan as with +-- no advice. If we ask for a non-unique semijoin, we should see a Semi +-- Join operation in the plan tree. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +COMMIT; + +-- Because this table is narrower than the previous one, a sequential scan +-- is less expensive, and we choose a straightforward Semi Join plan by +-- default. (Note that this is also very sensitive to the length of the IN +-- list, which affects how many index lookups the alternative plan will need.) +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + +-- Here, we expect advising a unique semijoin to swith to the same plan that +-- we got with sj_wide, and advising a non-unique semijoin should not change +-- the plan. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +COMMIT; + +-- In the above example, we made the outer side of the join unique, but here, +-- we should make the inner side unique. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + +-- We should be able to force a plan with or without the make-unique strategy, +-- with either side as the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) join_order(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow) join_order(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +COMMIT; + +-- However, mentioning the wrong side of the join should result in an advice +-- failure. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +COMMIT; + +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) semijoin_non_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +COMMIT; + +-- Try applying SEMIJOIN_UNIQUE() to a non-semijoin. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g, sj_narrow s WHERE g = s.val1; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/syntax.sql b/contrib/pg_plan_advice/sql/syntax.sql new file mode 100644 index 0000000000..56a5d54e2b --- /dev/null +++ b/contrib/pg_plan_advice/sql/syntax.sql @@ -0,0 +1,68 @@ +LOAD 'pg_plan_advice'; + +-- An empty string is allowed. Empty target lists are allowed for most advice +-- tags, but not for JOIN_ORDER. "Supplied Plan Advice" should be omitted in +-- text format when there is no actual advice, but not in non-text format. +SET pg_plan_advice.advice = ''; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'SEQ_SCAN()'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'NESTED_LOOP_PLAIN()'; +EXPLAIN (COSTS OFF, FORMAT JSON) SELECT 1; +SET pg_plan_advice.advice = 'JOIN_ORDER()'; + +-- Test assorted variations in capitalization, whitespace, and which parts of +-- the relation identifier are included. These should all work. +SET pg_plan_advice.advice = 'SEQ_SCAN(x)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'seq_scan(x@y)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'SEQ_scan(x#2)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'SEQ_SCAN (x/y)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = ' SEQ_SCAN ( x / y . z ) '; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'SEQ_SCAN("x"#2/"y"."z"@"t")'; +EXPLAIN (COSTS OFF) SELECT 1; + +-- Syntax errors. +SET pg_plan_advice.advice = 'SEQUENTIAL_SCAN(x)'; +SET pg_plan_advice.advice = 'SEQ_SCAN'; +SET pg_plan_advice.advice = 'SEQ_SCAN('; +SET pg_plan_advice.advice = 'SEQ_SCAN("'; +SET pg_plan_advice.advice = 'SEQ_SCAN("")'; +SET pg_plan_advice.advice = 'SEQ_SCAN("a"'; +SET pg_plan_advice.advice = 'SEQ_SCAN(#'; +SET pg_plan_advice.advice = '()'; +SET pg_plan_advice.advice = '123'; + +-- Tags like SEQ_SCAN and NO_GATHER don't allow sublists at all; other tags, +-- except for JOIN_ORDER, allow at most one level of sublist. Hence, these +-- examples should error out. +SET pg_plan_advice.advice = 'SEQ_SCAN((x))'; +SET pg_plan_advice.advice = 'GATHER(((x)))'; + +-- Legal comments. +SET pg_plan_advice.advice = '/**/'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'HASH_JOIN(_)/***/'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(/*x*/y)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(y//*x*/z)'; +EXPLAIN (COSTS OFF) SELECT 1; + +-- Unterminated comments. +SET pg_plan_advice.advice = '/*'; +SET pg_plan_advice.advice = 'JOIN_ORDER("fOO") /* oops'; + +-- Nested comments are not supported, so the first of these is legal and +-- the second is not. +SET pg_plan_advice.advice = '/*/*/'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = '/*/* stuff */*/'; + +-- Foreign join requires multiple relation identifiers. +SET pg_plan_advice.advice = 'FOREIGN_JOIN(a)'; +SET pg_plan_advice.advice = 'FOREIGN_JOIN((a))'; diff --git a/contrib/pg_plan_advice/t/001_regress.pl b/contrib/pg_plan_advice/t/001_regress.pl new file mode 100644 index 0000000000..67595cddf7 --- /dev/null +++ b/contrib/pg_plan_advice/t/001_regress.pl @@ -0,0 +1,148 @@ +# Copyright (c) 2021-2025, PostgreSQL Global Development Group + +# Run the core regression tests under pg_plan_advice to check for problems. +use strict; +use warnings FATAL => 'all'; + +use Cwd qw(abs_path); +use File::Basename qw(dirname); + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize the primary node +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(); + +# Set up our desired configuration. +# +# We run with pg_plan_advice.shared_collection_limit set to ensure that the +# plan tree walker code runs against every query in the regression tests. If +# we're unable to properly analyze any of those plan trees, this test should fail. +# +# We set pg_plan_advice.advice to an advice string that will cause the advice +# trove to be populated with a few entries of various sorts, but which we do +# not expect to match anything in the regression test queries. This way, the +# planner hooks will be called, improving code coverage, but no plans should +# actually change. +# +# pg_plan_advice.always_explain_supplied_advice=false is needed to avoid breaking +# regression test queries that use EXPLAIN. In the real world, it seems like +# users will want EXPLAIN output to show supplied advice so that it's clear +# whether normal planner behavior has been altered, but here that's undesirable. +$node->append_conf('postgresql.conf', <start; + +my $srcdir = abs_path("../.."); + +# --dlpath is needed to be able to find the location of regress.so +# and any libraries the regression tests require. +my $dlpath = dirname($ENV{REGRESS_SHLIB}); + +# --outputdir points to the path where to place the output files. +my $outputdir = $PostgreSQL::Test::Utils::tmp_check; + +# --inputdir points to the path of the input files. +my $inputdir = "$srcdir/src/test/regress"; + +# Run the tests. +my $rc = + system($ENV{PG_REGRESS} . " " + . "--bindir= " + . "--dlpath=\"$dlpath\" " + . "--host=" . $node->host . " " + . "--port=" . $node->port . " " + . "--schedule=$srcdir/src/test/regress/parallel_schedule " + . "--max-concurrent-tests=20 " + . "--inputdir=\"$inputdir\" " + . "--outputdir=\"$outputdir\""); + +# Dump out the regression diffs file, if there is one +if ($rc != 0) +{ + my $diffs = "$outputdir/regression.diffs"; + if (-e $diffs) + { + print "=== dumping $diffs ===\n"; + print slurp_file($diffs); + print "=== EOF ===\n"; + } +} + +# Report results +is($rc, 0, 'regression tests pass'); + +# Create the extension so we can access the collector +$node->safe_psql('postgres', 'CREATE EXTENSION pg_plan_advice'); + +# Verify that a large amount of advice was collected +my $all_query_count = $node->safe_psql('postgres', <', 20000, "copious advice collected"); + +# Verify that lots of different advice strings were collected +my $distinct_query_count = $node->safe_psql('postgres', <', 3000, "diverse advice collected"); + +# We want to test for the presence of our known tags in the collected advice. +# Put all tags into the hash that follows; map any tags that aren't tested +# by the core regression tests to 0, and others to 1. +my %tag_map = ( + BITMAP_HEAP_SCAN => 1, + FOREIGN_JOIN => 0, + GATHER => 1, + GATHER_MERGE => 1, + HASH_JOIN => 1, + INDEX_ONLY_SCAN => 1, + INDEX_SCAN => 1, + JOIN_ORDER => 1, + MERGE_JOIN_MATERIALIZE => 1, + MERGE_JOIN_PLAIN => 1, + NESTED_LOOP_MATERIALIZE => 1, + NESTED_LOOP_MEMOIZE => 1, + NESTED_LOOP_PLAIN => 1, + NO_GATHER => 1, + PARTITIONWISE => 1, + SEMIJOIN_NON_UNIQUE => 1, + SEMIJOIN_UNIQUE => 1, + SEQ_SCAN => 1, + TID_SCAN => 1, +); +for my $tag (sort keys %tag_map) +{ + my $checkit = $tag_map{$tag}; + + # Search for the given tag. This is not entirely robust: it could get thrown + # off by a table alias such as "FOREIGN_JOIN(", but that probably won't + # happen in the core regression tests. + my $tag_count = $node->safe_psql('postgres', <', 10, "multiple uses of $tag") if $checkit; + + # Regardless, note the exact count in the log, for human consumption. + note("found $tag_count advice strings containing $tag"); +} + +# Trigger a partial cleanup of the shared advice collector, and then a full +# cleanup. +$node->safe_psql('postgres', <extension_name; &pgfreespacemap; &pglogicalinspect; &pgoverexplain; + &pgplanadvice; &pgprewarm; &pgrowlocks; &pgstatstatements; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index ac66fcbdb5..d90b4338d2 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -149,6 +149,7 @@ + diff --git a/doc/src/sgml/pgplanadvice.sgml b/doc/src/sgml/pgplanadvice.sgml new file mode 100644 index 0000000000..efc4287df9 --- /dev/null +++ b/doc/src/sgml/pgplanadvice.sgml @@ -0,0 +1,1036 @@ + + + + pg_plan_advice — help the planner get the right plan + + + pg_plan_advice + + + + The pg_plan_advice allows key planner decisions to be + described, reproduced, and altered using a special-purpose "plan advice" + mini-language. It is intended to allow stabilization of plan choices that + the user believes to be good, as well as experimentation with plans that + the planner believes to be non-optimal. + + + + Note that, since the planner often makes good decisions, overriding its + judgement can easily backfire. For example, if the distribution of the + underlying data changes, the planner normally has the option to adjust the + plan in an attempt to preserve good performance. If the plan advice prevents + this, a very poor plan may be chosen. It is important to use plan advice + only when the risks of constraining the planner's choices are outweighed by + the benefits. + + + + Getting Started + + + In order to use this module, the pg_plan_advice module + must be loaded. You can do this on a system-wide basis by adding + pg_plan_advice to + and restarting the + server, or by adding it to + and starting a new session, + or by loading it into an individual session using the + LOAD command. If you + wish to use the + collector interface, + you must also install the pg_plan_advice extension + in the database where you wish to use the collector. Use the command + CREATE EXTENSION pg_plan_advice to do this. If you do + not wish to use the collector interface, this step is not required. + + + + Once the pg_plan_advice module is loaded, + EXPLAIN will support + a PLAN_ADVICE option. You can use this option to see + a plan advice string for the chosen plan. For example: + + + +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------ + Hash Join + Hash Cond: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Hash + -> Seq Scan on join_dim d + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + NO_GATHER(f d) + + + + In this example, the user has not specified any advice; instead, the + planner has been permitted to make whatever decisions it thinks best, and + those decisions are memorialized in the form of an advice string. + JOIN_ORDER(f d) means that f should + be the driving table, and the first table to which it should be joined is + d. HASH_JOIN(d) means that + d should appear on the inner side of a hash join. + SEQ_SCAN(f d) means that both f + and d should be accessed via a sequential scan. + NO_GATHER(f d) means that neither f + nor d should appear beneath a Gather + or Gather Merge node. For more details on the plan + advice mini-language, see the information on + advice targets and + advice tags, below. + + + + If you want to see the advice strings for a large number of queries, or + an entire workload, running EXPLAIN (PLAN_ADVICE) for + each one may not be convenient. In such situations, it can be more + convenient to use an + advice collector. + + + + Once you have an advice string for a query, you can use it to control how + that query is planned. You can do this by setting + pg_plan_advice.advice to the advice string you've + chosen. This can be an advice string that was generated by the system, + or one you've written yourself. One good way of creating your own advice + string is to take the string generated by the system and pick out just + those elements that you wish to enforce. In the example above, + pg_plan_advice emits advice for the join order, the + join method, the scan method, and the use of parallelism, but you might + only want to control the join order: + + + +SET pg_plan_advice.advice = 'JOIN_ORDER(f d)'; +EXPLAIN (COSTS OFF) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------ + Hash Join + Hash Cond: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Hash + -> Seq Scan on join_dim d + Supplied Plan Advice: + JOIN_ORDER(f d) /* matched */ + + + + Since the PLAN_ADVICE option to + EXPLAIN was not specified, no advice string is generated + for the plan. However, the supplied plan advice is still shown so that + anyone looking at the EXPLAIN output knows that the + chosen plan was influenced by plan advice. If information about supplied + plan advice is not desired, it can be suppressed by configuring + pg_plan_advice.always_explain_supplied_advice = false. + For each piece of supplied advice, the output shows + advice feedback indicating + whether or not the advice was successfully applied to the query. In this + case, the feedback says /* matched */, which means that + f and d were found in the query and + that the resulting query plan conforms to the specified advice. + + + + + + How It Works + + + Plan advice is written imperatively; that is, it specifies what should be + done. However, at an implementation level, + pg_plan_advice works by telling the core planner what + should not be done. In other words, it operates by constraining the + planner's choices, not by replacing it. Therefore, no matter what advice + you provide, you will only ever get a plan that the core planner would have + considered for the query in question. If you attempt to force what you + believe to be the correct plan by supplying an advice string, and the + planner still fails to produce the desired plan, this means that either + there is a bug in your advice string, or the plan in question was not + considered viable by the core planner. This commonly happens for one of two + reasons. First, it might be the planner believes that the plan you're trying + to force would be semantically incorrect - that is, it would produce the + wrong results - and for that reason it wasn't considered. Second, it might + be that the planner rejected the plan you were hoping to generate on some + grounds other than cost. For example, given a very simple query such as + SELECT * FROM some_table, the query planner will + decide that the use of an index is worthless here before it performs any + costing calculations. You cannot force it to use an index for this query + even if you set enable_seqscan = false, and you can't + force it to use an index using plan advice, either. + + + + Specifying plan advice should never cause planner failure. However, if you + specify plan advice that asks for something impossible, you may get a plan + where some plan nodes are flagged as Disabled: true in + the EXPLAIN output. In some cases, such plans will be + basically the same plan you would have gotten with no supplied advice at + all, but in other cases, they may be much worse. For example: + + + +SET pg_plan_advice.advice = 'JOIN_ORDER(x f d)'; +EXPLAIN (COSTS OFF) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------- + Nested Loop + Disabled: true + -> Seq Scan on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + JOIN_ORDER(x f d) /* partially matched */ + + + + Because neither f nor d is the + first table in the JOIN_ORDER() specification, the + planner disables all direct joins between the two of them, thinking that + the join to x should happen first. Since planning isn't + allowed to fail, a disabled plan between the two relations is eventually + selected anyway, but here it's a Nested Loop rather than + the Hash Join that was chosen in the above example where + no advice was specified. There are several different ways that this kind + of thing can happen; when it does, the resulting plan is generally worse + than if no advice had been specified at all. Therefore, it is a good idea + to validate that the advice you specify applies to the query to which it + is applied and that the results are as expected. + + + + + + Advice Targets + + + An advice target uniquely identifies a particular + instance of a particular relation involved in a particular query. In simple + cases, such as the examples shown above, the advice target is simply the + relation alias. However, a more complex syntax is required when subqueries + are used, when tables are partitioned, or when the same relation alias is + mentioned more than once in the same subquery (e.g., (foo JOIN bar + ON foo.a = bar.a) x JOIN foo ON x.b = foo.b). Any combination of + these three things can occur simultaneously: a relation could be mentioned + more than once, be partitioned, and be used inside of a subquery. + + + + Because of this, the general syntax for a relation identifier is: + + + +alias_name#occurrence_number/partition_schema.partition_name@plan_name + + + + All components except for the alias_name are optional + and are included only when required. When a component is omitted, the + preceding punctuation must also be omitted. For the first occurrence of a + relation within a given subquery, generated advice will omit the occurrence + number, but it is legal to write #1, if desired. The + partition schema and partition name are included only for children of + partitioned tables. In generated advice, pg_plan_advice + always includes both, but it is legal to omit the schema. The plan name is + omitted for the top-level plan, and must be included for any subplan. + + + + It is not always easy to determine the correct advice target by examining + the query. For instance, if the planner pulls up a subquery into the parent + query level, everything inside of it becomes part of the parent query level, + and uses the parent query's subplan name (or no subplan name, if pulled up + to the top level). Furthermore, the correct subquery name is sometimes not + obvious. For example, when two queries are joined using an operation such as + UNION or INTERSECT, no name for the + subqueries is present in the SQL syntax; instead, a system-generated name is + assigned to each branch. The easiest way to discover the proper advice + targets is to use EXPLAIN (PLAN_ADVICE) and examine the + generated advice. + + + + + + Advice Tags + + + An advice tag specifies a particular behavior that + should be enforced for some portion of the query, such as a particular + join order or join method. All advice tags take + advice targets as arguments, + and many allow lists of advice targets, which in some cases can be nested + multiple levels deep. Several different classes of advice targets exist, + each controlling a different aspect of query planning. + + + + Scan Method Advice + +SEQ_SCAN(target [ ... ]) +TID_SCAN(target [ ... ]) +INDEX_SCAN(target index_name [ ... ]) +INDEX_ONLY_SCAN(target index_name [ ... ]) +FOREIGN_SCAN((target [ ... ]) [ ... ]) +BITMAP_HEAP_SCAN(target [ ... ]) + + + SEQ_SCAN specifies that each target should be + scanned using a Seq Scan. TID_SCAN + specifies that each target should be scanned using a + TID Scan or TID Range Scan. + BITMAP_HEAP_SCAN specifies that each target + should be scanned using a Bitmap Heap Scan. + + + + INDEX_SCAN specifies that each target should + be scanned using an Index Scan on the given index + name. INDEX_ONLY_SCAN is similar, but specifies the + use of an Index Only Scan. In either case, the index + name can be, but does not have to be, schema-qualified. + + + + FOREIGN_SCAN specifies that a join between two or + more foreign tables should be pushed down to a remote server so + that it can be implemented as a single Foreign Scan. + Specifying FOREIGN_SCAN for a single foreign table is + neither necessary nor permissible: a Foreign Scan will + need to be used regardless. If you want to prevent a join from being + pushed down, consider using the JOIN_ORDER tag for + that purpose. + + + + The planner supports many types of scans other than those listed here; + however, in most of those cases, there is no meaningful decision to be + made, and hence no need for advice. For example, the output of a + set-returning function that appears in the FROM clause + can only ever be scanned using a Function Scan, so + there is no opportunity for advice to change anything. + + + + + + Join Order Advice + +JOIN_ORDER(join_order_item [ ... ]) + +where join_order_item is: + +advice_target | +( join_order_item [ ... ] ) | +{ join_order_item [ ... ] } + + + When JOIN_ORDER is used without any sublists, it + specifies an outer-deep join with the first advice target as the driving + table, joined to each subsequent advice target in turn in the order + specified. For instance, JOIN_ORDER(a b c) means that + a should be the driving table, and that it should be + joined first to b and then to c. + If there are more relations in the query than a, + b, and c, the rest can be joined + afterwards in any manner. + + + + If a JOIN_ORDER list contains a parenthesized sublist, + it specifies a non-outer-deep join. The relations in the sublist must first + be joined to each other much as if the sublist were a top-level + JOIN_ORDER list, and the resulting join product must + then appear on the inner side of a join at the appropriate point in the + join order. For example, JOIN_ORDER(a (b c) d) requires + a plan of this form: + + + +Join + -> Join + -> Scan on a + -> Join + -> Scan on b + -> Scan on c + -> Scan on d + + + + If a JOIN_ORDER list contains a sublist surrounded by + curly braces, this also specifies a non-outer-deep join. However, the join + order within the sublist is not constrained. For example, specifiying + JOIN_ORDER(a {b c} d) would allow the scans of + b and c to be swapped in the + previous example, which is not allowed when parentheses are used. + + + + Parenthesized sublists can be arbitrarily nested, but sublists surrounded + by curly braces cannot themselves contain sublists. + + + + Multiple instances of JOIN_ORDER() can sometimes be + needed in order to fully constraint the join order. This occurs when there + are multiple join problems that are optimized separately by the planner. + This can happen due to the presence of subqueries, or because there is a + partitionwise join. In the latter case, each branch of the partitionwise + join can have its own join order, independent of every other branch. + + + + + + Join Method Advice + +join_method_name(join_method_item [ ... ]) + +where join_method_name is: + +{ MERGE_JOIN_MATERIALIZE | MERGE_JOIN_PLAIN | NESTED_LOOP_MATERIALIZE | NESTED_LOOP_PLAIN | HASH_JOIN } + +and join_method_item is: + +{ advice_target | +( advice_target [ ... ] ) } + + + Join method advice specifies the relation, or set of relations, that should + appear on the inner side of a join using the named join method. For + example, HASH_JOIN(a b) means that each of + a and b should appear on the inner + side of a hash join; a conforming plan must contain at least two hash + joins, one of which has a and nothing else on the + inner side, and the other of which has b and nothing + else on the inner side. On the other hand, + HASH_JOIN((a b)) means that the join product of + a and b should appear together + on the inner side of a single hash join. + + + + Note that join method advice implies a negative join order constraint. + Since the named relation or relations must be on the inner side of a join + using the specified method, none of them can be the driving table for the + entire join problem. Moreover, no relation inside the set should be joined + to any relation outside the set until all relations within the set have + been joined to each other. For example, if the advice specifies + HASH_JOIN((a b)) and the system begins by joining either + of those to some third relation c, the resulting + plan could never be compliant with the request to put exactly those two + relations on the inner side of a hash join. When using both join order + advice and join method advice for the same query, it is a good idea to make + sure that they do not mandate incompatible join orders. + + + + + + Partitionwise Advice + +PARTITIONWISE(partitionwise_item [ ... ]) + +where partitionwise_item is: + +{ advice_target | +( advice_target [ ... ] ) } + + + When applied to a single target, PARTITIONWISE + specifies that the specified table should not be part of any partitionwise + join. When applied to a list of targets, PARTITIONWISE + specifies that exactly that set of relations should be joined in + partitionwise fashion. Note that, regardless of what advice is specified, + no partitionwise joins will be possible if + enable_partitionwise_join = off. + + + + + + Semijoin Uniqueness Advice + +SEMIJOIN_UNIQUE(sj_unique_item [ ... ]) +SEMIJOIN_NON_UNIQUE(sj_unique_item [ ... ]) + +where sj_unique_item is: + +{ advice_target | +( advice_target [ ... ] ) } + + + The planner sometimes has a choice between implementing a semijoin + directly and implememnting a semijoin by making the nullable side unique + and then performing an inner join. SEMIJOIN_UNIQUE + specifies the latter strategy, while SEMIJOIN_NON_UNIQUE + specifies the former strategy. In either case, the argument is the single + relation or list of relations that appear beneath the nullable side of the + join. + + + + + + Parallel Query Advice + +GATHER(gather_item [ ... ]) +GATHER_MERGE(gather_item [ ... ]) +NO_GATHER(advice_target [ ... ]) + +where gather_item is: + +{ advice_target | +( advice_target [ ... ] ) } + + + GATHER or GATHER_MERGE specifies + that Gather or Gather Merge, + respectively, should be placed on top of the single relation specified as + a target, or on top of the join between the list of relations specified as + a target. This means that GATHER(a b c) is a request + for three different Gather nodes, while + GATHER((a b c)) is a request for a single + Gather node on top of a 3-way join. + + + + NO_GATHER specifies that no Gather or + Gather Merge node should appear above any of the + targets, but it only constrains the planning of an individual subquery, + and outer subquery levels can still use parallel query. For example, + NO_GATHER(inner_example@any_1) precludes using a + Parallel Seq Scan to access the + inner_example table within the any_1 + subquery, but it does not prevent the planner from placing + SubPlan any_1 beneath a Gather + or Gather Merge node. The following plan is + compatible with NO_GATHER(inner_example@any_1), but + not with NO_GATHER(outer_example): + + + + Finalize Aggregate + -> Gather + -> Partial Aggregate + -> Parallel Seq Scan on outer_example + Filter: (something = (hashed SubPlan any_1).col1) + SubPlan any_1 + -> Seq Scan on inner_example + Filter: (something_else > 100) + + + + Here is the reverse case, that is, a plan compatible with + NO_GATHER(outer_example) but not with + NO_GATHER(inner_example@any_1): + + + + Aggregate + -> Seq Scan on outer_example + Filter: (something = (hashed SubPlan any_1).col1) + SubPlan any_1 + -> Gather + -> Parallel Seq Scan on inner_example + Filter: (something_else > 100) + + + + + + + Advice Feedback + + + EXPLAIN provides feedback on whether supplied advice was + successfully applied to the query in the form of a comment on each piece + of supplied advice. For example: + + + +SET pg_plan_advice.advice = 'hash_join(f g) join_order(f g) index_scan(f no_such_index)'; +SET +rhaas=# EXPLAIN (COSTS OFF) SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------------- + Hash Join + Hash Cond: ((d1.id = f.dim1_id) AND (d2.id = f.dim2_id)) + -> Nested Loop + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Materialize + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on jo_fact f + Supplied Plan Advice: + INDEX_SCAN(f no_such_index) /* matched, inapplicable, failed */ + HASH_JOIN(f) /* matched */ + HASH_JOIN(g) /* not matched */ + JOIN_ORDER(f g) /* partially matched */ + + + + For this query, f is a valid advice target, but + g is not. Therefore, the request to place + f on the inner side of a hash join is listed as + matched, but the request to place g + on the inner side of a hash join is listed as + not matched. The JOIN_ORDER advice + tag involves one valid target and one invald target, and so is listed as + partially matched. Note that + HASH_JOIN(f g) is actually a request for two logically + separate behaviors, whereas JOIN_ORDER(f g) is a single + request. When providing advice feedback, EXPLAIN shows + each logical request separately, together with all the feedback applicable + to that request type. + + + + Advice feedback can include any of the folllowing: + + + + + + + matched means that all of the specified advice targets + were observed together during query planning, at a time at which the + advice could bbe enforced. + + + + + + partially matched means that some but not all of the + specified advice targets were observed during query planning, or all + of the advice targets were observed but not together. For example, this + may happen if all the targets of JOIN_ORDER advice + individually match the query, but the proposed join order is not legal. + + + + + + not matched means that none of the + specified advice targets were observed during query planning. This may + happen if the advice simply doesn't match the query, or it may + occur if the relevant portion of the query was not planned, perhaps + because it was gated by a condition that was simplified to constant false. + + + + + + inapplicable means that the advice tag could not + be applied to the advice targets for some reason. For example, this will + happen if the use of a nonexistent index is requested, or if an attempt + is made to control semijoin uniquness for a non-semijoin. + + + + + + conflicting means that two or more pieces of advice + request incompatible behaviors. For example, if you advise a sequential + scan and an index scan for the same table, both requests will be flagged + as conflicting. This also commonly happens if join method advice or + semijoin uniqueness advice implies a join order incompatible with the + one explicitly specified; see + . + + + + + + failed means that query plan does not comply with + the advice. This only occurs for entries that are also shown as + matched. It frequently occurs for entries that are + also marked as conflicting or + inapplicable. However, it can also occur when the + advice is valid insofar as pg_plan_advice is able + to determine, but the planner is not able to construct a legal + plan that can comply with the advice. It is important to note that the + sanity checks performed by pg_plan_advice are fairly + superficial and focused mostly on looking for logical inconsistencies in + the advice string; only the planner knows what will actually work. + + + + + + + All advice should be marked as exactly one of matched, + partially matched, or not matched. + + + + + + Advice Collectors + + + pg_plan_advice can be configured to automatically + generate advice every time a query is planned and store the query and + the generated advice string either in local or shared memory. + + + + To enable a collector, you must first set a collection limit. When the + number of queries for which advice has been stored exceeds the collection + limit, the oldest queries and the corresponding advice will be discarded. + Then, you must adjust a separate setting to actually enable advice + collection. For the local collector, set the collection limit by configuring + pg_plan_advice.local_collection_limit to a value + greater than zero, and then enable advice collection by setting + pg_plan_advice.local_collector = true. For the shared + collector, the procedure is the same, except that the names of the settings + are pg_plan_advice.shared_collection_limit and + pg_plan_advice.shared_collector. Note that the local + collector stores query texts and advice strings in backend-local memory, + and the shared collector does the same in dynamic shared memory, so + configuring large limits may result in considerable memory consumption. + + + + Once the collector is enabled, you can run any queries for which you wish + to see the generated plan advice. Then, you can examine what has been + collected using whichever of + SELECT * FROM pg_get_collected_local_advice() or + SELECT * FROM pg_get_collected_shared_advice() + corresponds to the collector you enabled. To discard the collected advice + and release memory, you can call + pg_clear_collected_local_advice() + or pg_clear_collected_shared_advice(). + + + + In addition to the query texts an advice strings, the advice collectors + will also store the OID of the role that caused the query to be planned, + the OID of the database in which the query was planned, the query ID, + and the time at which the collection occurred. This module does not + automatically enable query ID computation; therefore, if you want the + query ID value to be populated in collected advice, be sure to configure + enable_query_id = on. Otherwise, the query ID may + always show as 0. + + + + + + Functions + + + Note that these functions will only be available if the + pg_plan_advice extension has been installed in the + current database, which is not mandatory, since much of the functionality + of this module can be used without installing the extension. + + + + + + + pg_clear_collected_local_advice() returns void + + pg_clear_collected_local_advice + + + + + + Removes all collected query texts and advice strings from backend-local + memory. + + + + + + + pg_get_collected_local_advice() returns setof (id bigint, + userid oid, dbid oid, queryid bigint, collection_time timestamptz, + query text, advice text) + + pg_get_collected_local_advice + + + + + + Returns all query texts and advice strings stored in the local + advice collector. + + + + + + + pg_clear_collected_shared_advice() returns void + + pg_clear_collected_shared_advice + + + + + + Removes all collected query texts and advice strings from shared + memory. + + + + + + + pg_get_collected_shared_advice() returns setof (id bigint, + userid oid, dbid oid, queryid bigint, collection_time timestamptz, + query text, advice text) + + pg_get_collected_shared_advice + + + + + + Returns all query texts and advice strings stored in the shared + advice collector. + + + + + + + + + + Configuration Parameters + + + + + + pg_plan_advice.advice (string) + + pg_plan_advice.advice configuration parameter + + + + + + pg_plan_advice.advice is an advice string to be + used during query planning. + + + + + + + pg_plan_advice.always_explain_supplied_advice (boolean) + + pg_plan_advice.always_explain_supplied_advice configuration parameter + + + + + + pg_plan_advice.always_explain_supplied_advice causes + EXPLAIN to always show any supplied advice and the + associated + advice feedback. + The default value is true. If set to + false, this information will be displayed only when + EXPLAIN (PLAN_ADVICE) is used. + + + + + + + pg_plan_advice.always_store_advice_details (boolean) + + pg_plan_advice.always_store_advice_details configuration parameter + + + + + + pg_plan_advice.always_store_advice_details allows + EXPLAIN to show details related to plan advice even + when prepared queries are used. The default value is + false. When planning a prepared query, it is not + possible to know whether EXPLAIN will later be used, + so by default, to reduce overhead, pg_plan_advice + will not generate plan advice or feedback on supplied advice. This means + that if EXPLAIN EXECUTE is used on the prepared query, + it will not be able to show this information. Changing this setting to + true avoids this problem, but adds additional + overhead. It is probably a good idea to enable this option only in + sessions where it is needed, rather than on a system-wide basis. + + + + + + + pg_plan_advice.feedback_warnings (boolean) + + pg_plan_advice.feedback_warnings configuration parameter + + + + + + When set to true, pg_plan_advice.feedback_warnings + emits a warning whenever supplied plan advice is not successfully + enforced. The default value is false. + + + + + + + pg_plan_advice.local_collector (boolean) + + pg_plan_advice.local_collector configuration parameter + + + + + + pg_plan_advice.local_collector enables the + local advice collector. + The default value is false. + + + + + + + pg_plan_advice.local_collection_limit (integer) + + pg_plan_advice.local_collection_limit configuration parameter + + + + + + pg_plan_advice.local_collection_limit sets the + maximum number of query texts and advice strings retained by the + local advice collector. + The default value is 0. + + + + + + + pg_plan_advice.shared_collector (boolean) + + pg_plan_advice.shared_collector configuration parameter + + + + + + pg_plan_advice.shared_collector enables the + shared advice collector. + The default value is false. Only superusers and users + with the appropriate SET privilege can change this + setting. + + + + + + + pg_plan_advice.shared_collection_limit (integer) + + pg_plan_advice.shared_collection_limit configuration parameter + + + + + + pg_plan_advice.shared_collection_limit sets the + maximum number of query texts and advice strings retained by the + shared advice collector. + The default value is 0. Only superusers and users + with the appropriate SET privilege can change this + setting. + + + + + + + pg_plan_advice.trace_mask (boolean) + + pg_plan_advice.trace_mask configuration parameter + + + + + + When pg_plan_advice.trace_mask is + true, pg_plan_advice will print + messages during query planning each time that + pg_plan_advice alters the mask of allowable query + plan types in response to supplied plan advice. The default values is + false. The messages printed by this setting are not + excepted to be useful except for purposes of debugging this module. + + + + + + + + + + Limitations + + + It is currently not possible to control any aspect of the planner's behavior + with respect to aggregation. This includes both whether aggregates are + computed by sorting or hashing, and also whether strategies such as + eager aggregation or + partitionwise + aggregation are used. + + + + It also is currently not possible to control any aspect of the planner's + behavior with respect to set operations such as UNION + or INTERSECT. + + + + As discussed above under, How + It Works, the use of plan advice can only affect which plan + the planner chooses from among those it believes to be viable. It can never + force the choice of a plan which the planner refused to consider in the + first place. + + + + + Author + + + Robert Haas rhaas@postgresql.org + + + + diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 241945734e..bee95caaca 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3981,6 +3981,45 @@ pg_uuid_t pg_wchar pg_wchar_tbl pgp_armor_headers_state +pgpa_collected_advice +pgpa_advice_item +pgpa_advice_tag_type +pgpa_advice_target +pgpa_identifier +pgpa_index_target +pgpa_index_type +pgpa_itm_type +pgpa_jo_outcome +pgpa_join_class +pgpa_join_member +pgpa_join_state +pgpa_join_strategy +pgpa_join_unroller +pgpa_local_advice +pgpa_local_advice_chunk +pgpa_output_context +pgpa_plan_walker_context +pgpa_planner_state +pgpa_qf_type +pgpa_query_feature +pgpa_ri_checker +pgpa_ri_checker_key +pgpa_scan +pgpa_scan_strategy +pgpa_shared_advice +pgpa_shared_advice_chunk +pgpa_shared_state +pgpa_sj_unique_rel +pgpa_target_type +pgpa_trove +pgpa_trove_entry +pgpa_trove_entry_element +pgpa_trove_entry_hash +pgpa_trove_entry_key +pgpa_trove_lookup_type +pgpa_trove_result +pgpa_trove_slice +pgpa_unrolled_join pgsocket pgsql_thing_t pgssEntry From fd927b2191c199920f13e8e35afee044ab716ee5 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 19 Feb 2026 11:51:11 -0500 Subject: [PATCH 141/147] Use rt_fetch, not planner_rt_fetch. --- contrib/pg_plan_advice/pgpa_identifier.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/contrib/pg_plan_advice/pgpa_identifier.c b/contrib/pg_plan_advice/pgpa_identifier.c index 51b4b0c60a..6f8f53e7cf 100644 --- a/contrib/pg_plan_advice/pgpa_identifier.c +++ b/contrib/pg_plan_advice/pgpa_identifier.c @@ -42,6 +42,11 @@ * particular, occurrence_number must be calculated relative to the range * table for the relevant subquery, not the final flattened range table. * + * NB: All of this code must use rt_fetch(), not planner_rt_fetch()! + * Join removal and self-join elimination remove rels from the arrays + * that planner_rt_fetch() uses; using rt_fetch() is necessary to get + * stable results. + * * Copyright (c) 2016-2024, PostgreSQL Global Development Group * * contrib/pg_plan_advice/pgpa_identifier.c @@ -133,7 +138,7 @@ pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti, (appinfo = root->append_rel_array[top_rti]) == NULL) break; - parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + parent_rte = rt_fetch(appinfo->parent_relid, root->parse->rtable); if (parent_rte->rtekind != RTE_RELATION) break; @@ -141,8 +146,8 @@ pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti, } /* Get the range table entries for the RTI and top RTI. */ - rte = planner_rt_fetch(rti, root); - top_rte = planner_rt_fetch(top_rti, root); + rte = rt_fetch(rti, root->parse->rtable); + top_rte = rt_fetch(top_rti, root->parse->rtable); Assert(rte->rtekind != RTE_JOIN); Assert(top_rte->rtekind != RTE_JOIN); @@ -166,13 +171,13 @@ pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti, { RangeTblEntry *parent_rte; - parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + parent_rte = rt_fetch(appinfo->parent_relid, root->parse->rtable); if (parent_rte->rtekind == RTE_RELATION) continue; } /* Skip NULL entries and joins. */ - prior_rte = planner_rt_fetch(prior_rti, root); + prior_rte = rt_fetch(prior_rti, root->parse->rtable); if (prior_rte == NULL || prior_rte->rtekind == RTE_JOIN) continue; @@ -220,7 +225,7 @@ pgpa_compute_identifiers_by_relids(PlannerInfo *root, Bitmapset *relids, while ((rti = bms_next_member(relids, rti)) >= 0) { - RangeTblEntry *rte = planner_rt_fetch(rti, root); + RangeTblEntry *rte = rt_fetch(rti, root->parse->rtable); if (rte->rtekind == RTE_JOIN) continue; From 7942badccd9c8957ce2da76c163cc7426273da5d Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 19 Feb 2026 12:18:04 -0500 Subject: [PATCH 142/147] clean up an XXX comment --- contrib/pg_plan_advice/pgpa_join.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/contrib/pg_plan_advice/pgpa_join.c b/contrib/pg_plan_advice/pgpa_join.c index ec8e1a666e..e04c0160a1 100644 --- a/contrib/pg_plan_advice/pgpa_join.c +++ b/contrib/pg_plan_advice/pgpa_join.c @@ -430,6 +430,13 @@ pgpa_decompose_join(pgpa_plan_walker_context *walker, Plan *plan, uniqueouter = pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter); uniqueinner = pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner); + /* + * Can we see a Result node here, to project above a Gather? So far I've + * found no example that behaves that way; rather, the Gather or Gather + * Merge is made to project. Hence, don't test is_result_node_with_child() + * at this point. + */ + /* * The planner may have decided to parallelize part of the join tree, so * we could find a Gather or Gather Merge node here. Note that, if @@ -459,10 +466,6 @@ pgpa_decompose_join(pgpa_plan_walker_context *walker, Plan *plan, * target list or to implement a one-time filter. If so, we can descend * throught it. Note that a result node without a child would be a * degenerate scan or join, and not something we could descend through. - * - * XXX. I suspect it's possible for this to happen above the Gather or - * Gather Merge node, too, but apparently we have no test case for that - * scenario. */ if (elidedouter == NULL && is_result_node_with_child(outerplan)) elidedouter = pgpa_descend_node(pstmt, &outerplan); From 149d8959cf95d07d3da8b5d948e259416fff2fae Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 19 Feb 2026 13:32:07 -0500 Subject: [PATCH 143/147] fix another XXX comment and reindent --- contrib/pg_plan_advice/pgpa_join.c | 9 +++++++-- contrib/pg_plan_advice/pgpa_planner.c | 16 ++++++++-------- contrib/pg_plan_advice/pgpa_walker.c | 6 +++--- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/contrib/pg_plan_advice/pgpa_join.c b/contrib/pg_plan_advice/pgpa_join.c index e04c0160a1..b6c588dfe2 100644 --- a/contrib/pg_plan_advice/pgpa_join.c +++ b/contrib/pg_plan_advice/pgpa_join.c @@ -595,8 +595,13 @@ pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, * aggregation or partitionwise aggregation operation that began at a * higher level of the plan tree. * - * XXX. I suspect this logic does not cover all cases: couldn't SJ - * uniqueness be implemented in two steps with an intermediate Gather? + * (Note that when we're using an Agg node for uniqueness, there's no + * need for any case other than AGGSPLIT_SIMPLE, because there's no + * aggregated column being * computed. However, the fact that + * AGGSPLIT_SIMPLE is in use doesn't prove that this Agg is here for + * the semijoin uniqueness. Maybe we should adjust an Agg node to + * carry a "purpose" field so that code like this can be more certain + * of its analysis.) */ descend = true; sjunique = (((Agg *) *plan)->aggsplit == AGGSPLIT_SIMPLE); diff --git a/contrib/pg_plan_advice/pgpa_planner.c b/contrib/pg_plan_advice/pgpa_planner.c index 3fc9127a99..c77d68dc14 100644 --- a/contrib/pg_plan_advice/pgpa_planner.c +++ b/contrib/pg_plan_advice/pgpa_planner.c @@ -1666,14 +1666,14 @@ pgpa_planner_apply_scan_advice(RelOptInfo *rel, if (my_entry->tag == PGPA_TAG_BITMAP_HEAP_SCAN) { /* - * Currently, PGS_CONSIDER_INDEXONLY can suppress Bitmap Heap Scans, - * so don't clear it when such a scan is requested. This happens - * because build_index_scan() thinks that the possibility of an - * index-only scan is a sufficient reason to consider using an - * otherwise-useless index, and get_index_paths() thinks that the - * same paths that are useful for index or index-only scans should - * also be considered for bitmap scans. Perhaps that logic should - * be tightened up, but until then we need to include + * Currently, PGS_CONSIDER_INDEXONLY can suppress Bitmap Heap + * Scans, so don't clear it when such a scan is requested. This + * happens because build_index_scan() thinks that the possibility + * of an index-only scan is a sufficient reason to consider using + * an otherwise-useless index, and get_index_paths() thinks that + * the same paths that are useful for index or index-only scans + * should also be considered for bitmap scans. Perhaps that logic + * should be tightened up, but until then we need to include * PGS_CONSIDER_INDEXONLY in my_scan_type here. */ my_scan_type = PGS_BITMAPSCAN | PGS_CONSIDER_INDEXONLY; diff --git a/contrib/pg_plan_advice/pgpa_walker.c b/contrib/pg_plan_advice/pgpa_walker.c index 1e4d9c1cf9..86a6c921f1 100644 --- a/contrib/pg_plan_advice/pgpa_walker.c +++ b/contrib/pg_plan_advice/pgpa_walker.c @@ -164,8 +164,8 @@ pgpa_plan_walker(pgpa_plan_walker_context *walker, PlannedStmt *pstmt, * It's possible for a Gather or Gather Merge query feature to find no * RTIs when partitionwise aggregation is in use. We shouldn't emit * something like GATHER_MERGE(()), so instead emit nothing. This means - * that we won't advise either GATHER or GATHER_MERGE or NO_GATHER in - * such cases, which might be something we want to improve in the future. + * that we won't advise either GATHER or GATHER_MERGE or NO_GATHER in such + * cases, which might be something we want to improve in the future. * * (Should the Partial Aggregates in such a case be created in an * UPPERREL_GROUP_AGG with a non-empty relid set? Right now that doesn't @@ -173,7 +173,7 @@ pgpa_plan_walker(pgpa_plan_walker_context *walker, PlannedStmt *pstmt, */ for (int t = 0; t < NUM_PGPA_QF_TYPES; ++t) { - List *query_features = NIL; + List *query_features = NIL; foreach_ptr(pgpa_query_feature, qf, walker->query_features[t]) { From e161d7c4a37826561e97a073e0415ce3f2ff988a Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 5 Feb 2026 15:23:36 -0500 Subject: [PATCH 144/147] Fix add_partial_path interaction with disabled_nodes Commit e22253467942fdb100087787c3e1e3a8620c54b2 adjusted the logic in add_path() to keep the path list sorted by disabled_nodes and then by total_cost, but failed to make the corresponding adjustment to add_partial_path. As a result, add_partial_path might sort the path list just by total cost, which could lead to later planner misbehavior. In principle, this should be back-patched to v18, but we are typically reluctant to back-patch planner fixes for fear of destabilizing working installations, and it is unclear to me that this has sufficiently serious consequences to justify an exception, so for now, no back-patch. --- src/backend/optimizer/util/pathnode.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 829295b3af..875764abdb 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -878,8 +878,13 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) } else { - /* new belongs after this old path if it has cost >= old's */ - if (new_path->total_cost >= old_path->total_cost) + /* + * new belongs after this old path if it has more disabled nodes + * or if it has the same number of nodes but a greater total cost + */ + if (new_path->disabled_nodes > old_path->disabled_nodes || + (new_path->disabled_nodes == old_path->disabled_nodes && + new_path->total_cost >= old_path->total_cost)) insert_at = foreach_current_index(p1) + 1; } From 5783db06932b56660223d497219fd51a495e5406 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 6 Feb 2026 14:51:23 -0500 Subject: [PATCH 145/147] Consider startup cost as a figure of merit for partial paths. Previously, the coments stated that there was no purpose to considering startup cost for partial paths, but this is not the case: it's perfectly reasonable to want a fast-start path for a plan that involves a LIMIT (perhaps over an aggregate, so that there is enough data being processed to justify parallel query but yet we don't want all the result rows). Accordingly, rewrite add_partial_path and add_partial_path_precheck to consider startup costs. This also fixes an independent bug in add_partial_path_precheck: commit e22253467942fdb100087787c3e1e3a8620c54b2 failed to update it to do anything with the new disabled_nodes field. That bug fix is formally separate from the rest of this patch and could be committed separately, but I think it makes more sense to fix both issues together, because then we can (as this commit does) just make add_partial_path_precheck do the cost comparisons in the same way as compare_path_costs_fuzzily, which hopefully reduces the chances of ending up with something that's still incorrect. This patch is based on earlier work on this topic by Tomas Vondra, but I have rewritten a great deal of it. Co-authored-by: Robert Haas Co-authored-by: Tomas Vondra --- src/backend/optimizer/path/joinpath.c | 3 + src/backend/optimizer/util/pathnode.c | 159 +++++++++++------- src/include/optimizer/pathnode.h | 2 +- .../regress/expected/incremental_sort.out | 18 +- src/test/regress/expected/join_hash.out | 13 +- src/test/regress/sql/join_hash.sql | 10 +- 6 files changed, 122 insertions(+), 83 deletions(-) diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index e0c00e26dd..044560da7b 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -1048,6 +1048,7 @@ try_partial_nestloop_path(PlannerInfo *root, initial_cost_nestloop(root, &workspace, jointype, nestloop_subtype, outer_path, inner_path, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1237,6 +1238,7 @@ try_partial_mergejoin_path(PlannerInfo *root, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1369,6 +1371,7 @@ try_partial_hashjoin_path(PlannerInfo *root, initial_cost_hashjoin(root, &workspace, jointype, hashclauses, outer_path, inner_path, extra, parallel_hash); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, NIL)) return; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 875764abdb..d61f328707 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -777,10 +777,9 @@ add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * * Because we don't consider parameterized paths here, we also don't * need to consider the row counts as a measure of quality: every path will - * produce the same number of rows. Neither do we need to consider startup - * costs: parallelism is only used for plans that will be run to completion. - * Therefore, this routine is much simpler than add_path: it needs to - * consider only disabled nodes, pathkeys and total cost. + * produce the same number of rows. However, we do need to consider the + * startup costs: this partial path could be used beneath a Limit node, + * so a fast-start plan could be correct. * * As with add_path, we pfree paths that are found to be dominated by * another partial path; this requires that there be no other references to @@ -818,52 +817,36 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) /* Compare pathkeys. */ keyscmp = compare_pathkeys(new_path->pathkeys, old_path->pathkeys); - /* Unless pathkeys are incompatible, keep just one of the two paths. */ + /* + * Unless pathkeys are incompatible, see if one of the paths dominates + * the other (both in startup and total cost). It may happen that one + * path has lower startup cost, the other has lower total cost. + */ if (keyscmp != PATHKEYS_DIFFERENT) { - if (unlikely(new_path->disabled_nodes != old_path->disabled_nodes)) + PathCostComparison costcmp; + + /* + * Do a fuzzy cost comparison with standard fuzziness limit. + */ + costcmp = compare_path_costs_fuzzily(new_path, old_path, + STD_FUZZ_FACTOR); + if (costcmp == COSTS_BETTER1) { - if (new_path->disabled_nodes > old_path->disabled_nodes) - accept_new = false; - else + if (keyscmp == PATHKEYS_BETTER1) remove_old = true; } - else if (new_path->total_cost > old_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_BETTER2) { - /* New path costs more; keep it only if pathkeys are better. */ - if (keyscmp != PATHKEYS_BETTER1) + if (keyscmp == PATHKEYS_BETTER2) accept_new = false; } - else if (old_path->total_cost > new_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_EQUAL) { - /* Old path costs more; keep it only if pathkeys are better. */ - if (keyscmp != PATHKEYS_BETTER2) + if (keyscmp == PATHKEYS_BETTER1) remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER1) - { - /* Costs are about the same, new path has better pathkeys. */ - remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER2) - { - /* Costs are about the same, old path has better pathkeys. */ - accept_new = false; - } - else if (old_path->total_cost > new_path->total_cost * 1.0000000001) - { - /* Pathkeys are the same, and the old path costs more. */ - remove_old = true; - } - else - { - /* - * Pathkeys are the same, and new path isn't materially - * cheaper. - */ - accept_new = false; + else if (keyscmp == PATHKEYS_BETTER2) + accept_new = false; } } @@ -914,16 +897,16 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) * add_partial_path_precheck * Check whether a proposed new partial path could possibly get accepted. * - * Unlike add_path_precheck, we can ignore startup cost and parameterization, - * since they don't matter for partial paths (see add_partial_path). But - * we do want to make sure we don't add a partial path if there's already - * a complete path that dominates it, since in that case the proposed path - * is surely a loser. + * Unlike add_path_precheck, we can ignore parameterization, since it doesn't + * matter for partial paths (see add_partial_path). But we do want to make + * sure we don't add a partial path if there's already a complete path that + * dominates it, since in that case the proposed path is surely a loser. */ bool add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, - Cost total_cost, List *pathkeys) + Cost startup_cost, Cost total_cost, List *pathkeys) { + bool consider_startup = parent_rel->consider_startup; ListCell *p1; /* @@ -933,25 +916,80 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * is clearly superior to some existing partial path -- at least, modulo * final cost computations. If so, we definitely want to consider it. * - * Unlike add_path(), we always compare pathkeys here. This is because we - * expect partial_pathlist to be very short, and getting a definitive + * Unlike add_path(), we never try to exit this loop early. This is because + * we expect partial_pathlist to be very short, and getting a definitive * answer at this stage avoids the need to call add_path_precheck. */ foreach(p1, parent_rel->partial_pathlist) { Path *old_path = (Path *) lfirst(p1); + PathCostComparison costcmp; PathKeysComparison keyscmp; - keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); - if (keyscmp != PATHKEYS_DIFFERENT) + /* + * First, compare costs and disabled nodes. This logic should be + * identical to compare_path_costs_fuzzily, except that one of the + * paths hasn't been created yet, and the fuzz factor is always + * STD_FUZZ_FACTOR. + */ + if (unlikely(old_path->disabled_nodes != disabled_nodes)) { - if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER1) - return false; - if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER2) - return true; + if (disabled_nodes < old_path->disabled_nodes) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_BETTER2; + } + else if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR) + { + if (consider_startup && + old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER2; + } + else if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR) + { + if (consider_startup && + startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER1; } + else if (startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER2; + else if (old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_EQUAL; + + /* + * If one path wins on startup cost and the other on total cost, we + * can't say for sure which is better. + */ + if (costcmp == COSTS_DIFFERENT) + continue; + + /* + * If the two paths have different pathkeys, we can't say for sure + * which is better. + */ + keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); + if (keyscmp == PATHKEYS_DIFFERENT) + continue; + + /* + * If the existing path is cheaper and the pathkeys are equal or worse, + * the new path is not interesting. + */ + if (costcmp == COSTS_BETTER2 && keyscmp != PATHKEYS_BETTER1) + return false; + + /* + * If the new path is cheaper and the pathkeys are equal or better, + * it is definitely interesting. + */ + if (costcmp == COSTS_BETTER1 && keyscmp != PATHKEYS_BETTER2) + return true; } /* @@ -959,14 +997,9 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * clearly good enough that it might replace one. Compare it to * non-parallel plans. If it loses even before accounting for the cost of * the Gather node, we should definitely reject it. - * - * Note that we pass the total_cost to add_path_precheck twice. This is - * because it's never advantageous to consider the startup cost of a - * partial path; the resulting plans, if run in parallel, will be run to - * completion. */ - if (!add_path_precheck(parent_rel, disabled_nodes, total_cost, total_cost, - pathkeys, NULL)) + if (!add_path_precheck(parent_rel, disabled_nodes, startup_cost, + total_cost, pathkeys, NULL)) return false; return true; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 8297870cf7..b8b2204eeb 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -61,7 +61,7 @@ extern bool add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, List *pathkeys, Relids required_outer); extern void add_partial_path(RelOptInfo *parent_rel, Path *new_path); extern bool add_partial_path_precheck(RelOptInfo *parent_rel, - int disabled_nodes, + int disabled_nodes, Cost startup_cost, Cost total_cost, List *pathkeys); extern Path *create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index fdec5b9ba5..29090dca1b 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1450,21 +1450,23 @@ explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1 set enable_incremental_sort = on; explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; - QUERY PLAN ----------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------- Limit -> Incremental Sort Sort Key: a, b, (sum(c)) Presorted Key: a, b - -> GroupAggregate + -> Finalize GroupAggregate Group Key: a, b -> Gather Merge Workers Planned: 2 - -> Incremental Sort - Sort Key: a, b - Presorted Key: a - -> Parallel Index Scan using t_a_idx on t -(12 rows) + -> Partial GroupAggregate + Group Key: a, b + -> Incremental Sort + Sort Key: a, b + Presorted Key: a + -> Parallel Index Scan using t_a_idx on t +(14 rows) -- Incremental sort vs. set operations with varno 0 set enable_hashagg to off; diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out index 4749f6ed70..bc7cc76467 100644 --- a/src/test/regress/expected/join_hash.out +++ b/src/test/regress/expected/join_hash.out @@ -76,8 +76,8 @@ insert into extremely_skewed update pg_class set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 where relname = 'extremely_skewed'; --- Make a relation with a couple of enormous tuples. -create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +-- Make a relation with several enormous tuples. +create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t; alter table wide set (parallel_workers = 2); -- The "optimal" case: the hash table fits in memory; we plan for 1 -- batch, we stick to that number, and peak memory usage stays within @@ -922,7 +922,7 @@ set work_mem = '128kB'; set hash_mem_multiplier = 1.0; explain (costs off) select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); QUERY PLAN ---------------------------------------------------------------- Finalize Aggregate @@ -934,10 +934,11 @@ explain (costs off) -> Parallel Seq Scan on wide -> Parallel Hash -> Parallel Seq Scan on wide wide_1 -(9 rows) + Filter: (id < 3) +(10 rows) select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); length -------- 320000 @@ -947,7 +948,7 @@ select final > 1 as multibatch from hash_join_batches( $$ select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); $$); multibatch ------------ diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql index 49d3fd6185..53db1754bb 100644 --- a/src/test/regress/sql/join_hash.sql +++ b/src/test/regress/sql/join_hash.sql @@ -83,8 +83,8 @@ update pg_class set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 where relname = 'extremely_skewed'; --- Make a relation with a couple of enormous tuples. -create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +-- Make a relation with several enormous tuples. +create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t; alter table wide set (parallel_workers = 2); -- The "optimal" case: the hash table fits in memory; we plan for 1 @@ -496,14 +496,14 @@ set work_mem = '128kB'; set hash_mem_multiplier = 1.0; explain (costs off) select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); select length(max(s.t)) -from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); select final > 1 as multibatch from hash_join_batches( $$ select length(max(s.t)) - from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id); $$); rollback to settings; From 7ae36cde0156e9789e9dc479700a77841ec5d115 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Sat, 7 Feb 2026 09:36:08 -0500 Subject: [PATCH 146/147] Test pg_plan_advice using a new test_plan_advice module. The TAP test included in this new module runs the regression tests with pg_plan_advice loaded. It arranges for each query to be planned twice. The first time, we generate plan advice. The second time, we replan the query using the resulting advice string. If the tests fail, that means that using pg_plan_advice to tell the planner to do what it was going to do anyway breaks something, which indicates a problem either with pg_plan_advice or with the planner. --- src/test/modules/Makefile | 1 + src/test/modules/meson.build | 1 + src/test/modules/test_plan_advice/Makefile | 28 ++++ src/test/modules/test_plan_advice/meson.build | 29 ++++ .../test_plan_advice/t/001_replan_regress.pl | 64 ++++++++ .../test_plan_advice/test_plan_advice.c | 143 ++++++++++++++++++ 6 files changed, 266 insertions(+) create mode 100644 src/test/modules/test_plan_advice/Makefile create mode 100644 src/test/modules/test_plan_advice/meson.build create mode 100644 src/test/modules/test_plan_advice/t/001_replan_regress.pl create mode 100644 src/test/modules/test_plan_advice/test_plan_advice.c diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 44c7163c1c..e8c31ec8e7 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -38,6 +38,7 @@ SUBDIRS = \ test_oat_hooks \ test_parser \ test_pg_dump \ + test_plan_advice \ test_predtest \ test_radixtree \ test_rbtree \ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 2634a51993..6998a226fa 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -39,6 +39,7 @@ subdir('test_misc') subdir('test_oat_hooks') subdir('test_parser') subdir('test_pg_dump') +subdir('test_plan_advice') subdir('test_predtest') subdir('test_radixtree') subdir('test_rbtree') diff --git a/src/test/modules/test_plan_advice/Makefile b/src/test/modules/test_plan_advice/Makefile new file mode 100644 index 0000000000..be026ce34b --- /dev/null +++ b/src/test/modules/test_plan_advice/Makefile @@ -0,0 +1,28 @@ +# src/test/modules/test_plan_advice/Makefile + +PGFILEDESC = "test_plan_advice - test whether generated plan advice works" + +MODULE_big = test_plan_advice +OBJS = \ + $(WIN32RES) \ + test_plan_advice.o + +EXTRA_INSTALL = contrib/pg_plan_advice + +TAP_TESTS = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_plan_advice +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +override CPPFLAGS += -I$(top_srcdir)/contrib/pg_plan_advice + +REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX) +export REGRESS_SHLIB diff --git a/src/test/modules/test_plan_advice/meson.build b/src/test/modules/test_plan_advice/meson.build new file mode 100644 index 0000000000..afde420bae --- /dev/null +++ b/src/test/modules/test_plan_advice/meson.build @@ -0,0 +1,29 @@ +# Copyright (c) 2022-2026, PostgreSQL Global Development Group + +test_plan_advice_sources = files( + 'test_plan_advice.c', +) + +if host_system == 'windows' + test_plan_advice_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_plan_advice', + '--FILEDESC', 'test_plan_advice - test whether generated plan advice works',]) +endif + +test_plan_advice = shared_module('test_plan_advice', + test_plan_advice_sources, + include_directories: pg_plan_advice_inc, + kwargs: pg_test_mod_args, +) +test_install_libs += test_plan_advice + +tests += { + 'name': 'test_plan_advice', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'tests': [ + 't/001_replan_regress.pl', + ], + }, +} diff --git a/src/test/modules/test_plan_advice/t/001_replan_regress.pl b/src/test/modules/test_plan_advice/t/001_replan_regress.pl new file mode 100644 index 0000000000..13b1a22570 --- /dev/null +++ b/src/test/modules/test_plan_advice/t/001_replan_regress.pl @@ -0,0 +1,64 @@ +# Copyright (c) 2021-2025, PostgreSQL Global Development Group + +# Run the core regression tests under pg_plan_advice to check for problems. +use strict; +use warnings FATAL => 'all'; + +use Cwd qw(abs_path); +use File::Basename qw(dirname); + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize the primary node +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(); + +# Set up our desired configuration. +$node->append_conf('postgresql.conf', <start; + +my $srcdir = abs_path("../../../.."); + +# --dlpath is needed to be able to find the location of regress.so +# and any libraries the regression tests require. +my $dlpath = dirname($ENV{REGRESS_SHLIB}); + +# --outputdir points to the path where to place the output files. +my $outputdir = $PostgreSQL::Test::Utils::tmp_check; + +# --inputdir points to the path of the input files. +my $inputdir = "$srcdir/src/test/regress"; + +# Run the tests. +my $rc = + system($ENV{PG_REGRESS} . " " + . "--bindir= " + . "--dlpath=\"$dlpath\" " + . "--host=" . $node->host . " " + . "--port=" . $node->port . " " + . "--schedule=$srcdir/src/test/regress/parallel_schedule " + . "--max-concurrent-tests=20 " + . "--inputdir=\"$inputdir\" " + . "--outputdir=\"$outputdir\""); + +# Dump out the regression diffs file, if there is one +if ($rc != 0) +{ + my $diffs = "$outputdir/regression.diffs"; + if (-e $diffs) + { + print "=== dumping $diffs ===\n"; + print slurp_file($diffs); + print "=== EOF ===\n"; + } +} + +# Report results +is($rc, 0, 'regression tests pass'); + +done_testing(); diff --git a/src/test/modules/test_plan_advice/test_plan_advice.c b/src/test/modules/test_plan_advice/test_plan_advice.c new file mode 100644 index 0000000000..996675dc38 --- /dev/null +++ b/src/test/modules/test_plan_advice/test_plan_advice.c @@ -0,0 +1,143 @@ +/*------------------------------------------------------------------------- + * + * test_plan_advice.c + * Test pg_plan_advice by planning every query with generated advice. + * + * With this module loaded, every time a query is executed, we end up + * planning it twice. The first time we plan it, we generate plan advice, + * which we then feed back to pg_plan_advice as the supplied plan advice. + * It is then planned a second time using that advice. This hopefully + * allows us to detect cases where the advice is incorrect or causes + * failures or plan changes for some reason. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * src/test/modules/test_plan_advice/test_plan_advice.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "fmgr.h" +#include "optimizer/optimizer.h" +#include "pg_plan_advice.h" +#include "utils/guc.h" + +PG_MODULE_MAGIC; + +static bool in_recursion = false; + +static char *test_plan_advice_advisor(PlannerGlobal *glob, + Query *parse, + const char *query_string, + int cursorOptions, + ExplainState *es); +static DefElem *find_defelem_by_defname(List *deflist, char *defname); + +/* + * Initialize this module. + */ +void +_PG_init(void) +{ + void *(*add_advisor_fn) (pg_plan_advice_advisor_hook hook); + + /* + * Ask pg_plan_advice to get advice strings from test_plan_advice_advisor + */ + add_advisor_fn = + load_external_function("pg_plan_advice", "pg_plan_advice_add_advisor", + true, NULL); + + (*add_advisor_fn) (test_plan_advice_advisor); +} + +/* + * Re-plan the given query and return the generated advice string as the + * supplied advice. + */ +static char * +test_plan_advice_advisor(PlannerGlobal *glob, Query *parse, + const char *query_string, int cursorOptions, + ExplainState *es) +{ + PlannedStmt *pstmt; + int save_nestlevel = 0; + DefElem *pgpa_item; + DefElem *advice_string_item; + + /* + * Since this function is called from the planner and triggers planning, + * we need a recursion guard. + */ + if (in_recursion) + return NULL; + + PG_TRY(); + { + in_recursion = true; + + /* + * Planning can trigger expression evaluation, which can result in + * sending NOTICE messages or other output to the client. To avoid + * that, we set client_min_messages = ERROR in the hopes of getting + * the same output with and without this module. + * + * We also need to set pg_plan_advice.always_store_advice_details so + * that pg_plan_advice will generate an advice string, since the whole + * point of this function is to get access to that. + */ + save_nestlevel = NewGUCNestLevel(); + set_config_option("client_min_messages", "error", + PGC_SUSET, PGC_S_SESSION, + GUC_ACTION_SAVE, true, 0, false); + set_config_option("pg_plan_advice.always_store_advice_details", "true", + PGC_SUSET, PGC_S_SESSION, + GUC_ACTION_SAVE, true, 0, false); + + /* + * Replan. We must copy the Query, because the planner modifies it. + * (As noted elsewhere, that's unfortunate; perhaps it will be fixed + * some day.) + */ + pstmt = planner(copyObject(parse), query_string, cursorOptions, + glob->boundParams, es); + } + PG_FINALLY(); + { + in_recursion = false; + } + PG_END_TRY(); + + /* Roll back any GUC changes */ + if (save_nestlevel > 0) + AtEOXact_GUC(false, save_nestlevel); + + /* Extract and return the advice string */ + pgpa_item = find_defelem_by_defname(pstmt->extension_state, + "pg_plan_advice"); + if (pgpa_item == NULL) + elog(ERROR, "extension state for pg_plan_advice not found"); + advice_string_item = find_defelem_by_defname((List *) pgpa_item->arg, + "advice_string"); + if (advice_string_item == NULL) + elog(ERROR, + "advice string for pg_plan_advice not found in extension state"); + return strVal(advice_string_item->arg); +} + +/* + * Search a list of DefElem objects for a given defname. + */ +static DefElem * +find_defelem_by_defname(List *deflist, char *defname) +{ + foreach_node(DefElem, item, deflist) + { + if (strcmp(item->defname, defname) == 0) + return item; + } + + return NULL; +} From 8c4d351326c3c04f57178a28229f9eaedc4bce3a Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 9 Feb 2026 11:13:23 -0500 Subject: [PATCH 147/147] in test_plan_advice, enable feedback warnings --- src/test/modules/test_plan_advice/t/001_replan_regress.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/modules/test_plan_advice/t/001_replan_regress.pl b/src/test/modules/test_plan_advice/t/001_replan_regress.pl index 13b1a22570..303210f13b 100644 --- a/src/test/modules/test_plan_advice/t/001_replan_regress.pl +++ b/src/test/modules/test_plan_advice/t/001_replan_regress.pl @@ -19,6 +19,7 @@ $node->append_conf('postgresql.conf', <start;