IndexTuple nposting, uint16 postingoff);
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
Buffer rbuf, BTStack stack, bool isroot, bool isonly);
-static Buffer _bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
+static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
OffsetNumber itup_off, bool newfirstdataitem);
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
/* we need an insertion scan key to do our search, so build one */
- itup_key = _bt_mkscankey(rel, heapRel, itup);
+ itup_key = _bt_mkscankey(rel, itup);
if (checkingunique)
{
* indexes.
*/
static void
-_bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
+_bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
+ BTStack stack)
{
Page page;
BTPageOpaque opaque;
Buffer rbuf;
BlockNumber rblkno;
+ Assert(heaprel != NULL);
page = BufferGetPage(insertstate->buf);
opaque = BTPageGetOpaque(page);
/*
* Every internal page should have exactly one negative infinity item at
- * all times. Only _bt_split() and _bt_newroot() should add items that
+ * all times. Only _bt_split() and _bt_newlevel() should add items that
* become negative infinity items through truncation, since they're the
* only routines that allocate new internal pages.
*/
* only one on its tree level, but was not the root, it may have been
* the "fast root". We need to ensure that the fast root link points
* at or above the current page. We can safely acquire a lock on the
- * metapage here --- see comments for _bt_newroot().
+ * metapage here --- see comments for _bt_newlevel().
*/
if (unlikely(split_only_page))
{
Assert(!isleaf);
Assert(BufferIsValid(cbuf));
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
* call _bt_getrootheight while holding a buffer lock.
*/
if (BlockNumberIsValid(blockcache) &&
- _bt_getrootheight(rel, heaprel) >= BTREE_FASTPATH_MIN_LEVEL)
+ _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
RelationSetTargetBlock(rel, blockcache);
}
* way because it avoids an unnecessary PANIC when either origpage or its
* existing sibling page are corrupt.
*/
- rbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE);
+ rbuf = _bt_allocbuf(rel, heaprel);
rightpage = BufferGetPage(rbuf);
rightpagenumber = BufferGetBlockNumber(rbuf);
/* rightpage was initialized by _bt_getbuf */
*/
if (!isrightmost)
{
- sbuf = _bt_getbuf(rel, heaprel, oopaque->btpo_next, BT_WRITE);
+ sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
spage = BufferGetPage(sbuf);
sopaque = BTPageGetOpaque(spage);
if (sopaque->btpo_prev != origpagenumber)
bool isroot,
bool isonly)
{
+ Assert(heaprel != NULL);
+
/*
* Here we have to do something Lehman and Yao don't talk about: deal with
* a root split and construction of a new root. If our stack is empty
Assert(stack == NULL);
Assert(isonly);
- /* create a new root node and update the metapage */
- rootbuf = _bt_newroot(rel, heaprel, buf, rbuf);
+ /* create a new root node one level up and update the metapage */
+ rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
/* release the split buffers */
_bt_relbuf(rel, rootbuf);
_bt_relbuf(rel, rbuf);
BlockNumberIsValid(RelationGetTargetBlock(rel))));
/* Find the leftmost page at the next level up */
- pbuf = _bt_get_endpoint(rel, heaprel, opaque->btpo_level + 1, false,
- NULL);
+ pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
/* Set up a phony stack entry pointing there */
stack = &fakestack;
stack->bts_blkno = BufferGetBlockNumber(pbuf);
*
* On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
* and unpinned.
+ *
+ * Caller must provide a valid heaprel, since finishing a page split requires
+ * allocating a new page if and when the parent page splits in turn.
*/
void
_bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
bool wasonly;
Assert(P_INCOMPLETE_SPLIT(lpageop));
+ Assert(heaprel != NULL);
/* Lock right sibling, the one missing the downlink */
- rbuf = _bt_getbuf(rel, heaprel, lpageop->btpo_next, BT_WRITE);
+ rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
rpage = BufferGetPage(rbuf);
rpageop = BTPageGetOpaque(rpage);
BTMetaPageData *metad;
/* acquire lock on the metapage */
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
Page page;
BTPageOpaque opaque;
- buf = _bt_getbuf(rel, heaprel, blkno, BT_WRITE);
+ buf = _bt_getbuf(rel, blkno, BT_WRITE);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
+ Assert(heaprel != NULL);
if (P_INCOMPLETE_SPLIT(opaque))
{
_bt_finish_split(rel, heaprel, buf, stack->bts_parent);
}
/*
- * _bt_newroot() -- Create a new root page for the index.
+ * _bt_newlevel() -- Create a new level above root page.
*
* We've just split the old root page and need to create a new one.
* In order to do this, we add a new root page to the file, then lock
* lbuf, rbuf & rootbuf.
*/
static Buffer
-_bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
+_bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
{
Buffer rootbuf;
Page lpage,
lopaque = BTPageGetOpaque(lpage);
/* get a new root page */
- rootbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE);
+ rootbuf = _bt_allocbuf(rel, heaprel);
rootpage = BufferGetPage(rootbuf);
rootblknum = BufferGetBlockNumber(rootbuf);
/* acquire lock on the metapage */
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
#include "utils/snapmgr.h"
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
-static void _bt_log_reuse_page(Relation rel, Relation heaprel, BlockNumber blkno,
- FullTransactionId safexid);
-static void _bt_delitems_delete(Relation rel, Relation heaprel, Buffer buf,
+static void _bt_delitems_delete(Relation rel, Buffer buf,
TransactionId snapshotConflictHorizon,
+ bool isCatalogRel,
OffsetNumber *deletable, int ndeletable,
BTVacuumPosting *updatable, int nupdatable);
static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
* index tuples needed to be deleted.
*/
bool
-_bt_vacuum_needs_cleanup(Relation rel, Relation heaprel)
+_bt_vacuum_needs_cleanup(Relation rel)
{
Buffer metabuf;
Page metapg;
*
* Note that we deliberately avoid using cached version of metapage here.
*/
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
btm_version = metad->btm_version;
* finalized.
*/
void
-_bt_set_cleanup_info(Relation rel, Relation heaprel, BlockNumber num_delpages)
+_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
{
Buffer metabuf;
Page metapg;
* no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
* to be consistent.
*/
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
* NOTE that the returned root page will have only a read lock set
* on it even if access = BT_WRITE!
*
+ * If access = BT_WRITE, heaprel must be set; otherwise caller can just
+ * pass NULL. See _bt_allocbuf for an explanation.
+ *
* The returned page is not necessarily the true root --- it could be
* a "fast root" (a page that is alone in its level due to deletions).
* Also, if the root page is split while we are "in flight" to it,
uint32 rootlevel;
BTMetaPageData *metad;
+ Assert(access == BT_READ || heaprel != NULL);
+
/*
* Try to use previously-cached metapage data to find the root. This
* normally saves one buffer access per index search, which is a very
Assert(rootblkno != P_NONE);
rootlevel = metad->btm_fastlevel;
- rootbuf = _bt_getbuf(rel, heaprel, rootblkno, BT_READ);
+ rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
rootpage = BufferGetPage(rootbuf);
rootopaque = BTPageGetOpaque(rootpage);
rel->rd_amcache = NULL;
}
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metad = _bt_getmeta(rel, metabuf);
/* if no root page initialized yet, do it */
* the new root page. Since this is the first page in the tree, it's
* a leaf as well as the root.
*/
- rootbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE);
+ rootbuf = _bt_allocbuf(rel, heaprel);
rootblkno = BufferGetBlockNumber(rootbuf);
rootpage = BufferGetPage(rootbuf);
rootopaque = BTPageGetOpaque(rootpage);
* moving to the root --- that'd deadlock against any concurrent root split.)
*/
Buffer
-_bt_gettrueroot(Relation rel, Relation heaprel)
+_bt_gettrueroot(Relation rel)
{
Buffer metabuf;
Page metapg;
pfree(rel->rd_amcache);
rel->rd_amcache = NULL;
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = BTPageGetOpaque(metapg);
metad = BTPageGetMeta(metapg);
* about updating previously cached data.
*/
int
-_bt_getrootheight(Relation rel, Relation heaprel)
+_bt_getrootheight(Relation rel)
{
BTMetaPageData *metad;
{
Buffer metabuf;
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metad = _bt_getmeta(rel, metabuf);
/*
* pg_upgrade'd from Postgres 12.
*/
void
-_bt_metaversion(Relation rel, Relation heaprel, bool *heapkeyspace, bool *allequalimage)
+_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
{
BTMetaPageData *metad;
{
Buffer metabuf;
- metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metad = _bt_getmeta(rel, metabuf);
/*
}
/*
- * Log the reuse of a page from the FSM.
- */
-static void
-_bt_log_reuse_page(Relation rel, Relation heaprel, BlockNumber blkno,
- FullTransactionId safexid)
-{
- xl_btree_reuse_page xlrec_reuse;
-
- /*
- * Note that we don't register the buffer with the record, because this
- * operation doesn't modify the page. This record only exists to provide a
- * conflict point for Hot Standby.
- */
-
- /* XLOG stuff */
- xlrec_reuse.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel);
- xlrec_reuse.locator = rel->rd_locator;
- xlrec_reuse.block = blkno;
- xlrec_reuse.snapshotConflictHorizon = safexid;
-
- XLogBeginInsert();
- XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
-
- XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
-}
-
-/*
- * _bt_getbuf() -- Get a buffer by block number for read or write.
- *
- * blkno == P_NEW means to get an unallocated index page. The page
- * will be initialized before returning it.
+ * _bt_getbuf() -- Get an existing block in a buffer, for read or write.
*
* The general rule in nbtree is that it's never okay to access a
* page without holding both a buffer pin and a buffer lock on
* When this routine returns, the appropriate lock is set on the
* requested buffer and its reference count has been incremented
* (ie, the buffer is "locked and pinned"). Also, we apply
- * _bt_checkpage to sanity-check the page (except in P_NEW case),
- * and perform Valgrind client requests that help Valgrind detect
- * unsafe page accesses.
+ * _bt_checkpage to sanity-check the page, and perform Valgrind
+ * client requests that help Valgrind detect unsafe page accesses.
*
* Note: raw LockBuffer() calls are disallowed in nbtree; all
* buffer lock requests need to go through wrapper functions such
* as _bt_lockbuf().
*/
Buffer
-_bt_getbuf(Relation rel, Relation heaprel, BlockNumber blkno, int access)
+_bt_getbuf(Relation rel, BlockNumber blkno, int access)
{
Buffer buf;
- if (blkno != P_NEW)
- {
- /* Read an existing block of the relation */
- buf = ReadBuffer(rel, blkno);
- _bt_lockbuf(rel, buf, access);
- _bt_checkpage(rel, buf);
- }
- else
- {
- Page page;
+ Assert(BlockNumberIsValid(blkno));
- Assert(access == BT_WRITE);
+ /* Read an existing block of the relation */
+ buf = ReadBuffer(rel, blkno);
+ _bt_lockbuf(rel, buf, access);
+ _bt_checkpage(rel, buf);
- /*
- * First see if the FSM knows of any free pages.
- *
- * We can't trust the FSM's report unreservedly; we have to check that
- * the page is still free. (For example, an already-free page could
- * have been re-used between the time the last VACUUM scanned it and
- * the time the VACUUM made its FSM updates.)
- *
- * In fact, it's worse than that: we can't even assume that it's safe
- * to take a lock on the reported page. If somebody else has a lock
- * on it, or even worse our own caller does, we could deadlock. (The
- * own-caller scenario is actually not improbable. Consider an index
- * on a serial or timestamp column. Nearly all splits will be at the
- * rightmost page, so it's entirely likely that _bt_split will call us
- * while holding a lock on the page most recently acquired from FSM. A
- * VACUUM running concurrently with the previous split could well have
- * placed that page back in FSM.)
- *
- * To get around that, we ask for only a conditional lock on the
- * reported page. If we fail, then someone else is using the page,
- * and we may reasonably assume it's not free. (If we happen to be
- * wrong, the worst consequence is the page will be lost to use till
- * the next VACUUM, which is no big problem.)
- */
- for (;;)
+ return buf;
+}
+
+/*
+ * _bt_allocbuf() -- Allocate a new block/page.
+ *
+ * Returns a write-locked buffer containing an unallocated nbtree page.
+ *
+ * Callers are required to pass a valid heaprel. We need heaprel so that we
+ * can handle generating a snapshotConflictHorizon that makes reusing a page
+ * from the FSM safe for queries that may be running on standbys.
+ */
+Buffer
+_bt_allocbuf(Relation rel, Relation heaprel)
+{
+ Buffer buf;
+ BlockNumber blkno;
+ Page page;
+
+ Assert(heaprel != NULL);
+
+ /*
+ * First see if the FSM knows of any free pages.
+ *
+ * We can't trust the FSM's report unreservedly; we have to check that the
+ * page is still free. (For example, an already-free page could have been
+ * re-used between the time the last VACUUM scanned it and the time the
+ * VACUUM made its FSM updates.)
+ *
+ * In fact, it's worse than that: we can't even assume that it's safe to
+ * take a lock on the reported page. If somebody else has a lock on it,
+ * or even worse our own caller does, we could deadlock. (The own-caller
+ * scenario is actually not improbable. Consider an index on a serial or
+ * timestamp column. Nearly all splits will be at the rightmost page, so
+ * it's entirely likely that _bt_split will call us while holding a lock
+ * on the page most recently acquired from FSM. A VACUUM running
+ * concurrently with the previous split could well have placed that page
+ * back in FSM.)
+ *
+ * To get around that, we ask for only a conditional lock on the reported
+ * page. If we fail, then someone else is using the page, and we may
+ * reasonably assume it's not free. (If we happen to be wrong, the worst
+ * consequence is the page will be lost to use till the next VACUUM, which
+ * is no big problem.)
+ */
+ for (;;)
+ {
+ blkno = GetFreeIndexPage(rel);
+ if (blkno == InvalidBlockNumber)
+ break;
+ buf = ReadBuffer(rel, blkno);
+ if (_bt_conditionallockbuf(rel, buf))
{
- blkno = GetFreeIndexPage(rel);
- if (blkno == InvalidBlockNumber)
- break;
- buf = ReadBuffer(rel, blkno);
- if (_bt_conditionallockbuf(rel, buf))
+ page = BufferGetPage(buf);
+
+ /*
+ * It's possible to find an all-zeroes page in an index. For
+ * example, a backend might successfully extend the relation one
+ * page and then crash before it is able to make a WAL entry for
+ * adding the page. If we find a zeroed page then reclaim it
+ * immediately.
+ */
+ if (PageIsNew(page))
{
- page = BufferGetPage(buf);
+ /* Okay to use page. Initialize and return it. */
+ _bt_pageinit(page, BufferGetPageSize(buf));
+ return buf;
+ }
+ if (BTPageIsRecyclable(page, heaprel))
+ {
/*
- * It's possible to find an all-zeroes page in an index. For
- * example, a backend might successfully extend the relation
- * one page and then crash before it is able to make a WAL
- * entry for adding the page. If we find a zeroed page then
- * reclaim it immediately.
+ * If we are generating WAL for Hot Standby then create a WAL
+ * record that will allow us to conflict with queries running
+ * on standby, in case they have snapshots older than safexid
+ * value
*/
- if (PageIsNew(page))
+ if (RelationNeedsWAL(rel) && XLogStandbyInfoActive())
{
- /* Okay to use page. Initialize and return it. */
- _bt_pageinit(page, BufferGetPageSize(buf));
- return buf;
- }
+ xl_btree_reuse_page xlrec_reuse;
- if (BTPageIsRecyclable(page, heaprel))
- {
/*
- * If we are generating WAL for Hot Standby then create a
- * WAL record that will allow us to conflict with queries
- * running on standby, in case they have snapshots older
- * than safexid value
+ * Note that we don't register the buffer with the record,
+ * because this operation doesn't modify the page (that
+ * already happened, back when VACUUM deleted the page).
+ * This record only exists to provide a conflict point for
+ * Hot Standby. See record REDO routine comments.
*/
- if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
- _bt_log_reuse_page(rel, heaprel, blkno,
- BTPageGetDeleteXid(page));
+ xlrec_reuse.locator = rel->rd_locator;
+ xlrec_reuse.block = blkno;
+ xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
+ xlrec_reuse.isCatalogRel =
+ RelationIsAccessibleInLogicalDecoding(heaprel);
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
- /* Okay to use page. Re-initialize and return it. */
- _bt_pageinit(page, BufferGetPageSize(buf));
- return buf;
+ XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
}
- elog(DEBUG2, "FSM returned nonrecyclable page");
- _bt_relbuf(rel, buf);
- }
- else
- {
- elog(DEBUG2, "FSM returned nonlockable page");
- /* couldn't get lock, so just drop pin */
- ReleaseBuffer(buf);
+
+ /* Okay to use page. Re-initialize and return it. */
+ _bt_pageinit(page, BufferGetPageSize(buf));
+ return buf;
}
+ elog(DEBUG2, "FSM returned nonrecyclable page");
+ _bt_relbuf(rel, buf);
}
+ else
+ {
+ elog(DEBUG2, "FSM returned nonlockable page");
+ /* couldn't get lock, so just drop pin */
+ ReleaseBuffer(buf);
+ }
+ }
- /*
- * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or
- * we risk a race condition against btvacuumscan --- see comments
- * therein. This forces us to repeat the valgrind request that
- * _bt_lockbuf() otherwise would make, as we can't use _bt_lockbuf()
- * without introducing a race.
- */
- buf = ExtendBufferedRel(EB_REL(rel), MAIN_FORKNUM, NULL,
- EB_LOCK_FIRST);
- if (!RelationUsesLocalBuffers(rel))
- VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+ /*
+ * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
+ * risk a race condition against btvacuumscan --- see comments therein.
+ * This forces us to repeat the valgrind request that _bt_lockbuf()
+ * otherwise would make, as we can't use _bt_lockbuf() without introducing
+ * a race.
+ */
+ buf = ExtendBufferedRel(EB_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST);
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
- /* Initialize the new page before returning it */
- page = BufferGetPage(buf);
- Assert(PageIsNew(page));
- _bt_pageinit(page, BufferGetPageSize(buf));
- }
+ /* Initialize the new page before returning it */
+ page = BufferGetPage(buf);
+ Assert(PageIsNew(page));
+ _bt_pageinit(page, BufferGetPageSize(buf));
- /* ref count and lock type are correct */
return buf;
}
/*
* _bt_relandgetbuf() -- release a locked buffer and get another one.
*
- * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
- * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
- * then it reduces to just _bt_getbuf; allowing this case simplifies some
- * callers.
+ * This is equivalent to _bt_relbuf followed by _bt_getbuf. Also, if obuf is
+ * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case
+ * simplifies some callers.
*
* The original motivation for using this was to avoid two entries to the
* bufmgr when one would do. However, now it's mainly just a notational
{
Buffer buf;
- Assert(blkno != P_NEW);
+ Assert(BlockNumberIsValid(blkno));
if (BufferIsValid(obuf))
_bt_unlockbuf(rel, obuf);
buf = ReleaseAndReadBuffer(obuf, rel, blkno);
* (a version that lacks the TIDs that are to be deleted).
*
* This is nearly the same as _bt_delitems_vacuum as far as what it does to
- * the page, but it needs its own snapshotConflictHorizon (caller gets this
- * from tableam). This is used by the REDO routine to generate recovery
+ * the page, but it needs its own snapshotConflictHorizon and isCatalogRel
+ * (from the tableam). This is used by the REDO routine to generate recovery
* conflicts. The other difference is that only _bt_delitems_vacuum will
* clear page's VACUUM cycle ID.
*/
static void
-_bt_delitems_delete(Relation rel, Relation heaprel, Buffer buf,
- TransactionId snapshotConflictHorizon,
+_bt_delitems_delete(Relation rel, Buffer buf,
+ TransactionId snapshotConflictHorizon, bool isCatalogRel,
OffsetNumber *deletable, int ndeletable,
BTVacuumPosting *updatable, int nupdatable)
{
XLogRecPtr recptr;
xl_btree_delete xlrec_delete;
- xlrec_delete.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel);
xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
xlrec_delete.ndeleted = ndeletable;
xlrec_delete.nupdated = nupdatable;
+ xlrec_delete.isCatalogRel = isCatalogRel;
XLogBeginInsert();
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
{
Page page = BufferGetPage(buf);
TransactionId snapshotConflictHorizon;
+ bool isCatalogRel;
OffsetNumber postingidxoffnum = InvalidOffsetNumber;
int ndeletable = 0,
nupdatable = 0;
/* Use tableam interface to determine which tuples to delete first */
snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
+ isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
/* Should not WAL-log snapshotConflictHorizon unless it's required */
if (!XLogStandbyInfoActive())
}
/* Physically delete tuples (or TIDs) using deletable (or updatable) */
- _bt_delitems_delete(rel, heapRel, buf, snapshotConflictHorizon, deletable,
- ndeletable, updatable, nupdatable);
+ _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
+ deletable, ndeletable, updatable, nupdatable);
/* be tidy */
for (int i = 0; i < nupdatable; i++)
* same level must always be locked left to right to avoid deadlocks.
*/
static bool
-_bt_leftsib_splitflag(Relation rel, Relation heaprel, BlockNumber leftsib,
- BlockNumber target)
+_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
{
Buffer buf;
Page page;
if (leftsib == P_NONE)
return false;
- buf = _bt_getbuf(rel, heaprel, leftsib, BT_READ);
+ buf = _bt_getbuf(rel, leftsib, BT_READ);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
* to-be-deleted subtree.)
*/
static bool
-_bt_rightsib_halfdeadflag(Relation rel, Relation heaprel, BlockNumber leafrightsib)
+_bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
{
Buffer buf;
Page page;
Assert(leafrightsib != P_NONE);
- buf = _bt_getbuf(rel, heaprel, leafrightsib, BT_READ);
+ buf = _bt_getbuf(rel, leafrightsib, BT_READ);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
* marked with INCOMPLETE_SPLIT flag before proceeding
*/
Assert(leafblkno == scanblkno);
- if (_bt_leftsib_splitflag(rel, vstate->info->heaprel, leftsib, leafblkno))
+ if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
{
ReleaseBuffer(leafbuf);
return;
}
/* we need an insertion scan key for the search, so build one */
- itup_key = _bt_mkscankey(rel, vstate->info->heaprel, targetkey);
+ itup_key = _bt_mkscankey(rel, targetkey);
/* find the leftmost leaf page with matching pivot/high key */
itup_key->pivotsearch = true;
- stack = _bt_search(rel, vstate->info->heaprel, itup_key,
- &sleafbuf, BT_READ, NULL);
+ stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ,
+ NULL);
/* won't need a second lock or pin on leafbuf */
_bt_relbuf(rel, sleafbuf);
* leafbuf page half-dead.
*/
Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
- if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf, stack))
+ if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
+ stack))
{
_bt_relbuf(rel, leafbuf);
return;
if (!rightsib_empty)
break;
- leafbuf = _bt_getbuf(rel, vstate->info->heaprel, rightsib, BT_WRITE);
+ leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
}
}
* but may include additional internal pages (at most one per level of the
* tree below the root).
*
+ * Caller must pass a valid heaprel, since it's just about possible that our
+ * call to _bt_lock_subtree_parent will need to allocate a new index page to
+ * complete a page split. Every call to _bt_allocbuf needs to pass a heaprel.
+ *
* Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
* the rightmost child of its parent (and parent has more than one downlink).
* Returns 'true' when the first stage of page deletion completed
Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
+ Assert(heaprel != NULL);
/*
* Save info about the leaf page.
* delete the downlink. It would fail the "right sibling of target page
* is also the next child in parent page" cross-check below.
*/
- if (_bt_rightsib_halfdeadflag(rel, heaprel, leafrightsib))
+ if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
{
elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
leafblkno, leafrightsib);
Assert(target != leafblkno);
/* Fetch the block number of the target's left sibling */
- buf = _bt_getbuf(rel, vstate->info->heaprel, target, BT_READ);
+ buf = _bt_getbuf(rel, target, BT_READ);
page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
leftsib = opaque->btpo_prev;
_bt_lockbuf(rel, leafbuf, BT_WRITE);
if (leftsib != P_NONE)
{
- lbuf = _bt_getbuf(rel, vstate->info->heaprel, leftsib, BT_WRITE);
+ lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
page = BufferGetPage(lbuf);
opaque = BTPageGetOpaque(page);
while (P_ISDELETED(opaque) || opaque->btpo_next != target)
CHECK_FOR_INTERRUPTS();
/* step right one page */
- lbuf = _bt_getbuf(rel, vstate->info->heaprel, leftsib, BT_WRITE);
+ lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
page = BufferGetPage(lbuf);
opaque = BTPageGetOpaque(page);
}
* And next write-lock the (current) right sibling.
*/
rightsib = opaque->btpo_next;
- rbuf = _bt_getbuf(rel, vstate->info->heaprel, rightsib, BT_WRITE);
+ rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
page = BufferGetPage(rbuf);
opaque = BTPageGetOpaque(page);
* of doing so are slim, and the locking considerations daunting.)
*
* We can safely acquire a lock on the metapage here --- see comments for
- * _bt_newroot().
+ * _bt_newlevel().
*/
if (leftsib == P_NONE && rightsib_is_rightmost)
{
if (P_RIGHTMOST(opaque))
{
/* rightsib will be the only one left on the level */
- metabuf = _bt_getbuf(rel, vstate->info->heaprel, BTREE_METAPAGE,
- BT_WRITE);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
*
* Note: We deliberately avoid completing incomplete splits here.
*/
- if (_bt_leftsib_splitflag(rel, heaprel, leftsibparent, parent))
+ if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
return false;
/* Recurse to examine child page's grandparent page */
Relation heaprel = vstate->info->heaprel;
Assert(stats->pages_newly_deleted >= vstate->npendingpages);
+ Assert(heaprel != NULL);
if (vstate->npendingpages == 0)
{
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
ScanDirection dir);
-static Buffer _bt_walk_left(Relation rel, Relation heaprel, Buffer buf,
- Snapshot snapshot);
+static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
* When access = BT_READ, an empty index will result in *bufP being set to
* InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
* during the search will be finished.
+ *
+ * heaprel must be provided by callers that pass access = BT_WRITE, since we
+ * might need to allocate a new root page for caller -- see _bt_allocbuf.
*/
BTStack
_bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
BTStack stack_in = NULL;
int page_access = BT_READ;
+ /* heaprel must be set whenever _bt_allocbuf is reachable */
+ Assert(access == BT_READ || access == BT_WRITE);
+ Assert(access == BT_READ || heaprel != NULL);
+
/* Get the root page to start with */
*bufP = _bt_getroot(rel, heaprel, access);
*
* If forupdate is true, we will attempt to finish any incomplete splits
* that we encounter. This is required when locking a target page for an
- * insertion, because we don't allow inserting on a page before the split
- * is completed. 'stack' is only used if forupdate is true.
+ * insertion, because we don't allow inserting on a page before the split is
+ * completed. 'heaprel' and 'stack' are only used if forupdate is true.
*
* On entry, we have the buffer pinned and a lock of the type specified by
* 'access'. If we move right, we release the buffer and lock and acquire
BTPageOpaque opaque;
int32 cmpval;
+ Assert(!forupdate || heaprel != NULL);
+
/*
* When nextkey = false (normal case): if the scan key that brought us to
* this page is > the high key stored on the page, then the page has split
_bt_relbuf(rel, buf);
/* re-acquire the lock in the right mode, and re-check */
- buf = _bt_getbuf(rel, heaprel, blkno, access);
+ buf = _bt_getbuf(rel, blkno, access);
continue;
}
_bt_first(IndexScanDesc scan, ScanDirection dir)
{
Relation rel = scan->indexRelation;
- Relation heaprel = scan->heapRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Buffer buf;
BTStack stack;
}
/* Initialize remaining insertion scan key fields */
- _bt_metaversion(rel, heaprel, &inskey.heapkeyspace, &inskey.allequalimage);
+ _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
inskey.anynullkeys = false; /* unused */
inskey.nextkey = nextkey;
inskey.pivotsearch = false;
* Use the manufactured insertion scan key to descend the tree and
* position ourselves on the target leaf page.
*/
- stack = _bt_search(rel, heaprel, &inskey, &buf, BT_READ, scan->xs_snapshot);
+ stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ, scan->xs_snapshot);
/* don't need to keep the stack around... */
_bt_freestack(stack);
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
/* step right one page */
- so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno, BT_READ);
+ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(so->currPos.buf);
TestForOldSnapshot(scan->xs_snapshot, rel, page);
opaque = BTPageGetOpaque(page);
if (BTScanPosIsPinned(so->currPos))
_bt_lockbuf(rel, so->currPos.buf, BT_READ);
else
- so->currPos.buf = _bt_getbuf(rel, scan->heapRelation,
- so->currPos.currPage, BT_READ);
+ so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
for (;;)
{
}
/* Step to next physical page */
- so->currPos.buf = _bt_walk_left(rel, scan->heapRelation,
- so->currPos.buf, scan->xs_snapshot);
+ so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
+ scan->xs_snapshot);
/* if we're physically at end of index, return failure */
if (so->currPos.buf == InvalidBuffer)
BTScanPosInvalidate(so->currPos);
return false;
}
- so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno,
- BT_READ);
+ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
}
}
}
* again if it's important.
*/
static Buffer
-_bt_walk_left(Relation rel, Relation heaprel, Buffer buf, Snapshot snapshot)
+_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
{
Page page;
BTPageOpaque opaque;
_bt_relbuf(rel, buf);
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
- buf = _bt_getbuf(rel, heaprel, blkno, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
TestForOldSnapshot(snapshot, rel, page);
opaque = BTPageGetOpaque(page);
* The returned buffer is pinned and read-locked.
*/
Buffer
-_bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, bool rightmost,
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
Snapshot snapshot)
{
Buffer buf;
* smarter about intermediate levels.)
*/
if (level == 0)
- buf = _bt_getroot(rel, heaprel, BT_READ);
+ buf = _bt_getroot(rel, NULL, BT_READ);
else
- buf = _bt_gettrueroot(rel, heaprel);
+ buf = _bt_gettrueroot(rel);
if (!BufferIsValid(buf))
return InvalidBuffer;
* version of _bt_search(). We don't maintain a stack since we know we
* won't need it.
*/
- buf = _bt_get_endpoint(rel, scan->heapRelation, 0,
- ScanDirectionIsBackward(dir), scan->xs_snapshot);
+ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
if (!BufferIsValid(buf))
{