Make table_scan_bitmap_next_block() async-friendly
authorMelanie Plageman <[email protected]>
Fri, 25 Oct 2024 14:11:58 +0000 (10:11 -0400)
committerMelanie Plageman <[email protected]>
Fri, 25 Oct 2024 14:11:58 +0000 (10:11 -0400)
Move all responsibility for indicating a block is exhuasted into
table_scan_bitmap_next_tuple() and advance the main iterator in
heap-specific code. This flow control makes more sense and is a step
toward using the read stream API for bitmap heap scans.

Previously, table_scan_bitmap_next_block() returned false to indicate
table_scan_bitmap_next_tuple() should not be called for the tuples on
the page. This happened both when 1) there were no visible tuples on the
page and 2) when the block returned by the iterator was past the end of
the table. BitmapHeapNext() (generic bitmap table scan code) handled the
case when the bitmap was exhausted.

It makes more sense for table_scan_bitmap_next_tuple() to return false
when there are no visible tuples on the page and
table_scan_bitmap_next_block() to return false when the bitmap is
exhausted or there are no more blocks in the table.

As part of this new design, TBMIterateResults are no longer used as a
flow control mechanism in BitmapHeapNext(), so we removed
table_scan_bitmap_next_tuple's TBMIterateResult parameter.

Note that the prefetch iterator is still saved in the
BitmapHeapScanState node and advanced in generic bitmap table scan code.
This is because 1) it was not necessary to change the prefetch iterator
location to change the flow control in BitmapHeapNext() 2) modifying
prefetch iterator management requires several more steps better split
over multiple commits and 3) the prefetch iterator will be removed once
the read stream API is used.

Author: Melanie Plageman
Reviewed-by: Tomas Vondra, Andres Freund, Heikki Linnakangas, Mark Dilger
Discussion: https://postgr.es/m/063e4eb4-32d9-439e-a0b1-75565a9835a8%40iki.fi

src/backend/access/heap/heapam.c
src/backend/access/heap/heapam_handler.c
src/backend/executor/nodeBitmapHeapscan.c
src/include/access/relscan.h
src/include/access/tableam.h
src/include/nodes/execnodes.h

index 4c8febdc811230734ea13d929022173ea4f83229..75ff9e7388fd8fc3fc829b636eea1dc96020bb7e 100644 (file)
@@ -1387,8 +1387,8 @@ heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid,
    heap_setscanlimits(sscan, startBlk, numBlks);
 
    /* Finally, set the TID range in sscan */
-   ItemPointerCopy(&lowestItem, &sscan->rs_mintid);
-   ItemPointerCopy(&highestItem, &sscan->rs_maxtid);
+   ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
+   ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
 }
 
 bool
@@ -1396,8 +1396,8 @@ heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction,
                          TupleTableSlot *slot)
 {
    HeapScanDesc scan = (HeapScanDesc) sscan;
-   ItemPointer mintid = &sscan->rs_mintid;
-   ItemPointer maxtid = &sscan->rs_maxtid;
+   ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
+   ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
 
    /* Note: no locking manipulations needed */
    for (;;)
index 166aab7a93ca7418b2c3c5ad55121a31dcf01e04..a8d95e0f1c1a0a7c3f220e08103eeb870b4abbc2 100644 (file)
@@ -2115,18 +2115,49 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
 
 static bool
 heapam_scan_bitmap_next_block(TableScanDesc scan,
-                             TBMIterateResult *tbmres,
+                             BlockNumber *blockno, bool *recheck,
                              uint64 *lossy_pages, uint64 *exact_pages)
 {
    HeapScanDesc hscan = (HeapScanDesc) scan;
-   BlockNumber block = tbmres->blockno;
+   BlockNumber block;
    Buffer      buffer;
    Snapshot    snapshot;
    int         ntup;
+   TBMIterateResult *tbmres;
 
    hscan->rs_cindex = 0;
    hscan->rs_ntuples = 0;
 
+   *blockno = InvalidBlockNumber;
+   *recheck = true;
+
+   do
+   {
+       CHECK_FOR_INTERRUPTS();
+
+       if (scan->st.bitmap.rs_shared_iterator)
+           tbmres = tbm_shared_iterate(scan->st.bitmap.rs_shared_iterator);
+       else
+           tbmres = tbm_iterate(scan->st.bitmap.rs_iterator);
+
+       if (tbmres == NULL)
+           return false;
+
+       /*
+        * Ignore any claimed entries past what we think is the end of the
+        * relation. It may have been extended after the start of our scan (we
+        * only hold an AccessShareLock, and it could be inserts from this
+        * backend).  We don't take this optimization in SERIALIZABLE
+        * isolation though, as we need to examine all invisible tuples
+        * reachable by the index.
+        */
+   } while (!IsolationIsSerializable() &&
+            tbmres->blockno >= hscan->rs_nblocks);
+
+   /* Got a valid block */
+   *blockno = tbmres->blockno;
+   *recheck = tbmres->recheck;
+
    /*
     * We can skip fetching the heap page if we don't need any fields from the
     * heap, the bitmap entries don't need rechecking, and all tuples on the
@@ -2145,16 +2176,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
        return true;
    }
 
-   /*
-    * Ignore any claimed entries past what we think is the end of the
-    * relation. It may have been extended after the start of our scan (we
-    * only hold an AccessShareLock, and it could be inserts from this
-    * backend).  We don't take this optimization in SERIALIZABLE isolation
-    * though, as we need to examine all invisible tuples reachable by the
-    * index.
-    */
-   if (!IsolationIsSerializable() && block >= hscan->rs_nblocks)
-       return false;
+   block = tbmres->blockno;
 
    /*
     * Acquire pin on the target heap page, trading in any pin we held before.
@@ -2249,12 +2271,18 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
    else
        (*lossy_pages)++;
 
-   return ntup > 0;
+   /*
+    * Return true to indicate that a valid block was found and the bitmap is
+    * not exhausted. If there are no visible tuples on this page,
+    * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will
+    * return false returning control to this function to advance to the next
+    * block in the bitmap.
+    */
+   return true;
 }
 
 static bool
 heapam_scan_bitmap_next_tuple(TableScanDesc scan,
-                             TBMIterateResult *tbmres,
                              TupleTableSlot *slot)
 {
    HeapScanDesc hscan = (HeapScanDesc) scan;
index f4690a20bb1f7c33f4c2d374c0215ce0348a40ec..89a16f142b7629e6e97d6f1a3d4667863b7e2ce4 100644 (file)
@@ -51,8 +51,7 @@
 
 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
 static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
-static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
-                                               BlockNumber blockno);
+static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node);
 static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
 static inline void BitmapPrefetch(BitmapHeapScanState *node,
                                  TableScanDesc scan);
@@ -71,9 +70,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
    ExprContext *econtext;
    TableScanDesc scan;
    TIDBitmap  *tbm;
-   TBMIterator *tbmiterator = NULL;
-   TBMSharedIterator *shared_tbmiterator = NULL;
-   TBMIterateResult *tbmres;
    TupleTableSlot *slot;
    ParallelBitmapHeapState *pstate = node->pstate;
    dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
@@ -85,11 +81,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
    slot = node->ss.ss_ScanTupleSlot;
    scan = node->ss.ss_currentScanDesc;
    tbm = node->tbm;
-   if (pstate == NULL)
-       tbmiterator = node->tbmiterator;
-   else
-       shared_tbmiterator = node->shared_tbmiterator;
-   tbmres = node->tbmres;
 
    /*
     * If we haven't yet performed the underlying index scan, do it, and begin
@@ -105,6 +96,9 @@ BitmapHeapNext(BitmapHeapScanState *node)
     */
    if (!node->initialized)
    {
+       TBMIterator *tbmiterator = NULL;
+       TBMSharedIterator *shared_tbmiterator = NULL;
+
        if (!pstate)
        {
            tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
@@ -113,8 +107,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
                elog(ERROR, "unrecognized result from subplan");
 
            node->tbm = tbm;
-           node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
-           node->tbmres = tbmres = NULL;
+           tbmiterator = tbm_begin_iterate(tbm);
 
 #ifdef USE_PREFETCH
            if (node->prefetch_maximum > 0)
@@ -166,9 +159,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
            }
 
            /* Allocate a private iterator and attach the shared state to it */
-           node->shared_tbmiterator = shared_tbmiterator =
+           shared_tbmiterator =
                tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
-           node->tbmres = tbmres = NULL;
 
 #ifdef USE_PREFETCH
            if (node->prefetch_maximum > 0)
@@ -207,47 +199,23 @@ BitmapHeapNext(BitmapHeapScanState *node)
            node->ss.ss_currentScanDesc = scan;
        }
 
+       scan->st.bitmap.rs_iterator = tbmiterator;
+       scan->st.bitmap.rs_shared_iterator = shared_tbmiterator;
        node->initialized = true;
+
+       goto new_page;
    }
 
    for (;;)
    {
-       CHECK_FOR_INTERRUPTS();
-
-       /*
-        * Get next page of results if needed
-        */
-       if (tbmres == NULL)
-       {
-           if (!pstate)
-               node->tbmres = tbmres = tbm_iterate(tbmiterator);
-           else
-               node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
-           if (tbmres == NULL)
-           {
-               /* no more entries in the bitmap */
-               break;
-           }
-
-           BitmapAdjustPrefetchIterator(node, tbmres->blockno);
-
-           if (!table_scan_bitmap_next_block(scan, tbmres,
-                                             &node->stats.lossy_pages,
-                                             &node->stats.exact_pages))
-           {
-               /* AM doesn't think this block is valid, skip */
-               continue;
-           }
-
-           /* Adjust the prefetch target */
-           BitmapAdjustPrefetchTarget(node);
-       }
-       else
+       while (table_scan_bitmap_next_tuple(scan, slot))
        {
            /*
             * Continuing in previously obtained page.
             */
 
+           CHECK_FOR_INTERRUPTS();
+
 #ifdef USE_PREFETCH
 
            /*
@@ -268,45 +236,64 @@ BitmapHeapNext(BitmapHeapScanState *node)
                SpinLockRelease(&pstate->mutex);
            }
 #endif                         /* USE_PREFETCH */
+
+           /*
+            * We issue prefetch requests *after* fetching the current page to
+            * try to avoid having prefetching interfere with the main I/O.
+            * Also, this should happen only when we have determined there is
+            * still something to do on the current page, else we may
+            * uselessly prefetch the same page we are just about to request
+            * for real.
+            */
+           BitmapPrefetch(node, scan);
+
+           /*
+            * If we are using lossy info, we have to recheck the qual
+            * conditions at every tuple.
+            */
+           if (node->recheck)
+           {
+               econtext->ecxt_scantuple = slot;
+               if (!ExecQualAndReset(node->bitmapqualorig, econtext))
+               {
+                   /* Fails recheck, so drop it and loop back for another */
+                   InstrCountFiltered2(node, 1);
+                   ExecClearTuple(slot);
+                   continue;
+               }
+           }
+
+           /* OK to return this tuple */
+           return slot;
        }
 
-       /*
-        * We issue prefetch requests *after* fetching the current page to try
-        * to avoid having prefetching interfere with the main I/O. Also, this
-        * should happen only when we have determined there is still something
-        * to do on the current page, else we may uselessly prefetch the same
-        * page we are just about to request for real.
-        */
-       BitmapPrefetch(node, scan);
+new_page:
+
+       BitmapAdjustPrefetchIterator(node);
 
        /*
-        * Attempt to fetch tuple from AM.
+        * Returns false if the bitmap is exhausted and there are no further
+        * blocks we need to scan.
         */
-       if (!table_scan_bitmap_next_tuple(scan, tbmres, slot))
-       {
-           /* nothing more to look at on this page */
-           node->tbmres = tbmres = NULL;
-           continue;
-       }
+       if (!table_scan_bitmap_next_block(scan, &node->blockno,
+                                         &node->recheck,
+                                         &node->stats.lossy_pages,
+                                         &node->stats.exact_pages))
+           break;
 
        /*
-        * If we are using lossy info, we have to recheck the qual conditions
-        * at every tuple.
+        * If serial, we can error out if the the prefetch block doesn't stay
+        * ahead of the current block.
         */
-       if (tbmres->recheck)
-       {
-           econtext->ecxt_scantuple = slot;
-           if (!ExecQualAndReset(node->bitmapqualorig, econtext))
-           {
-               /* Fails recheck, so drop it and loop back for another */
-               InstrCountFiltered2(node, 1);
-               ExecClearTuple(slot);
-               continue;
-           }
-       }
-
-       /* OK to return this tuple */
-       return slot;
+       if (node->pstate == NULL &&
+           node->prefetch_iterator &&
+           node->prefetch_blockno < node->blockno)
+           elog(ERROR,
+                "prefetch and main iterators are out of sync. pfblockno: %d. blockno: %d",
+                node->prefetch_blockno, node->blockno);
+
+       /* Adjust the prefetch target */
+       BitmapAdjustPrefetchTarget(node);
    }
 
    /*
@@ -332,13 +319,17 @@ BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
 
 /*
  * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
+ *
+ * We keep track of how far the prefetch iterator is ahead of the main
+ * iterator in prefetch_pages. For each block the main iterator returns, we
+ * decrement prefetch_pages.
  */
 static inline void
-BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
-                            BlockNumber blockno)
+BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
 {
 #ifdef USE_PREFETCH
    ParallelBitmapHeapState *pstate = node->pstate;
+   TBMIterateResult *tbmpre;
 
    if (pstate == NULL)
    {
@@ -351,15 +342,22 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
        }
        else if (prefetch_iterator)
        {
-           /* Do not let the prefetch iterator get behind the main one */
-           TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
-
-           if (tbmpre == NULL || tbmpre->blockno != blockno)
-               elog(ERROR, "prefetch and main iterators are out of sync");
+           tbmpre = tbm_iterate(prefetch_iterator);
+           node->prefetch_blockno = tbmpre ? tbmpre->blockno :
+               InvalidBlockNumber;
        }
        return;
    }
 
+   /*
+    * XXX: There is a known issue with keeping the prefetch and current block
+    * iterators in sync for parallel bitmap table scans. This can lead to
+    * prefetching blocks that have already been read. See the discussion
+    * here:
+    * https://postgr.es/m/20240315211449.en2jcmdqxv5o6tlz%40alap3.anarazel.de
+    * Note that moving the call site of BitmapAdjustPrefetchIterator()
+    * exacerbates the effects of this bug.
+    */
    if (node->prefetch_maximum > 0)
    {
        TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
@@ -384,7 +382,11 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
             * case.
             */
            if (prefetch_iterator)
-               tbm_shared_iterate(prefetch_iterator);
+           {
+               tbmpre = tbm_shared_iterate(prefetch_iterator);
+               node->prefetch_blockno = tbmpre ? tbmpre->blockno :
+                   InvalidBlockNumber;
+           }
        }
    }
 #endif                         /* USE_PREFETCH */
@@ -462,6 +464,7 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
                    break;
                }
                node->prefetch_pages++;
+               node->prefetch_blockno = tbmpre->blockno;
 
                /*
                 * If we expect not to have to actually read this heap page,
@@ -519,6 +522,8 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
                    break;
                }
 
+               node->prefetch_blockno = tbmpre->blockno;
+
                /* As above, skip prefetch if we expect not to need page */
                skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
                              !tbmpre->recheck &&
@@ -575,17 +580,32 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 {
    PlanState  *outerPlan = outerPlanState(node);
 
-   /* rescan to release any page pin */
-   if (node->ss.ss_currentScanDesc)
+   TableScanDesc scan = node->ss.ss_currentScanDesc;
+
+   if (scan)
+   {
+       /*
+        * End iteration on iterators saved in scan descriptor.
+        */
+       if (scan->st.bitmap.rs_shared_iterator)
+       {
+           tbm_end_shared_iterate(scan->st.bitmap.rs_shared_iterator);
+           scan->st.bitmap.rs_shared_iterator = NULL;
+       }
+
+       if (scan->st.bitmap.rs_iterator)
+       {
+           tbm_end_iterate(scan->st.bitmap.rs_iterator);
+           scan->st.bitmap.rs_iterator = NULL;
+       }
+
+       /* rescan to release any page pin */
        table_rescan(node->ss.ss_currentScanDesc, NULL);
+   }
 
    /* release bitmaps and buffers if any */
-   if (node->tbmiterator)
-       tbm_end_iterate(node->tbmiterator);
    if (node->prefetch_iterator)
        tbm_end_iterate(node->prefetch_iterator);
-   if (node->shared_tbmiterator)
-       tbm_end_shared_iterate(node->shared_tbmiterator);
    if (node->shared_prefetch_iterator)
        tbm_end_shared_iterate(node->shared_prefetch_iterator);
    if (node->tbm)
@@ -593,13 +613,13 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
    if (node->pvmbuffer != InvalidBuffer)
        ReleaseBuffer(node->pvmbuffer);
    node->tbm = NULL;
-   node->tbmiterator = NULL;
-   node->tbmres = NULL;
    node->prefetch_iterator = NULL;
    node->initialized = false;
-   node->shared_tbmiterator = NULL;
    node->shared_prefetch_iterator = NULL;
    node->pvmbuffer = InvalidBuffer;
+   node->recheck = true;
+   node->blockno = InvalidBlockNumber;
+   node->prefetch_blockno = InvalidBlockNumber;
 
    ExecScanReScan(&node->ss);
 
@@ -653,28 +673,40 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
     */
    ExecEndNode(outerPlanState(node));
 
+   if (scanDesc)
+   {
+       /*
+        * End iteration on iterators saved in scan descriptor.
+        */
+       if (scanDesc->st.bitmap.rs_shared_iterator)
+       {
+           tbm_end_shared_iterate(scanDesc->st.bitmap.rs_shared_iterator);
+           scanDesc->st.bitmap.rs_shared_iterator = NULL;
+       }
+
+       if (scanDesc->st.bitmap.rs_iterator)
+       {
+           tbm_end_iterate(scanDesc->st.bitmap.rs_iterator);
+           scanDesc->st.bitmap.rs_iterator = NULL;
+       }
+
+       /*
+        * close table scan
+        */
+       table_endscan(scanDesc);
+   }
+
    /*
     * release bitmaps and buffers if any
     */
-   if (node->tbmiterator)
-       tbm_end_iterate(node->tbmiterator);
    if (node->prefetch_iterator)
        tbm_end_iterate(node->prefetch_iterator);
    if (node->tbm)
        tbm_free(node->tbm);
-   if (node->shared_tbmiterator)
-       tbm_end_shared_iterate(node->shared_tbmiterator);
    if (node->shared_prefetch_iterator)
        tbm_end_shared_iterate(node->shared_prefetch_iterator);
    if (node->pvmbuffer != InvalidBuffer)
        ReleaseBuffer(node->pvmbuffer);
-
-   /*
-    * close heap scan
-    */
-   if (scanDesc)
-       table_endscan(scanDesc);
-
 }
 
 /* ----------------------------------------------------------------
@@ -707,8 +739,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
    scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
 
    scanstate->tbm = NULL;
-   scanstate->tbmiterator = NULL;
-   scanstate->tbmres = NULL;
    scanstate->pvmbuffer = InvalidBuffer;
 
    /* Zero the statistics counters */
@@ -718,9 +748,11 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
    scanstate->prefetch_pages = 0;
    scanstate->prefetch_target = 0;
    scanstate->initialized = false;
-   scanstate->shared_tbmiterator = NULL;
    scanstate->shared_prefetch_iterator = NULL;
    scanstate->pstate = NULL;
+   scanstate->recheck = true;
+   scanstate->blockno = InvalidBlockNumber;
+   scanstate->prefetch_blockno = InvalidBlockNumber;
 
    /*
     * Miscellaneous initialization
index 114a85dc47cb73c64d8b6670879b627ca359c7ef..e1884acf493cc4ce3041e3b2527d38d6ca87add4 100644 (file)
@@ -25,6 +25,9 @@
 
 struct ParallelTableScanDescData;
 
+struct TBMIterator;
+struct TBMSharedIterator;
+
 /*
  * Generic descriptor for table scans. This is the base-class for table scans,
  * which needs to be embedded in the scans of individual AMs.
@@ -37,9 +40,28 @@ typedef struct TableScanDescData
    int         rs_nkeys;       /* number of scan keys */
    struct ScanKeyData *rs_key; /* array of scan key descriptors */
 
-   /* Range of ItemPointers for table_scan_getnextslot_tidrange() to scan. */
-   ItemPointerData rs_mintid;
-   ItemPointerData rs_maxtid;
+   /*
+    * Scan type-specific members
+    */
+   union
+   {
+       /* Iterators for Bitmap Table Scans */
+       struct
+       {
+           struct TBMIterator *rs_iterator;
+           struct TBMSharedIterator *rs_shared_iterator;
+       }           bitmap;
+
+       /*
+        * Range of ItemPointers for table_scan_getnextslot_tidrange() to
+        * scan.
+        */
+       struct
+       {
+           ItemPointerData rs_mintid;
+           ItemPointerData rs_maxtid;
+       }           tidrange;
+   }           st;
 
    /*
     * Information about type and behaviour of the scan, a bitmask of members
index be09d180d45b98f6b888fb174c33bd66227f6e96..adb478a93ca4c28b23643ec0149e136cced86f61 100644 (file)
@@ -36,7 +36,6 @@ extern PGDLLIMPORT bool synchronize_seqscans;
 struct BulkInsertStateData;
 struct IndexInfo;
 struct SampleScanState;
-struct TBMIterateResult;
 struct VacuumParams;
 struct ValidateIndexState;
 
@@ -780,26 +779,29 @@ typedef struct TableAmRoutine
     */
 
    /*
-    * Prepare to fetch / check / return tuples from `tbmres->blockno` as part
-    * of a bitmap table scan. `scan` was started via table_beginscan_bm().
-    * Return false if there are no tuples to be found on the page, true
-    * otherwise.
+    * Prepare to fetch / check / return tuples from `blockno` as part of a
+    * bitmap table scan. `scan` was started via table_beginscan_bm(). Return
+    * false if the bitmap is exhausted and true otherwise.
     *
     * This will typically read and pin the target block, and do the necessary
     * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might
-    * make sense to perform tuple visibility checks at this time). For some
-    * AMs it will make more sense to do all the work referencing `tbmres`
-    * contents here, for others it might be better to defer more work to
-    * scan_bitmap_next_tuple.
-    *
-    * If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples
-    * on the page have to be returned, otherwise the tuples at offsets in
-    * `tbmres->offsets` need to be returned.
+    * make sense to perform tuple visibility checks at this time).
     *
     * `lossy_pages` and `exact_pages` are EXPLAIN counters that can be
     * incremented by the table AM to indicate whether or not the block's
     * representation in the bitmap is lossy.
     *
+    * `recheck` is set by the table AM to indicate whether or not the tuples
+    * from this block should be rechecked. Tuples from lossy pages will
+    * always need to be rechecked, but some non-lossy pages' tuples may also
+    * require recheck.
+    *
+    * `blockno` is the current block and is set by the table AM. The table AM
+    * is responsible for advancing the main iterator, but the bitmap table
+    * scan code still advances the prefetch iterator. `blockno` is used by
+    * bitmap table scan code to validate that the prefetch block stays ahead
+    * of the current block.
+    *
     * XXX: Currently this may only be implemented if the AM uses md.c as its
     * storage manager, and uses ItemPointer->ip_blkid in a manner that maps
     * blockids directly to the underlying storage. nodeBitmapHeapscan.c
@@ -815,7 +817,8 @@ typedef struct TableAmRoutine
     * scan_bitmap_next_tuple need to exist, or neither.
     */
    bool        (*scan_bitmap_next_block) (TableScanDesc scan,
-                                          struct TBMIterateResult *tbmres,
+                                          BlockNumber *blockno,
+                                          bool *recheck,
                                           uint64 *lossy_pages,
                                           uint64 *exact_pages);
 
@@ -823,15 +826,10 @@ typedef struct TableAmRoutine
     * Fetch the next tuple of a bitmap table scan into `slot` and return true
     * if a visible tuple was found, false otherwise.
     *
-    * For some AMs it will make more sense to do all the work referencing
-    * `tbmres` contents in scan_bitmap_next_block, for others it might be
-    * better to defer more work to this callback.
-    *
     * Optional callback, but either both scan_bitmap_next_block and
     * scan_bitmap_next_tuple need to exist, or neither.
     */
    bool        (*scan_bitmap_next_tuple) (TableScanDesc scan,
-                                          struct TBMIterateResult *tbmres,
                                           TupleTableSlot *slot);
 
    /*
@@ -959,12 +957,17 @@ static inline TableScanDesc
 table_beginscan_bm(Relation rel, Snapshot snapshot,
                   int nkeys, struct ScanKeyData *key, bool need_tuple)
 {
+   TableScanDesc result;
    uint32      flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE;
 
    if (need_tuple)
        flags |= SO_NEED_TUPLES;
 
-   return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
+   result = rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key,
+                                        NULL, flags);
+   result->st.bitmap.rs_shared_iterator = NULL;
+   result->st.bitmap.rs_iterator = NULL;
+   return result;
 }
 
 /*
@@ -1955,21 +1958,28 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
  */
 
 /*
- * Prepare to fetch / check / return tuples from `tbmres->blockno` as part of
- * a bitmap table scan. `scan` needs to have been started via
- * table_beginscan_bm(). Returns false if there are no tuples to be found on
- * the page, true otherwise.
+ * Prepare to fetch / check / return tuples as part of a bitmap table scan.
+ * `scan` needs to have been started via table_beginscan_bm(). Returns false
+ * if there are no more blocks in the bitmap, true otherwise.
  *
  * `lossy_pages` and `exact_pages` are EXPLAIN counters that can be
  * incremented by the table AM to indicate whether or not the block's
  * representation in the bitmap is lossy.
  *
+ * `recheck` is set by the table AM to indicate whether or not the tuples
+ * from this block should be rechecked.
+ *
+ * `blockno` is the current block and is set by the table AM and is used by
+ * bitmap table scan code to validate that the prefetch block stays ahead of
+ * the current block.
+ *
  * Note, this is an optionally implemented function, therefore should only be
  * used after verifying the presence (at plan time or such).
  */
 static inline bool
 table_scan_bitmap_next_block(TableScanDesc scan,
-                            struct TBMIterateResult *tbmres,
+                            BlockNumber *blockno,
+                            bool *recheck,
                             uint64 *lossy_pages,
                             uint64 *exact_pages)
 {
@@ -1982,7 +1992,7 @@ table_scan_bitmap_next_block(TableScanDesc scan,
        elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding");
 
    return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
-                                                          tbmres,
+                                                          blockno, recheck,
                                                           lossy_pages,
                                                           exact_pages);
 }
@@ -1997,7 +2007,6 @@ table_scan_bitmap_next_block(TableScanDesc scan,
  */
 static inline bool
 table_scan_bitmap_next_tuple(TableScanDesc scan,
-                            struct TBMIterateResult *tbmres,
                             TupleTableSlot *slot)
 {
    /*
@@ -2009,7 +2018,6 @@ table_scan_bitmap_next_tuple(TableScanDesc scan,
        elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding");
 
    return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
-                                                          tbmres,
                                                           slot);
 }
 
index e4698a28c4f0c03b9a4262a781225942d2e51afd..b67d5186a2d2a5eb88b908b9e6aaac8dfb81deb0 100644 (file)
@@ -1833,8 +1833,6 @@ typedef struct SharedBitmapHeapInstrumentation
  *
  *     bitmapqualorig     execution state for bitmapqualorig expressions
  *     tbm                bitmap obtained from child index scan(s)
- *     tbmiterator        iterator for scanning current pages
- *     tbmres             current-page data
  *     pvmbuffer          buffer for visibility-map lookups of prefetched pages
  *     stats              execution statistics
  *     prefetch_iterator  iterator for prefetching ahead of current page
@@ -1842,10 +1840,12 @@ typedef struct SharedBitmapHeapInstrumentation
  *     prefetch_target    current target prefetch distance
  *     prefetch_maximum   maximum value for prefetch_target
  *     initialized        is node is ready to iterate
- *     shared_tbmiterator     shared iterator
  *     shared_prefetch_iterator shared iterator for prefetching
  *     pstate             shared state for parallel bitmap scan
  *     sinstrument        statistics for parallel workers
+ *     recheck            do current page's tuples need recheck
+ *     blockno            used to validate pf and current block stay in sync
+ *     prefetch_blockno   used to validate pf stays ahead of current block
  * ----------------
  */
 typedef struct BitmapHeapScanState
@@ -1853,8 +1853,6 @@ typedef struct BitmapHeapScanState
    ScanState   ss;             /* its first field is NodeTag */
    ExprState  *bitmapqualorig;
    TIDBitmap  *tbm;
-   TBMIterator *tbmiterator;
-   TBMIterateResult *tbmres;
    Buffer      pvmbuffer;
    BitmapHeapScanInstrumentation stats;
    TBMIterator *prefetch_iterator;
@@ -1862,10 +1860,12 @@ typedef struct BitmapHeapScanState
    int         prefetch_target;
    int         prefetch_maximum;
    bool        initialized;
-   TBMSharedIterator *shared_tbmiterator;
    TBMSharedIterator *shared_prefetch_iterator;
    ParallelBitmapHeapState *pstate;
    SharedBitmapHeapInstrumentation *sinstrument;
+   bool        recheck;
+   BlockNumber blockno;
+   BlockNumber prefetch_blockno;
 } BitmapHeapScanState;
 
 /* ----------------