Make the visibility map crash-safe.

author Robert Haas <[email protected]>
Wed, 22 Jun 2011 03:04:40 +0000 (23:04 -0400)
committer Robert Haas <[email protected]>
Wed, 22 Jun 2011 03:04:40 +0000 (23:04 -0400)
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b947c11f7d80bd6aed2174bcdffa7cf382707168..7bb4a874c47e009f8b22714a4cf0d50daac2e07d 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1862,6 +1862,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
     TransactionId xid = GetCurrentTransactionId();
     HeapTuple   heaptup;
     Buffer      buffer;
+   Buffer      vmbuffer = InvalidBuffer;
     bool        all_visible_cleared = false;
  
     if (relation->rd_rel->relhasoids)
@@ -1914,9 +1915,13 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
     else
         heaptup = tup;
  
-   /* Find buffer to insert this tuple into */
+   /*
+    * Find buffer to insert this tuple into.  If the page is all visible,
+    * this will also pin the requisite visibility map page.
+    */
     buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
-                                      InvalidBuffer, options, bistate);
+                                      InvalidBuffer, options, bistate,
+                                      &vmbuffer);
  
     /*
      * We're about to do the actual insert -- check for conflict at the
@@ -1934,6 +1939,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
     {
         all_visible_cleared = true;
         PageClearAllVisible(BufferGetPage(buffer));
+       visibilitymap_clear(relation,
+                           ItemPointerGetBlockNumber(&(heaptup->t_self)),
+                           vmbuffer);
     }
  
     /*
@@ -2010,11 +2018,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
     END_CRIT_SECTION();
  
     UnlockReleaseBuffer(buffer);
-
-   /* Clear the bit in the visibility map if necessary */
-   if (all_visible_cleared)
-       visibilitymap_clear(relation,
-                           ItemPointerGetBlockNumber(&(heaptup->t_self)));
+   if (vmbuffer != InvalidBuffer)
+       ReleaseBuffer(vmbuffer);
  
     /*
      * If tuple is cachable, mark it for invalidation from the caches in case
@@ -2089,17 +2094,43 @@ heap_delete(Relation relation, ItemPointer tid,
     ItemId      lp;
     HeapTupleData tp;
     Page        page;
+   BlockNumber block;
     Buffer      buffer;
+   Buffer      vmbuffer = InvalidBuffer;
     bool        have_tuple_lock = false;
     bool        iscombo;
     bool        all_visible_cleared = false;
  
     Assert(ItemPointerIsValid(tid));
  
-   buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
+   block = ItemPointerGetBlockNumber(tid);
+   buffer = ReadBuffer(relation, block);
+   page = BufferGetPage(buffer);
+
+   /*
+    * Before locking the buffer, pin the visibility map page if it appears
+    * to be necessary.  Since we haven't got the lock yet, someone else might
+    * be in the middle of changing this, so we'll need to recheck after
+    * we have the lock.
+    */
+   if (PageIsAllVisible(page))
+       visibilitymap_pin(relation, block, &vmbuffer);
+
     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
  
-   page = BufferGetPage(buffer);
+   /*
+    * If we didn't pin the visibility map page and the page has become all
+    * visible while we were busy locking the buffer, we'll have to unlock and
+    * re-lock, to avoid holding the buffer lock across an I/O.  That's a bit
+    * unfortunate, but hopefully shouldn't happen often.
+    */
+   if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
+   {
+       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+       visibilitymap_pin(relation, block, &vmbuffer);
+       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+   }
+
     lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
     Assert(ItemIdIsNormal(lp));
  
@@ -2222,6 +2253,8 @@ l1:
         UnlockReleaseBuffer(buffer);
         if (have_tuple_lock)
             UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+       if (vmbuffer != InvalidBuffer)
+           ReleaseBuffer(vmbuffer);
         return result;
     }
  
@@ -2249,6 +2282,8 @@ l1:
     {
         all_visible_cleared = true;
         PageClearAllVisible(page);
+       visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
+                           vmbuffer);
     }
  
     /* store transaction information of xact deleting the tuple */
@@ -2296,6 +2331,9 @@ l1:
  
     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
+   if (vmbuffer != InvalidBuffer)
+       ReleaseBuffer(vmbuffer);
+
     /*
      * If the tuple has toasted out-of-line attributes, we need to delete
      * those items too.  We have to do this before releasing the buffer
@@ -2317,10 +2355,6 @@ l1:
      */
     CacheInvalidateHeapTuple(relation, &tp);
  
-   /* Clear the bit in the visibility map if necessary */
-   if (all_visible_cleared)
-       visibilitymap_clear(relation, BufferGetBlockNumber(buffer));
-
     /* Now we can release the buffer */
     ReleaseBuffer(buffer);
  
@@ -2419,8 +2453,11 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
     HeapTupleData oldtup;
     HeapTuple   heaptup;
     Page        page;
+   BlockNumber block;
     Buffer      buffer,
-               newbuf;
+               newbuf,
+               vmbuffer = InvalidBuffer,
+               vmbuffer_new = InvalidBuffer;
     bool        need_toast,
                 already_marked;
     Size        newtupsize,
@@ -2447,10 +2484,34 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
      */
     hot_attrs = RelationGetIndexAttrBitmap(relation);
  
-   buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
+   block = ItemPointerGetBlockNumber(otid);
+   buffer = ReadBuffer(relation, block);
+   page = BufferGetPage(buffer);
+
+   /*
+    * Before locking the buffer, pin the visibility map page if it appears
+    * to be necessary.  Since we haven't got the lock yet, someone else might
+    * be in the middle of changing this, so we'll need to recheck after
+    * we have the lock.
+    */
+   if (PageIsAllVisible(page))
+       visibilitymap_pin(relation, block, &vmbuffer);
+
     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
  
-   page = BufferGetPage(buffer);
+   /*
+    * If we didn't pin the visibility map page and the page has become all
+    * visible while we were busy locking the buffer, we'll have to unlock and
+    * re-lock, to avoid holding the buffer lock across an I/O.  That's a bit
+    * unfortunate, but hopefully shouldn't happen often.
+    */
+   if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
+   {
+       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+       visibilitymap_pin(relation, block, &vmbuffer);
+       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+   }
+
     lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
     Assert(ItemIdIsNormal(lp));
  
@@ -2580,6 +2641,8 @@ l2:
         UnlockReleaseBuffer(buffer);
         if (have_tuple_lock)
             UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+       if (vmbuffer != InvalidBuffer)
+           ReleaseBuffer(vmbuffer);
         bms_free(hot_attrs);
         return result;
     }
@@ -2700,7 +2763,8 @@ l2:
         {
             /* Assume there's no chance to put heaptup on same page. */
             newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
-                                              buffer, 0, NULL);
+                                              buffer, 0, NULL,
+                                              &vmbuffer_new);
         }
         else
         {
@@ -2717,7 +2781,8 @@ l2:
                  */
                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
-                                                  buffer, 0, NULL);
+                                                  buffer, 0, NULL,
+                                                  &vmbuffer_new);
             }
             else
             {
@@ -2866,14 +2931,20 @@ l2:
  
     /* Clear bits in visibility map */
     if (all_visible_cleared)
-       visibilitymap_clear(relation, BufferGetBlockNumber(buffer));
+       visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
+                           vmbuffer);
     if (all_visible_cleared_new)
-       visibilitymap_clear(relation, BufferGetBlockNumber(newbuf));
+       visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
+                           vmbuffer_new);
  
     /* Now we can release the buffer(s) */
     if (newbuf != buffer)
         ReleaseBuffer(newbuf);
     ReleaseBuffer(buffer);
+   if (BufferIsValid(vmbuffer_new))
+       ReleaseBuffer(vmbuffer_new);
+   if (BufferIsValid(vmbuffer))
+       ReleaseBuffer(vmbuffer);
  
     /*
      * If new tuple is cachable, mark it for invalidation from the caches in
@@ -4035,6 +4106,38 @@ log_heap_freeze(Relation reln, Buffer buffer,
     return recptr;
  }
  
+/*
+ * Perform XLogInsert for a heap-visible operation.     'block' is the block
+ * being marked all-visible, and vm_buffer is the buffer containing the
+ * corresponding visibility map block.  Both should have already been modified
+ * and dirtied.
+ */
+XLogRecPtr
+log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer)
+{
+   xl_heap_visible xlrec;
+   XLogRecPtr  recptr;
+   XLogRecData rdata[2];
+
+   xlrec.node = rnode;
+   xlrec.block = block;
+
+   rdata[0].data = (char *) &xlrec;
+   rdata[0].len = SizeOfHeapVisible;
+   rdata[0].buffer = InvalidBuffer;
+   rdata[0].next = &(rdata[1]);
+
+   rdata[1].data = NULL;
+   rdata[1].len = 0;
+   rdata[1].buffer = vm_buffer;
+   rdata[1].buffer_std = false;
+   rdata[1].next = NULL;
+
+   recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE, rdata);
+
+   return recptr;
+}
+
  /*
   * Perform XLogInsert for a heap-update operation. Caller must already
   * have modified the buffer(s) and marked them dirty.
@@ -4323,6 +4426,92 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
     UnlockReleaseBuffer(buffer);
  }
  
+/*
+ * Replay XLOG_HEAP2_VISIBLE record.
+ *
+ * The critical integrity requirement here is that we must never end up with
+ * a situation where the visibility map bit is set, and the page-level
+ * PD_ALL_VISIBLE bit is clear.  If that were to occur, then a subsequent
+ * page modification would fail to clear the visibility map bit.
+ */
+static void
+heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
+{
+   xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
+   Buffer      buffer;
+   Page        page;
+
+   /*
+    * Read the heap page, if it still exists.  If the heap file has been
+    * dropped or truncated later in recovery, this might fail.  In that case,
+    * there's no point in doing anything further, since the visibility map
+    * will have to be cleared out at the same time.
+    */
+   buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block,
+                                   RBM_NORMAL);
+   if (!BufferIsValid(buffer))
+       return;
+   page = (Page) BufferGetPage(buffer);
+
+   LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+   /*
+    * We don't bump the LSN of the heap page when setting the visibility
+    * map bit, because that would generate an unworkable volume of
+    * full-page writes.  This exposes us to torn page hazards, but since
+    * we're not inspecting the existing page contents in any way, we
+    * don't care.
+    *
+    * However, all operations that clear the visibility map bit *do* bump
+    * the LSN, and those operations will only be replayed if the XLOG LSN
+    * follows the page LSN.  Thus, if the page LSN has advanced past our
+    * XLOG record's LSN, we mustn't mark the page all-visible, because
+    * the subsequent update won't be replayed to clear the flag.
+    */
+   if (!XLByteLE(lsn, PageGetLSN(page)))
+   {
+       PageSetAllVisible(page);
+       MarkBufferDirty(buffer);
+   }
+
+   /* Done with heap page. */
+   UnlockReleaseBuffer(buffer);
+
+   /*
+    * Even we skipped the heap page update due to the LSN interlock, it's
+    * still safe to update the visibility map.  Any WAL record that clears
+    * the visibility map bit does so before checking the page LSN, so any
+    * bits that need to be cleared will still be cleared.
+    */
+   if (record->xl_info & XLR_BKP_BLOCK_1)
+       RestoreBkpBlocks(lsn, record, false);
+   else
+   {
+       Relation    reln;
+       Buffer      vmbuffer = InvalidBuffer;
+
+       reln = CreateFakeRelcacheEntry(xlrec->node);
+       visibilitymap_pin(reln, xlrec->block, &vmbuffer);
+
+       /*
+        * Don't set the bit if replay has already passed this point.
+        *
+        * It might be safe to do this unconditionally; if replay has past
+        * this point, we'll replay at least as far this time as we did before,
+        * and if this bit needs to be cleared, the record responsible for
+        * doing so should be again replayed, and clear it.  For right now,
+        * out of an abundance of conservatism, we use the same test here
+        * we did for the heap page; if this results in a dropped bit, no real
+        * harm is done; and the next VACUUM will fix it.
+        */
+       if (!XLByteLE(lsn, PageGetLSN(BufferGetPage(vmbuffer))))
+           visibilitymap_set(reln, xlrec->block, lsn, vmbuffer);
+
+       ReleaseBuffer(vmbuffer);
+       FreeFakeRelcacheEntry(reln);
+   }
+}
+
  static void
  heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
  {
@@ -4377,8 +4566,11 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
     if (xlrec->all_visible_cleared)
     {
         Relation    reln = CreateFakeRelcacheEntry(xlrec->target.node);
+       Buffer      vmbuffer = InvalidBuffer;
  
-       visibilitymap_clear(reln, blkno);
+       visibilitymap_pin(reln, blkno, &vmbuffer);
+       visibilitymap_clear(reln, blkno, vmbuffer);
+       ReleaseBuffer(vmbuffer);
         FreeFakeRelcacheEntry(reln);
     }
  
@@ -4455,8 +4647,11 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
     if (xlrec->all_visible_cleared)
     {
         Relation    reln = CreateFakeRelcacheEntry(xlrec->target.node);
+       Buffer      vmbuffer = InvalidBuffer;
  
-       visibilitymap_clear(reln, blkno);
+       visibilitymap_pin(reln, blkno, &vmbuffer);
+       visibilitymap_clear(reln, blkno, vmbuffer);
+       ReleaseBuffer(vmbuffer);
         FreeFakeRelcacheEntry(reln);
     }
  
@@ -4567,9 +4762,12 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
     if (xlrec->all_visible_cleared)
     {
         Relation    reln = CreateFakeRelcacheEntry(xlrec->target.node);
+       BlockNumber block = ItemPointerGetBlockNumber(&xlrec->target.tid);
+       Buffer      vmbuffer = InvalidBuffer;
  
-       visibilitymap_clear(reln,
-                           ItemPointerGetBlockNumber(&xlrec->target.tid));
+       visibilitymap_pin(reln, block, &vmbuffer);
+       visibilitymap_clear(reln, block, vmbuffer);
+       ReleaseBuffer(vmbuffer);
         FreeFakeRelcacheEntry(reln);
     }
  
@@ -4648,8 +4846,12 @@ newt:;
     if (xlrec->new_all_visible_cleared)
     {
         Relation    reln = CreateFakeRelcacheEntry(xlrec->target.node);
+       BlockNumber block = ItemPointerGetBlockNumber(&xlrec->newtid);
+       Buffer      vmbuffer = InvalidBuffer;
  
-       visibilitymap_clear(reln, ItemPointerGetBlockNumber(&xlrec->newtid));
+       visibilitymap_pin(reln, block, &vmbuffer);
+       visibilitymap_clear(reln, block, vmbuffer);
+       ReleaseBuffer(vmbuffer);
         FreeFakeRelcacheEntry(reln);
     }
  
@@ -4915,6 +5117,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
         case XLOG_HEAP2_CLEANUP_INFO:
             heap_xlog_cleanup_info(lsn, record);
             break;
+       case XLOG_HEAP2_VISIBLE:
+           heap_xlog_visible(lsn, record);
+           break;
         default:
             elog(PANIC, "heap2_redo: unknown op code %u", info);
     }
@@ -5044,6 +5249,14 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
         appendStringInfo(buf, "cleanup info: remxid %u",
                          xlrec->latestRemovedXid);
     }
+   else if (info == XLOG_HEAP2_VISIBLE)
+   {
+       xl_heap_visible *xlrec = (xl_heap_visible *) rec;
+
+       appendStringInfo(buf, "visible: rel %u/%u/%u; blk %u",
+                        xlrec->node.spcNode, xlrec->node.dbNode,
+                        xlrec->node.relNode, xlrec->block);
+   }
     else
         appendStringInfo(buf, "UNKNOWN");
  }
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 72a69e52b02878aba076a8b9f15d70149505aed0..aee2a205aa4090ba03131402515510986212feec 100644 (file)
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -17,6 +17,7 @@
  
  #include "access/heapam.h"
  #include "access/hio.h"
+#include "access/visibilitymap.h"
  #include "storage/bufmgr.h"
  #include "storage/freespace.h"
  #include "storage/lmgr.h"
@@ -150,7 +151,8 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock,
  Buffer
  RelationGetBufferForTuple(Relation relation, Size len,
                           Buffer otherBuffer, int options,
-                         struct BulkInsertStateData * bistate)
+                         struct BulkInsertStateData * bistate,
+                         Buffer *vmbuffer)
  {
     bool        use_fsm = !(options & HEAP_INSERT_SKIP_FSM);
     Buffer      buffer = InvalidBuffer;
@@ -237,23 +239,37 @@ RelationGetBufferForTuple(Relation relation, Size len,
          * Read and exclusive-lock the target block, as well as the other
          * block if one was given, taking suitable care with lock ordering and
          * the possibility they are the same block.
+        *
+        * If the page-level all-visible flag is set, caller will need to clear
+        * both that and the corresponding visibility map bit.  However, by the
+        * time we return, we'll have x-locked the buffer, and we don't want to
+        * do any I/O while in that state.  So we check the bit here before
+        * taking the lock, and pin the page if it appears necessary.
+        * Checking without the lock creates a risk of getting the wrong
+        * answer, so we'll have to recheck after acquiring the lock.
          */
         if (otherBuffer == InvalidBuffer)
         {
             /* easy case */
             buffer = ReadBufferBI(relation, targetBlock, bistate);
+           if (PageIsAllVisible(BufferGetPage(buffer)))
+               visibilitymap_pin(relation, targetBlock, vmbuffer);
             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
         }
         else if (otherBlock == targetBlock)
         {
             /* also easy case */
             buffer = otherBuffer;
+           if (PageIsAllVisible(BufferGetPage(buffer)))
+               visibilitymap_pin(relation, targetBlock, vmbuffer);
             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
         }
         else if (otherBlock < targetBlock)
         {
             /* lock other buffer first */
             buffer = ReadBuffer(relation, targetBlock);
+           if (PageIsAllVisible(BufferGetPage(buffer)))
+               visibilitymap_pin(relation, targetBlock, vmbuffer);
             LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
         }
@@ -261,10 +277,40 @@ RelationGetBufferForTuple(Relation relation, Size len,
         {
             /* lock target buffer first */
             buffer = ReadBuffer(relation, targetBlock);
+           if (PageIsAllVisible(BufferGetPage(buffer)))
+               visibilitymap_pin(relation, targetBlock, vmbuffer);
             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
             LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
         }
  
+       /*
+        * If the page is all visible but we don't have the right visibility
+        * map page pinned, then give up our locks, go get the pin, and
+        * re-lock.  This is pretty painful, but hopefully shouldn't happen
+        * often.  Note that there's a small possibility that we didn't pin
+        * the page above but still have the correct page pinned anyway, either
+        * because we've already made a previous pass through this loop, or
+        * because caller passed us the right page anyway.
+        *
+        * Note also that it's possible that by the time we get the pin and
+        * retake the buffer locks, the visibility map bit will have been
+        * cleared by some other backend anyway.  In that case, we'll have done
+        * a bit of extra work for no gain, but there's no real harm done.
+        */
+       if (PageIsAllVisible(BufferGetPage(buffer))
+           && !visibilitymap_pin_ok(targetBlock, *vmbuffer))
+       {
+           LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+           if (otherBlock != targetBlock)
+               LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
+           visibilitymap_pin(relation, targetBlock, vmbuffer);
+           if (otherBuffer != InvalidBuffer && otherBlock < targetBlock)
+               LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+           LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+           if (otherBuffer != InvalidBuffer && otherBlock > targetBlock)
+               LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+       }
+
         /*
          * Now we can check to see if there's enough free space here. If so,
          * we're done.
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 58bab7df102e824fd35c521012a47e17c52baaa0..a19352021dccd38c386871aa00fae45092d89bc6 100644 (file)
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -11,10 +11,11 @@
   *   src/backend/access/heap/visibilitymap.c
   *
   * INTERFACE ROUTINES
- *     visibilitymap_clear - clear a bit in the visibility map
- *     visibilitymap_pin   - pin a map page for setting a bit
- *     visibilitymap_set   - set a bit in a previously pinned page
- *     visibilitymap_test  - test if a bit is set
+ *     visibilitymap_clear  - clear a bit in the visibility map
+ *     visibilitymap_pin    - pin a map page for setting a bit
+ *     visibilitymap_pin_ok - check whether correct map page is already pinned
+ *     visibilitymap_set    - set a bit in a previously pinned page
+ *     visibilitymap_test   - test if a bit is set
   *
   * NOTES
   *
@@ -64,32 +65,13 @@
   * It would be nice to use the visibility map to skip visibility checks in
   * index scans.
   *
- * Currently, the visibility map is not 100% correct all the time.
- * During updates, the bit in the visibility map is cleared after releasing
- * the lock on the heap page. During the window between releasing the lock
- * and clearing the bit in the visibility map, the bit in the visibility map
- * is set, but the new insertion or deletion is not yet visible to other
- * backends.
- *
- * That might actually be OK for the index scans, though. The newly inserted
- * tuple wouldn't have an index pointer yet, so all tuples reachable from an
- * index would still be visible to all other backends, and deletions wouldn't
- * be visible to other backends yet.  (But HOT breaks that argument, no?)
- *
- * There's another hole in the way the PD_ALL_VISIBLE flag is set. When
- * vacuum observes that all tuples are visible to all, it sets the flag on
- * the heap page, and also sets the bit in the visibility map. If we then
- * crash, and only the visibility map page was flushed to disk, we'll have
- * a bit set in the visibility map, but the corresponding flag on the heap
- * page is not set. If the heap page is then updated, the updater won't
- * know to clear the bit in the visibility map.  (Isn't that prevented by
- * the LSN interlock?)
- *
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
  
+#include "access/heapam.h"
  #include "access/visibilitymap.h"
+#include "miscadmin.h"
  #include "storage/bufmgr.h"
  #include "storage/bufpage.h"
  #include "storage/lmgr.h"
@@ -127,38 +109,37 @@ static void vm_extend(Relation rel, BlockNumber nvmblocks);
  /*
   * visibilitymap_clear - clear a bit in visibility map
   *
- * Clear a bit in the visibility map, marking that not all tuples are
- * visible to all transactions anymore.
+ * You must pass a buffer containing the correct map page to this function.
+ * Call visibilitymap_pin first to pin the right one. This function doesn't do
+ * any I/O.
   */
  void
-visibilitymap_clear(Relation rel, BlockNumber heapBlk)
+visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf)
  {
     BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
     int         mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
     int         mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
     uint8       mask = 1 << mapBit;
-   Buffer      mapBuffer;
     char       *map;
  
  #ifdef TRACE_VISIBILITYMAP
     elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
  #endif
  
-   mapBuffer = vm_readbuf(rel, mapBlock, false);
-   if (!BufferIsValid(mapBuffer))
-       return;                 /* nothing to do */
+   if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
+       elog(ERROR, "wrong buffer passed to visibilitymap_clear");
  
-   LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
-   map = PageGetContents(BufferGetPage(mapBuffer));
+   LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+   map = PageGetContents(BufferGetPage(buf));
  
     if (map[mapByte] & mask)
     {
         map[mapByte] &= ~mask;
  
-       MarkBufferDirty(mapBuffer);
+       MarkBufferDirty(buf);
     }
  
-   UnlockReleaseBuffer(mapBuffer);
+   LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  }
  
  /*
@@ -193,20 +174,37 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
     *buf = vm_readbuf(rel, mapBlock, true);
  }
  
+/*
+ * visibilitymap_pin_ok - do we already have the correct page pinned?
+ *
+ * On entry, buf should be InvalidBuffer or a valid buffer returned by
+ * an earlier call to visibilitymap_pin or visibilitymap_test on the same
+ * relation.  The return value indicates whether the buffer covers the
+ * given heapBlk.
+ */
+bool
+visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
+{
+   BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
+
+   return BufferIsValid(buf) && BufferGetBlockNumber(buf) == mapBlock;
+}
+
  /*
   * visibilitymap_set - set a bit on a previously pinned page
   *
- * recptr is the LSN of the heap page. The LSN of the visibility map page is
- * advanced to that, to make sure that the visibility map doesn't get flushed
- * to disk before the update to the heap page that made all tuples visible.
+ * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
+ * or InvalidXLogRecPtr in normal running.  The page LSN is advanced to the
+ * one provided; in normal running, we generate a new XLOG record and set the
+ * page LSN to that value.
   *
- * This is an opportunistic function. It does nothing, unless *buf
- * contains the bit for heapBlk. Call visibilitymap_pin first to pin
- * the right map page. This function doesn't do any I/O.
+ * You must pass a buffer containing the correct map page to this function.
+ * Call visibilitymap_pin first to pin the right one. This function doesn't do
+ * any I/O.
   */
  void
  visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
-                 Buffer *buf)
+                 Buffer buf)
  {
     BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
     uint32      mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
@@ -218,25 +216,35 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
     elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
  #endif
  
+   Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
+
     /* Check that we have the right page pinned */
-   if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock)
-       return;
+   if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
+       elog(ERROR, "wrong buffer passed to visibilitymap_set");
  
-   page = BufferGetPage(*buf);
+   page = BufferGetPage(buf);
     map = PageGetContents(page);
-   LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
+   LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
  
     if (!(map[mapByte] & (1 << mapBit)))
     {
+       START_CRIT_SECTION();
+
         map[mapByte] |= (1 << mapBit);
+       MarkBufferDirty(buf);
  
-       if (XLByteLT(PageGetLSN(page), recptr))
+       if (RelationNeedsWAL(rel))
+       {
+           if (XLogRecPtrIsInvalid(recptr))
+               recptr = log_heap_visible(rel->rd_node, heapBlk, buf);
             PageSetLSN(page, recptr);
-       PageSetTLI(page, ThisTimeLineID);
-       MarkBufferDirty(*buf);
+           PageSetTLI(page, ThisTimeLineID);
+       }
+
+       END_CRIT_SECTION();
     }
  
-   LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+   LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  }
  
  /*
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
index bc02f15e8688247626901fc3a146245d0a59b534..472dd4cca6ec876d520a870f58f537388d0b3ebc 100644 (file)
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -24,6 +24,8 @@
  #include "access/transam.h"
  #include "utils/snapmgr.h"
  
+/* Handy constant for an invalid xlog recptr */
+const XLogRecPtr InvalidXLogRecPtr = {0, 0};
  
  /*
   * Single-item cache for results of TransactionLogFetch.  It's worth having
@@ -35,9 +37,6 @@ static TransactionId cachedFetchXid = InvalidTransactionId;
  static XidStatus cachedFetchXidStatus;
  static XLogRecPtr cachedCommitLSN;
  
-/* Handy constant for an invalid xlog recptr */
-static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
-
  /* Local functions */
  static XidStatus TransactionLogFetch(TransactionId transactionId);
  
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index aa0b0291ee1c7781a36c62e3d89abbc98d3b8499..4952d223cdf7e6fe69c7f8b1b89df57d6949077f 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5462,7 +5462,6 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
  {
     char        recoveryPath[MAXPGPATH];
     char        xlogpath[MAXPGPATH];
-   XLogRecPtr  InvalidXLogRecPtr = {0, 0};
  
     /*
      * We are no longer in archive recovery state.
@@ -8069,8 +8068,6 @@ CreateRestartPoint(int flags)
     if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
         XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
     {
-       XLogRecPtr  InvalidXLogRecPtr = {0, 0};
-
         ereport(DEBUG2,
                 (errmsg("skipping restartpoint, already performed at %X/%X",
                   lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index ccc586f12efa5ca40ac018de0864a57af467fba5..c5bf32e00fa8a1fed84a3c6758b68c29a51cd57d 100644 (file)
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -513,7 +513,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 visibilitymap_pin(onerel, blkno, &vmbuffer);
                 LockBuffer(buf, BUFFER_LOCK_SHARE);
                 if (PageIsAllVisible(page))
-                   visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
+                   visibilitymap_set(onerel, blkno, InvalidXLogRecPtr,
+                                     vmbuffer);
                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
             }
  
@@ -765,7 +766,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
              * updating the visibility map, but since this case shouldn't
              * happen anyway, don't worry about that.
              */
-           visibilitymap_clear(onerel, blkno);
+           visibilitymap_pin(onerel, blkno, &vmbuffer);
+           visibilitymap_clear(onerel, blkno, vmbuffer);
         }
  
         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
@@ -776,7 +778,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
             visibilitymap_pin(onerel, blkno, &vmbuffer);
             LockBuffer(buf, BUFFER_LOCK_SHARE);
             if (PageIsAllVisible(page))
-               visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
+               visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer);
             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
         }
  
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 4dbc3937099d63057db11c381ef327591827a991..fc657612ed06325b48c09cc20effb8e78f00c416 100644 (file)
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -136,6 +136,8 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
  extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
                 TransactionId cutoff_xid,
                 OffsetNumber *offsets, int offcnt);
+extern XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block,
+                Buffer vm_buffer);
  extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
             BlockNumber blk, Page page);
  
diff --git a/src/include/access/hio.h b/src/include/access/hio.h
index 6b661a3e87013cf404518b1cf5c2fd1c781a0fc0..7ae879788cf5626623f9a40e6bd26a7e7aec89c6 100644 (file)
--- a/src/include/access/hio.h
+++ b/src/include/access/hio.h
@@ -38,6 +38,7 @@ extern void RelationPutHeapTuple(Relation relation, Buffer buffer,
                      HeapTuple tuple);
  extern Buffer RelationGetBufferForTuple(Relation relation, Size len,
                           Buffer otherBuffer, int options,
-                         struct BulkInsertStateData * bistate);
+                         struct BulkInsertStateData * bistate,
+                         Buffer *vmbuffer);
  
  #endif   /* HIO_H */
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
index c1477071697f74ea845db8d0816841baae58c91a..ba5d9b28ef19f3054191cf0f8b358ac5831a9e26 100644 (file)
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -606,6 +606,7 @@ typedef HeapTupleData *HeapTuple;
  #define XLOG_HEAP2_CLEAN       0x10
  /* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */
  #define XLOG_HEAP2_CLEANUP_INFO 0x30
+#define XLOG_HEAP2_VISIBLE     0x40
  
  /*
   * All what we need to find changed tuple
@@ -750,6 +751,15 @@ typedef struct xl_heap_freeze
  
  #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
  
+/* This is what we need to know about setting a visibility map bit */
+typedef struct xl_heap_visible
+{
+   RelFileNode node;
+   BlockNumber block;
+} xl_heap_visible;
+
+#define SizeOfHeapVisible (offsetof(xl_heap_visible, block) + sizeof(BlockNumber))
+
  extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
                                        TransactionId *latestRemovedXid);
  
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index c5e6ab0ca49ee1708de73acfaad0f1948b10a191..c038fd9a52db5392135ccd997f4f12f5b18d44d2 100644 (file)
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -135,6 +135,9 @@ extern bool TransactionStartedDuringRecovery(void);
  /* in transam/varsup.c */
  extern PGDLLIMPORT VariableCache ShmemVariableCache;
  
+/* in transam/transam.c */
+extern const XLogRecPtr InvalidXLogRecPtr;
+
  
  /*
   * s for functions in transam/transam.c
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 689060bc159c749ddffc07ef38a536e87deaa6de..7d62c126407f4112290e92745f6c085fae20890e 100644 (file)
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -19,11 +19,13 @@
  #include "storage/buf.h"
  #include "utils/relcache.h"
  
-extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk);
+extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk,
+                   Buffer vmbuf);
  extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
                   Buffer *vmbuf);
+extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf);
  extern void visibilitymap_set(Relation rel, BlockNumber heapBlk,
-                 XLogRecPtr recptr, Buffer *vmbuf);
+                 XLogRecPtr recptr, Buffer vmbuf);
  extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf);
  extern void visibilitymap_truncate(Relation rel, BlockNumber heapblk);
  
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 7e39630c1bf5d7cbf1a721b641a9481069e92816..34316fffeba0342637b84b4e9a865a1d29ab1354 100644 (file)
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -71,7 +71,7 @@ typedef struct XLogContRecord
  /*
   * Each page of XLOG file has a header like this:
   */
-#define XLOG_PAGE_MAGIC 0xD066 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD067 /* can be used as WAL version indicator */
  
  typedef struct XLogPageHeaderData
  {
author	Robert Haas <[email protected]>
	Wed, 22 Jun 2011 03:04:40 +0000 (23:04 -0400)
committer	Robert Haas <[email protected]>
	Wed, 22 Jun 2011 03:04:40 +0000 (23:04 -0400)
src/backend/access/heap/heapam.c		\| blob \| blame \| history
src/backend/access/heap/hio.c		\| blob \| blame \| history
src/backend/access/heap/visibilitymap.c		\| blob \| blame \| history
src/backend/access/transam/transam.c		\| blob \| blame \| history
src/backend/access/transam/xlog.c		\| blob \| blame \| history
src/backend/commands/vacuumlazy.c		\| blob \| blame \| history
src/include/access/heapam.h		\| blob \| blame \| history
src/include/access/hio.h		\| blob \| blame \| history
src/include/access/htup.h		\| blob \| blame \| history
src/include/access/transam.h		\| blob \| blame \| history
src/include/access/visibilitymap.h		\| blob \| blame \| history
src/include/access/xlog_internal.h		\| blob \| blame \| history