Implement prefetching via posix_fadvise() for bitmap index scans. A new
authorTom Lane <[email protected]>
Mon, 12 Jan 2009 05:10:45 +0000 (05:10 +0000)
committerTom Lane <[email protected]>
Mon, 12 Jan 2009 05:10:45 +0000 (05:10 +0000)
GUC variable effective_io_concurrency controls how many concurrent block
prefetch requests will be issued.

(The best way to handle this for plain index scans is still under debate,
so that part is not applied yet --- tgl)

Greg Stark

15 files changed:
doc/src/sgml/config.sgml
src/backend/executor/nodeBitmapHeapscan.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/storage/file/fd.c
src/backend/storage/smgr/md.c
src/backend/storage/smgr/smgr.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/include/nodes/execnodes.h
src/include/pg_config_manual.h
src/include/storage/buf_internals.h
src/include/storage/bufmgr.h
src/include/storage/fd.h
src/include/storage/smgr.h

index 7085f4934a5f2f5eafd1a7444e706d49dfa2ce0c..4a4be1f53669621f16168debf31d0fa665256982 100644 (file)
@@ -1203,6 +1203,55 @@ SET ENABLE_SEQSCAN TO OFF;
       queries.
      </para>
     </sect2>
+
+    <sect2 id="runtime-config-resource-async-behavior">
+     <title>Asynchronous Behavior</title>
+
+     <variablelist>
+      <varlistentry id="guc-effective-io-concurrency" xreflabel="effective_io_concurrency">
+       <term><varname>effective_io_concurrency</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>effective_io_concurrency</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Sets the number of concurrent disk I/O operations that
+         <productname>PostgreSQL</> expects can be executed
+         simultaneously.  Raising this value will increase the number of I/O
+         operations that any individual <productname>PostgreSQL</> session
+         attempts to initiate in parallel.  The allowed range is 1 to 1000,
+         or zero to disable issuance of asynchronous I/O requests.
+        </para>
+
+        <para>
+         A good starting point for this setting is the number of separate
+         drives comprising a RAID 0 stripe or RAID 1 mirror being used for the
+         database.  (For RAID 5 the parity drive should not be counted.)
+         However, if the database is often busy with multiple queries issued in
+         concurrent sessions, lower values may be sufficient to keep the disk
+         array busy.  A value higher than needed to keep the disks busy will
+         only result in extra CPU overhead.
+        </para>
+
+        <para>
+         For more exotic systems, such as memory-based storage or a RAID array
+         that is limited by bus bandwidth, the correct value might be the
+         number of I/O paths available.  Some experimentation may be needed
+         to find the best value.
+        </para>
+
+        <para>
+         Asynchronous I/O depends on an effective <function>posix_fadvise</>
+         function, which some operating systems lack.  If the function is not
+         present then setting this parameter to anything but zero will result
+         in an error.  On some operating systems the function is present but
+         does not actually do anything.  On such systems setting a nonzero
+         value will add CPU overhead without improving performance.
+        </para>
+       </listitem>
+      </varlistentry>
+     </variablelist>
+    </sect2>
    </sect1>
 
    <sect1 id="runtime-config-wal">
index 9ca39f3ff20be993c9bf23ee4dee512b235d8d02..2ef7b887ddf24ee7790ab710a154f04bd61e4416 100644 (file)
@@ -67,6 +67,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
        TIDBitmap  *tbm;
        TBMIterator *tbmiterator;
        TBMIterateResult *tbmres;
+       TBMIterator *prefetch_iterator;
        OffsetNumber targoffset;
        TupleTableSlot *slot;
 
@@ -81,6 +82,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
        tbm = node->tbm;
        tbmiterator = node->tbmiterator;
        tbmres = node->tbmres;
+       prefetch_iterator = node->prefetch_iterator;
 
        /*
         * Check if we are evaluating PlanQual for tuple of this relation.
@@ -114,6 +116,15 @@ BitmapHeapNext(BitmapHeapScanState *node)
        /*
         * If we haven't yet performed the underlying index scan, do it, and
         * begin the iteration over the bitmap.
+        *
+        * For prefetching, we use *two* iterators, one for the pages we are
+        * actually scanning and another that runs ahead of the first for
+        * prefetching.  node->prefetch_pages tracks exactly how many pages
+        * ahead the prefetch iterator is.  Also, node->prefetch_target tracks
+        * the desired prefetch distance, which starts small and increases up
+        * to the GUC-controlled maximum, target_prefetch_pages.  This is to
+        * avoid doing a lot of prefetching in a scan that stops after a few
+        * tuples because of a LIMIT.
         */
        if (tbm == NULL)
        {
@@ -125,6 +136,15 @@ BitmapHeapNext(BitmapHeapScanState *node)
                node->tbm = tbm;
                node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
                node->tbmres = tbmres = NULL;
+
+#ifdef USE_PREFETCH
+               if (target_prefetch_pages > 0)
+               {
+                       node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
+                       node->prefetch_pages = 0;
+                       node->prefetch_target = -1;
+               }
+#endif /* USE_PREFETCH */
        }
 
        for (;;)
@@ -144,6 +164,22 @@ BitmapHeapNext(BitmapHeapScanState *node)
                                break;
                        }
 
+#ifdef USE_PREFETCH
+                       if (node->prefetch_pages > 0)
+                       {
+                               /* The main iterator has closed the distance by one page */
+                               node->prefetch_pages--;
+                       }
+                       else if (prefetch_iterator)
+                       {
+                               /* Do not let the prefetch iterator get behind the main one */
+                               TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+
+                               if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
+                                       elog(ERROR, "prefetch and main iterators are out of sync");
+                       }
+#endif /* USE_PREFETCH */
+
                        /*
                         * Ignore any claimed entries past what we think is the end of the
                         * relation.  (This is probably not necessary given that we got at
@@ -165,6 +201,23 @@ BitmapHeapNext(BitmapHeapScanState *node)
                         * Set rs_cindex to first slot to examine
                         */
                        scan->rs_cindex = 0;
+
+#ifdef USE_PREFETCH
+                       /*
+                        * Increase prefetch target if it's not yet at the max.  Note
+                        * that we will increase it to zero after fetching the very
+                        * first page/tuple, then to one after the second tuple is
+                        * fetched, then it doubles as later pages are fetched.
+                        */
+                       if (node->prefetch_target >= target_prefetch_pages)
+                               /* don't increase any further */ ;
+                       else if (node->prefetch_target >= target_prefetch_pages / 2)
+                               node->prefetch_target = target_prefetch_pages;
+                       else if (node->prefetch_target > 0)
+                               node->prefetch_target *= 2;
+                       else
+                               node->prefetch_target++;
+#endif /* USE_PREFETCH */
                }
                else
                {
@@ -172,7 +225,40 @@ BitmapHeapNext(BitmapHeapScanState *node)
                         * Continuing in previously obtained page; advance rs_cindex
                         */
                        scan->rs_cindex++;
+
+#ifdef USE_PREFETCH
+                       /*
+                        * Try to prefetch at least a few pages even before we get to the
+                        * second page if we don't stop reading after the first tuple.
+                        */
+                       if (node->prefetch_target < target_prefetch_pages)
+                               node->prefetch_target++;
+#endif /* USE_PREFETCH */
+               }
+
+#ifdef USE_PREFETCH
+               /*
+                * We issue prefetch requests *after* fetching the current page
+                * to try to avoid having prefetching interfere with the main I/O.
+                */
+               if (prefetch_iterator)
+               {
+                       while (node->prefetch_pages < node->prefetch_target)
+                       {
+                               TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+
+                               if (tbmpre == NULL)
+                               {
+                                       /* No more pages to prefetch */
+                                       tbm_end_iterate(prefetch_iterator);
+                                       node->prefetch_iterator = prefetch_iterator = NULL;
+                                       break;
+                               }
+                               node->prefetch_pages++;
+                               PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
+                       }
                }
+#endif /* USE_PREFETCH */
 
                /*
                 * Out of range?  If so, nothing more to look at on this page
@@ -379,11 +465,14 @@ ExecBitmapHeapReScan(BitmapHeapScanState *node, ExprContext *exprCtxt)
 
        if (node->tbmiterator)
                tbm_end_iterate(node->tbmiterator);
+       if (node->prefetch_iterator)
+               tbm_end_iterate(node->prefetch_iterator);
        if (node->tbm)
                tbm_free(node->tbm);
        node->tbm = NULL;
        node->tbmiterator = NULL;
        node->tbmres = NULL;
+       node->prefetch_iterator = NULL;
 
        /*
         * Always rescan the input immediately, to ensure we can pass down any
@@ -429,6 +518,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
         */
        if (node->tbmiterator)
                tbm_end_iterate(node->tbmiterator);
+       if (node->prefetch_iterator)
+               tbm_end_iterate(node->prefetch_iterator);
        if (node->tbm)
                tbm_free(node->tbm);
 
@@ -474,6 +565,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
        scanstate->tbm = NULL;
        scanstate->tbmiterator = NULL;
        scanstate->tbmres = NULL;
+       scanstate->prefetch_iterator = NULL;
+       scanstate->prefetch_pages = 0;
+       scanstate->prefetch_target = 0;
 
        /*
         * Miscellaneous initialization
index 88f318f35aef60035a15b1fabb4ca0af36da4468..bd053d503de04f98d202fb2e386529be251bc9fc 100644 (file)
@@ -65,6 +65,13 @@ bool         zero_damaged_pages = false;
 int                    bgwriter_lru_maxpages = 100;
 double         bgwriter_lru_multiplier = 2.0;
 
+/*
+ * How many buffers PrefetchBuffer callers should try to stay ahead of their
+ * ReadBuffer calls by.  This is maintained by the assign hook for
+ * effective_io_concurrency.  Zero means "never prefetch".
+ */
+int                    target_prefetch_pages = 0;
+
 /* local state for StartBufferIO and related functions */
 static volatile BufferDesc *InProgressBuf = NULL;
 static bool IsForInput;
@@ -95,6 +102,56 @@ static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
 
 
+/*
+ * PrefetchBuffer -- initiate asynchronous read of a block of a relation
+ *
+ * This is named by analogy to ReadBuffer but doesn't actually allocate a
+ * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
+ * block will not be delayed by the I/O.  Prefetching is optional.
+ * No-op if prefetching isn't compiled in.
+ */
+void
+PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
+{
+#ifdef USE_PREFETCH
+       Assert(RelationIsValid(reln));
+       Assert(BlockNumberIsValid(blockNum));
+
+       /* Open it at the smgr level if not already done */
+       RelationOpenSmgr(reln);
+
+       if (reln->rd_istemp)
+       {
+               /* pass it off to localbuf.c */
+               LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
+       }
+       else
+       {
+               BufferTag       newTag;                 /* identity of requested block */
+               uint32          newHash;                /* hash value for newTag */
+               LWLockId        newPartitionLock;               /* buffer partition lock for it */
+               int                     buf_id;
+
+               /* create a tag so we can lookup the buffer */
+               INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode, forkNum, blockNum);
+
+               /* determine its hash code and partition lock ID */
+               newHash = BufTableHashCode(&newTag);
+               newPartitionLock = BufMappingPartitionLock(newHash);
+
+               /* see if the block is in the buffer pool already */
+               LWLockAcquire(newPartitionLock, LW_SHARED);
+               buf_id = BufTableLookup(&newTag, newHash);
+               LWLockRelease(newPartitionLock);
+
+               /* If not in buffers, initiate prefetch */
+               if (buf_id < 0)
+                       smgrprefetch(reln->rd_smgr, forkNum, blockNum);
+       }
+#endif /* USE_PREFETCH */
+}
+
+
 /*
  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
  *             fork with RBM_NORMAL mode and default strategy.
index 16ad174888f5290e8923929c81d316475c4ffdfc..1d52f6ec6ee86fd1204cb0e2aa3c92927864e921 100644 (file)
@@ -52,6 +52,43 @@ static void InitLocalBuffers(void);
 static Block GetLocalBufferStorage(void);
 
 
+/*
+ * LocalPrefetchBuffer -
+ *       initiate asynchronous read of a block of a relation
+ *
+ * Do PrefetchBuffer's work for temporary relations.
+ * No-op if prefetching isn't compiled in.
+ */
+void
+LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
+                                       BlockNumber blockNum)
+{
+#ifdef USE_PREFETCH
+       BufferTag       newTag;                 /* identity of requested block */
+       LocalBufferLookupEnt *hresult;
+
+       INIT_BUFFERTAG(newTag, smgr->smgr_rnode, forkNum, blockNum);
+
+       /* Initialize local buffers if first request in this session */
+       if (LocalBufHash == NULL)
+               InitLocalBuffers();
+
+       /* See if the desired buffer already exists */
+       hresult = (LocalBufferLookupEnt *)
+               hash_search(LocalBufHash, (void *) &newTag, HASH_FIND, NULL);
+
+       if (hresult)
+       {
+               /* Yes, so nothing to do */
+               return;
+       }
+
+       /* Not in buffers, so initiate prefetch */
+       smgrprefetch(smgr, forkNum, blockNum);
+#endif /* USE_PREFETCH */
+}
+
+
 /*
  * LocalBufferAlloc -
  *       Find or create a local buffer for the given page of the given relation.
index 356a99c2fd9cda3279011a6e498c5059406d83dd..5eb8f5422f5a34bd9ad901d3bd0bcecb1fb646bf 100644 (file)
@@ -1029,6 +1029,42 @@ FileClose(File file)
        FreeVfd(file);
 }
 
+/*
+ * FilePrefetch - initiate asynchronous read of a given range of the file.
+ * The logical seek position is unaffected.
+ *
+ * Currently the only implementation of this function is using posix_fadvise
+ * which is the simplest standardized interface that accomplishes this.
+ * We could add an implementation using libaio in the future; but note that
+ * this API is inappropriate for libaio, which wants to have a buffer provided
+ * to read into.
+ */
+int
+FilePrefetch(File file, off_t offset, int amount)
+{
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
+       int                     returnCode;
+
+       Assert(FileIsValid(file));
+       
+       DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
+                          file, VfdCache[file].fileName,
+                          (int64) offset, amount));
+
+       returnCode = FileAccess(file);
+       if (returnCode < 0)
+               return returnCode;
+
+       returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
+                                                          POSIX_FADV_WILLNEED);
+
+       return returnCode;
+#else
+       Assert(FileIsValid(file));
+       return 0;
+#endif
+}
+
 int
 FileRead(File file, char *buffer, int amount)
 {
index c78d5bdeb6c02453c587dc09d8d9a21553d2d7d3..1b407c923d04253d89ffe4764b7a2b740be02010 100644 (file)
@@ -550,6 +550,26 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
        }
 }
 
+/*
+ *     mdprefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+void
+mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+#ifdef USE_PREFETCH
+       off_t           seekpos;
+       MdfdVec    *v;
+
+       v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
+
+       seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+       Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+       (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
+#endif /* USE_PREFETCH */
+}
+
+
 /*
  *     mdread() -- Read the specified block from a relation.
  */
index f923479598dae6e75f37b7ac07c24e0540183660..e1149f1c1e76b10692ce4a37e2b56f3cee7a5fed 100644 (file)
@@ -48,6 +48,8 @@ typedef struct f_smgr
                                                                bool isRedo);
        void            (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
                                                        BlockNumber blocknum, char *buffer, bool isTemp);
+       void            (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
+                                                                 BlockNumber blocknum);
        void            (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
                                                          BlockNumber blocknum, char *buffer);
        void            (*smgr_write) (SMgrRelation reln, ForkNumber forknum, 
@@ -65,7 +67,7 @@ typedef struct f_smgr
 static const f_smgr smgrsw[] = {
        /* magnetic disk */
        {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
-               mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
+               mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
                mdpreckpt, mdsync, mdpostckpt
        }
 };
@@ -375,6 +377,15 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                                                                           buffer, isTemp);
 }
 
+/*
+ *     smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
+ */
+void
+smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+       (*(smgrsw[reln->smgr_which].smgr_prefetch)) (reln, forknum, blocknum);
+}
+
 /*
  *     smgrread() -- read a particular block from a relation into the supplied
  *                               buffer.
index 36fdd8210abb7f5aeecf00e8d89893be66cb0a17..1feae63e4e6331a318e6646e12177e46291ab74f 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <ctype.h>
 #include <float.h>
+#include <math.h>
 #include <limits.h>
 #include <unistd.h>
 #include <sys/stat.h>
@@ -163,8 +164,9 @@ static bool assign_tcp_keepalives_count(int newval, bool doit, GucSource source)
 static const char *show_tcp_keepalives_idle(void);
 static const char *show_tcp_keepalives_interval(void);
 static const char *show_tcp_keepalives_count(void);
-static bool assign_autovacuum_max_workers(int newval, bool doit, GucSource source);
 static bool assign_maxconnections(int newval, bool doit, GucSource source);
+static bool assign_autovacuum_max_workers(int newval, bool doit, GucSource source);
+static bool assign_effective_io_concurrency(int newval, bool doit, GucSource source);
 static const char *assign_pgstat_temp_directory(const char *newval, bool doit, GucSource source);
 
 static char *config_enum_get_options(struct config_enum *record, 
@@ -413,6 +415,7 @@ static int  segment_size;
 static int     wal_block_size;
 static int     wal_segment_size;
 static bool integer_datetimes;
+static int     effective_io_concurrency;
 
 /* should be static, but commands/variable.c needs to get at these */
 char      *role_string;
@@ -1700,6 +1703,20 @@ static struct config_int ConfigureNamesInt[] =
                100, 0, 1000, NULL, NULL
        },
 
+       {
+               {"effective_io_concurrency", PGC_USERSET, RESOURCES,
+                       gettext_noop("Number of simultaneous requests that can be handled efficiently by the disk subsystem."),
+                       gettext_noop("For RAID arrays, this should be approximately the number of drive spindles in the array.")
+               },
+               &effective_io_concurrency,
+#ifdef USE_PREFETCH
+               1, 0, 1000,
+#else
+               0, 0, 0,
+#endif
+               assign_effective_io_concurrency, NULL
+       },
+
        {
                {"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE,
                        gettext_noop("Automatic log file rotation will occur after N minutes."),
@@ -7587,6 +7604,61 @@ assign_autovacuum_max_workers(int newval, bool doit, GucSource source)
        return true;
 }
 
+static bool
+assign_effective_io_concurrency(int newval, bool doit, GucSource source)
+{
+#ifdef USE_PREFETCH
+       double          new_prefetch_pages = 0.0;
+       int                     i;
+
+       /*----------
+        * The user-visible GUC parameter is the number of drives (spindles),
+        * which we need to translate to a number-of-pages-to-prefetch target.
+        *
+        * The expected number of prefetch pages needed to keep N drives busy is:
+        *
+        * drives |   I/O requests
+        * -------+----------------
+        *      1 |   1
+        *      2 |   2/1 + 2/2 = 3
+        *      3 |   3/1 + 3/2 + 3/3 = 5 1/2
+        *      4 |   4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
+        *      n |   n * H(n)
+        *
+        * This is called the "coupon collector problem" and H(n) is called the
+        * harmonic series.  This could be approximated by n * ln(n), but for
+        * reasonable numbers of drives we might as well just compute the series.
+        *
+        * Alternatively we could set the target to the number of pages necessary
+        * so that the expected number of active spindles is some arbitrary
+        * percentage of the total.  This sounds the same but is actually slightly
+        * different.  The result ends up being ln(1-P)/ln((n-1)/n) where P is
+        * that desired fraction.
+        *
+        * Experimental results show that both of these formulas aren't aggressive
+        * enough, but we don't really have any better proposals.
+        *
+        * Note that if newval = 0 (disabled), we must set target = 0.
+        *----------
+        */
+
+       for (i = 1; i <= newval; i++)
+               new_prefetch_pages += (double) newval / (double) i;
+
+       /* This range check shouldn't fail, but let's be paranoid */
+       if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX)
+       {
+               if (doit)
+                       target_prefetch_pages = (int) rint(new_prefetch_pages);
+               return true;
+       }
+       else
+               return false;
+#else
+       return true;
+#endif /* USE_PREFETCH */
+}
+
 static const char *
 assign_pgstat_temp_directory(const char *newval, bool doit, GucSource source)
 {
index ffa5055b76889b56bcdfd09a81f1fc29ee8af975..977e13e0aff54e700567b211cf9d734524453294 100644 (file)
 #bgwriter_lru_maxpages = 100           # 0-1000 max buffers written/round
 #bgwriter_lru_multiplier = 2.0         # 0-10.0 multipler on buffers scanned/round
 
+# - Asynchronous Behavior -
+
+#effective_io_concurrency = 1          # 1-1000, or 0 to disable prefetching
+
 
 #------------------------------------------------------------------------------
 # WRITE AHEAD LOG
index dd661d73f291171058fe6b575ac84da912c04a91..1e658790b4e72bcd6522b5b65ebde2d0f9f7d27a 100644 (file)
@@ -1154,6 +1154,9 @@ typedef struct BitmapIndexScanState
  *             tbm                                bitmap obtained from child index scan(s)
  *             tbmiterator                iterator for scanning current pages
  *             tbmres                     current-page data
+ *             prefetch_iterator  iterator for prefetching ahead of current page
+ *             prefetch_pages     # pages prefetch iterator is ahead of current
+ *             prefetch_target    target prefetch distance
  * ----------------
  */
 typedef struct BitmapHeapScanState
@@ -1163,6 +1166,9 @@ typedef struct BitmapHeapScanState
        TIDBitmap  *tbm;
        TBMIterator *tbmiterator;
        TBMIterateResult *tbmres;
+       TBMIterator *prefetch_iterator;
+       int                     prefetch_pages;
+       int                     prefetch_target;
 } BitmapHeapScanState;
 
 /* ----------------
index 1df610daf73bb5d0369b2296950f20317eb6ac78..b68bfe22b4210a2a8f7198d59d791df2507574f5 100644 (file)
 #define USE_POSIX_FADVISE
 #endif
 
+/*
+ * USE_PREFETCH code should be compiled only if we have a way to implement
+ * prefetching.  (This is decoupled from USE_POSIX_FADVISE because there
+ * might in future be support for alternative low-level prefetch APIs.)
+ */
+#ifdef USE_POSIX_FADVISE
+#define USE_PREFETCH
+#endif
+
 /*
  * This is the default directory in which AF_UNIX socket files are
  * placed.     Caution: changing this risks breaking your existing client
index 739e9986efa4b8f6a4c1b21f9984296ab02b97aa..3991b47e458560c5269b79ae3bdd050831c85aae 100644 (file)
@@ -208,7 +208,9 @@ extern int  BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
 
 /* localbuf.c */
-extern BufferDesc *LocalBufferAlloc(SMgrRelation reln, ForkNumber forkNum,
+extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
+                                                               BlockNumber blockNum);
+extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
                                 BlockNumber blockNum, bool *foundPtr);
 extern void MarkLocalBufferDirty(Buffer buffer);
 extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
index 1a5ec4e05059d498c46e1cf84bed092d11bf861a..42766e99f7c66dfa5cc8b96c8918333ae1f9390b 100644 (file)
@@ -47,6 +47,7 @@ extern PGDLLIMPORT int NBuffers;
 extern bool zero_damaged_pages;
 extern int     bgwriter_lru_maxpages;
 extern double bgwriter_lru_multiplier;
+extern int     target_prefetch_pages;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
@@ -152,6 +153,8 @@ extern PGDLLIMPORT int32 *LocalRefCount;
 /*
  * s for functions in bufmgr.c
  */
+extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
+                                                  BlockNumber blockNum);
 extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
 extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
                                                                 BlockNumber blockNum, ReadBufferMode mode,
index 4e7a1031a5105b6210c4aba76d5afca9ea213120..462f6d129192c4cbe2866b433106aec5c7368aba 100644 (file)
@@ -62,6 +62,7 @@ extern int    max_files_per_process;
 extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
 extern File OpenTemporaryFile(bool interXact);
 extern void FileClose(File file);
+extern int     FilePrefetch(File file, off_t offset, int amount);
 extern int     FileRead(File file, char *buffer, int amount);
 extern int     FileWrite(File file, char *buffer, int amount);
 extern int     FileSync(File file);
index 6a764802ca0acf3ab209dc13403c20e5b8524d04..22a4391abd019522558f59c0a3fc94ff14c9d06f 100644 (file)
@@ -70,6 +70,8 @@ extern void smgrdounlink(SMgrRelation reln, ForkNumber forknum,
                                                 bool isTemp, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum, 
                                           BlockNumber blocknum, char *buffer, bool isTemp);
+extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
+                                                BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
                                         BlockNumber blocknum, char *buffer);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
@@ -93,6 +95,8 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileNode rnode, ForkNumber forknum, bool isRedo);
 extern void mdextend(SMgrRelation reln, ForkNumber forknum,
                                         BlockNumber blocknum, char *buffer, bool isTemp);
+extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
+                                          BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                   char *buffer);
 extern void mdwrite(SMgrRelation reln, ForkNumber forknum,