Support parallel btree index builds.

author Robert Haas <[email protected]>

Fri, 2 Feb 2018 18:25:55 +0000 (13:25 -0500)

committer Robert Haas <[email protected]>

Fri, 2 Feb 2018 18:32:44 +0000 (13:32 -0500)
author Robert Haas <[email protected]>
Fri, 2 Feb 2018 18:25:55 +0000 (13:25 -0500)
committer Robert Haas <[email protected]>
Fri, 2 Feb 2018 18:32:44 +0000 (13:32 -0500)
diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c

index bfee244aa1cc94de707ee0d6d784a521c8bbda44..d231e5331f9fad1ed4fd3efc4edab097cc4a3055 100644 (file)
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -135,7 +135,8 @@ blbuild(Relation heap, Relation index, IndexInfo *indexInfo)
  
     /* Do the heap scan */
     reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
-                                  bloomBuildCallback, (void *) &buildstate);
+                                  bloomBuildCallback, (void *) &buildstate,
+                                  NULL);
  
     /*
      * There are could be some items in cached page.  Flush this page if
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index f951ddb41e71b7a3c45de017c898737cd66b78fb..c45979dee48b8b00514cf3b702d0e8d0db7dccf2 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2022,7 +2022,8 @@ include_dir 'conf.d'
  
          <para>
           When changing this value, consider also adjusting
-         <xref linkend="guc-max-parallel-workers"/> and
+         <xref linkend="guc-max-parallel-workers"/>,
+         <xref linkend="guc-max-parallel-workers-maintenance"/>, and
           <xref linkend="guc-max-parallel-workers-per-gather"/>.
          </para>
         </listitem>
@@ -2070,6 +2071,44 @@ include_dir 'conf.d'
         </listitem>
        </varlistentry>
  
+      <varlistentry id="guc-max-parallel-workers-maintenance" xreflabel="max_parallel_maintenance_workers">
+       <term><varname>max_parallel_maintenance_workers</varname> (<type>integer</type>)
+       <indexterm>
+        <primary><varname>max_parallel_maintenance_workers</varname> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Sets the maximum number of parallel workers that can be
+         started by a single utility command.  Currently, the only
+         parallel utility command that supports the use of parallel
+         workers is <command>CREATE INDEX</command>, and only when
+         building a B-tree index.  Parallel workers are taken from the
+         pool of processes established by <xref
+         linkend="guc-max-worker-processes"/>, limited by <xref
+         linkend="guc-max-parallel-workers"/>.  Note that the requested
+         number of workers may not actually be available at runtime.
+         If this occurs, the utility operation will run with fewer
+         workers than expected.  The default value is 2.  Setting this
+         value to 0 disables the use of parallel workers by utility
+         commands.
+        </para>
+
+        <para>
+         Note that parallel utility commands should not consume
+         substantially more memory than equivalent non-parallel
+         operations.  This strategy differs from that of parallel
+         query, where resource limits generally apply per worker
+         process.  Parallel utility commands treat the resource limit
+         <varname>maintenance_work_mem</varname> as a limit to be applied to
+         the entire utility command, regardless of the number of
+         parallel worker processes.  However, parallel utility
+         commands may still consume substantially more CPU resources
+         and I/O bandwidth.
+        </para>
+       </listitem>
+      </varlistentry>
+
        <varlistentry id="guc-max-parallel-workers" xreflabel="max_parallel_workers">
         <term><varname>max_parallel_workers</varname> (<type>integer</type>)
         <indexterm>
@@ -2079,8 +2118,9 @@ include_dir 'conf.d'
         <listitem>
          <para>
           Sets the maximum number of workers that the system can support for
-         parallel queries.  The default value is 8.  When increasing or
+         parallel operations.  The default value is 8.  When increasing or
           decreasing this value, consider also adjusting
+         <xref linkend="guc-max-parallel-workers-maintenance"/> and
           <xref linkend="guc-max-parallel-workers-per-gather"/>.
           Also, note that a setting for this value which is higher than
           <xref linkend="guc-max-worker-processes"/> will have no effect,
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml

index 8a9793644fa222632261b08e205d2ab44f3f9868..e138d1ef0769f7076a1deb8de39f046379aa843a 100644 (file)
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1263,7 +1263,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
           <entry>Waiting in an extension.</entry>
          </row>
          <row>
-         <entry morerows="32"><literal>IPC</literal></entry>
+         <entry morerows="33"><literal>IPC</literal></entry>
           <entry><literal>BgWorkerShutdown</literal></entry>
           <entry>Waiting for background worker to shut down.</entry>
          </row>
@@ -1371,6 +1371,10 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
           <entry><literal>ParallelBitmapScan</literal></entry>
           <entry>Waiting for parallel bitmap scan to become initialized.</entry>
          </row>
+        <row>
+         <entry><literal>ParallelCreateIndexScan</literal></entry>
+         <entry>Waiting for parallel <command>CREATE INDEX</command> workers to finish heap scan.</entry>
+        </row>
          <row>
           <entry><literal>ProcArrayGroupUpdate</literal></entry>
           <entry>Waiting for group leader to clear transaction id at transaction end.</entry>
@@ -3900,13 +3904,15 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
      </row>
      <row>
       <entry><literal>sort-start</literal></entry>
-     <entry><literal>(int, bool, int, int, bool)</literal></entry>
+     <entry><literal>(int, bool, int, int, bool, int)</literal></entry>
       <entry>Probe that fires when a sort operation is started.
        arg0 indicates heap, index or datum sort.
        arg1 is true for unique-value enforcement.
        arg2 is the number of key columns.
        arg3 is the number of kilobytes of work memory allowed.
-      arg4 is true if random access to the sort result is required.</entry>
+      arg4 is true if random access to the sort result is required.
+      arg5 indicates serial when <literal>0</literal>, parallel worker when
+      <literal>1</literal>, or parallel leader when <literal>2</literal>.</entry>
      </row>
      <row>
       <entry><literal>sort-done</literal></entry>
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml

index 5137fe63832f75195fb49bb3f7c76a1be2658345..f464557de81121cdbe552d32ebf38f969bb8a3ab 100644 (file)
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -599,6 +599,64 @@ Indexes:
     which would drive the machine into swapping.
    </para>
  
+  <para>
+   <productname>PostgreSQL</productname> can build indexes while
+   leveraging multiple CPUs in order to process the table rows faster.
+   This feature is known as <firstterm>parallel index
+   build</firstterm>.  For index methods that support building indexes
+   in parallel (currently, only B-tree),
+   <varname>maintenance_work_mem</varname> specifies the maximum
+   amount of memory that can be used by each index build operation as
+   a whole, regardless of how many worker processes were started.
+   Generally, a cost model automatically determines how many worker
+   processes should be requested, if any.
+  </para>
+
+  <para>
+   Parallel index builds may benefit from increasing
+   <varname>maintenance_work_mem</varname> where an equivalent serial
+   index build will see little or no benefit.  Note that
+   <varname>maintenance_work_mem</varname> may influence the number of
+   worker processes requested, since parallel workers must have at
+   least a <literal>32MB</literal> share of the total
+   <varname>maintenance_work_mem</varname> budget.  There must also be
+   a remaining <literal>32MB</literal> share for the leader process.
+   Increasing <xref linkend="guc-max-parallel-workers-maintenance"/>
+   may allow more workers to be used, which will reduce the time
+   needed for index creation, so long as the index build is not
+   already I/O bound.  Of course, there should also be sufficient
+   CPU capacity that would otherwise lie idle.
+  </para>
+
+  <para>
+   Setting a value for <literal>parallel_workers</literal> via <xref
+   linkend="sql-altertable"/> directly controls how many parallel
+   worker processes will be requested by a <command>CREATE
+   INDEX</command> against the table.  This bypasses the cost model
+   completely, and prevents <varname>maintenance_work_mem</varname>
+   from affecting how many parallel workers are requested.  Setting
+   <literal>parallel_workers</literal> to 0 via <command>ALTER
+   TABLE</command> will disable parallel index builds on the table in
+   all cases.
+  </para>
+
+  <tip>
+   <para>
+    You might want to reset <literal>parallel_workers</literal> after
+    setting it as part of tuning an index build.  This avoids
+    inadvertent changes to query plans, since
+    <literal>parallel_workers</literal> affects
+    <emphasis>all</emphasis> parallel table scans.
+   </para>
+  </tip>
+
+  <para>
+   While <command>CREATE INDEX</command> with the
+   <literal>CONCURRENTLY</literal> option supports parallel builds
+   without special restrictions, only the first table scan is actually
+   performed in parallel.
+  </para>
+
    <para>
     Use <xref linkend="sql-dropindex"/>
     to remove an index.
diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml

index a0c9a6d257182142107518230a342829fd5415fe..d2df40d54313609f30cef9d542a28f5f362adf37 100644 (file)
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -1228,8 +1228,8 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM
        This sets the number of workers that should be used to assist a parallel
        scan of this table.  If not set, the system will determine a value based
        on the relation size.  The actual number of workers chosen by the planner
-      may be less, for example due to
-      the setting of <xref linkend="guc-max-worker-processes"/>.
+      or by utility statements that use parallel scans may be less, for example
+      due to the setting of <xref linkend="guc-max-worker-processes"/>.
       </para>
      </listitem>
     </varlistentry>
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c

index 50278722670a9bd3ae372a02efbb76b28ce44c7c..68b33716659f6be769f231276e45feac9619b769 100644 (file)
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -706,7 +706,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
      * heap blocks in physical order.
      */
     reltuples = IndexBuildHeapScan(heap, index, indexInfo, false,
-                                  brinbuildCallback, (void *) state);
+                                  brinbuildCallback, (void *) state, NULL);
  
     /* process the final batch */
     form_and_insert_tuple(state);
@@ -1205,7 +1205,7 @@ summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
     state->bs_currRangeStart = heapBlk;
     IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false, true,
                             heapBlk, scanNumBlks,
-                           brinbuildCallback, (void *) state);
+                           brinbuildCallback, (void *) state, NULL);
  
     /*
      * Now we update the values obtained by the scan with the placeholder
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c

index 473cc3d6b34c526ada92f69e72aecaeb4d587fab..23f7285547443cfc8b58b11b68b94bf461e67c56 100644 (file)
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -391,7 +391,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
      * prefers to receive tuples in TID order.
      */
     reltuples = IndexBuildHeapScan(heap, index, indexInfo, false,
-                                  ginBuildCallback, (void *) &buildstate);
+                                  ginBuildCallback, (void *) &buildstate, NULL);
  
     /* dump remaining entries to the index */
     oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c

index d22318a5f1e13ed166ad65a5beab1f492a81ee82..434f15f0148e0ff90dd131d36783edb405982960 100644 (file)
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -203,7 +203,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
      * Do the heap scan.
      */
     reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
-                                  gistBuildCallback, (void *) &buildstate);
+                                  gistBuildCallback, (void *) &buildstate, NULL);
  
     /*
      * If buffering was used, flush out all the tuples that are still in the
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c

index 718e2be1cd85ca1814549bf7f8b164f110ebd1b0..e337439adad4f0bc7dbe70d2e0c9db61e1cdc322 100644 (file)
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -159,7 +159,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
  
     /* do the heap scan */
     reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
-                                  hashbuildCallback, (void *) &buildstate);
+                                  hashbuildCallback, (void *) &buildstate, NULL);
  
     if (buildstate.spool)
     {
diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c

index 7d3790a47343cadf7a85188f8e1ad0daa5708549..b70964f429f9cd3fa1c4c5a77583218fd1f45874 100644 (file)
--- a/src/backend/access/hash/hashsort.c
+++ b/src/backend/access/hash/hashsort.c
@@ -82,6 +82,7 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
                                                    hspool->low_mask,
                                                    hspool->max_buckets,
                                                    maintenance_work_mem,
+                                                  NULL,
                                                    false);
  
     return hspool;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index be263850cd354b3a478a1d582741e419fd24cf34..8a846e7dbaa9e38373f3f5d3af5e1c3fdc386db2 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1627,7 +1627,16 @@ heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation,
     SpinLockInit(&target->phs_mutex);
     target->phs_startblock = InvalidBlockNumber;
     pg_atomic_init_u64(&target->phs_nallocated, 0);
-   SerializeSnapshot(snapshot, target->phs_snapshot_data);
+   if (IsMVCCSnapshot(snapshot))
+   {
+       SerializeSnapshot(snapshot, target->phs_snapshot_data);
+       target->phs_snapshot_any = false;
+   }
+   else
+   {
+       Assert(snapshot == SnapshotAny);
+       target->phs_snapshot_any = true;
+   }
  }
  
  /* ----------------
@@ -1655,11 +1664,22 @@ heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan)
     Snapshot    snapshot;
  
     Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
-   snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
-   RegisterSnapshot(snapshot);
+
+   if (!parallel_scan->phs_snapshot_any)
+   {
+       /* Snapshot was serialized -- restore it */
+       snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
+       RegisterSnapshot(snapshot);
+   }
+   else
+   {
+       /* SnapshotAny passed by caller (not serialized) */
+       snapshot = SnapshotAny;
+   }
  
     return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
-                                  true, true, true, false, false, true);
+                                  true, true, true, false, false,
+                                  !parallel_scan->phs_snapshot_any);
  }
  
  /* ----------------
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index a344c4490e4fe5e67005721b65abac9c126f6e5b..8158508d8c560005c0493edb4e4e0893aa0b1a01 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -21,36 +21,19 @@
  #include "access/nbtree.h"
  #include "access/relscan.h"
  #include "access/xlog.h"
-#include "catalog/index.h"
  #include "commands/vacuum.h"
+#include "nodes/execnodes.h"
  #include "pgstat.h"
  #include "storage/condition_variable.h"
  #include "storage/indexfsm.h"
  #include "storage/ipc.h"
  #include "storage/lmgr.h"
  #include "storage/smgr.h"
-#include "tcop/tcopprot.h"     /* pgrminclude ignore */
  #include "utils/builtins.h"
  #include "utils/index_selfuncs.h"
  #include "utils/memutils.h"
  
  
-/* Working state for btbuild and its callback */
-typedef struct
-{
-   bool        isUnique;
-   bool        haveDead;
-   Relation    heapRel;
-   BTSpool    *spool;
-
-   /*
-    * spool2 is needed only when the index is a unique index. Dead tuples are
-    * put into spool2 instead of spool in order to avoid uniqueness check.
-    */
-   BTSpool    *spool2;
-   double      indtuples;
-} BTBuildState;
-
  /* Working state needed by btvacuumpage */
  typedef struct
  {
@@ -104,12 +87,6 @@ typedef struct BTParallelScanDescData
  typedef struct BTParallelScanDescData *BTParallelScanDesc;
  
  
-static void btbuildCallback(Relation index,
-               HeapTuple htup,
-               Datum *values,
-               bool *isnull,
-               bool tupleIsAlive,
-               void *state);
  static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
              IndexBulkDeleteCallback callback, void *callback_state,
              BTCycleId cycleid);
@@ -166,115 +143,6 @@ bthandler(PG_FUNCTION_ARGS)
     PG_RETURN_POINTER(amroutine);
  }
  
-/*
- * btbuild() -- build a new btree index.
- */
-IndexBuildResult *
-btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
-{
-   IndexBuildResult *result;
-   double      reltuples;
-   BTBuildState buildstate;
-
-   buildstate.isUnique = indexInfo->ii_Unique;
-   buildstate.haveDead = false;
-   buildstate.heapRel = heap;
-   buildstate.spool = NULL;
-   buildstate.spool2 = NULL;
-   buildstate.indtuples = 0;
-
-#ifdef BTREE_BUILD_STATS
-   if (log_btree_build_stats)
-       ResetUsage();
-#endif                         /* BTREE_BUILD_STATS */
-
-   /*
-    * We expect to be called exactly once for any index relation. If that's
-    * not the case, big trouble's what we have.
-    */
-   if (RelationGetNumberOfBlocks(index) != 0)
-       elog(ERROR, "index \"%s\" already contains data",
-            RelationGetRelationName(index));
-
-   buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);
-
-   /*
-    * If building a unique index, put dead tuples in a second spool to keep
-    * them out of the uniqueness check.
-    */
-   if (indexInfo->ii_Unique)
-       buildstate.spool2 = _bt_spoolinit(heap, index, false, true);
-
-   /* do the heap scan */
-   reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
-                                  btbuildCallback, (void *) &buildstate);
-
-   /* okay, all heap tuples are indexed */
-   if (buildstate.spool2 && !buildstate.haveDead)
-   {
-       /* spool2 turns out to be unnecessary */
-       _bt_spooldestroy(buildstate.spool2);
-       buildstate.spool2 = NULL;
-   }
-
-   /*
-    * Finish the build by (1) completing the sort of the spool file, (2)
-    * inserting the sorted tuples into btree pages and (3) building the upper
-    * levels.
-    */
-   _bt_leafbuild(buildstate.spool, buildstate.spool2);
-   _bt_spooldestroy(buildstate.spool);
-   if (buildstate.spool2)
-       _bt_spooldestroy(buildstate.spool2);
-
-#ifdef BTREE_BUILD_STATS
-   if (log_btree_build_stats)
-   {
-       ShowUsage("BTREE BUILD STATS");
-       ResetUsage();
-   }
-#endif                         /* BTREE_BUILD_STATS */
-
-   /*
-    * Return statistics
-    */
-   result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-
-   result->heap_tuples = reltuples;
-   result->index_tuples = buildstate.indtuples;
-
-   return result;
-}
-
-/*
- * Per-tuple callback from IndexBuildHeapScan
- */
-static void
-btbuildCallback(Relation index,
-               HeapTuple htup,
-               Datum *values,
-               bool *isnull,
-               bool tupleIsAlive,
-               void *state)
-{
-   BTBuildState *buildstate = (BTBuildState *) state;
-
-   /*
-    * insert the index tuple into the appropriate spool file for subsequent
-    * processing
-    */
-   if (tupleIsAlive || buildstate->spool2 == NULL)
-       _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
-   else
-   {
-       /* dead tuples are put into spool2 */
-       buildstate->haveDead = true;
-       _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
-   }
-
-   buildstate->indtuples += 1;
-}
-
  /*
   * btbuildempty() -- build an empty btree index in the initialization fork
   */
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c

index f6159db1cd6003f665f6be94731ba34141bf1e9c..521ae6e5f77a06149bad269b8697834ae8c33c40 100644 (file)
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -67,28 +67,168 @@
  #include "postgres.h"
  
  #include "access/nbtree.h"
+#include "access/parallel.h"
+#include "access/relscan.h"
+#include "access/xact.h"
  #include "access/xlog.h"
  #include "access/xloginsert.h"
+#include "catalog/index.h"
  #include "miscadmin.h"
+#include "pgstat.h"
  #include "storage/smgr.h"
-#include "tcop/tcopprot.h"
+#include "tcop/tcopprot.h"     /* pgrminclude ignore */
  #include "utils/rel.h"
  #include "utils/sortsupport.h"
  #include "utils/tuplesort.h"
  
  
+/* Magic numbers for parallel state sharing */
+#define PARALLEL_KEY_BTREE_SHARED      UINT64CONST(0xA000000000000001)
+#define PARALLEL_KEY_TUPLESORT         UINT64CONST(0xA000000000000002)
+#define PARALLEL_KEY_TUPLESORT_SPOOL2  UINT64CONST(0xA000000000000003)
+
+/*
+ * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
+ * parallel index builds.  This may be useful as a debugging aid.
+#undef DISABLE_LEADER_PARTICIPATION
+ */
+
  /*
   * Status record for spooling/sorting phase.  (Note we may have two of
   * these due to the special requirements for uniqueness-checking with
   * dead tuples.)
   */
-struct BTSpool
+typedef struct BTSpool
  {
     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
     Relation    heap;
     Relation    index;
     bool        isunique;
-};
+} BTSpool;
+
+/*
+ * Status for index builds performed in parallel.  This is allocated in a
+ * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
+ * entry, private to tuplesort.c but allocated by this module on its behalf.
+ */
+typedef struct BTShared
+{
+   /*
+    * These fields are not modified during the sort.  They primarily exist
+    * for the benefit of worker processes that need to create BTSpool state
+    * corresponding to that used by the leader.
+    */
+   Oid         heaprelid;
+   Oid         indexrelid;
+   bool        isunique;
+   bool        isconcurrent;
+   int         scantuplesortstates;
+
+   /*
+    * workersdonecv is used to monitor the progress of workers.  All parallel
+    * participants must indicate that they are done before leader can use
+    * mutable state that workers maintain during scan (and before leader can
+    * proceed to tuplesort_performsort()).
+    */
+   ConditionVariable workersdonecv;
+
+   /*
+    * mutex protects all fields before heapdesc.
+    *
+    * These fields contain status information of interest to B-Tree index
+    * builds that must work just the same when an index is built in parallel.
+    */
+   slock_t     mutex;
+
+   /*
+    * Mutable state that is maintained by workers, and reported back to
+    * leader at end of parallel scan.
+    *
+    * nparticipantsdone is number of worker processes finished.
+    *
+    * reltuples is the total number of input heap tuples.
+    *
+    * havedead indicates if RECENTLY_DEAD tuples were encountered during
+    * build.
+    *
+    * indtuples is the total number of tuples that made it into the index.
+    *
+    * brokenhotchain indicates if any worker detected a broken HOT chain
+    * during build.
+    */
+   int         nparticipantsdone;
+   double      reltuples;
+   bool        havedead;
+   double      indtuples;
+   bool        brokenhotchain;
+
+   /*
+    * This variable-sized field must come last.
+    *
+    * See _bt_parallel_estimate_shared().
+    */
+   ParallelHeapScanDescData heapdesc;
+} BTShared;
+
+/*
+ * Status for leader in parallel index build.
+ */
+typedef struct BTLeader
+{
+   /* parallel context itself */
+   ParallelContext *pcxt;
+
+   /*
+    * nparticipanttuplesorts is the exact number of worker processes
+    * successfully launched, plus one leader process if it participates as a
+    * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
+    * participating as a worker).
+    */
+   int         nparticipanttuplesorts;
+
+   /*
+    * Leader process convenience pointers to shared state (leader avoids TOC
+    * lookups).
+    *
+    * btshared is the shared state for entire build.  sharedsort is the
+    * shared, tuplesort-managed state passed to each process tuplesort.
+    * sharedsort2 is the corresponding btspool2 shared state, used only when
+    * building unique indexes.  snapshot is the snapshot used by the scan iff
+    * an MVCC snapshot is required.
+    */
+   BTShared   *btshared;
+   Sharedsort *sharedsort;
+   Sharedsort *sharedsort2;
+   Snapshot    snapshot;
+} BTLeader;
+
+/*
+ * Working state for btbuild and its callback.
+ *
+ * When parallel CREATE INDEX is used, there is a BTBuildState for each
+ * participant.
+ */
+typedef struct BTBuildState
+{
+   bool        isunique;
+   bool        havedead;
+   Relation    heap;
+   BTSpool    *spool;
+
+   /*
+    * spool2 is needed only when the index is a unique index. Dead tuples are
+    * put into spool2 instead of spool in order to avoid uniqueness check.
+    */
+   BTSpool    *spool2;
+   double      indtuples;
+
+   /*
+    * btleader is only present when a parallel index build is performed, and
+    * only in the leader process. (Actually, only the leader has a
+    * BTBuildState.  Workers have their own spool and spool2, though.)
+    */
+   BTLeader   *btleader;
+} BTBuildState;
  
  /*
   * Status record for a btree page being built.  We have one of these
@@ -128,6 +268,14 @@ typedef struct BTWriteState
  } BTWriteState;
  
  
+static double _bt_spools_heapscan(Relation heap, Relation index,
+                   BTBuildState *buildstate, IndexInfo *indexInfo);
+static void _bt_spooldestroy(BTSpool *btspool);
+static void _bt_spool(BTSpool *btspool, ItemPointer self,
+         Datum *values, bool *isnull);
+static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
+static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values,
+                  bool *isnull, bool tupleIsAlive, void *state);
  static Page _bt_blnewpage(uint32 level);
  static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
  static void _bt_slideleft(Page page);
@@ -138,45 +286,219 @@ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
  static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
  static void _bt_load(BTWriteState *wstate,
          BTSpool *btspool, BTSpool *btspool2);
+static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
+                  int request);
+static void _bt_end_parallel(BTLeader *btleader);
+static Size _bt_parallel_estimate_shared(Snapshot snapshot);
+static double _bt_parallel_heapscan(BTBuildState *buildstate,
+                     bool *brokenhotchain);
+static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
+static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
+                          BTShared *btshared, Sharedsort *sharedsort,
+                          Sharedsort *sharedsort2, int sortmem);
  
  
  /*
- * Interface routines
+ * btbuild() -- build a new btree index.
   */
+IndexBuildResult *
+btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+   IndexBuildResult *result;
+   BTBuildState buildstate;
+   double      reltuples;
+
+#ifdef BTREE_BUILD_STATS
+   if (log_btree_build_stats)
+       ResetUsage();
+#endif                         /* BTREE_BUILD_STATS */
+
+   buildstate.isunique = indexInfo->ii_Unique;
+   buildstate.havedead = false;
+   buildstate.heap = heap;
+   buildstate.spool = NULL;
+   buildstate.spool2 = NULL;
+   buildstate.indtuples = 0;
+   buildstate.btleader = NULL;
+
+   /*
+    * We expect to be called exactly once for any index relation. If that's
+    * not the case, big trouble's what we have.
+    */
+   if (RelationGetNumberOfBlocks(index) != 0)
+       elog(ERROR, "index \"%s\" already contains data",
+            RelationGetRelationName(index));
+
+   reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
+
+   /*
+    * Finish the build by (1) completing the sort of the spool file, (2)
+    * inserting the sorted tuples into btree pages and (3) building the upper
+    * levels.  Finally, it may also be necessary to end use of parallelism.
+    */
+   _bt_leafbuild(buildstate.spool, buildstate.spool2);
+   _bt_spooldestroy(buildstate.spool);
+   if (buildstate.spool2)
+       _bt_spooldestroy(buildstate.spool2);
+   if (buildstate.btleader)
+       _bt_end_parallel(buildstate.btleader);
+
+   result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+   result->heap_tuples = reltuples;
+   result->index_tuples = buildstate.indtuples;
+
+#ifdef BTREE_BUILD_STATS
+   if (log_btree_build_stats)
+   {
+       ShowUsage("BTREE BUILD STATS");
+       ResetUsage();
+   }
+#endif                         /* BTREE_BUILD_STATS */
  
+   return result;
+}
  
  /*
- * create and initialize a spool structure
+ * Create and initialize one or two spool structures, and save them in caller's
+ * buildstate argument.  May also fill-in fields within indexInfo used by index
+ * builds.
+ *
+ * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
+ * routine encapsulates all aspects of managing parallelism.  Caller need only
+ * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
+ *
+ * Returns the total number of heap tuples scanned.
   */
-BTSpool *
-_bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
+static double
+_bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
+                   IndexInfo *indexInfo)
  {
     BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
-   int         btKbytes;
+   SortCoordinate coordinate = NULL;
+   double      reltuples = 0;
  
+   /*
+    * We size the sort area as maintenance_work_mem rather than work_mem to
+    * speed index creation.  This should be OK since a single backend can't
+    * run multiple index creations in parallel (see also: notes on
+    * parallelism and maintenance_work_mem below).
+    */
     btspool->heap = heap;
     btspool->index = index;
-   btspool->isunique = isunique;
+   btspool->isunique = indexInfo->ii_Unique;
+
+   /* Save as primary spool */
+   buildstate->spool = btspool;
+
+   /* Attempt to launch parallel worker scan when required */
+   if (indexInfo->ii_ParallelWorkers > 0)
+       _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
+                          indexInfo->ii_ParallelWorkers);
  
     /*
-    * We size the sort area as maintenance_work_mem rather than work_mem to
-    * speed index creation.  This should be OK since a single backend can't
-    * run multiple index creations in parallel.  Note that creation of a
-    * unique index actually requires two BTSpool objects.  We expect that the
-    * second one (for dead tuples) won't get very full, so we give it only
-    * work_mem.
+    * If parallel build requested and at least one worker process was
+    * successfully launched, set up coordination state
      */
-   btKbytes = isdead ? work_mem : maintenance_work_mem;
-   btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
-                                                    btKbytes, false);
+   if (buildstate->btleader)
+   {
+       coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
+       coordinate->isWorker = false;
+       coordinate->nParticipants =
+           buildstate->btleader->nparticipanttuplesorts;
+       coordinate->sharedsort = buildstate->btleader->sharedsort;
+   }
  
-   return btspool;
+   /*
+    * Begin serial/leader tuplesort.
+    *
+    * In cases where parallelism is involved, the leader receives the same
+    * share of maintenance_work_mem as a serial sort (it is generally treated
+    * in the same way as a serial sort once we return).  Parallel worker
+    * Tuplesortstates will have received only a fraction of
+    * maintenance_work_mem, though.
+    *
+    * We rely on the lifetime of the Leader Tuplesortstate almost not
+    * overlapping with any worker Tuplesortstate's lifetime.  There may be
+    * some small overlap, but that's okay because we rely on leader
+    * Tuplesortstate only allocating a small, fixed amount of memory here.
+    * When its tuplesort_performsort() is called (by our caller), and
+    * significant amounts of memory are likely to be used, all workers must
+    * have already freed almost all memory held by their Tuplesortstates
+    * (they are about to go away completely, too).  The overall effect is
+    * that maintenance_work_mem always represents an absolute high watermark
+    * on the amount of memory used by a CREATE INDEX operation, regardless of
+    * the use of parallelism or any other factor.
+    */
+   buildstate->spool->sortstate =
+       tuplesort_begin_index_btree(heap, index, buildstate->isunique,
+                                   maintenance_work_mem, coordinate,
+                                   false);
+
+   /*
+    * If building a unique index, put dead tuples in a second spool to keep
+    * them out of the uniqueness check.  We expect that the second spool (for
+    * dead tuples) won't get very full, so we give it only work_mem.
+    */
+   if (indexInfo->ii_Unique)
+   {
+       BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
+       SortCoordinate coordinate2 = NULL;
+
+       /* Initialize secondary spool */
+       btspool2->heap = heap;
+       btspool2->index = index;
+       btspool2->isunique = false;
+       /* Save as secondary spool */
+       buildstate->spool2 = btspool2;
+
+       if (buildstate->btleader)
+       {
+           /*
+            * Set up non-private state that is passed to
+            * tuplesort_begin_index_btree() about the basic high level
+            * coordination of a parallel sort.
+            */
+           coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
+           coordinate2->isWorker = false;
+           coordinate2->nParticipants =
+               buildstate->btleader->nparticipanttuplesorts;
+           coordinate2->sharedsort = buildstate->btleader->sharedsort2;
+       }
+
+       /*
+        * We expect that the second one (for dead tuples) won't get very
+        * full, so we give it only work_mem
+        */
+       buildstate->spool2->sortstate =
+           tuplesort_begin_index_btree(heap, index, false, work_mem,
+                                       coordinate2, false);
+   }
+
+   /* Fill spool using either serial or parallel heap scan */
+   if (!buildstate->btleader)
+       reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
+                                      _bt_build_callback, (void *) buildstate,
+                                      NULL);
+   else
+       reltuples = _bt_parallel_heapscan(buildstate,
+                                         &indexInfo->ii_BrokenHotChain);
+
+   /* okay, all heap tuples are spooled */
+   if (buildstate->spool2 && !buildstate->havedead)
+   {
+       /* spool2 turns out to be unnecessary */
+       _bt_spooldestroy(buildstate->spool2);
+       buildstate->spool2 = NULL;
+   }
+
+   return reltuples;
  }
  
  /*
   * clean up a spool structure and its substructures.
   */
-void
+static void
  _bt_spooldestroy(BTSpool *btspool)
  {
     tuplesort_end(btspool->sortstate);
@@ -186,7 +508,7 @@ _bt_spooldestroy(BTSpool *btspool)
  /*
   * spool an index entry into the sort file.
   */
-void
+static void
  _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
  {
     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
@@ -197,7 +519,7 @@ _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
   * given a spool loaded by successive calls to _bt_spool,
   * create an entire btree.
   */
-void
+static void
  _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
  {
     BTWriteState wstate;
@@ -231,11 +553,34 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     _bt_load(&wstate, btspool, btspool2);
  }
  
-
  /*
- * Internal routines.
+ * Per-tuple callback from IndexBuildHeapScan
   */
+static void
+_bt_build_callback(Relation index,
+                  HeapTuple htup,
+                  Datum *values,
+                  bool *isnull,
+                  bool tupleIsAlive,
+                  void *state)
+{
+   BTBuildState *buildstate = (BTBuildState *) state;
  
+   /*
+    * insert the index tuple into the appropriate spool file for subsequent
+    * processing
+    */
+   if (tupleIsAlive || buildstate->spool2 == NULL)
+       _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
+   else
+   {
+       /* dead tuples are put into spool2 */
+       buildstate->havedead = true;
+       _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
+   }
+
+   buildstate->indtuples += 1;
+}
  
  /*
   * allocate workspace for a new, clean btree page, not linked to any siblings.
@@ -819,3 +1164,488 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
         smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
     }
  }
+
+/*
+ * Create parallel context, and launch workers for leader.
+ *
+ * buildstate argument should be initialized (with the exception of the
+ * tuplesort state in spools, which may later be created based on shared
+ * state initially set up here).
+ *
+ * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
+ *
+ * request is the target number of parallel worker processes to launch.
+ *
+ * Sets buildstate's BTLeader, which caller must use to shut down parallel
+ * mode by passing it to _bt_end_parallel() at the very end of its index
+ * build.  If not even a single worker process can be launched, this is
+ * never set, and caller should proceed with a serial index build.
+ */
+static void
+_bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
+{
+   ParallelContext *pcxt;
+   int         scantuplesortstates;
+   Snapshot    snapshot;
+   Size        estbtshared;
+   Size        estsort;
+   BTShared   *btshared;
+   Sharedsort *sharedsort;
+   Sharedsort *sharedsort2;
+   BTSpool    *btspool = buildstate->spool;
+   BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
+   bool        leaderparticipates = true;
+
+#ifdef DISABLE_LEADER_PARTICIPATION
+   leaderparticipates = false;
+#endif
+
+   /*
+    * Enter parallel mode, and create context for parallel build of btree
+    * index
+    */
+   EnterParallelMode();
+   Assert(request > 0);
+   pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
+                                request, true);
+   scantuplesortstates = leaderparticipates ? request + 1 : request;
+
+   /*
+    * Prepare for scan of the base relation.  In a normal index build, we use
+    * SnapshotAny because we must retrieve all tuples and do our own time
+    * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
+    * concurrent build, we take a regular MVCC snapshot and index whatever's
+    * live according to that.
+    */
+   if (!isconcurrent)
+       snapshot = SnapshotAny;
+   else
+       snapshot = RegisterSnapshot(GetTransactionSnapshot());
+
+   /*
+    * Estimate size for at least two keys -- our own
+    * PARALLEL_KEY_BTREE_SHARED workspace, and PARALLEL_KEY_TUPLESORT
+    * tuplesort workspace
+    */
+   estbtshared = _bt_parallel_estimate_shared(snapshot);
+   shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
+   estsort = tuplesort_estimate_shared(scantuplesortstates);
+   shm_toc_estimate_chunk(&pcxt->estimator, estsort);
+
+   /*
+    * Unique case requires a second spool, and so we may have to account for
+    * a third shared workspace -- PARALLEL_KEY_TUPLESORT_SPOOL2
+    */
+   if (!btspool->isunique)
+       shm_toc_estimate_keys(&pcxt->estimator, 2);
+   else
+   {
+       shm_toc_estimate_chunk(&pcxt->estimator, estsort);
+       shm_toc_estimate_keys(&pcxt->estimator, 3);
+   }
+
+   /* Everyone's had a chance to ask for space, so now create the DSM */
+   InitializeParallelDSM(pcxt);
+
+   /* Store shared build state, for which we reserved space */
+   btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
+   /* Initialize immutable state */
+   btshared->heaprelid = RelationGetRelid(btspool->heap);
+   btshared->indexrelid = RelationGetRelid(btspool->index);
+   btshared->isunique = btspool->isunique;
+   btshared->isconcurrent = isconcurrent;
+   btshared->scantuplesortstates = scantuplesortstates;
+   ConditionVariableInit(&btshared->workersdonecv);
+   SpinLockInit(&btshared->mutex);
+   /* Initialize mutable state */
+   btshared->nparticipantsdone = 0;
+   btshared->reltuples = 0.0;
+   btshared->havedead = false;
+   btshared->indtuples = 0.0;
+   btshared->brokenhotchain = false;
+   heap_parallelscan_initialize(&btshared->heapdesc, btspool->heap, snapshot);
+
+   /*
+    * Store shared tuplesort-private state, for which we reserved space.
+    * Then, initialize opaque state using tuplesort routine.
+    */
+   sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
+   tuplesort_initialize_shared(sharedsort, scantuplesortstates,
+                               pcxt->seg);
+
+   shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
+   shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
+
+   /* Unique case requires a second spool, and associated shared state */
+   if (!btspool->isunique)
+       sharedsort2 = NULL;
+   else
+   {
+       /*
+        * Store additional shared tuplesort-private state, for which we
+        * reserved space.  Then, initialize opaque state using tuplesort
+        * routine.
+        */
+       sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
+       tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
+                                   pcxt->seg);
+
+       shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
+   }
+
+   /* Launch workers, saving status for leader/caller */
+   LaunchParallelWorkers(pcxt);
+   btleader->pcxt = pcxt;
+   btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
+   if (leaderparticipates)
+       btleader->nparticipanttuplesorts++;
+   btleader->btshared = btshared;
+   btleader->sharedsort = sharedsort;
+   btleader->sharedsort2 = sharedsort2;
+   btleader->snapshot = snapshot;
+
+   /* If no workers were successfully launched, back out (do serial build) */
+   if (pcxt->nworkers_launched == 0)
+   {
+       _bt_end_parallel(btleader);
+       return;
+   }
+
+   /* Save leader state now that it's clear build will be parallel */
+   buildstate->btleader = btleader;
+
+   /* Join heap scan ourselves */
+   if (leaderparticipates)
+       _bt_leader_participate_as_worker(buildstate);
+
+   /*
+    * Caller needs to wait for all launched workers when we return.  Make
+    * sure that the failure-to-start case will not hang forever.
+    */
+   WaitForParallelWorkersToAttach(pcxt);
+}
+
+/*
+ * Shut down workers, destroy parallel context, and end parallel mode.
+ */
+static void
+_bt_end_parallel(BTLeader *btleader)
+{
+   /* Shutdown worker processes */
+   WaitForParallelWorkersToFinish(btleader->pcxt);
+   /* Free last reference to MVCC snapshot, if one was used */
+   if (IsMVCCSnapshot(btleader->snapshot))
+       UnregisterSnapshot(btleader->snapshot);
+   DestroyParallelContext(btleader->pcxt);
+   ExitParallelMode();
+}
+
+/*
+ * Returns size of shared memory required to store state for a parallel
+ * btree index build based on the snapshot its parallel scan will use.
+ */
+static Size
+_bt_parallel_estimate_shared(Snapshot snapshot)
+{
+   if (!IsMVCCSnapshot(snapshot))
+   {
+       Assert(snapshot == SnapshotAny);
+       return sizeof(BTShared);
+   }
+
+   return add_size(offsetof(BTShared, heapdesc) +
+                   offsetof(ParallelHeapScanDescData, phs_snapshot_data),
+                   EstimateSnapshotSpace(snapshot));
+}
+
+/*
+ * Within leader, wait for end of heap scan.
+ *
+ * When called, parallel heap scan started by _bt_begin_parallel() will
+ * already be underway within worker processes (when leader participates
+ * as a worker, we should end up here just as workers are finishing).
+ *
+ * Fills in fields needed for ambuild statistics, and lets caller set
+ * field indicating that some worker encountered a broken HOT chain.
+ *
+ * Returns the total number of heap tuples scanned.
+ */
+static double
+_bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
+{
+   BTShared   *btshared = buildstate->btleader->btshared;
+   int         nparticipanttuplesorts;
+   double      reltuples;
+
+   nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
+   for (;;)
+   {
+       SpinLockAcquire(&btshared->mutex);
+       if (btshared->nparticipantsdone == nparticipanttuplesorts)
+       {
+           buildstate->havedead = btshared->havedead;
+           buildstate->indtuples = btshared->indtuples;
+           *brokenhotchain = btshared->brokenhotchain;
+           reltuples = btshared->reltuples;
+           SpinLockRelease(&btshared->mutex);
+           break;
+       }
+       SpinLockRelease(&btshared->mutex);
+
+       ConditionVariableSleep(&btshared->workersdonecv,
+                              WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
+   }
+
+   ConditionVariableCancelSleep();
+
+   return reltuples;
+}
+
+/*
+ * Within leader, participate as a parallel worker.
+ */
+static void
+_bt_leader_participate_as_worker(BTBuildState *buildstate)
+{
+   BTLeader   *btleader = buildstate->btleader;
+   BTSpool    *leaderworker;
+   BTSpool    *leaderworker2;
+   int         sortmem;
+
+   /* Allocate memory and initialize private spool */
+   leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
+   leaderworker->heap = buildstate->spool->heap;
+   leaderworker->index = buildstate->spool->index;
+   leaderworker->isunique = buildstate->spool->isunique;
+
+   /* Initialize second spool, if required */
+   if (!btleader->btshared->isunique)
+       leaderworker2 = NULL;
+   else
+   {
+       /* Allocate memory for worker's own private secondary spool */
+       leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
+
+       /* Initialize worker's own secondary spool */
+       leaderworker2->heap = leaderworker->heap;
+       leaderworker2->index = leaderworker->index;
+       leaderworker2->isunique = false;
+   }
+
+   /*
+    * Might as well use reliable figure when doling out maintenance_work_mem
+    * (when requested number of workers were not launched, this will be
+    * somewhat higher than it is for other workers).
+    */
+   sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
+
+   /* Perform work common to all participants */
+   _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
+                              btleader->sharedsort, btleader->sharedsort2,
+                              sortmem);
+
+#ifdef BTREE_BUILD_STATS
+   if (log_btree_build_stats)
+   {
+       ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
+       ResetUsage();
+   }
+#endif                         /* BTREE_BUILD_STATS */
+}
+
+/*
+ * Perform work within a launched parallel process.
+ */
+void
+_bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
+{
+   BTSpool    *btspool;
+   BTSpool    *btspool2;
+   BTShared   *btshared;
+   Sharedsort *sharedsort;
+   Sharedsort *sharedsort2;
+   Relation    heapRel;
+   Relation    indexRel;
+   LOCKMODE    heapLockmode;
+   LOCKMODE    indexLockmode;
+   int         sortmem;
+
+#ifdef BTREE_BUILD_STATS
+   if (log_btree_build_stats)
+       ResetUsage();
+#endif                         /* BTREE_BUILD_STATS */
+
+   /* Look up shared state */
+   btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
+
+   /* Open relations using lock modes known to be obtained by index.c */
+   if (!btshared->isconcurrent)
+   {
+       heapLockmode = ShareLock;
+       indexLockmode = AccessExclusiveLock;
+   }
+   else
+   {
+       heapLockmode = ShareUpdateExclusiveLock;
+       indexLockmode = RowExclusiveLock;
+   }
+
+   /* Open relations within worker */
+   heapRel = heap_open(btshared->heaprelid, heapLockmode);
+   indexRel = index_open(btshared->indexrelid, indexLockmode);
+
+   /* Initialize worker's own spool */
+   btspool = (BTSpool *) palloc0(sizeof(BTSpool));
+   btspool->heap = heapRel;
+   btspool->index = indexRel;
+   btspool->isunique = btshared->isunique;
+
+   /* Look up shared state private to tuplesort.c */
+   sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
+   tuplesort_attach_shared(sharedsort, seg);
+   if (!btshared->isunique)
+   {
+       btspool2 = NULL;
+       sharedsort2 = NULL;
+   }
+   else
+   {
+       /* Allocate memory for worker's own private secondary spool */
+       btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
+
+       /* Initialize worker's own secondary spool */
+       btspool2->heap = btspool->heap;
+       btspool2->index = btspool->index;
+       btspool2->isunique = false;
+       /* Look up shared state private to tuplesort.c */
+       sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
+       tuplesort_attach_shared(sharedsort2, seg);
+   }
+
+   /* Perform sorting of spool, and possibly a spool2 */
+   sortmem = maintenance_work_mem / btshared->scantuplesortstates;
+   _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
+                              sharedsort2, sortmem);
+
+#ifdef BTREE_BUILD_STATS
+   if (log_btree_build_stats)
+   {
+       ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
+       ResetUsage();
+   }
+#endif                         /* BTREE_BUILD_STATS */
+
+   index_close(indexRel, indexLockmode);
+   heap_close(heapRel, heapLockmode);
+}
+
+/*
+ * Perform a worker's portion of a parallel sort.
+ *
+ * This generates a tuplesort for passed btspool, and a second tuplesort
+ * state if a second btspool is need (i.e. for unique index builds).  All
+ * other spool fields should already be set when this is called.
+ *
+ * sortmem is the amount of working memory to use within each worker,
+ * expressed in KBs.
+ *
+ * When this returns, workers are done, and need only release resources.
+ */
+static void
+_bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
+                          BTShared *btshared, Sharedsort *sharedsort,
+                          Sharedsort *sharedsort2, int sortmem)
+{
+   SortCoordinate coordinate;
+   BTBuildState buildstate;
+   HeapScanDesc scan;
+   double      reltuples;
+   IndexInfo  *indexInfo;
+
+   /* Initialize local tuplesort coordination state */
+   coordinate = palloc0(sizeof(SortCoordinateData));
+   coordinate->isWorker = true;
+   coordinate->nParticipants = -1;
+   coordinate->sharedsort = sharedsort;
+
+   /* Begin "partial" tuplesort */
+   btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
+                                                    btspool->index,
+                                                    btspool->isunique,
+                                                    sortmem, coordinate,
+                                                    false);
+
+   /*
+    * Just as with serial case, there may be a second spool.  If so, a
+    * second, dedicated spool2 partial tuplesort is required.
+    */
+   if (btspool2)
+   {
+       SortCoordinate coordinate2;
+
+       /*
+        * We expect that the second one (for dead tuples) won't get very
+        * full, so we give it only work_mem (unless sortmem is less for
+        * worker).  Worker processes are generally permitted to allocate
+        * work_mem independently.
+        */
+       coordinate2 = palloc0(sizeof(SortCoordinateData));
+       coordinate2->isWorker = true;
+       coordinate2->nParticipants = -1;
+       coordinate2->sharedsort = sharedsort2;
+       btspool2->sortstate =
+           tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
+                                       Min(sortmem, work_mem), coordinate2,
+                                       false);
+   }
+
+   /* Fill in buildstate for _bt_build_callback() */
+   buildstate.isunique = btshared->isunique;
+   buildstate.havedead = false;
+   buildstate.heap = btspool->heap;
+   buildstate.spool = btspool;
+   buildstate.spool2 = btspool2;
+   buildstate.indtuples = 0;
+   buildstate.btleader = NULL;
+
+   /* Join parallel scan */
+   indexInfo = BuildIndexInfo(btspool->index);
+   indexInfo->ii_Concurrent = btshared->isconcurrent;
+   scan = heap_beginscan_parallel(btspool->heap, &btshared->heapdesc);
+   reltuples = IndexBuildHeapScan(btspool->heap, btspool->index, indexInfo,
+                                  true, _bt_build_callback,
+                                  (void *) &buildstate, scan);
+
+   /*
+    * Execute this worker's part of the sort.
+    *
+    * Unlike leader and serial cases, we cannot avoid calling
+    * tuplesort_performsort() for spool2 if it ends up containing no dead
+    * tuples (this is disallowed for workers by tuplesort).
+    */
+   tuplesort_performsort(btspool->sortstate);
+   if (btspool2)
+       tuplesort_performsort(btspool2->sortstate);
+
+   /*
+    * Done.  Record ambuild statistics, and whether we encountered a broken
+    * HOT chain.
+    */
+   SpinLockAcquire(&btshared->mutex);
+   btshared->nparticipantsdone++;
+   btshared->reltuples += reltuples;
+   if (buildstate.havedead)
+       btshared->havedead = true;
+   btshared->indtuples += buildstate.indtuples;
+   if (indexInfo->ii_BrokenHotChain)
+       btshared->brokenhotchain = true;
+   SpinLockRelease(&btshared->mutex);
+
+   /* Notify leader */
+   ConditionVariableSignal(&btshared->workersdonecv);
+
+   /* We can end tuplesorts immediately */
+   tuplesort_end(btspool->sortstate);
+   if (btspool2)
+       tuplesort_end(btspool2->sortstate);
+}
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c

index d2aec6df3eb6f393eb7491f4224b69890fe93742..34d9b48f15e45bce917cb97d7fe680b90daab7c4 100644 (file)
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -138,7 +138,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
                                               ALLOCSET_DEFAULT_SIZES);
  
     reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
-                                  spgistBuildCallback, (void *) &buildstate);
+                                  spgistBuildCallback, (void *) &buildstate,
+                                  NULL);
  
     MemoryContextDelete(buildstate.tmpCtx);
  
diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c

index 5b45b07e7c1aa21c5918627c889296927ee523b8..a325933940d6db79d9d1096fe599e2841ab59fc9 100644 (file)
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -14,6 +14,7 @@
  
  #include "postgres.h"
  
+#include "access/nbtree.h"
  #include "access/parallel.h"
  #include "access/session.h"
  #include "access/xact.h"
@@ -129,6 +130,9 @@ static const struct
  {
     {
         "ParallelQueryMain", ParallelQueryMain
+   },
+   {
+       "_bt_parallel_build_main", _bt_parallel_build_main
     }
  };
  
@@ -146,7 +150,7 @@ static void ParallelWorkerShutdown(int code, Datum arg);
   */
  ParallelContext *
  CreateParallelContext(const char *library_name, const char *function_name,
-                     int nworkers)
+                     int nworkers, bool serializable_okay)
  {
     MemoryContext oldcontext;
     ParallelContext *pcxt;
@@ -167,9 +171,11 @@ CreateParallelContext(const char *library_name, const char *function_name,
     /*
      * If we are running under serializable isolation, we can't use parallel
      * workers, at least not until somebody enhances that mechanism to be
-    * parallel-aware.
+    * parallel-aware.  Utility statement callers may ask us to ignore this
+    * restriction because they're always able to safely ignore the fact that
+    * SIREAD locks do not work with parallelism.
      */
-   if (IsolationIsSerializable())
+   if (IsolationIsSerializable() && !serializable_okay)
         nworkers = 0;
  
     /* We might be running in a short-lived memory context. */
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c

index 80860128fbe5b058633f03a3f261992de6fbaabe..28ff2f0979817c81cb7e4943854c36fc59be42e2 100644 (file)
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -1137,7 +1137,7 @@ build_indices(void)
         heap = heap_open(ILHead->il_heap, NoLock);
         ind = index_open(ILHead->il_ind, NoLock);
  
-       index_build(heap, ind, ILHead->il_info, false, false);
+       index_build(heap, ind, ILHead->il_info, false, false, false);
  
         index_close(ind, NoLock);
         heap_close(heap, NoLock);
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c

index 774c07b03a07f06df969bca4663ef2e3d40dbe17..0f34f5381a299ace264b983b62625ebcf2912b75 100644 (file)
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -2841,7 +2841,7 @@ RelationTruncateIndexes(Relation heapRelation)
  
         /* Initialize the index and rebuild */
         /* Note: we do not need to re-establish pkey setting */
-       index_build(heapRelation, currentIndex, indexInfo, false, true);
+       index_build(heapRelation, currentIndex, indexInfo, false, true, false);
  
         /* We're done with this index */
         index_close(currentIndex, NoLock);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c

index 849a4691277450bdc15ed008a0ea8233804205ed..f2cb6d7fb81782c591a5b1775d926527131b5151 100644 (file)
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -56,6 +56,7 @@
  #include "nodes/makefuncs.h"
  #include "nodes/nodeFuncs.h"
  #include "optimizer/clauses.h"
+#include "optimizer/planner.h"
  #include "parser/parser.h"
  #include "rewrite/rewriteManip.h"
  #include "storage/bufmgr.h"
@@ -902,7 +903,7 @@ index_create(Relation heapRelation,
     Assert(indexRelationId == RelationGetRelid(indexRelation));
  
     /*
-    * Obtain exclusive lock on it.  Although no other backends can see it
+    * Obtain exclusive lock on it.  Although no other transactions can see it
      * until we commit, this prevents deadlock-risk complaints from lock
      * manager in cases such as CLUSTER.
      */
@@ -1159,7 +1160,8 @@ index_create(Relation heapRelation,
     }
     else
     {
-       index_build(heapRelation, indexRelation, indexInfo, isprimary, false);
+       index_build(heapRelation, indexRelation, indexInfo, isprimary, false,
+                   true);
     }
  
     /*
@@ -1746,6 +1748,7 @@ BuildIndexInfo(Relation index)
     /* initialize index-build state to default */
     ii->ii_Concurrent = false;
     ii->ii_BrokenHotChain = false;
+   ii->ii_ParallelWorkers = 0;
  
     /* set up for possible use by index AM */
     ii->ii_Am = index->rd_rel->relam;
@@ -2164,6 +2167,7 @@ index_update_stats(Relation rel,
   *
   * isprimary tells whether to mark the index as a primary-key index.
   * isreindex indicates we are recreating a previously-existing index.
+ * parallel indicates if parallelism may be useful.
   *
   * Note: when reindexing an existing index, isprimary can be false even if
   * the index is a PK; it's already properly marked and need not be re-marked.
@@ -2177,7 +2181,8 @@ index_build(Relation heapRelation,
             Relation indexRelation,
             IndexInfo *indexInfo,
             bool isprimary,
-           bool isreindex)
+           bool isreindex,
+           bool parallel)
  {
     IndexBuildResult *stats;
     Oid         save_userid;
@@ -2192,10 +2197,31 @@ index_build(Relation heapRelation,
     Assert(PointerIsValid(indexRelation->rd_amroutine->ambuild));
     Assert(PointerIsValid(indexRelation->rd_amroutine->ambuildempty));
  
-   ereport(DEBUG1,
-           (errmsg("building index \"%s\" on table \"%s\"",
-                   RelationGetRelationName(indexRelation),
-                   RelationGetRelationName(heapRelation))));
+   /*
+    * Determine worker process details for parallel CREATE INDEX.  Currently,
+    * only btree has support for parallel builds.
+    *
+    * Note that planner considers parallel safety for us.
+    */
+   if (parallel && IsNormalProcessingMode() &&
+       indexRelation->rd_rel->relam == BTREE_AM_OID)
+       indexInfo->ii_ParallelWorkers =
+           plan_create_index_workers(RelationGetRelid(heapRelation),
+                                     RelationGetRelid(indexRelation));
+
+   if (indexInfo->ii_ParallelWorkers == 0)
+       ereport(DEBUG1,
+               (errmsg("building index \"%s\" on table \"%s\" serially",
+                       RelationGetRelationName(indexRelation),
+                       RelationGetRelationName(heapRelation))));
+   else
+       ereport(DEBUG1,
+               (errmsg_plural("building index \"%s\" on table \"%s\" with request for %d parallel worker",
+                              "building index \"%s\" on table \"%s\" with request for %d parallel workers",
+                              indexInfo->ii_ParallelWorkers,
+                              RelationGetRelationName(indexRelation),
+                              RelationGetRelationName(heapRelation),
+                              indexInfo->ii_ParallelWorkers)));
  
     /*
      * Switch to the table owner's userid, so that any index functions are run
@@ -2347,13 +2373,14 @@ IndexBuildHeapScan(Relation heapRelation,
                    IndexInfo *indexInfo,
                    bool allow_sync,
                    IndexBuildCallback callback,
-                  void *callback_state)
+                  void *callback_state,
+                  HeapScanDesc scan)
  {
     return IndexBuildHeapRangeScan(heapRelation, indexRelation,
                                    indexInfo, allow_sync,
                                    false,
                                    0, InvalidBlockNumber,
-                                  callback, callback_state);
+                                  callback, callback_state, scan);
  }
  
  /*
@@ -2375,11 +2402,11 @@ IndexBuildHeapRangeScan(Relation heapRelation,
                         BlockNumber start_blockno,
                         BlockNumber numblocks,
                         IndexBuildCallback callback,
-                       void *callback_state)
+                       void *callback_state,
+                       HeapScanDesc scan)
  {
     bool        is_system_catalog;
     bool        checking_uniqueness;
-   HeapScanDesc scan;
     HeapTuple   heapTuple;
     Datum       values[INDEX_MAX_KEYS];
     bool        isnull[INDEX_MAX_KEYS];
@@ -2389,6 +2416,7 @@ IndexBuildHeapRangeScan(Relation heapRelation,
     EState     *estate;
     ExprContext *econtext;
     Snapshot    snapshot;
+   bool        need_unregister_snapshot = false;
     TransactionId OldestXmin;
     BlockNumber root_blkno = InvalidBlockNumber;
     OffsetNumber root_offsets[MaxHeapTuplesPerPage];
@@ -2432,27 +2460,59 @@ IndexBuildHeapRangeScan(Relation heapRelation,
      * concurrent build, or during bootstrap, we take a regular MVCC snapshot
      * and index whatever's live according to that.
      */
-   if (IsBootstrapProcessingMode() || indexInfo->ii_Concurrent)
-   {
-       snapshot = RegisterSnapshot(GetTransactionSnapshot());
-       OldestXmin = InvalidTransactionId;  /* not used */
+   OldestXmin = InvalidTransactionId;
+
+   /* okay to ignore lazy VACUUMs here */
+   if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
+       OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM);
  
-       /* "any visible" mode is not compatible with this */
-       Assert(!anyvisible);
+   if (!scan)
+   {
+       /*
+        * Serial index build.
+        *
+        * Must begin our own heap scan in this case.  We may also need to
+        * register a snapshot whose lifetime is under our direct control.
+        */
+       if (!TransactionIdIsValid(OldestXmin))
+       {
+           snapshot = RegisterSnapshot(GetTransactionSnapshot());
+           need_unregister_snapshot = true;
+       }
+       else
+           snapshot = SnapshotAny;
+
+       scan = heap_beginscan_strat(heapRelation,   /* relation */
+                                   snapshot,   /* snapshot */
+                                   0,  /* number of keys */
+                                   NULL,   /* scan key */
+                                   true,   /* buffer access strategy OK */
+                                   allow_sync);    /* syncscan OK? */
     }
     else
     {
-       snapshot = SnapshotAny;
-       /* okay to ignore lazy VACUUMs here */
-       OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM);
+       /*
+        * Parallel index build.
+        *
+        * Parallel case never registers/unregisters own snapshot.  Snapshot
+        * is taken from parallel heap scan, and is SnapshotAny or an MVCC
+        * snapshot, based on same criteria as serial case.
+        */
+       Assert(!IsBootstrapProcessingMode());
+       Assert(allow_sync);
+       snapshot = scan->rs_snapshot;
     }
  
-   scan = heap_beginscan_strat(heapRelation,   /* relation */
-                               snapshot,   /* snapshot */
-                               0,  /* number of keys */
-                               NULL,   /* scan key */
-                               true,   /* buffer access strategy OK */
-                               allow_sync);    /* syncscan OK? */
+   /*
+    * Must call GetOldestXmin() with SnapshotAny.  Should never call
+    * GetOldestXmin() with MVCC snapshot. (It's especially worth checking
+    * this for parallel builds, since ambuild routines that support parallel
+    * builds must work these details out for themselves.)
+    */
+   Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
+   Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
+          !TransactionIdIsValid(OldestXmin));
+   Assert(snapshot == SnapshotAny || !anyvisible);
  
     /* set our scan endpoints */
     if (!allow_sync)
@@ -2783,8 +2843,8 @@ IndexBuildHeapRangeScan(Relation heapRelation,
  
     heap_endscan(scan);
  
-   /* we can now forget our snapshot, if set */
-   if (IsBootstrapProcessingMode() || indexInfo->ii_Concurrent)
+   /* we can now forget our snapshot, if set and registered by us */
+   if (need_unregister_snapshot)
         UnregisterSnapshot(snapshot);
  
     ExecDropSingleTupleTableSlot(slot);
@@ -3027,7 +3087,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
     state.tuplesort = tuplesort_begin_datum(INT8OID, Int8LessOperator,
                                             InvalidOid, false,
                                             maintenance_work_mem,
-                                           false);
+                                           NULL, false);
     state.htups = state.itups = state.tups_inserted = 0;
  
     (void) index_bulk_delete(&ivinfo, NULL,
@@ -3552,7 +3612,7 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence,
  
         /* Initialize the index and rebuild */
         /* Note: we do not need to re-establish pkey setting */
-       index_build(heapRelation, iRel, indexInfo, false, true);
+       index_build(heapRelation, iRel, indexInfo, false, true, true);
     }
     PG_CATCH();
     {
@@ -3911,8 +3971,7 @@ SetReindexProcessing(Oid heapOid, Oid indexOid)
  static void
  ResetReindexProcessing(void)
  {
-   if (IsInParallelMode())
-       elog(ERROR, "cannot modify reindex state during a parallel operation");
+   /* This may be called in leader error path */
     currentlyReindexedHeap = InvalidOid;
     currentlyReindexedIndex = InvalidOid;
  }
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c

index cf37011b73f95a791c8e2a298d7991a968a59a4f..dcbad1286be8f1d2cc37964ea36a7ab30ec63c36 100644 (file)
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -315,6 +315,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
     indexInfo->ii_ReadyForInserts = true;
     indexInfo->ii_Concurrent = false;
     indexInfo->ii_BrokenHotChain = false;
+   indexInfo->ii_ParallelWorkers = 0;
     indexInfo->ii_Am = BTREE_AM_OID;
     indexInfo->ii_AmCache = NULL;
     indexInfo->ii_Context = CurrentMemoryContext;
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c

index 1701548d84439b1639fa0ec64a47ef4a2485ac41..5d481dd50de4bf665ca56aaa947bfc785afe733b 100644 (file)
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -909,7 +909,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
     /* Set up sorting if wanted */
     if (use_sort)
         tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
-                                           maintenance_work_mem, false);
+                                           maintenance_work_mem,
+                                           NULL, false);
     else
         tuplesort = NULL;
  
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c

index a9461a4b06c4eb20de9a483d635de6a4a22f5e68..7c46613215c76fe94ceab2d40ef99039c78099c3 100644 (file)
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -380,6 +380,10 @@ DefineIndex(Oid relationId,
      * this will typically require the caller to have already locked the
      * relation.  To avoid lock upgrade hazards, that lock should be at least
      * as strong as the one we take here.
+    *
+    * NB: If the lock strength here ever changes, code that is run by
+    * parallel workers under the control of certain particular ambuild
+    * functions will need to be updated, too.
      */
     lockmode = stmt->concurrent ? ShareUpdateExclusiveLock : ShareLock;
     rel = heap_open(relationId, lockmode);
@@ -617,6 +621,7 @@ DefineIndex(Oid relationId,
     indexInfo->ii_ReadyForInserts = !stmt->concurrent;
     indexInfo->ii_Concurrent = stmt->concurrent;
     indexInfo->ii_BrokenHotChain = false;
+   indexInfo->ii_ParallelWorkers = 0;
     indexInfo->ii_Am = accessMethodId;
     indexInfo->ii_AmCache = NULL;
     indexInfo->ii_Context = CurrentMemoryContext;
@@ -1000,7 +1005,7 @@ DefineIndex(Oid relationId,
     indexInfo->ii_BrokenHotChain = false;
  
     /* Now build the index */
-   index_build(rel, indexRelation, indexInfo, stmt->primary, false);
+   index_build(rel, indexRelation, indexInfo, stmt->primary, false, true);
  
     /* Close both the relations, but keep the locks */
     heap_close(rel, NoLock);
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c

index f8b72ebab9947f317bfd5541d40b9e50be16bb1c..14b0b89463cd67fb0a21147301a299bb58620806 100644 (file)
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -592,7 +592,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
     pstmt_data = ExecSerializePlan(planstate->plan, estate);
  
     /* Create a parallel context. */
-   pcxt = CreateParallelContext("postgres", "ParallelQueryMain", nworkers);
+   pcxt = CreateParallelContext("postgres", "ParallelQueryMain", nworkers, false);
     pei->pcxt = pcxt;
  
     /*
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c

index ec62e7fb389936e7a9584a7a790b5008be52b204..a86d4b68eac270e4d4a4f621af3333e49ae3efb9 100644 (file)
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -373,7 +373,7 @@ initialize_phase(AggState *aggstate, int newphase)
                                                   sortnode->collations,
                                                   sortnode->nullsFirst,
                                                   work_mem,
-                                                 false);
+                                                 NULL, false);
     }
  
     aggstate->current_phase = newphase;
@@ -450,7 +450,7 @@ initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans,
                                       pertrans->sortOperators[0],
                                       pertrans->sortCollations[0],
                                       pertrans->sortNullsFirst[0],
-                                     work_mem, false);
+                                     work_mem, NULL, false);
         }
         else
             pertrans->sortstates[aggstate->current_set] =
@@ -460,7 +460,7 @@ initialize_aggregate(AggState *aggstate, AggStatePerTrans pertrans,
                                      pertrans->sortOperators,
                                      pertrans->sortCollations,
                                      pertrans->sortNullsFirst,
-                                    work_mem, false);
+                                    work_mem, NULL, false);
     }
  
     /*
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c

index 9c68de8565cc9feead97fe6828855cf1db77b804..d61c859fce609156a0ced9bc0e296144e51a1641 100644 (file)
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -93,7 +93,7 @@ ExecSort(PlanState *pstate)
                                               plannode->collations,
                                               plannode->nullsFirst,
                                               work_mem,
-                                             node->randomAccess);
+                                             NULL, node->randomAccess);
         if (node->bounded)
             tuplesort_set_bound(tuplesortstate, node->bound);
         node->tuplesortstate = (void *) tuplesortstate;
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c

index fd1a58336b4b9961963b190f57641fc0f857f9c2..5bff90e1bcab8a4a41aa1bf727b369d2787a2aec 100644 (file)
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -720,7 +720,8 @@ create_plain_partial_paths(PlannerInfo *root, RelOptInfo *rel)
  {
     int         parallel_workers;
  
-   parallel_workers = compute_parallel_worker(rel, rel->pages, -1);
+   parallel_workers = compute_parallel_worker(rel, rel->pages, -1,
+                                              max_parallel_workers_per_gather);
  
     /* If any limit was set to zero, the user doesn't want a parallel scan. */
     if (parallel_workers <= 0)
@@ -3299,7 +3300,8 @@ create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel,
     pages_fetched = compute_bitmap_pages(root, rel, bitmapqual, 1.0,
                                          NULL, NULL);
  
-   parallel_workers = compute_parallel_worker(rel, pages_fetched, -1);
+   parallel_workers = compute_parallel_worker(rel, pages_fetched, -1,
+                                              max_parallel_workers_per_gather);
  
     if (parallel_workers <= 0)
         return;
@@ -3319,9 +3321,13 @@ create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel,
   *
   * "index_pages" is the number of pages from the index that we expect to scan, or
   * -1 if we don't expect to scan any.
+ *
+ * "max_workers" is caller's limit on the number of workers.  This typically
+ * comes from a GUC.
   */
  int
-compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages)
+compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages,
+                       int max_workers)
  {
     int         parallel_workers = 0;
  
@@ -3392,10 +3398,8 @@ compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages)
         }
     }
  
-   /*
-    * In no case use more than max_parallel_workers_per_gather workers.
-    */
-   parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather);
+   /* In no case use more than caller supplied maximum number of workers */
+   parallel_workers = Min(parallel_workers, max_workers);
  
     return parallel_workers;
  }
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c

index 8679b14b29ab89ef5483c663cef6466caa910a8b..29fea48ee231c5dca58af97faef45ee794a0abf8 100644 (file)
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -682,7 +682,9 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
          * order.
          */
         path->path.parallel_workers = compute_parallel_worker(baserel,
-                                                             rand_heap_pages, index_pages);
+                                                             rand_heap_pages,
+                                                             index_pages,
+                                                             max_parallel_workers_per_gather);
  
         /*
          * Fall out if workers can't be assigned for parallel scan, because in
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 2a4e22b6c889a8d2ac15ebb324296d3b11b5fc51..740de4957dd54334e1d00e04763e7080c1653da7 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -5793,6 +5793,142 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
     return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
  }
  
+/*
+ * plan_create_index_workers
+ *     Use the planner to decide how many parallel worker processes
+ *     CREATE INDEX should request for use
+ *
+ * tableOid is the table on which the index is to be built.  indexOid is the
+ * OID of an index to be created or reindexed (which must be a btree index).
+ *
+ * Return value is the number of parallel worker processes to request.  It
+ * may be unsafe to proceed if this is 0.  Note that this does not include the
+ * leader participating as a worker (value is always a number of parallel
+ * worker processes).
+ *
+ * Note: caller had better already hold some type of lock on the table and
+ * index.
+ */
+int
+plan_create_index_workers(Oid tableOid, Oid indexOid)
+{
+   PlannerInfo *root;
+   Query      *query;
+   PlannerGlobal *glob;
+   RangeTblEntry *rte;
+   Relation    heap;
+   Relation    index;
+   RelOptInfo *rel;
+   int         parallel_workers;
+   BlockNumber heap_blocks;
+   double      reltuples;
+   double      allvisfrac;
+
+   /* Return immediately when parallelism disabled */
+   if (max_parallel_maintenance_workers == 0)
+       return 0;
+
+   /* Set up largely-dummy planner state */
+   query = makeNode(Query);
+   query->commandType = CMD_SELECT;
+
+   glob = makeNode(PlannerGlobal);
+
+   root = makeNode(PlannerInfo);
+   root->parse = query;
+   root->glob = glob;
+   root->query_level = 1;
+   root->planner_cxt = CurrentMemoryContext;
+   root->wt_param_id = -1;
+
+   /*
+    * Build a minimal RTE.
+    *
+    * Set the target's table to be an inheritance parent.  This is a kludge
+    * that prevents problems within get_relation_info(), which does not
+    * expect that any IndexOptInfo is currently undergoing REINDEX.
+    */
+   rte = makeNode(RangeTblEntry);
+   rte->rtekind = RTE_RELATION;
+   rte->relid = tableOid;
+   rte->relkind = RELKIND_RELATION;    /* Don't be too picky. */
+   rte->lateral = false;
+   rte->inh = true;
+   rte->inFromCl = true;
+   query->rtable = list_make1(rte);
+
+   /* Set up RTE/RelOptInfo arrays */
+   setup_simple_rel_arrays(root);
+
+   /* Build RelOptInfo */
+   rel = build_simple_rel(root, 1, NULL);
+
+   heap = heap_open(tableOid, NoLock);
+   index = index_open(indexOid, NoLock);
+
+   /*
+    * Determine if it's safe to proceed.
+    *
+    * Currently, parallel workers can't access the leader's temporary tables.
+    * Furthermore, any index predicate or index expressions must be parallel
+    * safe.
+    */
+   if (heap->rd_rel->relpersistence == RELPERSISTENCE_TEMP ||
+       !is_parallel_safe(root, (Node *) RelationGetIndexExpressions(index)) ||
+       !is_parallel_safe(root, (Node *) RelationGetIndexPredicate(index)))
+   {
+       parallel_workers = 0;
+       goto done;
+   }
+
+   /*
+    * If parallel_workers storage parameter is set for the table, accept that
+    * as the number of parallel worker processes to launch (though still cap
+    * at max_parallel_maintenance_workers).  Note that we deliberately do not
+    * consider any other factor when parallel_workers is set. (e.g., memory
+    * use by workers.)
+    */
+   if (rel->rel_parallel_workers != -1)
+   {
+       parallel_workers = Min(rel->rel_parallel_workers,
+                              max_parallel_maintenance_workers);
+       goto done;
+   }
+
+   /*
+    * Estimate heap relation size ourselves, since rel->pages cannot be
+    * trusted (heap RTE was marked as inheritance parent)
+    */
+   estimate_rel_size(heap, NULL, &heap_blocks, &reltuples, &allvisfrac);
+
+   /*
+    * Determine number of workers to scan the heap relation using generic
+    * model
+    */
+   parallel_workers = compute_parallel_worker(rel, heap_blocks, -1,
+                                              max_parallel_maintenance_workers);
+
+   /*
+    * Cap workers based on available maintenance_work_mem as needed.
+    *
+    * Note that each tuplesort participant receives an even share of the
+    * total maintenance_work_mem budget.  Aim to leave participants
+    * (including the leader as a participant) with no less than 32MB of
+    * memory.  This leaves cases where maintenance_work_mem is set to 64MB
+    * immediately past the threshold of being capable of launching a single
+    * parallel worker to sort.
+    */
+   while (parallel_workers > 0 &&
+          maintenance_work_mem / (parallel_workers + 1) < 32768L)
+       parallel_workers--;
+
+done:
+   index_close(index, NoLock);
+   heap_close(heap, NoLock);
+
+   return parallel_workers;
+}
+
  /*
   * get_partitioned_child_rels
   *     Returns a list of the RT indexes of the partitioned child relations
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c

index 605b1832be6e98ebc07e46f6a1d7a26a180ab333..96ba2163878ed31047aa4a95faf59773101fc4bb 100644 (file)
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3655,6 +3655,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
         case WAIT_EVENT_PARALLEL_BITMAP_SCAN:
             event_name = "ParallelBitmapScan";
             break;
+       case WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN:
+           event_name = "ParallelCreateIndexScan";
+           break;
         case WAIT_EVENT_PROCARRAY_GROUP_UPDATE:
             event_name = "ProcArrayGroupUpdate";
             break;
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c

index 4de6121ab9c2afd58230cad7d5e4ab938d6ca4e1..c058c3fc43ef189d2d0fbb48905e53b7f7c1ca6b 100644 (file)
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -271,7 +271,7 @@ BufFileCreateShared(SharedFileSet *fileset, const char *name)
   * Open a file that was previously created in another backend (or this one)
   * with BufFileCreateShared in the same SharedFileSet using the same name.
   * The backend that created the file must have called BufFileClose() or
- * BufFileExport() to make sure that it is ready to be opened by other
+ * BufFileExportShared() to make sure that it is ready to be opened by other
   * backends and render it read-only.
   */
  BufFile *
@@ -800,3 +800,62 @@ BufFileTellBlock(BufFile *file)
  }
  
  #endif
+
+/*
+ * Return the current file size.  Counts any holes left behind by
+ * BufFileViewAppend as part of the size.
+ */
+off_t
+BufFileSize(BufFile *file)
+{
+   return ((file->numFiles - 1) * (off_t) MAX_PHYSICAL_FILESIZE) +
+       FileGetSize(file->files[file->numFiles - 1]);
+}
+
+/*
+ * Append the contents of source file (managed within shared fileset) to
+ * end of target file (managed within same shared fileset).
+ *
+ * Note that operation subsumes ownership of underlying resources from
+ * "source".  Caller should never call BufFileClose against source having
+ * called here first.  Resource owners for source and target must match,
+ * too.
+ *
+ * This operation works by manipulating lists of segment files, so the
+ * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
+ * boundary, typically creating empty holes before the boundary.  These
+ * areas do not contain any interesting data, and cannot be read from by
+ * caller.
+ *
+ * Returns the block number within target where the contents of source
+ * begins.  Caller should apply this as an offset when working off block
+ * positions that are in terms of the original BufFile space.
+ */
+long
+BufFileAppend(BufFile *target, BufFile *source)
+{
+   long        startBlock = target->numFiles * BUFFILE_SEG_SIZE;
+   int         newNumFiles = target->numFiles + source->numFiles;
+   int         i;
+
+   Assert(target->fileset != NULL);
+   Assert(source->readOnly);
+   Assert(!source->dirty);
+   Assert(source->fileset != NULL);
+
+   if (target->resowner != source->resowner)
+       elog(ERROR, "could not append BufFile with non-matching resource owner");
+
+   target->files = (File *)
+       repalloc(target->files, sizeof(File) * newNumFiles);
+   target->offsets = (off_t *)
+       repalloc(target->offsets, sizeof(off_t) * newNumFiles);
+   for (i = target->numFiles; i < newNumFiles; i++)
+   {
+       target->files[i] = source->files[i - target->numFiles];
+       target->offsets[i] = 0L;
+   }
+   target->numFiles = newNumFiles;
+
+   return startBlock;
+}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c

index 71516a9a5addd9a9f2ae5e1f772d3e31e45c9257..2a18e94ff49bff3b7c83e5f16ac288caa75ea8ce 100644 (file)
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2262,6 +2262,16 @@ FileGetRawMode(File file)
     return VfdCache[file].fileMode;
  }
  
+/*
+ * FileGetSize - returns the size of file
+ */
+off_t
+FileGetSize(File file)
+{
+   Assert(FileIsValid(file));
+   return VfdCache[file].fileSize;
+}
+
  /*
   * Make room for another allocatedDescs[] array entry if needed and possible.
   * Returns true if an array element is available.
diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c

index 79dbfd1a059c2a0e8694848a0f02d40ede90a905..63d9c6702746a14bfc7673d1e54f29d8259e4637 100644 (file)
--- a/src/backend/utils/adt/orderedsetaggs.c
+++ b/src/backend/utils/adt/orderedsetaggs.c
@@ -291,6 +291,7 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
                                                    qstate->sortCollations,
                                                    qstate->sortNullsFirsts,
                                                    work_mem,
+                                                  NULL,
                                                    qstate->rescan_needed);
     else
         osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
@@ -298,6 +299,7 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
                                                     qstate->sortCollation,
                                                     qstate->sortNullsFirst,
                                                     work_mem,
+                                                   NULL,
                                                     qstate->rescan_needed);
  
     osastate->number_of_rows = 0;
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c

index 54fa4a389ecd759520886d750eeeb4fe177708bc..446040d8160a9c73d7a08434ff39543367b177f2 100644 (file)
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -112,6 +112,7 @@ bool        enableFsync = true;
  bool       allowSystemTableMods = false;
  int            work_mem = 1024;
  int            maintenance_work_mem = 16384;
+int            max_parallel_maintenance_workers = 2;
  
  /*
   * Primary determinants of sizes of shared-memory structures.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index 5884fa905e97355d4deb206a87897dac44e528f1..87ba67661a4b78e208d5c35db48678e3bf4a0f65 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2734,6 +2734,16 @@ static struct config_int ConfigureNamesInt[] =
         check_autovacuum_max_workers, NULL, NULL
     },
  
+   {
+       {"max_parallel_maintenance_workers", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+           gettext_noop("Sets the maximum number of parallel processes per maintenance operation."),
+           NULL
+       },
+       &max_parallel_maintenance_workers,
+       2, 0, 1024,
+       NULL, NULL, NULL
+   },
+
     {
         {"max_parallel_workers_per_gather", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
             gettext_noop("Sets the maximum number of parallel processes per executor node."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample

index abffde6b2befdb9df680e5c8eca9722c0bc9819f..9a3535559e1cbb1bcb00f1c8c2e8f75be9d76a4e 100644 (file)
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -163,10 +163,11 @@
  
  #effective_io_concurrency = 1      # 1-1000; 0 disables prefetching
  #max_worker_processes = 8      # (change requires restart)
+#max_parallel_maintenance_workers = 2  # taken from max_parallel_workers
  #max_parallel_workers_per_gather = 2   # taken from max_parallel_workers
  #parallel_leader_participation = on
  #max_parallel_workers = 8      # maximum number of max_worker_processes that
-                   # can be used in parallel queries
+                   # can be used in parallel operations
  #old_snapshot_threshold = -1       # 1min-60d; -1 disables; 0 is immediate
                     # (change requires restart)
  #backend_flush_after = 0       # measured in pages, 0 disables
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d

index 560d8ccda39ca6cf70a642ff99e300589fa3b0c8..ad06e8e2ea59b8efa42a962515f61d85f0847a9f 100644 (file)
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -52,7 +52,7 @@ provider postgresql {
     probe query__done(const char *);
     probe statement__status(const char *);
  
-   probe sort__start(int, bool, int, int, bool);
+   probe sort__start(int, bool, int, int, bool, int);
     probe sort__done(bool, long);
  
     probe buffer__read__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool);
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c

index 2d07b3d3f5a6086c70d34d053f01b904743e7534..6b7c10bcfcaf518339056aeb2ec6d0e5958cf05b 100644 (file)
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -64,6 +64,14 @@
   * care that all calls for a single LogicalTapeSet are made in the same
   * palloc context.
   *
+ * To support parallel sort operations involving coordinated callers to
+ * tuplesort.c routines across multiple workers, it is necessary to
+ * concatenate each worker BufFile/tapeset into one single logical tapeset
+ * managed by the leader.  Workers should have produced one final
+ * materialized tape (their entire output) when this happens in leader.
+ * There will always be the same number of runs as input tapes, and the same
+ * number of input tapes as participants (worker Tuplesortstates).
+ *
   * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
@@ -76,6 +84,7 @@
  #include "postgres.h"
  
  #include "storage/buffile.h"
+#include "utils/builtins.h"
  #include "utils/logtape.h"
  #include "utils/memutils.h"
  
@@ -129,16 +138,21 @@ typedef struct LogicalTape
      * a frozen tape.  (When reading from an unfrozen tape, we use a larger
      * read buffer that holds multiple blocks, so the "current" block is
      * ambiguous.)
+    *
+    * When concatenation of worker tape BufFiles is performed, an offset to
+    * the first block in the unified BufFile space is applied during reads.
      */
     long        firstBlockNumber;
     long        curBlockNumber;
     long        nextBlockNumber;
+   long        offsetBlockNumber;
  
     /*
      * Buffer for current data block(s).
      */
     char       *buffer;         /* physical buffer (separately palloc'd) */
     int         buffer_size;    /* allocated size of the buffer */
+   int         max_size;       /* highest useful, safe buffer_size */
     int         pos;            /* next read/write position in buffer */
     int         nbytes;         /* total # of valid bytes in buffer */
  } LogicalTape;
@@ -159,10 +173,13 @@ struct LogicalTapeSet
      * by ltsGetFreeBlock(), and it is always greater than or equal to
      * nBlocksWritten.  Blocks between nBlocksAllocated and nBlocksWritten are
      * blocks that have been allocated for a tape, but have not been written
-    * to the underlying file yet.
+    * to the underlying file yet.  nHoleBlocks tracks the total number of
+    * blocks that are in unused holes between worker spaces following BufFile
+    * concatenation.
      */
     long        nBlocksAllocated;   /* # of blocks allocated */
     long        nBlocksWritten; /* # of blocks used in underlying file */
+   long        nHoleBlocks;    /* # of "hole" blocks left */
  
     /*
      * We store the numbers of recycled-and-available blocks in freeBlocks[].
@@ -192,6 +209,8 @@ static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
  static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
  static long ltsGetFreeBlock(LogicalTapeSet *lts);
  static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
+static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
+                    SharedFileSet *fileset);
  
  
  /*
@@ -213,6 +232,11 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
      * previous tape isn't flushed to disk until the end of the sort, so you
      * get one-block hole, where the last block of the previous tape will
      * later go.
+    *
+    * Note that BufFile concatenation can leave "holes" in BufFile between
+    * worker-owned block ranges.  These are tracked for reporting purposes
+    * only.  We never read from nor write to these hole blocks, and so they
+    * are not considered here.
      */
     while (blocknum > lts->nBlocksWritten)
     {
@@ -267,15 +291,18 @@ ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt)
     do
     {
         char       *thisbuf = lt->buffer + lt->nbytes;
+       long        datablocknum = lt->nextBlockNumber;
  
         /* Fetch next block number */
-       if (lt->nextBlockNumber == -1L)
+       if (datablocknum == -1L)
             break;              /* EOF */
+       /* Apply worker offset, needed for leader tapesets */
+       datablocknum += lt->offsetBlockNumber;
  
         /* Read the block */
-       ltsReadBlock(lts, lt->nextBlockNumber, (void *) thisbuf);
+       ltsReadBlock(lts, datablocknum, (void *) thisbuf);
         if (!lt->frozen)
-           ltsReleaseBlock(lts, lt->nextBlockNumber);
+           ltsReleaseBlock(lts, datablocknum);
         lt->curBlockNumber = lt->nextBlockNumber;
  
         lt->nbytes += TapeBlockGetNBytes(thisbuf);
@@ -370,13 +397,116 @@ ltsReleaseBlock(LogicalTapeSet *lts, long blocknum)
         lts->blocksSorted = false;
  }
  
+/*
+ * Claim ownership of a set of logical tapes from existing shared BufFiles.
+ *
+ * Caller should be leader process.  Though tapes are marked as frozen in
+ * workers, they are not frozen when opened within leader, since unfrozen tapes
+ * use a larger read buffer. (Frozen tapes have smaller read buffer, optimized
+ * for random access.)
+ */
+static void
+ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
+                    SharedFileSet *fileset)
+{
+   LogicalTape *lt = NULL;
+   long        tapeblocks;
+   long        nphysicalblocks = 0L;
+   int         i;
+
+   /* Should have at least one worker tape, plus leader's tape */
+   Assert(lts->nTapes >= 2);
+
+   /*
+    * Build concatenated view of all BufFiles, remembering the block number
+    * where each source file begins.  No changes are needed for leader/last
+    * tape.
+    */
+   for (i = 0; i < lts->nTapes - 1; i++)
+   {
+       char        filename[MAXPGPATH];
+       BufFile    *file;
+
+       lt = &lts->tapes[i];
+
+       pg_itoa(i, filename);
+       file = BufFileOpenShared(fileset, filename);
+
+       /*
+        * Stash first BufFile, and concatenate subsequent BufFiles to that.
+        * Store block offset into each tape as we go.
+        */
+       lt->firstBlockNumber = shared[i].firstblocknumber;
+       if (i == 0)
+       {
+           lts->pfile = file;
+           lt->offsetBlockNumber = 0L;
+       }
+       else
+       {
+           lt->offsetBlockNumber = BufFileAppend(lts->pfile, file);
+       }
+       /* Don't allocate more for read buffer than could possibly help */
+       lt->max_size = Min(MaxAllocSize, shared[i].buffilesize);
+       tapeblocks = shared[i].buffilesize / BLCKSZ;
+       nphysicalblocks += tapeblocks;
+   }
+
+   /*
+    * Set # of allocated blocks, as well as # blocks written.  Use extent of
+    * new BufFile space (from 0 to end of last worker's tape space) for this.
+    * Allocated/written blocks should include space used by holes left
+    * between concatenated BufFiles.
+    */
+   lts->nBlocksAllocated = lt->offsetBlockNumber + tapeblocks;
+   lts->nBlocksWritten = lts->nBlocksAllocated;
+
+   /*
+    * Compute number of hole blocks so that we can later work backwards, and
+    * instrument number of physical blocks.  We don't simply use physical
+    * blocks directly for instrumentation because this would break if we ever
+    * subsequently wrote to worker tape.
+    *
+    * Working backwards like this keeps our options open.  If shared BufFiles
+    * ever support being written to post-export, logtape.c can automatically
+    * take advantage of that.  We'd then support writing to the leader tape
+    * while recycling space from worker tapes, because the leader tape has a
+    * zero offset (write routines won't need to have extra logic to apply an
+    * offset).
+    *
+    * The only thing that currently prevents writing to the leader tape from
+    * working is the fact that BufFiles opened using BufFileOpenShared() are
+    * read-only by definition, but that could be changed if it seemed
+    * worthwhile.  For now, writing to the leader tape will raise a "Bad file
+    * descriptor" error, so tuplesort must avoid writing to the leader tape
+    * altogether.
+    */
+   lts->nHoleBlocks = lts->nBlocksAllocated - nphysicalblocks;
+}
+
  /*
   * Create a set of logical tapes in a temporary underlying file.
   *
- * Each tape is initialized in write state.
+ * Each tape is initialized in write state.  Serial callers pass ntapes,
+ * NULL argument for shared, and -1 for worker.  Parallel worker callers
+ * pass ntapes, a shared file handle, NULL shared argument,  and their own
+ * worker number.  Leader callers, which claim shared worker tapes here,
+ * must supply non-sentinel values for all arguments except worker number,
+ * which should be -1.
+ *
+ * Leader caller is passing back an array of metadata each worker captured
+ * when LogicalTapeFreeze() was called for their final result tapes.  Passed
+ * tapes array is actually sized ntapes - 1, because it includes only
+ * worker tapes, whereas leader requires its own leader tape.  Note that we
+ * rely on the assumption that reclaimed worker tapes will only be read
+ * from once by leader, and never written to again (tapes are initialized
+ * for writing, but that's only to be consistent).  Leader may not write to
+ * its own tape purely due to a restriction in the shared buffile
+ * infrastructure that may be lifted in the future.
   */
  LogicalTapeSet *
-LogicalTapeSetCreate(int ntapes)
+LogicalTapeSetCreate(int ntapes, TapeShare *shared, SharedFileSet *fileset,
+                    int worker)
  {
     LogicalTapeSet *lts;
     LogicalTape *lt;
@@ -388,9 +518,9 @@ LogicalTapeSetCreate(int ntapes)
     Assert(ntapes > 0);
     lts = (LogicalTapeSet *) palloc(offsetof(LogicalTapeSet, tapes) +
                                     ntapes * sizeof(LogicalTape));
-   lts->pfile = BufFileCreateTemp(false);
     lts->nBlocksAllocated = 0L;
     lts->nBlocksWritten = 0L;
+   lts->nHoleBlocks = 0L;
     lts->forgetFreeSpace = false;
     lts->blocksSorted = true;   /* a zero-length array is sorted ... */
     lts->freeBlocksLen = 32;    /* reasonable initial guess */
@@ -412,11 +542,36 @@ LogicalTapeSetCreate(int ntapes)
         lt->dirty = false;
         lt->firstBlockNumber = -1L;
         lt->curBlockNumber = -1L;
+       lt->nextBlockNumber = -1L;
+       lt->offsetBlockNumber = 0L;
         lt->buffer = NULL;
         lt->buffer_size = 0;
+       /* palloc() larger than MaxAllocSize would fail */
+       lt->max_size = MaxAllocSize;
         lt->pos = 0;
         lt->nbytes = 0;
     }
+
+   /*
+    * Create temp BufFile storage as required.
+    *
+    * Leader concatenates worker tapes, which requires special adjustment to
+    * final tapeset data.  Things are simpler for the worker case and the
+    * serial case, though.  They are generally very similar -- workers use a
+    * shared fileset, whereas serial sorts use a conventional serial BufFile.
+    */
+   if (shared)
+       ltsConcatWorkerTapes(lts, shared, fileset);
+   else if (fileset)
+   {
+       char        filename[MAXPGPATH];
+
+       pg_itoa(worker, filename);
+       lts->pfile = BufFileCreateShared(fileset, filename);
+   }
+   else
+       lts->pfile = BufFileCreateTemp(false);
+
     return lts;
  }
  
@@ -470,6 +625,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
     Assert(tapenum >= 0 && tapenum < lts->nTapes);
     lt = &lts->tapes[tapenum];
     Assert(lt->writing);
+   Assert(lt->offsetBlockNumber == 0L);
  
     /* Allocate data buffer and first block on first write */
     if (lt->buffer == NULL)
@@ -566,12 +722,9 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
         if (buffer_size < BLCKSZ)
             buffer_size = BLCKSZ;
  
-       /*
-        * palloc() larger than MaxAllocSize would fail (a multi-gigabyte
-        * buffer is unlikely to be helpful, anyway)
-        */
-       if (buffer_size > MaxAllocSize)
-           buffer_size = MaxAllocSize;
+       /* palloc() larger than max_size is unlikely to be helpful */
+       if (buffer_size > lt->max_size)
+           buffer_size = lt->max_size;
  
         /* round down to BLCKSZ boundary */
         buffer_size -= buffer_size % BLCKSZ;
@@ -698,15 +851,22 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
   * tape is rewound (after rewind is too late!).  It performs a rewind
   * and switch to read mode "for free".  An immediately following rewind-
   * for-read call is OK but not necessary.
+ *
+ * share output argument is set with details of storage used for tape after
+ * freezing, which may be passed to LogicalTapeSetCreate within leader
+ * process later.  This metadata is only of interest to worker callers
+ * freezing their final output for leader (single materialized tape).
+ * Serial sorts should set share to NULL.
   */
  void
-LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
+LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
  {
     LogicalTape *lt;
  
     Assert(tapenum >= 0 && tapenum < lts->nTapes);
     lt = &lts->tapes[tapenum];
     Assert(lt->writing);
+   Assert(lt->offsetBlockNumber == 0L);
  
     /*
      * Completion of a write phase.  Flush last partial data block, and rewind
@@ -749,6 +909,14 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
     else
         lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
     lt->nbytes = TapeBlockGetNBytes(lt->buffer);
+
+   /* Handle extra steps when caller is to share its tapeset */
+   if (share)
+   {
+       BufFileExportShared(lts->pfile);
+       share->firstblocknumber = lt->firstBlockNumber;
+       share->buffilesize = BufFileSize(lts->pfile);
+   }
  }
  
  /*
@@ -874,6 +1042,7 @@ LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
  
     Assert(tapenum >= 0 && tapenum < lts->nTapes);
     lt = &lts->tapes[tapenum];
+   Assert(lt->offsetBlockNumber == 0L);
  
     /* With a larger buffer, 'pos' wouldn't be the same as offset within page */
     Assert(lt->buffer_size == BLCKSZ);
@@ -888,5 +1057,5 @@ LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
  long
  LogicalTapeSetBlocks(LogicalTapeSet *lts)
  {
-   return lts->nBlocksAllocated;
+   return lts->nBlocksAllocated - lts->nHoleBlocks;
  }
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c

index eecc66cafa81838679b8a630ea73b51a763424ca..041bdc2fa7e0b85f531a1502dcb9e6c1dac426ba 100644 (file)
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -74,6 +74,14 @@
   * above.  Nonetheless, with large workMem we can have many tapes (but not
   * too many -- see the comments in tuplesort_merge_order).
   *
+ * This module supports parallel sorting.  Parallel sorts involve coordination
+ * among one or more worker processes, and a leader process, each with its own
+ * tuplesort state.  The leader process (or, more accurately, the
+ * Tuplesortstate associated with a leader process) creates a full tapeset
+ * consisting of worker tapes with one run to merge; a run for every
+ * worker process.  This is then merged.  Worker processes are guaranteed to
+ * produce exactly one output run from their partial input.
+ *
   *
   * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
@@ -113,6 +121,10 @@
  #define DATUM_SORT     2
  #define CLUSTER_SORT   3
  
+/* Sort parallel code from state for sort__start probes */
+#define PARALLEL_SORT(state)   ((state)->shared == NULL ? 0 : \
+                                (state)->worker >= 0 ? 1 : 2)
+
  /* GUC variables */
  #ifdef TRACE_SORT
  bool       trace_sort = false;
@@ -374,6 +386,25 @@ struct Tuplesortstate
     int         markpos_offset; /* saved "current", or offset in tape block */
     bool        markpos_eof;    /* saved "eof_reached" */
  
+   /*
+    * These variables are used during parallel sorting.
+    *
+    * worker is our worker identifier.  Follows the general convention that
+    * -1 value relates to a leader tuplesort, and values >= 0 worker
+    * tuplesorts. (-1 can also be a serial tuplesort.)
+    *
+    * shared is mutable shared memory state, which is used to coordinate
+    * parallel sorts.
+    *
+    * nParticipants is the number of worker Tuplesortstates known by the
+    * leader to have actually been launched, which implies that they must
+    * finish a run leader can merge.  Typically includes a worker state held
+    * by the leader process itself.  Set in the leader Tuplesortstate only.
+    */
+   int         worker;
+   Sharedsort *shared;
+   int         nParticipants;
+
     /*
      * The sortKeys variable is used by every case other than the hash index
      * case; it is set by tuplesort_begin_xxx.  tupDesc is only used by the
@@ -435,6 +466,39 @@ struct Tuplesortstate
  #endif
  };
  
+/*
+ * Private mutable state of tuplesort-parallel-operation.  This is allocated
+ * in shared memory.
+ */
+struct Sharedsort
+{
+   /* mutex protects all fields prior to tapes */
+   slock_t     mutex;
+
+   /*
+    * currentWorker generates ordinal identifier numbers for parallel sort
+    * workers.  These start from 0, and are always gapless.
+    *
+    * Workers increment workersFinished to indicate having finished.  If this
+    * is equal to state.nParticipants within the leader, leader is ready to
+    * merge worker runs.
+    */
+   int         currentWorker;
+   int         workersFinished;
+
+   /* Temporary file space */
+   SharedFileSet fileset;
+
+   /* Size of tapes flexible array */
+   int         nTapes;
+
+   /*
+    * Tapes array used by workers to report back information needed by the
+    * leader to concatenate all worker tapes into one for merging
+    */
+   TapeShare   tapes[FLEXIBLE_ARRAY_MEMBER];
+};
+
  /*
   * Is the given tuple allocated from the slab memory arena?
   */
@@ -465,6 +529,9 @@ struct Tuplesortstate
  #define LACKMEM(state)     ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
  #define USEMEM(state,amt)  ((state)->availMem -= (amt))
  #define FREEMEM(state,amt) ((state)->availMem += (amt))
+#define SERIAL(state)      ((state)->shared == NULL)
+#define WORKER(state)      ((state)->shared && (state)->worker != -1)
+#define LEADER(state)      ((state)->shared && (state)->worker == -1)
  
  /*
   * NOTES about on-tape representation of tuples:
@@ -521,10 +588,13 @@ struct Tuplesortstate
     } while(0)
  
  
-static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
+static Tuplesortstate *tuplesort_begin_common(int workMem,
+                      SortCoordinate coordinate,
+                      bool randomAccess);
  static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
  static bool consider_abort_common(Tuplesortstate *state);
-static void inittapes(Tuplesortstate *state);
+static void inittapes(Tuplesortstate *state, bool mergeruns);
+static void inittapestate(Tuplesortstate *state, int maxTapes);
  static void selectnewtape(Tuplesortstate *state);
  static void init_slab_allocator(Tuplesortstate *state, int numSlots);
  static void mergeruns(Tuplesortstate *state);
@@ -572,6 +642,10 @@ static void writetup_datum(Tuplesortstate *state, int tapenum,
                SortTuple *stup);
  static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
               int tapenum, unsigned int len);
+static int worker_get_identifier(Tuplesortstate *state);
+static void worker_freeze_result_tape(Tuplesortstate *state);
+static void worker_nomergeruns(Tuplesortstate *state);
+static void leader_takeover_tapes(Tuplesortstate *state);
  static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
  
  /*
@@ -604,13 +678,18 @@ static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
   */
  
  static Tuplesortstate *
-tuplesort_begin_common(int workMem, bool randomAccess)
+tuplesort_begin_common(int workMem, SortCoordinate coordinate,
+                      bool randomAccess)
  {
     Tuplesortstate *state;
     MemoryContext sortcontext;
     MemoryContext tuplecontext;
     MemoryContext oldcontext;
  
+   /* See leader_takeover_tapes() remarks on randomAccess support */
+   if (coordinate && randomAccess)
+       elog(ERROR, "random access disallowed under parallel sort");
+
     /*
      * Create a working memory context for this sort operation. All data
      * needed by the sort will live inside this context.
@@ -650,7 +729,14 @@ tuplesort_begin_common(int workMem, bool randomAccess)
     state->bounded = false;
     state->tuples = true;
     state->boundUsed = false;
-   state->allowedMem = workMem * (int64) 1024;
+
+   /*
+    * workMem is forced to be at least 64KB, the current minimum valid value
+    * for the work_mem GUC.  This is a defense against parallel sort callers
+    * that divide out memory among many workers in a way that leaves each
+    * with very little memory.
+    */
+   state->allowedMem = Max(workMem, 64) * (int64) 1024;
     state->availMem = state->allowedMem;
     state->sortcontext = sortcontext;
     state->tuplecontext = tuplecontext;
@@ -684,6 +770,33 @@ tuplesort_begin_common(int workMem, bool randomAccess)
  
     state->result_tape = -1;    /* flag that result tape has not been formed */
  
+   /*
+    * Initialize parallel-related state based on coordination information
+    * from caller
+    */
+   if (!coordinate)
+   {
+       /* Serial sort */
+       state->shared = NULL;
+       state->worker = -1;
+       state->nParticipants = -1;
+   }
+   else if (coordinate->isWorker)
+   {
+       /* Parallel worker produces exactly one final run from all input */
+       state->shared = coordinate->sharedsort;
+       state->worker = worker_get_identifier(state);
+       state->nParticipants = -1;
+   }
+   else
+   {
+       /* Parallel leader state only used for final merge */
+       state->shared = coordinate->sharedsort;
+       state->worker = -1;
+       state->nParticipants = coordinate->nParticipants;
+       Assert(state->nParticipants >= 1);
+   }
+
     MemoryContextSwitchTo(oldcontext);
  
     return state;
@@ -694,9 +807,10 @@ tuplesort_begin_heap(TupleDesc tupDesc,
                      int nkeys, AttrNumber *attNums,
                      Oid *sortOperators, Oid *sortCollations,
                      bool *nullsFirstFlags,
-                    int workMem, bool randomAccess)
+                    int workMem, SortCoordinate coordinate, bool randomAccess)
  {
-   Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+   Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
+                                                  randomAccess);
     MemoryContext oldcontext;
     int         i;
  
@@ -717,7 +831,8 @@ tuplesort_begin_heap(TupleDesc tupDesc,
                                 false,  /* no unique check */
                                 nkeys,
                                 workMem,
-                               randomAccess);
+                               randomAccess,
+                               PARALLEL_SORT(state));
  
     state->comparetup = comparetup_heap;
     state->copytup = copytup_heap;
@@ -764,9 +879,11 @@ tuplesort_begin_heap(TupleDesc tupDesc,
  Tuplesortstate *
  tuplesort_begin_cluster(TupleDesc tupDesc,
                         Relation indexRel,
-                       int workMem, bool randomAccess)
+                       int workMem,
+                       SortCoordinate coordinate, bool randomAccess)
  {
-   Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+   Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
+                                                  randomAccess);
     ScanKey     indexScanKey;
     MemoryContext oldcontext;
     int         i;
@@ -789,7 +906,8 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
                                 false,  /* no unique check */
                                 state->nKeys,
                                 workMem,
-                               randomAccess);
+                               randomAccess,
+                               PARALLEL_SORT(state));
  
     state->comparetup = comparetup_cluster;
     state->copytup = copytup_cluster;
@@ -857,9 +975,12 @@ Tuplesortstate *
  tuplesort_begin_index_btree(Relation heapRel,
                             Relation indexRel,
                             bool enforceUnique,
-                           int workMem, bool randomAccess)
+                           int workMem,
+                           SortCoordinate coordinate,
+                           bool randomAccess)
  {
-   Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+   Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
+                                                  randomAccess);
     ScanKey     indexScanKey;
     MemoryContext oldcontext;
     int         i;
@@ -880,7 +1001,8 @@ tuplesort_begin_index_btree(Relation heapRel,
                                 enforceUnique,
                                 state->nKeys,
                                 workMem,
-                               randomAccess);
+                               randomAccess,
+                               PARALLEL_SORT(state));
  
     state->comparetup = comparetup_index_btree;
     state->copytup = copytup_index;
@@ -934,9 +1056,12 @@ tuplesort_begin_index_hash(Relation heapRel,
                            uint32 high_mask,
                            uint32 low_mask,
                            uint32 max_buckets,
-                          int workMem, bool randomAccess)
+                          int workMem,
+                          SortCoordinate coordinate,
+                          bool randomAccess)
  {
-   Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+   Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
+                                                  randomAccess);
     MemoryContext oldcontext;
  
     oldcontext = MemoryContextSwitchTo(state->sortcontext);
@@ -973,10 +1098,11 @@ tuplesort_begin_index_hash(Relation heapRel,
  
  Tuplesortstate *
  tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
-                     bool nullsFirstFlag,
-                     int workMem, bool randomAccess)
+                     bool nullsFirstFlag, int workMem,
+                     SortCoordinate coordinate, bool randomAccess)
  {
-   Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+   Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
+                                                  randomAccess);
     MemoryContext oldcontext;
     int16       typlen;
     bool        typbyval;
@@ -996,7 +1122,8 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
                                 false,  /* no unique check */
                                 1,
                                 workMem,
-                               randomAccess);
+                               randomAccess,
+                               PARALLEL_SORT(state));
  
     state->comparetup = comparetup_datum;
     state->copytup = copytup_datum;
@@ -1054,7 +1181,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
   * delayed calls at the moment.)
   *
   * This is a hint only. The tuplesort may still return more tuples than
- * requested.
+ * requested.  Parallel leader tuplesorts will always ignore the hint.
   */
  void
  tuplesort_set_bound(Tuplesortstate *state, int64 bound)
@@ -1063,6 +1190,7 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound)
     Assert(state->status == TSS_INITIAL);
     Assert(state->memtupcount == 0);
     Assert(!state->bounded);
+   Assert(!WORKER(state));
  
  #ifdef DEBUG_BOUNDED_SORT
     /* Honor GUC setting that disables the feature (for easy testing) */
@@ -1070,6 +1198,10 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound)
         return;
  #endif
  
+   /* Parallel leader ignores hint */
+   if (LEADER(state))
+       return;
+
     /* We want to be able to compute bound * 2, so limit the setting */
     if (bound > (int64) (INT_MAX / 2))
         return;
@@ -1128,11 +1260,13 @@ tuplesort_end(Tuplesortstate *state)
     if (trace_sort)
     {
         if (state->tapeset)
-           elog(LOG, "external sort ended, %ld disk blocks used: %s",
-                spaceUsed, pg_rusage_show(&state->ru_start));
+           elog(LOG, "%s of %d ended, %ld disk blocks used: %s",
+                SERIAL(state) ? "external sort" : "parallel external sort",
+                state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
         else
-           elog(LOG, "internal sort ended, %ld KB used: %s",
-                spaceUsed, pg_rusage_show(&state->ru_start));
+           elog(LOG, "%s of %d ended, %ld KB used: %s",
+                SERIAL(state) ? "internal sort" : "unperformed parallel sort",
+                state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
     }
  
     TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
@@ -1503,6 +1637,8 @@ tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
  static void
  puttuple_common(Tuplesortstate *state, SortTuple *tuple)
  {
+   Assert(!LEADER(state));
+
     switch (state->status)
     {
         case TSS_INITIAL:
@@ -1556,7 +1692,7 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
             /*
              * Nope; time to switch to tape-based operation.
              */
-           inittapes(state);
+           inittapes(state, true);
  
             /*
              * Dump all tuples.
@@ -1658,8 +1794,8 @@ tuplesort_performsort(Tuplesortstate *state)
  
  #ifdef TRACE_SORT
     if (trace_sort)
-       elog(LOG, "performsort starting: %s",
-            pg_rusage_show(&state->ru_start));
+       elog(LOG, "performsort of %d starting: %s",
+            state->worker, pg_rusage_show(&state->ru_start));
  #endif
  
     switch (state->status)
@@ -1668,14 +1804,39 @@ tuplesort_performsort(Tuplesortstate *state)
  
             /*
              * We were able to accumulate all the tuples within the allowed
-            * amount of memory.  Just qsort 'em and we're done.
+            * amount of memory, or leader to take over worker tapes
              */
-           tuplesort_sort_memtuples(state);
+           if (SERIAL(state))
+           {
+               /* Just qsort 'em and we're done */
+               tuplesort_sort_memtuples(state);
+               state->status = TSS_SORTEDINMEM;
+           }
+           else if (WORKER(state))
+           {
+               /*
+                * Parallel workers must still dump out tuples to tape.  No
+                * merge is required to produce single output run, though.
+                */
+               inittapes(state, false);
+               dumptuples(state, true);
+               worker_nomergeruns(state);
+               state->status = TSS_SORTEDONTAPE;
+           }
+           else
+           {
+               /*
+                * Leader will take over worker tapes and merge worker runs.
+                * Note that mergeruns sets the correct state->status.
+                */
+               leader_takeover_tapes(state);
+               mergeruns(state);
+           }
             state->current = 0;
             state->eof_reached = false;
+           state->markpos_block = 0L;
             state->markpos_offset = 0;
             state->markpos_eof = false;
-           state->status = TSS_SORTEDINMEM;
             break;
  
         case TSS_BOUNDED:
@@ -1698,8 +1859,8 @@ tuplesort_performsort(Tuplesortstate *state)
             /*
              * Finish tape-based sort.  First, flush all tuples remaining in
              * memory out to tape; then merge until we have a single remaining
-            * run (or, if !randomAccess, one run per tape). Note that
-            * mergeruns sets the correct state->status.
+            * run (or, if !randomAccess and !WORKER(), one run per tape).
+            * Note that mergeruns sets the correct state->status.
              */
             dumptuples(state, true);
             mergeruns(state);
@@ -1718,12 +1879,12 @@ tuplesort_performsort(Tuplesortstate *state)
     if (trace_sort)
     {
         if (state->status == TSS_FINALMERGE)
-           elog(LOG, "performsort done (except %d-way final merge): %s",
-                state->activeTapes,
+           elog(LOG, "performsort of %d done (except %d-way final merge): %s",
+                state->worker, state->activeTapes,
                  pg_rusage_show(&state->ru_start));
         else
-           elog(LOG, "performsort done: %s",
-                pg_rusage_show(&state->ru_start));
+           elog(LOG, "performsort of %d done: %s",
+                state->worker, pg_rusage_show(&state->ru_start));
     }
  #endif
  
@@ -1744,6 +1905,8 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
     unsigned int tuplen;
     size_t      nmoved;
  
+   Assert(!WORKER(state));
+
     switch (state->status)
     {
         case TSS_SORTEDINMEM:
@@ -2127,6 +2290,7 @@ tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
      */
     Assert(forward);
     Assert(ntuples >= 0);
+   Assert(!WORKER(state));
  
     switch (state->status)
     {
@@ -2221,57 +2385,40 @@ tuplesort_merge_order(int64 allowedMem)
  /*
   * inittapes - initialize for tape sorting.
   *
- * This is called only if we have found we don't have room to sort in memory.
+ * This is called only if we have found we won't sort in memory.
   */
  static void
-inittapes(Tuplesortstate *state)
+inittapes(Tuplesortstate *state, bool mergeruns)
  {
     int         maxTapes,
                 j;
-   int64       tapeSpace;
  
-   /* Compute number of tapes to use: merge order plus 1 */
-   maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
+   Assert(!LEADER(state));
  
-   state->maxTapes = maxTapes;
-   state->tapeRange = maxTapes - 1;
+   if (mergeruns)
+   {
+       /* Compute number of tapes to use: merge order plus 1 */
+       maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
+   }
+   else
+   {
+       /* Workers can sometimes produce single run, output without merge */
+       Assert(WORKER(state));
+       maxTapes = MINORDER + 1;
+   }
  
  #ifdef TRACE_SORT
     if (trace_sort)
-       elog(LOG, "switching to external sort with %d tapes: %s",
-            maxTapes, pg_rusage_show(&state->ru_start));
+       elog(LOG, "%d switching to external sort with %d tapes: %s",
+            state->worker, maxTapes, pg_rusage_show(&state->ru_start));
  #endif
  
-   /*
-    * Decrease availMem to reflect the space needed for tape buffers, when
-    * writing the initial runs; but don't decrease it to the point that we
-    * have no room for tuples.  (That case is only likely to occur if sorting
-    * pass-by-value Datums; in all other scenarios the memtuples[] array is
-    * unlikely to occupy more than half of allowedMem.  In the pass-by-value
-    * case it's not important to account for tuple space, so we don't care if
-    * LACKMEM becomes inaccurate.)
-    */
-   tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
-
-   if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
-       USEMEM(state, tapeSpace);
-
-   /*
-    * Make sure that the temp file(s) underlying the tape set are created in
-    * suitable temp tablespaces.
-    */
-   PrepareTempTablespaces();
-
-   /*
-    * Create the tape set and allocate the per-tape data arrays.
-    */
-   state->tapeset = LogicalTapeSetCreate(maxTapes);
-
-   state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
-   state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
-   state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
-   state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
-   state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
+   /* Create the tape set and allocate the per-tape data arrays */
+   inittapestate(state, maxTapes);
+   state->tapeset =
+       LogicalTapeSetCreate(maxTapes, NULL,
+                            state->shared ? &state->shared->fileset : NULL,
+                            state->worker);
  
     state->currentRun = 0;
  
@@ -2294,6 +2441,47 @@ inittapes(Tuplesortstate *state)
     state->status = TSS_BUILDRUNS;
  }
  
+/*
+ * inittapestate - initialize generic tape management state
+ */
+static void
+inittapestate(Tuplesortstate *state, int maxTapes)
+{
+   int64       tapeSpace;
+
+   /*
+    * Decrease availMem to reflect the space needed for tape buffers; but
+    * don't decrease it to the point that we have no room for tuples. (That
+    * case is only likely to occur if sorting pass-by-value Datums; in all
+    * other scenarios the memtuples[] array is unlikely to occupy more than
+    * half of allowedMem.  In the pass-by-value case it's not important to
+    * account for tuple space, so we don't care if LACKMEM becomes
+    * inaccurate.)
+    */
+   tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
+
+   if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
+       USEMEM(state, tapeSpace);
+
+   /*
+    * Make sure that the temp file(s) underlying the tape set are created in
+    * suitable temp tablespaces.  For parallel sorts, this should have been
+    * called already, but it doesn't matter if it is called a second time.
+    */
+   PrepareTempTablespaces();
+
+   state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
+   state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
+   state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
+   state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
+   state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
+
+   /* Record # of tapes allocated (for duration of sort) */
+   state->maxTapes = maxTapes;
+   /* Record maximum # of tapes usable as inputs when merging */
+   state->tapeRange = maxTapes - 1;
+}
+
  /*
   * selectnewtape -- select new tape for new initial run.
   *
@@ -2471,8 +2659,8 @@ mergeruns(Tuplesortstate *state)
      */
  #ifdef TRACE_SORT
     if (trace_sort)
-       elog(LOG, "using " INT64_FORMAT " KB of memory for read buffers among %d input tapes",
-            (state->availMem) / 1024, numInputTapes);
+       elog(LOG, "%d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes",
+            state->worker, state->availMem / 1024, numInputTapes);
  #endif
  
     state->read_buffer_size = Max(state->availMem / numInputTapes, 0);
@@ -2490,7 +2678,7 @@ mergeruns(Tuplesortstate *state)
          * pass remains.  If we don't have to produce a materialized sorted
          * tape, we can stop at this point and do the final merge on-the-fly.
          */
-       if (!state->randomAccess)
+       if (!state->randomAccess && !WORKER(state))
         {
             bool        allOneRun = true;
  
@@ -2575,7 +2763,10 @@ mergeruns(Tuplesortstate *state)
      * a waste of cycles anyway...
      */
     state->result_tape = state->tp_tapenum[state->tapeRange];
-   LogicalTapeFreeze(state->tapeset, state->result_tape);
+   if (!WORKER(state))
+       LogicalTapeFreeze(state->tapeset, state->result_tape, NULL);
+   else
+       worker_freeze_result_tape(state);
     state->status = TSS_SORTEDONTAPE;
  
     /* Release the read buffers of all the other tapes, by rewinding them. */
@@ -2644,8 +2835,8 @@ mergeonerun(Tuplesortstate *state)
  
  #ifdef TRACE_SORT
     if (trace_sort)
-       elog(LOG, "finished %d-way merge step: %s", state->activeTapes,
-            pg_rusage_show(&state->ru_start));
+       elog(LOG, "%d finished %d-way merge step: %s", state->worker,
+            state->activeTapes, pg_rusage_show(&state->ru_start));
  #endif
  }
  
@@ -2779,8 +2970,9 @@ dumptuples(Tuplesortstate *state, bool alltuples)
  
  #ifdef TRACE_SORT
     if (trace_sort)
-       elog(LOG, "starting quicksort of run %d: %s",
-            state->currentRun, pg_rusage_show(&state->ru_start));
+       elog(LOG, "%d starting quicksort of run %d: %s",
+            state->worker, state->currentRun,
+            pg_rusage_show(&state->ru_start));
  #endif
  
     /*
@@ -2791,8 +2983,9 @@ dumptuples(Tuplesortstate *state, bool alltuples)
  
  #ifdef TRACE_SORT
     if (trace_sort)
-       elog(LOG, "finished quicksort of run %d: %s",
-            state->currentRun, pg_rusage_show(&state->ru_start));
+       elog(LOG, "%d finished quicksort of run %d: %s",
+            state->worker, state->currentRun,
+            pg_rusage_show(&state->ru_start));
  #endif
  
     memtupwrite = state->memtupcount;
@@ -2818,8 +3011,8 @@ dumptuples(Tuplesortstate *state, bool alltuples)
  
  #ifdef TRACE_SORT
     if (trace_sort)
-       elog(LOG, "finished writing run %d to tape %d: %s",
-            state->currentRun, state->destTape,
+       elog(LOG, "%d finished writing run %d to tape %d: %s",
+            state->worker, state->currentRun, state->destTape,
              pg_rusage_show(&state->ru_start));
  #endif
  
@@ -3031,6 +3224,7 @@ make_bounded_heap(Tuplesortstate *state)
     Assert(state->status == TSS_INITIAL);
     Assert(state->bounded);
     Assert(tupcount >= state->bound);
+   Assert(SERIAL(state));
  
     /* Reverse sort direction so largest entry will be at root */
     reversedirection(state);
@@ -3078,6 +3272,7 @@ sort_bounded_heap(Tuplesortstate *state)
     Assert(state->status == TSS_BOUNDED);
     Assert(state->bounded);
     Assert(tupcount == state->bound);
+   Assert(SERIAL(state));
  
     /*
      * We can unheapify in place because each delete-top call will remove the
@@ -3112,6 +3307,8 @@ sort_bounded_heap(Tuplesortstate *state)
  static void
  tuplesort_sort_memtuples(Tuplesortstate *state)
  {
+   Assert(!LEADER(state));
+
     if (state->memtupcount > 1)
     {
         /* Can we use the single-key sort function? */
@@ -4151,6 +4348,230 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
                              &tuplen, sizeof(tuplen));
  }
  
+/*
+ * Parallel sort routines
+ */
+
+/*
+ * tuplesort_estimate_shared - estimate required shared memory allocation
+ *
+ * nWorkers is an estimate of the number of workers (it's the number that
+ * will be requested).
+ */
+Size
+tuplesort_estimate_shared(int nWorkers)
+{
+   Size        tapesSize;
+
+   Assert(nWorkers > 0);
+
+   /* Make sure that BufFile shared state is MAXALIGN'd */
+   tapesSize = mul_size(sizeof(TapeShare), nWorkers);
+   tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes)));
+
+   return tapesSize;
+}
+
+/*
+ * tuplesort_initialize_shared - initialize shared tuplesort state
+ *
+ * Must be called from leader process before workers are launched, to
+ * establish state needed up-front for worker tuplesortstates.  nWorkers
+ * should match the argument passed to tuplesort_estimate_shared().
+ */
+void
+tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
+{
+   int         i;
+
+   Assert(nWorkers > 0);
+
+   SpinLockInit(&shared->mutex);
+   shared->currentWorker = 0;
+   shared->workersFinished = 0;
+   SharedFileSetInit(&shared->fileset, seg);
+   shared->nTapes = nWorkers;
+   for (i = 0; i < nWorkers; i++)
+   {
+       shared->tapes[i].firstblocknumber = 0L;
+       shared->tapes[i].buffilesize = 0;
+   }
+}
+
+/*
+ * tuplesort_attach_shared - attach to shared tuplesort state
+ *
+ * Must be called by all worker processes.
+ */
+void
+tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
+{
+   /* Attach to SharedFileSet */
+   SharedFileSetAttach(&shared->fileset, seg);
+}
+
+/*
+ * worker_get_identifier - Assign and return ordinal identifier for worker
+ *
+ * The order in which these are assigned is not well defined, and should not
+ * matter; worker numbers across parallel sort participants need only be
+ * distinct and gapless.  logtape.c requires this.
+ *
+ * Note that the identifiers assigned from here have no relation to
+ * ParallelWorkerNumber number, to avoid making any assumption about
+ * caller's requirements.  However, we do follow the ParallelWorkerNumber
+ * convention of representing a non-worker with worker number -1.  This
+ * includes the leader, as well as serial Tuplesort processes.
+ */
+static int
+worker_get_identifier(Tuplesortstate *state)
+{
+   Sharedsort *shared = state->shared;
+   int         worker;
+
+   Assert(WORKER(state));
+
+   SpinLockAcquire(&shared->mutex);
+   worker = shared->currentWorker++;
+   SpinLockRelease(&shared->mutex);
+
+   return worker;
+}
+
+/*
+ * worker_freeze_result_tape - freeze worker's result tape for leader
+ *
+ * This is called by workers just after the result tape has been determined,
+ * instead of calling LogicalTapeFreeze() directly.  They do so because
+ * workers require a few additional steps over similar serial
+ * TSS_SORTEDONTAPE external sort cases, which also happen here.  The extra
+ * steps are around freeing now unneeded resources, and representing to
+ * leader that worker's input run is available for its merge.
+ *
+ * There should only be one final output run for each worker, which consists
+ * of all tuples that were originally input into worker.
+ */
+static void
+worker_freeze_result_tape(Tuplesortstate *state)
+{
+   Sharedsort *shared = state->shared;
+   TapeShare   output;
+
+   Assert(WORKER(state));
+   Assert(state->result_tape != -1);
+   Assert(state->memtupcount == 0);
+
+   /*
+    * Free most remaining memory, in case caller is sensitive to our holding
+    * on to it.  memtuples may not be a tiny merge heap at this point.
+    */
+   pfree(state->memtuples);
+   /* Be tidy */
+   state->memtuples = NULL;
+   state->memtupsize = 0;
+
+   /*
+    * Parallel worker requires result tape metadata, which is to be stored in
+    * shared memory for leader
+    */
+   LogicalTapeFreeze(state->tapeset, state->result_tape, &output);
+
+   /* Store properties of output tape, and update finished worker count */
+   SpinLockAcquire(&shared->mutex);
+   shared->tapes[state->worker] = output;
+   shared->workersFinished++;
+   SpinLockRelease(&shared->mutex);
+}
+
+/*
+ * worker_nomergeruns - dump memtuples in worker, without merging
+ *
+ * This called as an alternative to mergeruns() with a worker when no
+ * merging is required.
+ */
+static void
+worker_nomergeruns(Tuplesortstate *state)
+{
+   Assert(WORKER(state));
+   Assert(state->result_tape == -1);
+
+   state->result_tape = state->tp_tapenum[state->destTape];
+   worker_freeze_result_tape(state);
+}
+
+/*
+ * leader_takeover_tapes - create tapeset for leader from worker tapes
+ *
+ * So far, leader Tuplesortstate has performed no actual sorting.  By now, all
+ * sorting has occurred in workers, all of which must have already returned
+ * from tuplesort_performsort().
+ *
+ * When this returns, leader process is left in a state that is virtually
+ * indistinguishable from it having generated runs as a serial external sort
+ * might have.
+ */
+static void
+leader_takeover_tapes(Tuplesortstate *state)
+{
+   Sharedsort *shared = state->shared;
+   int         nParticipants = state->nParticipants;
+   int         workersFinished;
+   int         j;
+
+   Assert(LEADER(state));
+   Assert(nParticipants >= 1);
+
+   SpinLockAcquire(&shared->mutex);
+   workersFinished = shared->workersFinished;
+   SpinLockRelease(&shared->mutex);
+
+   if (nParticipants != workersFinished)
+       elog(ERROR, "cannot take over tapes before all workers finish");
+
+   /*
+    * Create the tapeset from worker tapes, including a leader-owned tape at
+    * the end.  Parallel workers are far more expensive than logical tapes,
+    * so the number of tapes allocated here should never be excessive.
+    *
+    * We still have a leader tape, though it's not possible to write to it
+    * due to restrictions in the shared fileset infrastructure used by
+    * logtape.c.  It will never be written to in practice because
+    * randomAccess is disallowed for parallel sorts.
+    */
+   inittapestate(state, nParticipants + 1);
+   state->tapeset = LogicalTapeSetCreate(nParticipants + 1, shared->tapes,
+                                         &shared->fileset, state->worker);
+
+   /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */
+   state->currentRun = nParticipants;
+
+   /*
+    * Initialize variables of Algorithm D to be consistent with runs from
+    * workers having been generated in the leader.
+    *
+    * There will always be exactly 1 run per worker, and exactly one input
+    * tape per run, because workers always output exactly 1 run, even when
+    * there were no input tuples for workers to sort.
+    */
+   for (j = 0; j < state->maxTapes; j++)
+   {
+       /* One real run; no dummy runs for worker tapes */
+       state->tp_fib[j] = 1;
+       state->tp_runs[j] = 1;
+       state->tp_dummy[j] = 0;
+       state->tp_tapenum[j] = j;
+   }
+   /* Leader tape gets one dummy run, and no real runs */
+   state->tp_fib[state->tapeRange] = 0;
+   state->tp_runs[state->tapeRange] = 0;
+   state->tp_dummy[state->tapeRange] = 1;
+
+   state->Level = 1;
+   state->destTape = 0;
+
+   state->status = TSS_BUILDRUNS;
+}
+
  /*
   * Convenience routine to free a tuple previously loaded into sort memory
   */
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h

index d28f413c6635a6436157bd95547b92a95b646ee5..0f6a40168ca142a5286ba1d17637b712ee60e408 100644 (file)
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -21,6 +21,7 @@
  #include "catalog/pg_index.h"
  #include "lib/stringinfo.h"
  #include "storage/bufmgr.h"
+#include "storage/shm_toc.h"
  
  /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
  typedef uint16 BTCycleId;
@@ -430,8 +431,6 @@ typedef BTScanOpaqueData *BTScanOpaque;
  /*
   * external entry points for btree, in nbtree.c
   */
-extern IndexBuildResult *btbuild(Relation heap, Relation index,
-       struct IndexInfo *indexInfo);
  extern void btbuildempty(Relation index);
  extern bool btinsert(Relation rel, Datum *values, bool *isnull,
          ItemPointer ht_ctid, Relation heapRel,
@@ -547,13 +546,8 @@ extern bool btvalidate(Oid opclassoid);
  /*
   * prototypes for functions in nbtsort.c
   */
-typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
-
-extern BTSpool *_bt_spoolinit(Relation heap, Relation index,
-             bool isunique, bool isdead);
-extern void _bt_spooldestroy(BTSpool *btspool);
-extern void _bt_spool(BTSpool *btspool, ItemPointer self,
-         Datum *values, bool *isnull);
-extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
+extern IndexBuildResult *btbuild(Relation heap, Relation index,
+       struct IndexInfo *indexInfo);
+extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
  
  #endif                         /* NBTREE_H */
diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h

index d0c218b1854b8df98bea1142691701e4048850ee..025691fd82d0783f3156e28b42fde75ed0e6088c 100644 (file)
--- a/src/include/access/parallel.h
+++ b/src/include/access/parallel.h
@@ -59,7 +59,9 @@ extern PGDLLIMPORT bool InitializingParallelWorker;
  
  #define        IsParallelWorker()      (ParallelWorkerNumber >= 0)
  
-extern ParallelContext *CreateParallelContext(const char *library_name, const char *function_name, int nworkers);
+extern ParallelContext *CreateParallelContext(const char *library_name,
+                     const char *function_name, int nworkers,
+                     bool serializable_okay);
  extern void InitializeParallelDSM(ParallelContext *pcxt);
  extern void ReinitializeParallelDSM(ParallelContext *pcxt);
  extern void LaunchParallelWorkers(ParallelContext *pcxt);
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h

index 9c603ca637ad7710d53df7998f182b1a1bb80f4b..18c7dedd5d3cca0e0c9e85d1f498ea3f51bbe6ab 100644 (file)
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -39,6 +39,7 @@ typedef struct ParallelHeapScanDescData
     BlockNumber phs_startblock; /* starting block number */
     pg_atomic_uint64 phs_nallocated;    /* number of blocks allocated to
                                          * workers so far. */
+   bool        phs_snapshot_any;   /* SnapshotAny, not phs_snapshot_data? */
     char        phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
  }          ParallelHeapScanDescData;
  
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h

index 235e180299c1cbd6d963052a33a863aa3e1224d0..a5cd8ddb1eb5d30c5620cb674bc19da8922c4af8 100644 (file)
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -104,14 +104,16 @@ extern void index_build(Relation heapRelation,
             Relation indexRelation,
             IndexInfo *indexInfo,
             bool isprimary,
-           bool isreindex);
+           bool isreindex,
+           bool parallel);
  
  extern double IndexBuildHeapScan(Relation heapRelation,
                    Relation indexRelation,
                    IndexInfo *indexInfo,
                    bool allow_sync,
                    IndexBuildCallback callback,
-                  void *callback_state);
+                  void *callback_state,
+                  HeapScanDesc scan);
  extern double IndexBuildHeapRangeScan(Relation heapRelation,
                         Relation indexRelation,
                         IndexInfo *indexInfo,
@@ -120,7 +122,8 @@ extern double IndexBuildHeapRangeScan(Relation heapRelation,
                         BlockNumber start_blockno,
                         BlockNumber end_blockno,
                         IndexBuildCallback callback,
-                       void *callback_state);
+                       void *callback_state,
+                       HeapScanDesc scan);
  
  extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
  
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h

index 54ee2737477e81dd799fbdb69049e2d1a8f8e580..429c05548993430379e6352c3fad1a735a2e1aff 100644 (file)
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -241,6 +241,7 @@ extern bool enableFsync;
  extern PGDLLIMPORT bool allowSystemTableMods;
  extern PGDLLIMPORT int work_mem;
  extern PGDLLIMPORT int maintenance_work_mem;
+extern PGDLLIMPORT int max_parallel_maintenance_workers;
  
  extern int VacuumCostPageHit;
  extern int VacuumCostPageMiss;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h

index 1bf67455e073e73af30ffdb3e080265e7e13c322..a2a2a9f3d4da6b1b72179232b55b78dc8669896e 100644 (file)
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -132,11 +132,12 @@ typedef struct ExprState
   *     ReadyForInserts     is it valid for inserts?
   *     Concurrent          are we doing a concurrent index build?
   *     BrokenHotChain      did we detect any broken HOT chains?
+ *     ParallelWorkers     # of workers requested (excludes leader)
   *     AmCache             private cache area for index AM
   *     Context             memory context holding this IndexInfo
   *
- * ii_Concurrent and ii_BrokenHotChain are used only during index build;
- * they're conventionally set to false otherwise.
+ * ii_Concurrent, ii_BrokenHotChain, and ii_ParallelWorkers are used only
+ * during index build; they're conventionally zeroed otherwise.
   * ----------------
   */
  typedef struct IndexInfo
@@ -158,6 +159,7 @@ typedef struct IndexInfo
     bool        ii_ReadyForInserts;
     bool        ii_Concurrent;
     bool        ii_BrokenHotChain;
+   int         ii_ParallelWorkers;
     Oid         ii_Am;
     void       *ii_AmCache;
     MemoryContext ii_Context;
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h

index 0072b7aa0d4c7799dc31030b8df9d95aaefee3c0..b6be259ff7391f70a9cb86fc4a2462087f57adbe 100644 (file)
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -55,7 +55,7 @@ extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed,
  
  extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel);
  extern int compute_parallel_worker(RelOptInfo *rel, double heap_pages,
-                       double index_pages);
+                       double index_pages, int max_workers);
  extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel,
                             Path *bitmapqual);
  extern void generate_partition_wise_join_paths(PlannerInfo *root,
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h

index 29173d36c49b4d31241e925e682a8055611a521c..0d8b88d78beb149a2845fc0adbb18e6f9e0efc0a 100644 (file)
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -56,6 +56,7 @@ extern Expr *expression_planner(Expr *expr);
  extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr);
  
  extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
+extern int plan_create_index_workers(Oid tableOid, Oid indexOid);
  
  extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti,
                            bool *part_cols_updated);
diff --git a/src/include/pgstat.h b/src/include/pgstat.h

index 3d3c0b64fc3647753d5676f3c64a58ce47147dd7..be2f59239bf9d7e7512cffedb6ab8e0f838da103 100644 (file)
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -826,6 +826,7 @@ typedef enum
     WAIT_EVENT_MQ_SEND,
     WAIT_EVENT_PARALLEL_FINISH,
     WAIT_EVENT_PARALLEL_BITMAP_SCAN,
+   WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN,
     WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
     WAIT_EVENT_CLOG_GROUP_UPDATE,
     WAIT_EVENT_REPLICATION_ORIGIN_DROP,
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h

index a3df056a61b671ea8689fd42ac06edd8a669b1f1..a6cdeb451c1f7b0d89dce77a8531f1425fd89fc5 100644 (file)
--- a/src/include/storage/buffile.h
+++ b/src/include/storage/buffile.h
@@ -43,6 +43,8 @@ extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
  extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
  extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
  extern int BufFileSeekBlock(BufFile *file, long blknum);
+extern off_t BufFileSize(BufFile *file);
+extern long BufFileAppend(BufFile *target, BufFile *source);
  
  extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name);
  extern void BufFileExportShared(BufFile *file);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h

index db5ca1667942e859f61c7325c7a3961808fd5e14..4244e7b1fd8595916b9fe24617aadf61c314c56d 100644 (file)
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -78,6 +78,7 @@ extern char *FilePathName(File file);
  extern int FileGetRawDesc(File file);
  extern int FileGetRawFlags(File file);
  extern mode_t FileGetRawMode(File file);
+extern off_t FileGetSize(File file);
  
  /* Operations used for sharing named temporary files */
  extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure);
diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h

index 88662c10a434ad88b9fba972b01e718c5630b0f0..9bf1d8014240cc36a66f76c51e6ce4bfe1bf7b99 100644 (file)
--- a/src/include/utils/logtape.h
+++ b/src/include/utils/logtape.h
@@ -16,15 +16,49 @@
  #ifndef LOGTAPE_H
  #define LOGTAPE_H
  
+#include "storage/sharedfileset.h"
+
  /* LogicalTapeSet is an opaque type whose details are not known outside logtape.c. */
  
  typedef struct LogicalTapeSet LogicalTapeSet;
  
+/*
+ * The approach tuplesort.c takes to parallel external sorts is that workers,
+ * whose state is almost the same as independent serial sorts, are made to
+ * produce a final materialized tape of sorted output in all cases.  This is
+ * frozen, just like any case requiring a final materialized tape.  However,
+ * there is one difference, which is that freezing will also export an
+ * underlying shared fileset BufFile for sharing.  Freezing produces TapeShare
+ * metadata for the worker when this happens, which is passed along through
+ * shared memory to leader.
+ *
+ * The leader process can then pass an array of TapeShare metadata (one per
+ * worker participant) to LogicalTapeSetCreate(), alongside a handle to a
+ * shared fileset, which is sufficient to construct a new logical tapeset that
+ * consists of each of the tapes materialized by workers.
+ *
+ * Note that while logtape.c does create an empty leader tape at the end of the
+ * tapeset in the leader case, it can never be written to due to a restriction
+ * in the shared buffile infrastructure.
+ */
+typedef struct TapeShare
+{
+   /*
+    * firstblocknumber is first block that should be read from materialized
+    * tape.
+    *
+    * buffilesize is the size of associated BufFile following freezing.
+    */
+   long        firstblocknumber;
+   off_t       buffilesize;
+} TapeShare;
+
  /*
   * prototypes for functions in logtape.c
   */
  
-extern LogicalTapeSet *LogicalTapeSetCreate(int ntapes);
+extern LogicalTapeSet *LogicalTapeSetCreate(int ntapes, TapeShare *shared,
+                    SharedFileSet *fileset, int worker);
  extern void LogicalTapeSetClose(LogicalTapeSet *lts);
  extern void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts);
  extern size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
@@ -34,7 +68,8 @@ extern void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
  extern void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum,
                          size_t buffer_size);
  extern void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum);
-extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum);
+extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum,
+                 TapeShare *share);
  extern size_t LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum,
                      size_t size);
  extern void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h

index 5d57c503ab2bd41101daad2b2a378979c3bd2e60..d2e6754f0437e803f27dc988e6afd1c3cefe437c 100644 (file)
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -8,7 +8,8 @@
   * if necessary).  It works efficiently for both small and large amounts
   * of data.  Small amounts are sorted in-memory using qsort().  Large
   * amounts are sorted using temporary files and a standard external sort
- * algorithm.
+ * algorithm.  Parallel sorts use a variant of this external sort
+ * algorithm, and are typically only used for large amounts of data.
   *
   * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
@@ -23,13 +24,39 @@
  #include "access/itup.h"
  #include "executor/tuptable.h"
  #include "fmgr.h"
+#include "storage/dsm.h"
  #include "utils/relcache.h"
  
  
-/* Tuplesortstate is an opaque type whose details are not known outside
- * tuplesort.c.
+/*
+ * Tuplesortstate and Sharedsort are opaque types whose details are not
+ * known outside tuplesort.c.
   */
  typedef struct Tuplesortstate Tuplesortstate;
+typedef struct Sharedsort Sharedsort;
+
+/*
+ * Tuplesort parallel coordination state, allocated by each participant in
+ * local memory.  Participant caller initializes everything.  See usage notes
+ * below.
+ */
+typedef struct SortCoordinateData
+{
+   /* Worker process?  If not, must be leader. */
+   bool        isWorker;
+
+   /*
+    * Leader-process-passed number of participants known launched (workers
+    * set this to -1).  Includes state within leader needed for it to
+    * participate as a worker, if any.
+    */
+   int         nParticipants;
+
+   /* Private opaque state (points to shared memory) */
+   Sharedsort *sharedsort;
+} SortCoordinateData;
+
+typedef struct SortCoordinateData *SortCoordinate;
  
  /*
   * Data structures for reporting sort statistics.  Note that
@@ -66,6 +93,8 @@ typedef struct TuplesortInstrumentation
   * sorting HeapTuples and two more for sorting IndexTuples.  Yet another
   * API supports sorting bare Datums.
   *
+ * Serial sort callers should pass NULL for their coordinate argument.
+ *
   * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't
   * preserve the system columns (tuple identity and transaction visibility
   * info).  The sort keys are specified by column numbers within the tuples
@@ -84,30 +113,107 @@ typedef struct TuplesortInstrumentation
   *
   * The "index_hash" API is similar to index_btree, but the tuples are
   * actually sorted by their hash codes not the raw data.
+ *
+ * Parallel sort callers are required to coordinate multiple tuplesort states
+ * in a leader process and one or more worker processes.  The leader process
+ * must launch workers, and have each perform an independent "partial"
+ * tuplesort, typically fed by the parallel heap interface.  The leader later
+ * produces the final output (internally, it merges runs output by workers).
+ *
+ * Callers must do the following to perform a sort in parallel using multiple
+ * worker processes:
+ *
+ * 1. Request tuplesort-private shared memory for n workers.  Use
+ *    tuplesort_estimate_shared() to get the required size.
+ * 2. Have leader process initialize allocated shared memory using
+ *    tuplesort_initialize_shared().  Launch workers.
+ * 3. Initialize a coordinate argument within both the leader process, and
+ *    for each worker process.  This has a pointer to the shared
+ *    tuplesort-private structure, as well as some caller-initialized fields.
+ *    Leader's coordinate argument reliably indicates number of workers
+ *    launched (this is unused by workers).
+ * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine,
+ *    (passing the coordinate argument) within each worker.  The workMem
+ *    arguments need not be identical.  All other arguments should match
+ *    exactly, though.
+ * 5. tuplesort_attach_shared() should be called by all workers.  Feed tuples
+ *    to each worker, and call tuplesort_performsort() within each when input
+ *    is exhausted.
+ * 6. Call tuplesort_end() in each worker process.  Worker processes can shut
+ *    down once tuplesort_end() returns.
+ * 7. Begin a tuplesort in the leader using the same tuplesort_begin*
+ *    routine, passing a leader-appropriate coordinate argument (this can
+ *    happen as early as during step 3, actually, since we only need to know
+ *    the number of workers successfully launched).  The leader must now wait
+ *    for workers to finish.  Caller must use own mechanism for ensuring that
+ *    next step isn't reached until all workers have called and returned from
+ *    tuplesort_performsort().  (Note that it's okay if workers have already
+ *    also called tuplesort_end() by then.)
+ * 8. Call tuplesort_performsort() in leader.  Consume output using the
+ *    appropriate tuplesort_get* routine.  Leader can skip this step if
+ *    tuplesort turns out to be unnecessary.
+ * 9. Call tuplesort_end() in leader.
+ *
+ * This division of labor assumes nothing about how input tuples are produced,
+ * but does require that caller combine the state of multiple tuplesorts for
+ * any purpose other than producing the final output.  For example, callers
+ * must consider that tuplesort_get_stats() reports on only one worker's role
+ * in a sort (or the leader's role), and not statistics for the sort as a
+ * whole.
+ *
+ * Note that callers may use the leader process to sort runs as if it was an
+ * independent worker process (prior to the process performing a leader sort
+ * to produce the final sorted output).  Doing so only requires a second
+ * "partial" tuplesort within the leader process, initialized like that of a
+ * worker process.  The steps above don't touch on this directly.  The only
+ * difference is that the tuplesort_attach_shared() call is never needed within
+ * leader process, because the backend as a whole holds the shared fileset
+ * reference.  A worker Tuplesortstate in leader is expected to do exactly the
+ * same amount of total initial processing work as a worker process
+ * Tuplesortstate, since the leader process has nothing else to do before
+ * workers finish.
+ *
+ * Note that only a very small amount of memory will be allocated prior to
+ * the leader state first consuming input, and that workers will free the
+ * vast majority of their memory upon returning from tuplesort_performsort().
+ * Callers can rely on this to arrange for memory to be used in a way that
+ * respects a workMem-style budget across an entire parallel sort operation.
+ *
+ * Callers are responsible for parallel safety in general.  However, they
+ * can at least rely on there being no parallel safety hazards within
+ * tuplesort, because tuplesort thinks of the sort as several independent
+ * sorts whose results are combined.  Since, in general, the behavior of
+ * sort operators is immutable, caller need only worry about the parallel
+ * safety of whatever the process is through which input tuples are
+ * generated (typically, caller uses a parallel heap scan).
   */
  
  extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
                      int nkeys, AttrNumber *attNums,
                      Oid *sortOperators, Oid *sortCollations,
                      bool *nullsFirstFlags,
-                    int workMem, bool randomAccess);
+                    int workMem, SortCoordinate coordinate,
+                    bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
-                       Relation indexRel,
-                       int workMem, bool randomAccess);
+                       Relation indexRel, int workMem,
+                       SortCoordinate coordinate, bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
                             Relation indexRel,
                             bool enforceUnique,
-                           int workMem, bool randomAccess);
+                           int workMem, SortCoordinate coordinate,
+                           bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
                            Relation indexRel,
                            uint32 high_mask,
                            uint32 low_mask,
                            uint32 max_buckets,
-                          int workMem, bool randomAccess);
+                          int workMem, SortCoordinate coordinate,
+                          bool randomAccess);
  extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
                       Oid sortOperator, Oid sortCollation,
                       bool nullsFirstFlag,
-                     int workMem, bool randomAccess);
+                     int workMem, SortCoordinate coordinate,
+                     bool randomAccess);
  
  extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
  
@@ -141,10 +247,16 @@ extern const char *tuplesort_space_type_name(TuplesortSpaceType t);
  
  extern int tuplesort_merge_order(int64 allowedMem);
  
+extern Size tuplesort_estimate_shared(int nworkers);
+extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers,
+                           dsm_segment *seg);
+extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg);
+
  /*
   * These routines may only be called if randomAccess was specified 'true'.
   * Likewise, backwards scan in gettuple/getdatum is only allowed if
- * randomAccess was specified.
+ * randomAccess was specified.  Note that parallel sorts do not support
+ * randomAccess.
   */
  
  extern void tuplesort_rescan(Tuplesortstate *state);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index a42ff9794a1bca14206069c2a7b6a79574eb947b..d4765ce3b011f5834b2014e5218d48154fd810ef 100644 (file)
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -165,6 +165,7 @@ BTArrayKeyInfo
  BTBuildState
  BTCycleId
  BTIndexStat
+BTLeader
  BTMetaPageData
  BTOneVacInfo
  BTPS_State
@@ -178,6 +179,7 @@ BTScanOpaqueData
  BTScanPos
  BTScanPosData
  BTScanPosItem
+BTShared
  BTSortArrayContext
  BTSpool
  BTStack
@@ -2047,6 +2049,7 @@ SharedSortInfo
  SharedTuplestore
  SharedTuplestoreAccessor
  SharedTypmodTableEntry
+Sharedsort
  ShellTypeInfo
  ShippableCacheEntry
  ShippableCacheKey
@@ -2091,6 +2094,8 @@ Sort
  SortBy
  SortByDir
  SortByNulls
+SortCoordinate
+SortCoordinateData
  SortGroupClause
  SortItem
  SortPath
@@ -2234,6 +2239,7 @@ TableSpaceOpts
  TablespaceList
  TablespaceListCell
  TapeBlockTrailer
+TapeShare
  TarMethodData
  TarMethodFile
  TargetEntry
author	Robert Haas <[email protected]>
	Fri, 2 Feb 2018 18:25:55 +0000 (13:25 -0500)
committer	Robert Haas <[email protected]>
	Fri, 2 Feb 2018 18:32:44 +0000 (13:32 -0500)
contrib/bloom/blinsert.c		patch \| blob \| blame \| history
doc/src/sgml/config.sgml		patch \| blob \| blame \| history
doc/src/sgml/monitoring.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/create_index.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/create_table.sgml		patch \| blob \| blame \| history
src/backend/access/brin/brin.c		patch \| blob \| blame \| history
src/backend/access/gin/gininsert.c		patch \| blob \| blame \| history
src/backend/access/gist/gistbuild.c		patch \| blob \| blame \| history
src/backend/access/hash/hash.c		patch \| blob \| blame \| history
src/backend/access/hash/hashsort.c		patch \| blob \| blame \| history
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtree.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtsort.c		patch \| blob \| blame \| history
src/backend/access/spgist/spginsert.c		patch \| blob \| blame \| history
src/backend/access/transam/parallel.c		patch \| blob \| blame \| history
src/backend/bootstrap/bootstrap.c		patch \| blob \| blame \| history
src/backend/catalog/heap.c		patch \| blob \| blame \| history
src/backend/catalog/index.c		patch \| blob \| blame \| history
src/backend/catalog/toasting.c		patch \| blob \| blame \| history
src/backend/commands/cluster.c		patch \| blob \| blame \| history
src/backend/commands/indexcmds.c		patch \| blob \| blame \| history
src/backend/executor/execParallel.c		patch \| blob \| blame \| history
src/backend/executor/nodeAgg.c		patch \| blob \| blame \| history
src/backend/executor/nodeSort.c		patch \| blob \| blame \| history
src/backend/optimizer/path/allpaths.c		patch \| blob \| blame \| history
src/backend/optimizer/path/costsize.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/backend/postmaster/pgstat.c		patch \| blob \| blame \| history
src/backend/storage/file/buffile.c		patch \| blob \| blame \| history
src/backend/storage/file/fd.c		patch \| blob \| blame \| history
src/backend/utils/adt/orderedsetaggs.c		patch \| blob \| blame \| history
src/backend/utils/init/globals.c		patch \| blob \| blame \| history
src/backend/utils/misc/guc.c		patch \| blob \| blame \| history
src/backend/utils/misc/postgresql.conf.sample		patch \| blob \| blame \| history
src/backend/utils/probes.d		patch \| blob \| blame \| history
src/backend/utils/sort/logtape.c		patch \| blob \| blame \| history
src/backend/utils/sort/tuplesort.c		patch \| blob \| blame \| history
src/include/access/nbtree.h		patch \| blob \| blame \| history
src/include/access/parallel.h		patch \| blob \| blame \| history
src/include/access/relscan.h		patch \| blob \| blame \| history
src/include/catalog/index.h		patch \| blob \| blame \| history
src/include/miscadmin.h		patch \| blob \| blame \| history
src/include/nodes/execnodes.h		patch \| blob \| blame \| history
src/include/optimizer/paths.h		patch \| blob \| blame \| history
src/include/optimizer/planner.h		patch \| blob \| blame \| history
src/include/pgstat.h		patch \| blob \| blame \| history
src/include/storage/buffile.h		patch \| blob \| blame \| history
src/include/storage/fd.h		patch \| blob \| blame \| history
src/include/utils/logtape.h		patch \| blob \| blame \| history
src/include/utils/tuplesort.h		patch \| blob \| blame \| history
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history