Enhance nbtree ScalarArrayOp execution.

author Peter Geoghegan <[email protected]>

Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)

committer Peter Geoghegan <[email protected]>

Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
author Peter Geoghegan <[email protected]>
Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
committer Peter Geoghegan <[email protected]>
Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml

index b68daa55aeb726239809f2a39b93d8da7f0ad3f0..76ac0fcddd78d6da0ae656e68e807947e2196865 100644 (file)
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -809,7 +809,8 @@ amrestrpos (IndexScanDesc scan);
    <para>
  <programlisting>
  Size
-amestimateparallelscan (void);
+amestimateparallelscan (int nkeys,
+                        int norderbys);
  </programlisting>
     Estimate and return the number of bytes of dynamic shared memory which
     the access method will be needed to perform a parallel scan.  (This number
@@ -817,6 +818,13 @@ amestimateparallelscan (void);
     AM-independent data in <structname>ParallelIndexScanDescData</structname>.)
    </para>
  
+  <para>
+   The <literal>nkeys</literal> and <literal>norderbys</literal>
+   parameters indicate the number of quals and ordering operators that will be
+   used in the scan; the same values will be passed to <function>amrescan</function>.
+   Note that the actual values of the scan keys aren't provided yet.
+  </para>
+
    <para>
     It is not necessary to implement this function for access methods which
     do not support parallel scans or for which the number of additional bytes
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml

index e1e96ba7c45fef482e8e4522c83e95201703cc3f..053da8d6e40a0e65b104e762669efaf5477b629c 100644 (file)
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -4064,6 +4064,19 @@ description | Waiting for a newly initialized WAL file to reach durable storage
     </para>
    </note>
  
+  <note>
+   <para>
+    Queries that use certain <acronym>SQL</acronym> constructs to search for
+    rows matching any value out of a list or array of multiple scalar values
+    (see <xref linkend="functions-comparisons"/>) perform multiple
+    <quote>primitive</quote> index scans (up to one primitive scan per scalar
+    value) during query execution.  Each internal primitive index scan
+    increments <structname>pg_stat_all_indexes</structname>.<structfield>idx_scan</structfield>,
+    so it's possible for the count of index scans to significantly exceed the
+    total number of index scan executor node executions.
+   </para>
+  </note>
+
   </sect2>
  
   <sect2 id="monitoring-pg-statio-all-tables-view">
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c

index 78ac3b1abb3eca279a09db4a1d16ffe2cde949f2..7510159fc8d46921e96baa5a55301db0844c450f 100644 (file)
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -449,13 +449,10 @@ index_restrpos(IndexScanDesc scan)
  
  /*
   * index_parallelscan_estimate - estimate shared memory for parallel scan
- *
- * Currently, we don't pass any information to the AM-specific estimator,
- * so it can probably only return a constant.  In the future, we might need
- * to pass more information.
   */
  Size
-index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
+index_parallelscan_estimate(Relation indexRelation, int nkeys, int norderbys,
+                           Snapshot snapshot)
  {
     Size        nbytes;
  
@@ -474,7 +471,8 @@ index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
      */
     if (indexRelation->rd_indam->amestimateparallelscan != NULL)
         nbytes = add_size(nbytes,
-                         indexRelation->rd_indam->amestimateparallelscan());
+                         indexRelation->rd_indam->amestimateparallelscan(nkeys,
+                                                                         norderbys));
  
     return nbytes;
  }
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index 41df1027d2d979ff11e7a987914970b57fbcfd3f..686a3206f726bdf07852ee96b23350c1261ea337 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -40,6 +40,9 @@
  /*
   * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
   *
+ * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
+ * scan to advance it via another call to _bt_first.
+ *
   * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
   * a new page; others must wait.
   *
@@ -47,11 +50,11 @@
   * to a new page; some process can start doing that.
   *
   * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
- * We reach this state once for every distinct combination of array keys.
   */
  typedef enum
  {
     BTPARALLEL_NOT_INITIALIZED,
+   BTPARALLEL_NEED_PRIMSCAN,
     BTPARALLEL_ADVANCING,
     BTPARALLEL_IDLE,
     BTPARALLEL_DONE,
@@ -67,10 +70,14 @@ typedef struct BTParallelScanDescData
     BTPS_State  btps_pageStatus;    /* indicates whether next page is
                                      * available for scan. see above for
                                      * possible states of parallel scan. */
-   int         btps_arrayKeyCount; /* count indicating number of array scan
-                                    * keys processed by parallel scan */
-   slock_t     btps_mutex;     /* protects above variables */
+   slock_t     btps_mutex;     /* protects above variables, btps_arrElems */
     ConditionVariable btps_cv;  /* used to synchronize parallel scan */
+
+   /*
+    * btps_arrElems is used when scans need to schedule another primitive
+    * index scan.  Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
+    */
+   int         btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
  }          BTParallelScanDescData;
  
  typedef struct BTParallelScanDescData *BTParallelScanDesc;
@@ -204,21 +211,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
     /* btree indexes are never lossy */
     scan->xs_recheck = false;
  
-   /*
-    * If we have any array keys, initialize them during first call for a
-    * scan.  We can't do this in btrescan because we don't know the scan
-    * direction at that time.
-    */
-   if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
-   {
-       /* punt if we have any unsatisfiable array keys */
-       if (so->numArrayKeys < 0)
-           return false;
-
-       _bt_start_array_keys(scan, dir);
-   }
-
-   /* This loop handles advancing to the next array elements, if any */
+   /* Each loop iteration performs another primitive index scan */
     do
     {
         /*
@@ -260,8 +253,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
         /* If we have a tuple, return it ... */
         if (res)
             break;
-       /* ... otherwise see if we have more array keys to deal with */
-   } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
+       /* ... otherwise see if we need another primitive index scan */
+   } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
  
     return res;
  }
@@ -276,19 +269,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
     int64       ntids = 0;
     ItemPointer heapTid;
  
-   /*
-    * If we have any array keys, initialize them.
-    */
-   if (so->numArrayKeys)
-   {
-       /* punt if we have any unsatisfiable array keys */
-       if (so->numArrayKeys < 0)
-           return ntids;
-
-       _bt_start_array_keys(scan, ForwardScanDirection);
-   }
-
-   /* This loop handles advancing to the next array elements, if any */
+   /* Each loop iteration performs another primitive index scan */
     do
     {
         /* Fetch the first page & tuple */
@@ -318,8 +299,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
                 ntids++;
             }
         }
-       /* Now see if we have more array keys to deal with */
-   } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
+       /* Now see if we need another primitive index scan */
+   } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
  
     return ntids;
  }
@@ -348,10 +329,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
     else
         so->keyData = NULL;
  
-   so->arrayKeyData = NULL;    /* assume no array keys for now */
-   so->arraysStarted = false;
-   so->numArrayKeys = 0;
+   so->needPrimScan = false;
+   so->scanBehind = false;
     so->arrayKeys = NULL;
+   so->orderProcs = NULL;
     so->arrayContext = NULL;
  
     so->killedItems = NULL;     /* until needed */
@@ -391,7 +372,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
     }
  
     so->markItemIndex = -1;
-   so->arrayKeyCount = 0;
+   so->needPrimScan = false;
+   so->scanBehind = false;
     BTScanPosUnpinIfPinned(so->markPos);
     BTScanPosInvalidate(so->markPos);
  
@@ -425,9 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
                 scankey,
                 scan->numberOfKeys * sizeof(ScanKeyData));
     so->numberOfKeys = 0;       /* until _bt_preprocess_keys sets it */
-
-   /* If any keys are SK_SEARCHARRAY type, set up array-key info */
-   _bt_preprocess_array_keys(scan);
+   so->numArrayKeys = 0;       /* ditto */
  }
  
  /*
@@ -455,7 +435,7 @@ btendscan(IndexScanDesc scan)
     /* Release storage */
     if (so->keyData != NULL)
         pfree(so->keyData);
-   /* so->arrayKeyData and so->arrayKeys are in arrayContext */
+   /* so->arrayKeys and so->orderProcs are in arrayContext */
     if (so->arrayContext != NULL)
         MemoryContextDelete(so->arrayContext);
     if (so->killedItems != NULL)
@@ -490,10 +470,6 @@ btmarkpos(IndexScanDesc scan)
         BTScanPosInvalidate(so->markPos);
         so->markItemIndex = -1;
     }
-
-   /* Also record the current positions of any array keys */
-   if (so->numArrayKeys)
-       _bt_mark_array_keys(scan);
  }
  
  /*
@@ -504,10 +480,6 @@ btrestrpos(IndexScanDesc scan)
  {
     BTScanOpaque so = (BTScanOpaque) scan->opaque;
  
-   /* Restore the marked positions of any array keys */
-   if (so->numArrayKeys)
-       _bt_restore_array_keys(scan);
-
     if (so->markItemIndex >= 0)
     {
         /*
@@ -546,6 +518,12 @@ btrestrpos(IndexScanDesc scan)
             if (so->currTuples)
                 memcpy(so->currTuples, so->markTuples,
                        so->markPos.nextTupleOffset);
+           /* Reset the scan's array keys (see _bt_steppage for why) */
+           if (so->numArrayKeys)
+           {
+               _bt_start_array_keys(scan, so->currPos.dir);
+               so->needPrimScan = false;
+           }
         }
         else
             BTScanPosInvalidate(so->currPos);
@@ -556,9 +534,10 @@ btrestrpos(IndexScanDesc scan)
   * btestimateparallelscan -- estimate storage for BTParallelScanDescData
   */
  Size
-btestimateparallelscan(void)
+btestimateparallelscan(int nkeys, int norderbys)
  {
-   return sizeof(BTParallelScanDescData);
+   /* Pessimistically assume all input scankeys will be output with arrays */
+   return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
  }
  
  /*
@@ -572,7 +551,6 @@ btinitparallelscan(void *target)
     SpinLockInit(&bt_target->btps_mutex);
     bt_target->btps_scanPage = InvalidBlockNumber;
     bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
-   bt_target->btps_arrayKeyCount = 0;
     ConditionVariableInit(&bt_target->btps_cv);
  }
  
@@ -598,7 +576,6 @@ btparallelrescan(IndexScanDesc scan)
     SpinLockAcquire(&btscan->btps_mutex);
     btscan->btps_scanPage = InvalidBlockNumber;
     btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
-   btscan->btps_arrayKeyCount = 0;
     SpinLockRelease(&btscan->btps_mutex);
  }
  
@@ -608,23 +585,26 @@ btparallelrescan(IndexScanDesc scan)
   *     or _bt_parallel_done().
   *
   * The return value is true if we successfully seized the scan and false
- * if we did not.  The latter case occurs if no pages remain for the current
- * set of scankeys.
+ * if we did not.  The latter case occurs if no pages remain.
   *
   * If the return value is true, *pageno returns the next or current page
   * of the scan (depending on the scan direction).  An invalid block number
- * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * means the scan hasn't yet started, or that caller needs to start the next
+ * primitive index scan (if it's the latter case we'll set so.needPrimScan).
   * The first time a participating process reaches the last page, it will return
   * true and set *pageno to P_NONE; after that, further attempts to seize the
   * scan will return false.
   *
   * Callers should ignore the value of pageno if the return value is false.
+ *
+ * Callers that are in a position to start a new primitive index scan must
+ * pass first=true (all other callers pass first=false).  We just return false
+ * for first=false callers that require another primitive index scan.
   */
  bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
  {
     BTScanOpaque so = (BTScanOpaque) scan->opaque;
-   BTPS_State  pageStatus;
     bool        exit_loop = false;
     bool        status = true;
     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@@ -632,28 +612,69 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
  
     *pageno = P_NONE;
  
+   if (first)
+   {
+       /*
+        * Initialize array related state when called from _bt_first, assuming
+        * that this will either be the first primitive index scan for the
+        * scan, or a previous explicitly scheduled primitive scan.
+        *
+        * Note: so->needPrimScan is only set when a scheduled primitive index
+        * scan is set to be performed in caller's worker process.  It should
+        * not be set here by us for the first primitive scan, nor should we
+        * ever set it for a parallel scan that has no array keys.
+        */
+       so->needPrimScan = false;
+       so->scanBehind = false;
+   }
+   else
+   {
+       /*
+        * Don't attempt to seize the scan when backend requires another
+        * primitive index scan unless we're in a position to start it now
+        */
+       if (so->needPrimScan)
+           return false;
+   }
+
     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
                                                   parallel_scan->ps_offset);
  
     while (1)
     {
         SpinLockAcquire(&btscan->btps_mutex);
-       pageStatus = btscan->btps_pageStatus;
  
-       if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+       if (btscan->btps_pageStatus == BTPARALLEL_DONE)
         {
-           /* Parallel scan has already advanced to a new set of scankeys. */
+           /* We're done with this parallel index scan */
             status = false;
         }
-       else if (pageStatus == BTPARALLEL_DONE)
+       else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
         {
+           Assert(so->numArrayKeys);
+
             /*
-            * We're done with this set of scankeys.  This may be the end, or
-            * there could be more sets to try.
+            * If we can start another primitive scan right away, do so.
+            * Otherwise just wait.
              */
-           status = false;
+           if (first)
+           {
+               btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+               for (int i = 0; i < so->numArrayKeys; i++)
+               {
+                   BTArrayKeyInfo *array = &so->arrayKeys[i];
+                   ScanKey     skey = &so->keyData[array->scan_key];
+
+                   array->cur_elem = btscan->btps_arrElems[i];
+                   skey->sk_argument = array->elem_values[array->cur_elem];
+               }
+               so->needPrimScan = true;
+               so->scanBehind = false;
+               *pageno = InvalidBlockNumber;
+               exit_loop = true;
+           }
         }
-       else if (pageStatus != BTPARALLEL_ADVANCING)
+       else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
         {
             /*
              * We have successfully seized control of the scan for the purpose
@@ -677,6 +698,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
   * _bt_parallel_release() -- Complete the process of advancing the scan to a
   *     new page.  We now have the new value btps_scanPage; some other backend
   *     can now begin advancing the scan.
+ *
+ * Callers whose scan uses array keys must save their scan_page argument so
+ * that it can be passed to _bt_parallel_primscan_schedule, should caller
+ * determine that another primitive index scan is required.  If that happens,
+ * scan_page won't be scanned by any backend (unless the next primitive index
+ * scan lands on scan_page).
   */
  void
  _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
@@ -704,7 +731,6 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
  void
  _bt_parallel_done(IndexScanDesc scan)
  {
-   BTScanOpaque so = (BTScanOpaque) scan->opaque;
     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     BTParallelScanDesc btscan;
     bool        status_changed = false;
@@ -717,13 +743,11 @@ _bt_parallel_done(IndexScanDesc scan)
                                                   parallel_scan->ps_offset);
  
     /*
-    * Mark the parallel scan as done for this combination of scan keys,
-    * unless some other process already did so.  See also
-    * _bt_advance_array_keys.
+    * Mark the parallel scan as done, unless some other process did so
+    * already
      */
     SpinLockAcquire(&btscan->btps_mutex);
-   if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
-       btscan->btps_pageStatus != BTPARALLEL_DONE)
+   if (btscan->btps_pageStatus != BTPARALLEL_DONE)
     {
         btscan->btps_pageStatus = BTPARALLEL_DONE;
         status_changed = true;
@@ -736,29 +760,39 @@ _bt_parallel_done(IndexScanDesc scan)
  }
  
  /*
- * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
- *         keys.
+ * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
   *
- * Updates the count of array keys processed for both local and parallel
- * scans.
+ * Caller passes the block number most recently passed to _bt_parallel_release
+ * by its backend.  Caller successfully schedules the next primitive index scan
+ * if the shared parallel state hasn't been seized since caller's backend last
+ * advanced the scan.
   */
  void
-_bt_parallel_advance_array_keys(IndexScanDesc scan)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
  {
     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     BTParallelScanDesc btscan;
  
+   Assert(so->numArrayKeys);
+
     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
                                                   parallel_scan->ps_offset);
  
-   so->arrayKeyCount++;
     SpinLockAcquire(&btscan->btps_mutex);
-   if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+   if (btscan->btps_scanPage == prev_scan_page &&
+       btscan->btps_pageStatus == BTPARALLEL_IDLE)
     {
         btscan->btps_scanPage = InvalidBlockNumber;
-       btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
-       btscan->btps_arrayKeyCount++;
+       btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
+
+       /* Serialize scan's current array keys */
+       for (int i = 0; i < so->numArrayKeys; i++)
+       {
+           BTArrayKeyInfo *array = &so->arrayKeys[i];
+
+           btscan->btps_arrElems[i] = array->cur_elem;
+       }
     }
     SpinLockRelease(&btscan->btps_mutex);
  }
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c

index e3fff90d8e421a119c2b3e7290bdc89a381c29d0..d241e8ea1dcce5b6c5ee1a5a932ebc4b19c6a68c 100644 (file)
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -907,7 +907,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
      */
     if (!so->qual_ok)
     {
-       /* Notify any other workers that we're done with this scan key. */
         _bt_parallel_done(scan);
         return false;
     }
@@ -917,10 +916,22 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
      * scan has not started, proceed to find out first leaf page in the usual
      * way while keeping other participating processes waiting.  If the scan
      * has already begun, use the page number from the shared structure.
+    *
+    * When a parallel scan has another primitive index scan scheduled, a
+    * parallel worker will seize the scan for that purpose now.  This is
+    * similar to the case where the top-level scan hasn't started.
      */
     if (scan->parallel_scan != NULL)
     {
-       status = _bt_parallel_seize(scan, &blkno);
+       status = _bt_parallel_seize(scan, &blkno, true);
+
+       /*
+        * Initialize arrays (when _bt_parallel_seize didn't already set up
+        * the next primitive index scan)
+        */
+       if (so->numArrayKeys && !so->needPrimScan)
+           _bt_start_array_keys(scan, dir);
+
         if (!status)
             return false;
         else if (blkno == P_NONE)
@@ -935,6 +946,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
             goto readcomplete;
         }
     }
+   else if (so->numArrayKeys && !so->needPrimScan)
+   {
+       /*
+        * First _bt_first call (for current btrescan) without parallelism.
+        *
+        * Initialize arrays, and the corresponding scan keys that were just
+        * output by _bt_preprocess_keys.
+        */
+       _bt_start_array_keys(scan, dir);
+   }
  
     /*----------
      * Examine the scan keys to discover where we need to start the scan.
@@ -980,6 +1001,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
      *
      * The selected scan keys (at most one per index column) are remembered by
      * storing their addresses into the local startKeys[] array.
+    *
+    * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
+    * the next primitive index scan (for scans with array keys) based in part
+    * on an understanding of how it'll enable us to reposition the scan.
+    * They're directly aware of how we'll sometimes cons up an explicit
+    * SK_SEARCHNOTNULL key.  They'll even end primitive scans by applying a
+    * symmetric "deduce NOT NULL" rule of their own.  This allows top-level
+    * scans to skip large groups of NULLs through repeated deductions about
+    * key strictness (for a required inequality key) and whether NULLs in the
+    * key's index column are stored last or first (relative to non-NULLs).
+    * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
+    * need to be kept in sync.
      *----------
      */
     strat_total = BTEqualStrategyNumber;
@@ -1502,7 +1535,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
   * We scan the current page starting at offnum and moving in the indicated
   * direction.  All items matching the scan keys are loaded into currPos.items.
   * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
- * that there can be no more matching tuples in the current scan direction.
+ * that there can be no more matching tuples in the current scan direction
+ * (could just be for the current primitive index scan when scan has arrays).
   *
   * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
   * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
@@ -1527,11 +1561,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
     BTPageOpaque opaque;
     OffsetNumber minoff;
     OffsetNumber maxoff;
-   int         itemIndex;
-   bool        continuescan;
-   int         indnatts;
-   bool        continuescanPrechecked;
-   bool        haveFirstMatch = false;
+   BTReadPageState pstate;
+   bool        arrayKeys;
+   int         itemIndex,
+               indnatts;
  
     /*
      * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1546,16 +1579,32 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
     if (scan->parallel_scan)
     {
         if (ScanDirectionIsForward(dir))
-           _bt_parallel_release(scan, opaque->btpo_next);
+           pstate.prev_scan_page = opaque->btpo_next;
         else
-           _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+           pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
+
+       _bt_parallel_release(scan, pstate.prev_scan_page);
     }
  
-   continuescan = true;        /* default assumption */
     indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+   arrayKeys = so->numArrayKeys != 0;
     minoff = P_FIRSTDATAKEY(opaque);
     maxoff = PageGetMaxOffsetNumber(page);
  
+   /* initialize page-level state that we'll pass to _bt_checkkeys */
+   pstate.dir = dir;
+   pstate.minoff = minoff;
+   pstate.maxoff = maxoff;
+   pstate.finaltup = NULL;
+   pstate.page = page;
+   pstate.offnum = InvalidOffsetNumber;
+   pstate.skip = InvalidOffsetNumber;
+   pstate.continuescan = true; /* default assumption */
+   pstate.prechecked = false;
+   pstate.firstmatch = false;
+   pstate.rechecks = 0;
+   pstate.targetdistance = 0;
+
     /*
      * We note the buffer's block number so that we can release the pin later.
      * This allows us to re-read the buffer if it is needed again for hinting.
@@ -1598,10 +1647,34 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
      * corresponding value from the last item on the page.  So checking with
      * the last item on the page would give a more precise answer.
      *
-    * We skip this for the first page in the scan to evade the possible
-    * slowdown of the point queries.
+    * We skip this for the first page read by each (primitive) scan, to avoid
+    * slowing down point queries.  They typically don't stand to gain much
+    * when the optimization can be applied, and are more likely to notice the
+    * overhead of the precheck.
+    *
+    * The optimization is unsafe and must be avoided whenever _bt_checkkeys
+    * just set a low-order required array's key to the best available match
+    * for a truncated -inf attribute value from the prior page's high key
+    * (array element 0 is always the best available match in this scenario).
+    * It's quite likely that matches for array element 0 begin on this page,
+    * but the start of matches won't necessarily align with page boundaries.
+    * When the start of matches is somewhere in the middle of this page, it
+    * would be wrong to treat page's final non-pivot tuple as representative.
+    * Doing so might lead us to treat some of the page's earlier tuples as
+    * being part of a group of tuples thought to satisfy the required keys.
+    *
+    * Note: Conversely, in the case where the scan's arrays just advanced
+    * using the prior page's HIKEY _without_ advancement setting scanBehind,
+    * the start of matches must be aligned with page boundaries, which makes
+    * it safe to attempt the optimization here now.  It's also safe when the
+    * prior page's HIKEY simply didn't need to advance any required array. In
+    * both cases we can safely assume that the _first_ tuple from this page
+    * must be >= the current set of array keys/equality constraints. And so
+    * if the final tuple is == those same keys (and also satisfies any
+    * required < or <= strategy scan keys) during the precheck, we can safely
+    * assume that this must also be true of all earlier tuples from the page.
      */
-   if (!firstPage && minoff < maxoff)
+   if (!firstPage && !so->scanBehind && minoff < maxoff)
     {
         ItemId      iid;
         IndexTuple  itup;
@@ -1609,22 +1682,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
         iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
         itup = (IndexTuple) PageGetItem(page, iid);
  
-       /*
-        * Do the precheck.  Note that we pass the pointer to the
-        * 'continuescanPrechecked' to the 'continuescan' argument. That will
-        * set flag to true if all required keys are satisfied and false
-        * otherwise.
-        */
-       (void) _bt_checkkeys(scan, itup, indnatts, dir,
-                            &continuescanPrechecked, false, false);
-   }
-   else
-   {
-       continuescanPrechecked = false;
+       /* Call with arrayKeys=false to avoid undesirable side-effects */
+       _bt_checkkeys(scan, &pstate, false, itup, indnatts);
+       pstate.prechecked = pstate.continuescan;
+       pstate.continuescan = true; /* reset */
     }
  
     if (ScanDirectionIsForward(dir))
     {
+       /* SK_SEARCHARRAY forward scans must provide high key up front */
+       if (arrayKeys && !P_RIGHTMOST(opaque))
+       {
+           ItemId      iid = PageGetItemId(page, P_HIKEY);
+
+           pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+       }
+
         /* load items[] in ascending order */
         itemIndex = 0;
  
@@ -1649,23 +1722,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
             itup = (IndexTuple) PageGetItem(page, iid);
             Assert(!BTreeTupleIsPivot(itup));
  
-           passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-                                        &continuescan,
-                                        continuescanPrechecked,
-                                        haveFirstMatch);
+           pstate.offnum = offnum;
+           passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+                                        itup, indnatts);
  
             /*
-            * If the result of prechecking required keys was true, then in
-            * assert-enabled builds we also recheck that the _bt_checkkeys()
-            * result is the same.
+            * Check if we need to skip ahead to a later tuple (only possible
+            * when the scan uses array keys)
              */
-           Assert((!continuescanPrechecked && haveFirstMatch) ||
-                  passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
-                                                &continuescan, false, false));
+           if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+           {
+               Assert(!passes_quals && pstate.continuescan);
+               Assert(offnum < pstate.skip);
+
+               offnum = pstate.skip;
+               pstate.skip = InvalidOffsetNumber;
+               continue;
+           }
+
             if (passes_quals)
             {
                 /* tuple passes all scan key conditions */
-               haveFirstMatch = true;
+               pstate.firstmatch = true;
                 if (!BTreeTupleIsPosting(itup))
                 {
                     /* Remember it */
@@ -1696,7 +1774,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                 }
             }
             /* When !continuescan, there can't be any more matches, so stop */
-           if (!continuescan)
+           if (!pstate.continuescan)
                 break;
  
             offnum = OffsetNumberNext(offnum);
@@ -1713,17 +1791,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
          * only appear on non-pivot tuples on the right sibling page are
          * common.
          */
-       if (continuescan && !P_RIGHTMOST(opaque))
+       if (pstate.continuescan && !P_RIGHTMOST(opaque))
         {
             ItemId      iid = PageGetItemId(page, P_HIKEY);
             IndexTuple  itup = (IndexTuple) PageGetItem(page, iid);
             int         truncatt;
  
             truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-           _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false);
+           pstate.prechecked = false;  /* precheck didn't cover HIKEY */
+           _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
         }
  
-       if (!continuescan)
+       if (!pstate.continuescan)
             so->currPos.moreRight = false;
  
         Assert(itemIndex <= MaxTIDsPerBTreePage);
@@ -1733,6 +1812,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
     }
     else
     {
+       /* SK_SEARCHARRAY backward scans must provide final tuple up front */
+       if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
+       {
+           ItemId      iid = PageGetItemId(page, minoff);
+
+           pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+       }
+
         /* load items[] in descending order */
         itemIndex = MaxTIDsPerBTreePage;
  
@@ -1772,23 +1859,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
             itup = (IndexTuple) PageGetItem(page, iid);
             Assert(!BTreeTupleIsPivot(itup));
  
-           passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-                                        &continuescan,
-                                        continuescanPrechecked,
-                                        haveFirstMatch);
+           pstate.offnum = offnum;
+           passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+                                        itup, indnatts);
  
             /*
-            * If the result of prechecking required keys was true, then in
-            * assert-enabled builds we also recheck that the _bt_checkkeys()
-            * result is the same.
+            * Check if we need to skip ahead to a later tuple (only possible
+            * when the scan uses array keys)
              */
-           Assert((!continuescanPrechecked && !haveFirstMatch) ||
-                  passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
-                                                &continuescan, false, false));
+           if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+           {
+               Assert(!passes_quals && pstate.continuescan);
+               Assert(offnum > pstate.skip);
+
+               offnum = pstate.skip;
+               pstate.skip = InvalidOffsetNumber;
+               continue;
+           }
+
             if (passes_quals && tuple_alive)
             {
                 /* tuple passes all scan key conditions */
-               haveFirstMatch = true;
+               pstate.firstmatch = true;
                 if (!BTreeTupleIsPosting(itup))
                 {
                     /* Remember it */
@@ -1824,7 +1916,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                     }
                 }
             }
-           if (!continuescan)
+           if (!pstate.continuescan)
             {
                 /* there can't be any more matches, so stop */
                 so->currPos.moreLeft = false;
@@ -1970,6 +2062,31 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
                    so->currPos.nextTupleOffset);
         so->markPos.itemIndex = so->markItemIndex;
         so->markItemIndex = -1;
+
+       /*
+        * If we're just about to start the next primitive index scan
+        * (possible with a scan that has arrays keys, and needs to skip to
+        * continue in the current scan direction), moreLeft/moreRight only
+        * indicate the end of the current primitive index scan.  They must
+        * never be taken to indicate that the top-level index scan has ended
+        * (that would be wrong).
+        *
+        * We could handle this case by treating the current array keys as
+        * markPos state.  But depending on the current array state like this
+        * would add complexity.  Instead, we just unset markPos's copy of
+        * moreRight or moreLeft (whichever might be affected), while making
+        * btrestpos reset the scan's arrays to their initial scan positions.
+        * In effect, btrestpos leaves advancing the arrays up to the first
+        * _bt_readpage call (that takes place after it has restored markPos).
+        */
+       Assert(so->markPos.dir == dir);
+       if (so->needPrimScan)
+       {
+           if (ScanDirectionIsForward(dir))
+               so->markPos.moreRight = true;
+           else
+               so->markPos.moreLeft = true;
+       }
     }
  
     if (ScanDirectionIsForward(dir))
@@ -1981,7 +2098,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
              * Seize the scan to get the next block number; if the scan has
              * ended already, bail out.
              */
-           status = _bt_parallel_seize(scan, &blkno);
+           status = _bt_parallel_seize(scan, &blkno, false);
             if (!status)
             {
                 /* release the previous buffer, if pinned */
@@ -2013,7 +2130,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
              * Seize the scan to get the current block number; if the scan has
              * ended already, bail out.
              */
-           status = _bt_parallel_seize(scan, &blkno);
+           status = _bt_parallel_seize(scan, &blkno, false);
             BTScanPosUnpinIfPinned(so->currPos);
             if (!status)
             {
@@ -2097,7 +2214,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
             if (scan->parallel_scan != NULL)
             {
                 _bt_relbuf(rel, so->currPos.buf);
-               status = _bt_parallel_seize(scan, &blkno);
+               status = _bt_parallel_seize(scan, &blkno, false);
                 if (!status)
                 {
                     BTScanPosInvalidate(so->currPos);
@@ -2193,7 +2310,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
             if (scan->parallel_scan != NULL)
             {
                 _bt_relbuf(rel, so->currPos.buf);
-               status = _bt_parallel_seize(scan, &blkno);
+               status = _bt_parallel_seize(scan, &blkno, false);
                 if (!status)
                 {
                     BTScanPosInvalidate(so->currPos);
@@ -2218,6 +2335,8 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
  {
     BTScanOpaque so = (BTScanOpaque) scan->opaque;
  
+   Assert(!so->needPrimScan);
+
     _bt_initialize_more_data(so, dir);
  
     if (!_bt_readnextpage(scan, blkno, dir))
@@ -2524,14 +2643,22 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
  }
  
  /*
- * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
- * for scan direction
+ * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
+ * from currPos
   */
  static inline void
  _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
  {
-   /* initialize moreLeft/moreRight appropriately for scan direction */
-   if (ScanDirectionIsForward(dir))
+   so->currPos.dir = dir;
+   if (so->needPrimScan)
+   {
+       Assert(so->numArrayKeys);
+
+       so->currPos.moreLeft = true;
+       so->currPos.moreRight = true;
+       so->needPrimScan = false;
+   }
+   else if (ScanDirectionIsForward(dir))
     {
         so->currPos.moreLeft = false;
         so->currPos.moreRight = true;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c

index d50317096da347933562ec6d10109dcb3fcbd083..e963de78a7bea106752e82d2bdf135fbae57af35 100644 (file)
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -29,29 +29,77 @@
  #include "utils/memutils.h"
  #include "utils/rel.h"
  
+#define LOOK_AHEAD_REQUIRED_RECHECKS   3
+#define LOOK_AHEAD_DEFAULT_DISTANCE    5
  
  typedef struct BTSortArrayContext
  {
-   FmgrInfo    flinfo;
+   FmgrInfo   *sortproc;
     Oid         collation;
     bool        reverse;
  } BTSortArrayContext;
  
+typedef struct BTScanKeyPreproc
+{
+   ScanKey     skey;
+   int         ikey;
+   int         arrayidx;
+} BTScanKeyPreproc;
+
+static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
+                               FmgrInfo *orderproc, FmgrInfo **sortprocp);
  static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
-                                     StrategyNumber strat,
+                                     Oid elemtype, StrategyNumber strat,
                                       Datum *elems, int nelems);
-static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
-                                   bool reverse,
-                                   Datum *elems, int nelems);
+static int _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc,
+                                   bool reverse, Datum *elems, int nelems);
+static bool _bt_merge_arrays(IndexScanDesc scan, ScanKey skey,
+                            FmgrInfo *sortproc, bool reverse,
+                            Oid origelemtype, Oid nextelemtype,
+                            Datum *elems_orig, int *nelems_orig,
+                            Datum *elems_next, int nelems_next);
+static bool _bt_compare_array_scankey_args(IndexScanDesc scan,
+                                          ScanKey arraysk, ScanKey skey,
+                                          FmgrInfo *orderproc, BTArrayKeyInfo *array,
+                                          bool *qual_ok);
+static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan);
+static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
  static int _bt_compare_array_elements(const void *a, const void *b, void *arg);
+static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
+                                          Datum tupdatum, bool tupnull,
+                                          Datum arrdatum, ScanKey cur);
+static int _bt_binsrch_array_skey(FmgrInfo *orderproc,
+                                  bool cur_elem_trig, ScanDirection dir,
+                                  Datum tupdatum, bool tupnull,
+                                  BTArrayKeyInfo *array, ScanKey cur,
+                                  int32 *set_elem_result);
+static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir);
+static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
+                                        IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
+                                        bool readpagetup, int sktrig, bool *scanBehind);
+static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
+                                  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+                                  int sktrig, bool sktrig_required);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
+#endif
  static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
                                      ScanKey leftarg, ScanKey rightarg,
+                                    BTArrayKeyInfo *array, FmgrInfo *orderproc,
                                      bool *result);
  static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
  static void _bt_mark_scankey_required(ScanKey skey);
+static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
+                             IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+                             bool advancenonrequired, bool prechecked, bool firstmatch,
+                             bool *continuescan, int *ikey);
  static bool _bt_check_rowcompare(ScanKey skey,
                                  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
                                  ScanDirection dir, bool *continuescan);
+static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
+                                    int tupnatts, TupleDesc tupdesc);
  static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
                            IndexTuple firstright, BTScanInsert itup_key);
  
@@ -188,29 +236,55 @@ _bt_freestack(BTStack stack)
   *
   * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
   * set up BTArrayKeyInfo info for each one that is an equality-type key.
- * Prepare modified scan keys in so->arrayKeyData, which will hold the current
- * array elements during each primitive indexscan operation.  For inequality
- * array keys, it's sufficient to find the extreme element value and replace
- * the whole array with that scalar value.
- *
- * Note: the reason we need so->arrayKeyData, rather than just scribbling
- * on scan->keyData, is that callers are permitted to call btrescan without
- * supplying a new set of scankey data.
+ * Returns modified scan keys as input for further, standard preprocessing.
+ *
+ * Currently we perform two kinds of preprocessing to deal with redundancies.
+ * For inequality array keys, it's sufficient to find the extreme element
+ * value and replace the whole array with that scalar value.  This eliminates
+ * all but one array element as redundant.  Similarly, we are capable of
+ * "merging together" multiple equality array keys (from two or more input
+ * scan keys) into a single output scan key containing only the intersecting
+ * array elements.  This can eliminate many redundant array elements, as well
+ * as eliminating whole array scan keys as redundant.  It can also allow us to
+ * detect contradictory quals.
+ *
+ * It is convenient for _bt_preprocess_keys caller to have to deal with no
+ * more than one equality strategy array scan key per index attribute.  We'll
+ * always be able to set things up that way when complete opfamilies are used.
+ * Eliminated array scan keys can be recognized as those that have had their
+ * sk_strategy field set to InvalidStrategy here by us.  Caller should avoid
+ * including these in the scan's so->keyData[] output array.
+ *
+ * We set the scan key references from the scan's BTArrayKeyInfo info array to
+ * offsets into the temp modified input array returned to caller.  Scans that
+ * have array keys should call _bt_preprocess_array_keys_final when standard
+ * preprocessing steps are complete.  This will convert the scan key offset
+ * references into references to the scan's so->keyData[] output scan keys.
+ *
+ * Note: the reason we need to return a temp scan key array, rather than just
+ * scribbling on scan->keyData, is that callers are permitted to call btrescan
+ * without supplying a new set of scankey data.
   */
-void
+static ScanKey
  _bt_preprocess_array_keys(IndexScanDesc scan)
  {
     BTScanOpaque so = (BTScanOpaque) scan->opaque;
+   Relation    rel = scan->indexRelation;
     int         numberOfKeys = scan->numberOfKeys;
-   int16      *indoption = scan->indexRelation->rd_indoption;
+   int16      *indoption = rel->rd_indoption;
     int         numArrayKeys;
+   int         origarrayatt = InvalidAttrNumber,
+               origarraykey = -1;
+   Oid         origelemtype = InvalidOid;
     ScanKey     cur;
-   int         i;
     MemoryContext oldContext;
+   ScanKey     arrayKeyData;   /* modified copy of scan->keyData */
+
+   Assert(numberOfKeys);
  
     /* Quick check to see if there are any array keys */
     numArrayKeys = 0;
-   for (i = 0; i < numberOfKeys; i++)
+   for (int i = 0; i < numberOfKeys; i++)
     {
         cur = &scan->keyData[i];
         if (cur->sk_flags & SK_SEARCHARRAY)
@@ -220,20 +294,15 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
             /* If any arrays are null as a whole, we can quit right now. */
             if (cur->sk_flags & SK_ISNULL)
             {
-               so->numArrayKeys = -1;
-               so->arrayKeyData = NULL;
-               return;
+               so->qual_ok = false;
+               return NULL;
             }
         }
     }
  
     /* Quit if nothing to do. */
     if (numArrayKeys == 0)
-   {
-       so->numArrayKeys = 0;
-       so->arrayKeyData = NULL;
-       return;
-   }
+       return NULL;
  
     /*
      * Make a scan-lifespan context to hold array-associated data, or reset it
@@ -249,18 +318,23 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
     oldContext = MemoryContextSwitchTo(so->arrayContext);
  
     /* Create modifiable copy of scan->keyData in the workspace context */
-   so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
-   memcpy(so->arrayKeyData,
-          scan->keyData,
-          scan->numberOfKeys * sizeof(ScanKeyData));
+   arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData));
+   memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData));
  
     /* Allocate space for per-array data in the workspace context */
-   so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
+   so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo));
+
+   /* Allocate space for ORDER procs used to help _bt_checkkeys */
+   so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo));
  
     /* Now process each array key */
     numArrayKeys = 0;
-   for (i = 0; i < numberOfKeys; i++)
+   for (int i = 0; i < numberOfKeys; i++)
     {
+       FmgrInfo    sortproc;
+       FmgrInfo   *sortprocp = &sortproc;
+       Oid         elemtype;
+       bool        reverse;
         ArrayType  *arrayval;
         int16       elmlen;
         bool        elmbyval;
@@ -271,7 +345,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
         int         num_nonnulls;
         int         j;
  
-       cur = &so->arrayKeyData[i];
+       cur = &arrayKeyData[i];
         if (!(cur->sk_flags & SK_SEARCHARRAY))
             continue;
  
@@ -305,10 +379,21 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
         /* If there's no non-nulls, the scan qual is unsatisfiable */
         if (num_nonnulls == 0)
         {
-           numArrayKeys = -1;
+           so->qual_ok = false;
             break;
         }
  
+       /*
+        * Determine the nominal datatype of the array elements.  We have to
+        * support the convention that sk_subtype == InvalidOid means the
+        * opclass input type; this is a hack to simplify life for
+        * ScanKeyInit().
+        */
+       elemtype = cur->sk_subtype;
+       if (elemtype == InvalidOid)
+           elemtype = rel->rd_opcintype[cur->sk_attno - 1];
+       Assert(elemtype == ARR_ELEMTYPE(arrayval));
+
         /*
          * If the comparison operator is not equality, then the array qual
          * degenerates to a simple comparison against the smallest or largest
@@ -319,7 +404,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
             case BTLessStrategyNumber:
             case BTLessEqualStrategyNumber:
                 cur->sk_argument =
-                   _bt_find_extreme_element(scan, cur,
+                   _bt_find_extreme_element(scan, cur, elemtype,
                                              BTGreaterStrategyNumber,
                                              elem_values, num_nonnulls);
                 continue;
@@ -329,7 +414,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
             case BTGreaterEqualStrategyNumber:
             case BTGreaterStrategyNumber:
                 cur->sk_argument =
-                   _bt_find_extreme_element(scan, cur,
+                   _bt_find_extreme_element(scan, cur, elemtype,
                                              BTLessStrategyNumber,
                                              elem_values, num_nonnulls);
                 continue;
@@ -339,17 +424,93 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
                 break;
         }
  
+       /*
+        * We'll need a 3-way ORDER proc to perform binary searches for the
+        * next matching array element.  Set that up now.
+        *
+        * Array scan keys with cross-type equality operators will require a
+        * separate same-type ORDER proc for sorting their array.  Otherwise,
+        * sortproc just points to the same proc used during binary searches.
+        */
+       _bt_setup_array_cmp(scan, cur, elemtype,
+                           &so->orderProcs[i], &sortprocp);
+
         /*
          * Sort the non-null elements and eliminate any duplicates.  We must
          * sort in the same ordering used by the index column, so that the
-        * successive primitive indexscans produce data in index order.
+        * arrays can be advanced in lockstep with the scan's progress through
+        * the index's key space.
          */
-       num_elems = _bt_sort_array_elements(scan, cur,
-                                           (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
+       reverse = (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0;
+       num_elems = _bt_sort_array_elements(cur, sortprocp, reverse,
                                             elem_values, num_nonnulls);
  
+       if (origarrayatt == cur->sk_attno)
+       {
+           BTArrayKeyInfo *orig = &so->arrayKeys[origarraykey];
+
+           /*
+            * This array scan key is redundant with a previous equality
+            * operator array scan key.  Merge the two arrays together to
+            * eliminate contradictory non-intersecting elements (or try to).
+            *
+            * We merge this next array back into attribute's original array.
+            */
+           Assert(arrayKeyData[orig->scan_key].sk_attno == cur->sk_attno);
+           Assert(arrayKeyData[orig->scan_key].sk_collation ==
+                  cur->sk_collation);
+           if (_bt_merge_arrays(scan, cur, sortprocp, reverse,
+                                origelemtype, elemtype,
+                                orig->elem_values, &orig->num_elems,
+                                elem_values, num_elems))
+           {
+               /* Successfully eliminated this array */
+               pfree(elem_values);
+
+               /*
+                * If no intersecting elements remain in the original array,
+                * the scan qual is unsatisfiable
+                */
+               if (orig->num_elems == 0)
+               {
+                   so->qual_ok = false;
+                   break;
+               }
+
+               /*
+                * Indicate to _bt_preprocess_keys caller that it must ignore
+                * this scan key
+                */
+               cur->sk_strategy = InvalidStrategy;
+               continue;
+           }
+
+           /*
+            * Unable to merge this array with previous array due to a lack of
+            * suitable cross-type opfamily support.  Will need to keep both
+            * scan keys/arrays.
+            */
+       }
+       else
+       {
+           /*
+            * This array is the first for current index attribute.
+            *
+            * If it turns out to not be the last array (that is, if the next
+            * array is redundantly applied to this same index attribute),
+            * we'll then treat this array as the attribute's "original" array
+            * when merging.
+            */
+           origarrayatt = cur->sk_attno;
+           origarraykey = numArrayKeys;
+           origelemtype = elemtype;
+       }
+
         /*
          * And set up the BTArrayKeyInfo data.
+        *
+        * Note: _bt_preprocess_array_keys_final will fix-up each array's
+        * scan_key field later on, after so->keyData[] has been finalized.
          */
         so->arrayKeys[numArrayKeys].scan_key = i;
         so->arrayKeys[numArrayKeys].num_elems = num_elems;
@@ -360,6 +521,256 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
     so->numArrayKeys = numArrayKeys;
  
     MemoryContextSwitchTo(oldContext);
+
+   return arrayKeyData;
+}
+
+/*
+ * _bt_preprocess_array_keys_final() -- fix up array scan key references
+ *
+ * When _bt_preprocess_array_keys performed initial array preprocessing, it
+ * set each array's array->scan_key to the array's arrayKeys[] entry offset
+ * (that also work as references into the original scan->keyData[] array).
+ * This function handles translation of the scan key references from the
+ * BTArrayKeyInfo info array, from input scan key references (to the keys in
+ * scan->keyData[]), into output references (to the keys in so->keyData[]).
+ * Caller's keyDataMap[] array tells us how to perform this remapping.
+ *
+ * Also finalizes so->orderProcs[] for the scan.  Arrays already have an ORDER
+ * proc, which might need to be repositioned to its so->keyData[]-wise offset
+ * (very much like the remapping that we apply to array->scan_key references).
+ * Non-array equality strategy scan keys (that survived preprocessing) don't
+ * yet have an so->orderProcs[] entry, so we set one for them here.
+ *
+ * Also converts single-element array scan keys into equivalent non-array
+ * equality scan keys, which decrements so->numArrayKeys.  It's possible that
+ * this will leave this new btrescan without any arrays at all.  This isn't
+ * necessary for correctness; it's just an optimization.  Non-array equality
+ * scan keys are slightly faster than equivalent array scan keys at runtime.
+ */
+static void
+_bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+   Relation    rel = scan->indexRelation;
+   int         arrayidx = 0;
+   int         last_equal_output_ikey PG_USED_FOR_ASSERTS_ONLY = -1;
+
+   Assert(so->qual_ok);
+   Assert(so->numArrayKeys);
+
+   for (int output_ikey = 0; output_ikey < so->numberOfKeys; output_ikey++)
+   {
+       ScanKey     outkey = so->keyData + output_ikey;
+       int         input_ikey;
+       bool        found PG_USED_FOR_ASSERTS_ONLY = false;
+
+       Assert(outkey->sk_strategy != InvalidStrategy);
+
+       if (outkey->sk_strategy != BTEqualStrategyNumber)
+           continue;
+
+       input_ikey = keyDataMap[output_ikey];
+
+       Assert(last_equal_output_ikey < output_ikey);
+       Assert(last_equal_output_ikey < input_ikey);
+       last_equal_output_ikey = output_ikey;
+
+       /*
+        * We're lazy about looking up ORDER procs for non-array keys, since
+        * not all input keys become output keys.  Take care of it now.
+        */
+       if (!(outkey->sk_flags & SK_SEARCHARRAY))
+       {
+           Oid         elemtype;
+
+           /* No need for an ORDER proc given an IS NULL scan key */
+           if (outkey->sk_flags & SK_SEARCHNULL)
+               continue;
+
+           /*
+            * A non-required scan key doesn't need an ORDER proc, either
+            * (unless it's associated with an array, which this one isn't)
+            */
+           if (!(outkey->sk_flags & SK_BT_REQFWD))
+               continue;
+
+           elemtype = outkey->sk_subtype;
+           if (elemtype == InvalidOid)
+               elemtype = rel->rd_opcintype[outkey->sk_attno - 1];
+
+           _bt_setup_array_cmp(scan, outkey, elemtype,
+                               &so->orderProcs[output_ikey], NULL);
+           continue;
+       }
+
+       /*
+        * Reorder existing array scan key so->orderProcs[] entries.
+        *
+        * Doing this in-place is safe because preprocessing is required to
+        * output all equality strategy scan keys in original input order
+        * (among each group of entries against the same index attribute).
+        * This is also the order that the arrays themselves appear in.
+        */
+       so->orderProcs[output_ikey] = so->orderProcs[input_ikey];
+
+       /* Fix-up array->scan_key references for arrays */
+       for (; arrayidx < so->numArrayKeys; arrayidx++)
+       {
+           BTArrayKeyInfo *array = &so->arrayKeys[arrayidx];
+
+           Assert(array->num_elems > 0);
+
+           if (array->scan_key == input_ikey)
+           {
+               /* found it */
+               array->scan_key = output_ikey;
+               found = true;
+
+               /*
+                * Transform array scan keys that have exactly 1 element
+                * remaining (following all prior preprocessing) into
+                * equivalent non-array scan keys.
+                */
+               if (array->num_elems == 1)
+               {
+                   outkey->sk_flags &= ~SK_SEARCHARRAY;
+                   outkey->sk_argument = array->elem_values[0];
+                   so->numArrayKeys--;
+
+                   /* If we're out of array keys, we can quit right away */
+                   if (so->numArrayKeys == 0)
+                       return;
+
+                   /* Shift other arrays forward */
+                   memmove(array, array + 1,
+                           sizeof(BTArrayKeyInfo) *
+                           (so->numArrayKeys - arrayidx));
+
+                   /*
+                    * Don't increment arrayidx (there was an entry that was
+                    * just shifted forward to the offset at arrayidx, which
+                    * will still need to be matched)
+                    */
+               }
+               else
+               {
+                   /* Match found, so done with this array */
+                   arrayidx++;
+               }
+
+               break;
+           }
+       }
+
+       Assert(found);
+   }
+
+   /*
+    * Parallel index scans require space in shared memory to store the
+    * current array elements (for arrays kept by preprocessing) to schedule
+    * the next primitive index scan.  The underlying structure is protected
+    * using a spinlock, so defensively limit its size.  In practice this can
+    * only affect parallel scans that use an incomplete opfamily.
+    */
+   if (scan->parallel_scan && so->numArrayKeys > INDEX_MAX_KEYS)
+       ereport(ERROR,
+               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                errmsg_internal("number of array scan keys left by preprocessing (%d) exceeds the maximum allowed by parallel btree index scans (%d)",
+                                so->numArrayKeys, INDEX_MAX_KEYS)));
+}
+
+/*
+ * _bt_setup_array_cmp() -- Set up array comparison functions
+ *
+ * Sets ORDER proc in caller's orderproc argument, which is used during binary
+ * searches of arrays during the index scan.  Also sets a same-type ORDER proc
+ * in caller's *sortprocp argument, which is used when sorting the array.
+ *
+ * Preprocessing calls here with all equality strategy scan keys (when scan
+ * uses equality array keys), including those not associated with any array.
+ * See _bt_advance_array_keys for an explanation of why it'll need to treat
+ * simple scalar equality scan keys as degenerate single element arrays.
+ *
+ * Caller should pass an orderproc pointing to space that'll store the ORDER
+ * proc for the scan, and a *sortprocp pointing to its own separate space.
+ * When calling here for a non-array scan key, sortprocp arg should be NULL.
+ *
+ * In the common case where we don't need to deal with cross-type operators,
+ * only one ORDER proc is actually required by caller.  We'll set *sortprocp
+ * to point to the same memory that caller's orderproc continues to point to.
+ * Otherwise, *sortprocp will continue to point to caller's own space.  Either
+ * way, *sortprocp will point to a same-type ORDER proc (since that's the only
+ * safe way to sort/deduplicate the array associated with caller's scan key).
+ */
+static void
+_bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
+                   FmgrInfo *orderproc, FmgrInfo **sortprocp)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+   Relation    rel = scan->indexRelation;
+   RegProcedure cmp_proc;
+   Oid         opcintype = rel->rd_opcintype[skey->sk_attno - 1];
+
+   Assert(skey->sk_strategy == BTEqualStrategyNumber);
+   Assert(OidIsValid(elemtype));
+
+   /*
+    * If scankey operator is not a cross-type comparison, we can use the
+    * cached comparison function; otherwise gotta look it up in the catalogs
+    */
+   if (elemtype == opcintype)
+   {
+       /* Set same-type ORDER procs for caller */
+       *orderproc = *index_getprocinfo(rel, skey->sk_attno, BTORDER_PROC);
+       if (sortprocp)
+           *sortprocp = orderproc;
+
+       return;
+   }
+
+   /*
+    * Look up the appropriate cross-type comparison function in the opfamily.
+    *
+    * Use the opclass input type as the left hand arg type, and the array
+    * element type as the right hand arg type (since binary searches use an
+    * index tuple's attribute value to search for a matching array element).
+    *
+    * Note: it's possible that this would fail, if the opfamily is
+    * incomplete, but only in cases where it's quite likely that _bt_first
+    * would fail in just the same way (had we not failed before it could).
+    */
+   cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
+                                opcintype, elemtype, BTORDER_PROC);
+   if (!RegProcedureIsValid(cmp_proc))
+       elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+            BTORDER_PROC, opcintype, elemtype, skey->sk_attno,
+            RelationGetRelationName(rel));
+
+   /* Set cross-type ORDER proc for caller */
+   fmgr_info_cxt(cmp_proc, orderproc, so->arrayContext);
+
+   /* Done if caller doesn't actually have an array they'll need to sort */
+   if (!sortprocp)
+       return;
+
+   /*
+    * Look up the appropriate same-type comparison function in the opfamily.
+    *
+    * Note: it's possible that this would fail, if the opfamily is
+    * incomplete, but it seems quite unlikely that an opfamily would omit
+    * non-cross-type comparison procs for any datatype that it supports at
+    * all.
+    */
+   cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
+                                elemtype, elemtype, BTORDER_PROC);
+   if (!RegProcedureIsValid(cmp_proc))
+       elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+            BTORDER_PROC, elemtype, elemtype,
+            skey->sk_attno, RelationGetRelationName(rel));
+
+   /* Set same-type ORDER proc for caller */
+   fmgr_info_cxt(cmp_proc, *sortprocp, so->arrayContext);
  }
  
  /*
@@ -370,27 +781,17 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
   * least element, or BTGreaterStrategyNumber to get the greatest.
   */
  static Datum
-_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
+_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype,
                          StrategyNumber strat,
                          Datum *elems, int nelems)
  {
     Relation    rel = scan->indexRelation;
-   Oid         elemtype,
-               cmp_op;
+   Oid         cmp_op;
     RegProcedure cmp_proc;
     FmgrInfo    flinfo;
     Datum       result;
     int         i;
  
-   /*
-    * Determine the nominal datatype of the array elements.  We have to
-    * support the convention that sk_subtype == InvalidOid means the opclass
-    * input type; this is a hack to simplify life for ScanKeyInit().
-    */
-   elemtype = skey->sk_subtype;
-   if (elemtype == InvalidOid)
-       elemtype = rel->rd_opcintype[skey->sk_attno - 1];
-
     /*
      * Look up the appropriate comparison operator in the opfamily.
      *
@@ -399,6 +800,8 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
      * non-cross-type comparison operators for any datatype that it supports
      * at all.
      */
+   Assert(skey->sk_strategy != BTEqualStrategyNumber);
+   Assert(OidIsValid(elemtype));
     cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
                                  elemtype,
                                  elemtype,
@@ -433,50 +836,21 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
   * The array elements are sorted in-place, and the new number of elements
   * after duplicate removal is returned.
   *
- * scan and skey identify the index column, whose opfamily determines the
- * comparison semantics.  If reverse is true, we sort in descending order.
+ * skey identifies the index column whose opfamily determines the comparison
+ * semantics, and sortproc is a corresponding ORDER proc.  If reverse is true,
+ * we sort in descending order.
   */
  static int
-_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
-                       bool reverse,
+_bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, bool reverse,
                         Datum *elems, int nelems)
  {
-   Relation    rel = scan->indexRelation;
-   Oid         elemtype;
-   RegProcedure cmp_proc;
     BTSortArrayContext cxt;
  
     if (nelems <= 1)
         return nelems;          /* no work to do */
  
-   /*
-    * Determine the nominal datatype of the array elements.  We have to
-    * support the convention that sk_subtype == InvalidOid means the opclass
-    * input type; this is a hack to simplify life for ScanKeyInit().
-    */
-   elemtype = skey->sk_subtype;
-   if (elemtype == InvalidOid)
-       elemtype = rel->rd_opcintype[skey->sk_attno - 1];
-
-   /*
-    * Look up the appropriate comparison function in the opfamily.
-    *
-    * Note: it's possible that this would fail, if the opfamily is
-    * incomplete, but it seems quite unlikely that an opfamily would omit
-    * non-cross-type support functions for any datatype that it supports at
-    * all.
-    */
-   cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
-                                elemtype,
-                                elemtype,
-                                BTORDER_PROC);
-   if (!RegProcedureIsValid(cmp_proc))
-       elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
-            BTORDER_PROC, elemtype, elemtype,
-            rel->rd_opfamily[skey->sk_attno - 1]);
-
     /* Sort the array elements */
-   fmgr_info(cmp_proc, &cxt.flinfo);
+   cxt.sortproc = sortproc;
     cxt.collation = skey->sk_collation;
     cxt.reverse = reverse;
     qsort_arg(elems, nelems, sizeof(Datum),
@@ -488,189 +862,1610 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
  }
  
  /*
- * qsort_arg comparator for sorting array elements
- */
-static int
-_bt_compare_array_elements(const void *a, const void *b, void *arg)
-{
-   Datum       da = *((const Datum *) a);
-   Datum       db = *((const Datum *) b);
-   BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
-   int32       compare;
-
-   compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo,
-                                             cxt->collation,
-                                             da, db));
-   if (cxt->reverse)
-       INVERT_COMPARE_RESULT(compare);
-   return compare;
-}
-
-/*
- * _bt_start_array_keys() -- Initialize array keys at start of a scan
+ * _bt_merge_arrays() -- merge next array's elements into an original array
   *
- * Set up the cur_elem counters and fill in the first sk_argument value for
- * each array scankey.  We can't do this until we know the scan direction.
+ * Called when preprocessing encounters a pair of array equality scan keys,
+ * both against the same index attribute (during initial array preprocessing).
+ * Merging reorganizes caller's original array (the left hand arg) in-place,
+ * without ever copying elements from one array into the other. (Mixing the
+ * elements together like this would be wrong, since they don't necessarily
+ * use the same underlying element type, despite all the other similarities.)
+ *
+ * Both arrays must have already been sorted and deduplicated by calling
+ * _bt_sort_array_elements.  sortproc is the same-type ORDER proc that was
+ * just used to sort and deduplicate caller's "next" array.  We'll usually be
+ * able to reuse that order PROC to merge the arrays together now.  If not,
+ * then we'll perform a separate ORDER proc lookup.
+ *
+ * If the opfamily doesn't supply a complete set of cross-type ORDER procs we
+ * may not be able to determine which elements are contradictory.  If we have
+ * the required ORDER proc then we return true (and validly set *nelems_orig),
+ * guaranteeing that at least the next array can be considered redundant.  We
+ * return false if the required comparisons cannot not be made (caller must
+ * keep both arrays when this happens).
   */
-void
-_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
+static bool
+_bt_merge_arrays(IndexScanDesc scan, ScanKey skey, FmgrInfo *sortproc,
+                bool reverse, Oid origelemtype, Oid nextelemtype,
+                Datum *elems_orig, int *nelems_orig,
+                Datum *elems_next, int nelems_next)
  {
+   Relation    rel = scan->indexRelation;
     BTScanOpaque so = (BTScanOpaque) scan->opaque;
-   int         i;
+   BTSortArrayContext cxt;
+   int         nelems_orig_start = *nelems_orig,
+               nelems_orig_merged = 0;
+   FmgrInfo   *mergeproc = sortproc;
+   FmgrInfo    crosstypeproc;
  
-   for (i = 0; i < so->numArrayKeys; i++)
+   Assert(skey->sk_strategy == BTEqualStrategyNumber);
+   Assert(OidIsValid(origelemtype) && OidIsValid(nextelemtype));
+
+   if (origelemtype != nextelemtype)
     {
-       BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
-       ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
+       RegProcedure cmp_proc;
  
-       Assert(curArrayKey->num_elems > 0);
-       if (ScanDirectionIsBackward(dir))
-           curArrayKey->cur_elem = curArrayKey->num_elems - 1;
-       else
-           curArrayKey->cur_elem = 0;
-       skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
+       /*
+        * Cross-array-element-type merging is required, so can't just reuse
+        * sortproc when merging
+        */
+       cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
+                                    origelemtype, nextelemtype, BTORDER_PROC);
+       if (!RegProcedureIsValid(cmp_proc))
+       {
+           /* Can't make the required comparisons */
+           return false;
+       }
+
+       /* We have all we need to determine redundancy/contradictoriness */
+       mergeproc = &crosstypeproc;
+       fmgr_info_cxt(cmp_proc, mergeproc, so->arrayContext);
+   }
+
+   cxt.sortproc = mergeproc;
+   cxt.collation = skey->sk_collation;
+   cxt.reverse = reverse;
+
+   for (int i = 0, j = 0; i < nelems_orig_start && j < nelems_next;)
+   {
+       Datum      *oelem = elems_orig + i,
+                  *nelem = elems_next + j;
+       int         res = _bt_compare_array_elements(oelem, nelem, &cxt);
+
+       if (res == 0)
+       {
+           elems_orig[nelems_orig_merged++] = *oelem;
+           i++;
+           j++;
+       }
+       else if (res < 0)
+           i++;
+       else                    /* res > 0 */
+           j++;
     }
  
-   so->arraysStarted = true;
+   *nelems_orig = nelems_orig_merged;
+
+   return true;
  }
  
  /*
- * _bt_advance_array_keys() -- Advance to next set of array elements
+ * Compare an array scan key to a scalar scan key, eliminating contradictory
+ * array elements such that the scalar scan key becomes redundant.
   *
- * Returns true if there is another set of values to consider, false if not.
- * On true result, the scankeys are initialized with the next set of values.
+ * Array elements can be eliminated as contradictory when excluded by some
+ * other operator on the same attribute.  For example, with an index scan qual
+ * "WHERE a IN (1, 2, 3) AND a < 2", all array elements except the value "1"
+ * are eliminated, and the < scan key is eliminated as redundant.  Cases where
+ * every array element is eliminated by a redundant scalar scan key have an
+ * unsatisfiable qual, which we handle by setting *qual_ok=false for caller.
+ *
+ * If the opfamily doesn't supply a complete set of cross-type ORDER procs we
+ * may not be able to determine which elements are contradictory.  If we have
+ * the required ORDER proc then we return true (and validly set *qual_ok),
+ * guaranteeing that at least the scalar scan key can be considered redundant.
+ * We return false if the comparison could not be made (caller must keep both
+ * scan keys when this happens).
   */
-bool
-_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
+static bool
+_bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey,
+                              FmgrInfo *orderproc, BTArrayKeyInfo *array,
+                              bool *qual_ok)
  {
-   BTScanOpaque so = (BTScanOpaque) scan->opaque;
-   bool        found = false;
-   int         i;
+   Relation    rel = scan->indexRelation;
+   Oid         opcintype = rel->rd_opcintype[arraysk->sk_attno - 1];
+   int         cmpresult = 0,
+               cmpexact = 0,
+               matchelem,
+               new_nelems = 0;
+   FmgrInfo    crosstypeproc;
+   FmgrInfo   *orderprocp = orderproc;
+
+   Assert(arraysk->sk_attno == skey->sk_attno);
+   Assert(array->num_elems > 0);
+   Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
+   Assert((arraysk->sk_flags & SK_SEARCHARRAY) &&
+          arraysk->sk_strategy == BTEqualStrategyNumber);
+   Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
+   Assert(!(skey->sk_flags & SK_SEARCHARRAY) ||
+          skey->sk_strategy != BTEqualStrategyNumber);
  
     /*
-    * We must advance the last array key most quickly, since it will
-    * correspond to the lowest-order index column among the available
-    * qualifications. This is necessary to ensure correct ordering of output
-    * when there are multiple array keys.
+    * _bt_binsrch_array_skey searches an array for the entry best matching a
+    * datum of opclass input type for the index's attribute (on-disk type).
+    * We can reuse the array's ORDER proc whenever the non-array scan key's
+    * type is a match for the corresponding attribute's input opclass type.
+    * Otherwise, we have to do another ORDER proc lookup so that our call to
+    * _bt_binsrch_array_skey applies the correct comparator.
+    *
+    * Note: we have to support the convention that sk_subtype == InvalidOid
+    * means the opclass input type; this is a hack to simplify life for
+    * ScanKeyInit().
      */
-   for (i = so->numArrayKeys - 1; i >= 0; i--)
+   if (skey->sk_subtype != opcintype && skey->sk_subtype != InvalidOid)
     {
-       BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
-       ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
-       int         cur_elem = curArrayKey->cur_elem;
-       int         num_elems = curArrayKey->num_elems;
+       RegProcedure cmp_proc;
+       Oid         arraysk_elemtype;
  
-       if (ScanDirectionIsBackward(dir))
+       /*
+        * Need an ORDER proc lookup to detect redundancy/contradictoriness
+        * with this pair of scankeys.
+        *
+        * Scalar scan key's argument will be passed to _bt_compare_array_skey
+        * as its tupdatum/lefthand argument (rhs arg is for array elements).
+        */
+       arraysk_elemtype = arraysk->sk_subtype;
+       if (arraysk_elemtype == InvalidOid)
+           arraysk_elemtype = rel->rd_opcintype[arraysk->sk_attno - 1];
+       cmp_proc = get_opfamily_proc(rel->rd_opfamily[arraysk->sk_attno - 1],
+                                    skey->sk_subtype, arraysk_elemtype,
+                                    BTORDER_PROC);
+       if (!RegProcedureIsValid(cmp_proc))
         {
-           if (--cur_elem < 0)
+           /* Can't make the comparison */
+           *qual_ok = false;   /* suppress compiler warnings */
+           return false;
+       }
+
+       /* We have all we need to determine redundancy/contradictoriness */
+       orderprocp = &crosstypeproc;
+       fmgr_info(cmp_proc, orderprocp);
+   }
+
+   matchelem = _bt_binsrch_array_skey(orderprocp, false,
+                                      NoMovementScanDirection,
+                                      skey->sk_argument, false, array,
+                                      arraysk, &cmpresult);
+
+   switch (skey->sk_strategy)
+   {
+       case BTLessStrategyNumber:
+           cmpexact = 1;       /* exclude exact match, if any */
+           /* FALL THRU */
+       case BTLessEqualStrategyNumber:
+           if (cmpresult >= cmpexact)
+               matchelem++;
+           /* Resize, keeping elements from the start of the array */
+           new_nelems = matchelem;
+           break;
+       case BTEqualStrategyNumber:
+           if (cmpresult != 0)
             {
-               cur_elem = num_elems - 1;
-               found = false;  /* need to advance next array key */
+               /* qual is unsatisfiable */
+               new_nelems = 0;
             }
             else
-               found = true;
+           {
+               /* Shift matching element to the start of the array, resize */
+               array->elem_values[0] = array->elem_values[matchelem];
+               new_nelems = 1;
+           }
+           break;
+       case BTGreaterEqualStrategyNumber:
+           cmpexact = 1;       /* include exact match, if any */
+           /* FALL THRU */
+       case BTGreaterStrategyNumber:
+           if (cmpresult >= cmpexact)
+               matchelem++;
+           /* Shift matching elements to the start of the array, resize */
+           new_nelems = array->num_elems - matchelem;
+           memmove(array->elem_values, array->elem_values + matchelem,
+                   sizeof(Datum) * new_nelems);
+           break;
+       default:
+           elog(ERROR, "unrecognized StrategyNumber: %d",
+                (int) skey->sk_strategy);
+           break;
+   }
+
+   Assert(new_nelems >= 0);
+   Assert(new_nelems <= array->num_elems);
+
+   array->num_elems = new_nelems;
+   *qual_ok = new_nelems > 0;
+
+   return true;
+}
+
+/*
+ * qsort_arg comparator for sorting array elements
+ */
+static int
+_bt_compare_array_elements(const void *a, const void *b, void *arg)
+{
+   Datum       da = *((const Datum *) a);
+   Datum       db = *((const Datum *) b);
+   BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
+   int32       compare;
+
+   compare = DatumGetInt32(FunctionCall2Coll(cxt->sortproc,
+                                             cxt->collation,
+                                             da, db));
+   if (cxt->reverse)
+       INVERT_COMPARE_RESULT(compare);
+   return compare;
+}
+
+/*
+ * _bt_compare_array_skey() -- apply array comparison function
+ *
+ * Compares caller's tuple attribute value to a scan key/array element.
+ * Helper function used during binary searches of SK_SEARCHARRAY arrays.
+ *
+ *     This routine returns:
+ *         <0 if tupdatum < arrdatum;
+ *          0 if tupdatum == arrdatum;
+ *         >0 if tupdatum > arrdatum.
+ *
+ * This is essentially the same interface as _bt_compare: both functions
+ * compare the value that they're searching for to a binary search pivot.
+ * However, unlike _bt_compare, this function's "tuple argument" comes first,
+ * while its "array/scankey argument" comes second.
+*/
+static inline int32
+_bt_compare_array_skey(FmgrInfo *orderproc,
+                      Datum tupdatum, bool tupnull,
+                      Datum arrdatum, ScanKey cur)
+{
+   int32       result = 0;
+
+   Assert(cur->sk_strategy == BTEqualStrategyNumber);
+
+   if (tupnull)                /* NULL tupdatum */
+   {
+       if (cur->sk_flags & SK_ISNULL)
+           result = 0;         /* NULL "=" NULL */
+       else if (cur->sk_flags & SK_BT_NULLS_FIRST)
+           result = -1;        /* NULL "<" NOT_NULL */
+       else
+           result = 1;         /* NULL ">" NOT_NULL */
+   }
+   else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */
+   {
+       if (cur->sk_flags & SK_BT_NULLS_FIRST)
+           result = 1;         /* NOT_NULL ">" NULL */
+       else
+           result = -1;        /* NOT_NULL "<" NULL */
+   }
+   else
+   {
+       /*
+        * Like _bt_compare, we need to be careful of cross-type comparisons,
+        * so the left value has to be the value that came from an index tuple
+        */
+       result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation,
+                                                tupdatum, arrdatum));
+
+       /*
+        * We flip the sign by following the obvious rule: flip whenever the
+        * column is a DESC column.
+        *
+        * _bt_compare does it the wrong way around (flip when *ASC*) in order
+        * to compensate for passing its orderproc arguments backwards.  We
+        * don't need to play these games because we find it natural to pass
+        * tupdatum as the left value (and arrdatum as the right value).
+        */
+       if (cur->sk_flags & SK_BT_DESC)
+           INVERT_COMPARE_RESULT(result);
+   }
+
+   return result;
+}
+
+/*
+ * _bt_binsrch_array_skey() -- Binary search for next matching array key
+ *
+ * Returns an index to the first array element >= caller's tupdatum argument.
+ * This convention is more natural for forwards scan callers, but that can't
+ * really matter to backwards scan callers.  Both callers require handling for
+ * the case where the match we return is < tupdatum, and symmetric handling
+ * for the case where our best match is > tupdatum.
+ *
+ * Also sets *set_elem_result to the result _bt_compare_array_skey returned
+ * when we used it to compare the matching array element to tupdatum/tupnull.
+ *
+ * cur_elem_trig indicates if array advancement was triggered by this array's
+ * scan key, and that the array is for a required scan key.  We can apply this
+ * information to find the next matching array element in the current scan
+ * direction using far fewer comparisons (fewer on average, compared to naive
+ * binary search).  This scheme takes advantage of an important property of
+ * required arrays: required arrays always advance in lockstep with the index
+ * scan's progress through the index's key space.
+ */
+static int
+_bt_binsrch_array_skey(FmgrInfo *orderproc,
+                      bool cur_elem_trig, ScanDirection dir,
+                      Datum tupdatum, bool tupnull,
+                      BTArrayKeyInfo *array, ScanKey cur,
+                      int32 *set_elem_result)
+{
+   int         low_elem = 0,
+               mid_elem = -1,
+               high_elem = array->num_elems - 1,
+               result = 0;
+   Datum       arrdatum;
+
+   Assert(cur->sk_flags & SK_SEARCHARRAY);
+   Assert(cur->sk_strategy == BTEqualStrategyNumber);
+
+   if (cur_elem_trig)
+   {
+       Assert(!ScanDirectionIsNoMovement(dir));
+       Assert(cur->sk_flags & SK_BT_REQFWD);
+
+       /*
+        * When the scan key that triggered array advancement is a required
+        * array scan key, it is now certain that the current array element
+        * (plus all prior elements relative to the current scan direction)
+        * cannot possibly be at or ahead of the corresponding tuple value.
+        * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which
+        * makes sure this is true as a condition of advancing the arrays.)
+        *
+        * This makes it safe to exclude array elements up to and including
+        * the former-current array element from our search.
+        *
+        * Separately, when array advancement was triggered by a required scan
+        * key, the array element immediately after the former-current element
+        * is often either an exact tupdatum match, or a "close by" near-match
+        * (a near-match tupdatum is one whose key space falls _between_ the
+        * former-current and new-current array elements).  We'll detect both
+        * cases via an optimistic comparison of the new search lower bound
+        * (or new search upper bound in the case of backwards scans).
+        */
+       if (ScanDirectionIsForward(dir))
+       {
+           low_elem = array->cur_elem + 1; /* old cur_elem exhausted */
+
+           /* Compare prospective new cur_elem (also the new lower bound) */
+           if (high_elem >= low_elem)
+           {
+               arrdatum = array->elem_values[low_elem];
+               result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
+                                               arrdatum, cur);
+
+               if (result <= 0)
+               {
+                   /* Optimistic comparison optimization worked out */
+                   *set_elem_result = result;
+                   return low_elem;
+               }
+               mid_elem = low_elem;
+               low_elem++;     /* this cur_elem exhausted, too */
+           }
+
+           if (high_elem < low_elem)
+           {
+               /* Caller needs to perform "beyond end" array advancement */
+               *set_elem_result = 1;
+               return high_elem;
+           }
         }
         else
         {
-           if (++cur_elem >= num_elems)
+           high_elem = array->cur_elem - 1;    /* old cur_elem exhausted */
+
+           /* Compare prospective new cur_elem (also the new upper bound) */
+           if (high_elem >= low_elem)
+           {
+               arrdatum = array->elem_values[high_elem];
+               result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
+                                               arrdatum, cur);
+
+               if (result >= 0)
+               {
+                   /* Optimistic comparison optimization worked out */
+                   *set_elem_result = result;
+                   return high_elem;
+               }
+               mid_elem = high_elem;
+               high_elem--;    /* this cur_elem exhausted, too */
+           }
+
+           if (high_elem < low_elem)
             {
-               cur_elem = 0;
-               found = false;  /* need to advance next array key */
+               /* Caller needs to perform "beyond end" array advancement */
+               *set_elem_result = -1;
+               return low_elem;
             }
-           else
-               found = true;
         }
+   }
  
-       curArrayKey->cur_elem = cur_elem;
-       skey->sk_argument = curArrayKey->elem_values[cur_elem];
-       if (found)
+   while (high_elem > low_elem)
+   {
+       mid_elem = low_elem + ((high_elem - low_elem) / 2);
+       arrdatum = array->elem_values[mid_elem];
+
+       result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
+                                       arrdatum, cur);
+
+       if (result == 0)
+       {
+           /*
+            * It's safe to quit as soon as we see an equal array element.
+            * This often saves an extra comparison or two...
+            */
+           low_elem = mid_elem;
             break;
-   }
+       }
  
-   /* advance parallel scan */
-   if (scan->parallel_scan != NULL)
-       _bt_parallel_advance_array_keys(scan);
+       if (result > 0)
+           low_elem = mid_elem + 1;
+       else
+           high_elem = mid_elem;
+   }
  
     /*
-    * When no new array keys were found, the scan is "past the end" of the
-    * array keys.  _bt_start_array_keys can still "restart" the array keys if
-    * a rescan is required.
+    * ...but our caller also cares about how its searched-for tuple datum
+    * compares to the low_elem datum.  Must always set *set_elem_result with
+    * the result of that comparison specifically.
      */
-   if (!found)
-       so->arraysStarted = false;
+   if (low_elem != mid_elem)
+       result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
+                                       array->elem_values[low_elem], cur);
+
+   *set_elem_result = result;
  
-   return found;
+   return low_elem;
  }
  
  /*
- * _bt_mark_array_keys() -- Handle array keys during btmarkpos
+ * _bt_start_array_keys() -- Initialize array keys at start of a scan
   *
- * Save the current state of the array keys as the "mark" position.
+ * Set up the cur_elem counters and fill in the first sk_argument value for
+ * each array scankey.
   */
  void
-_bt_mark_array_keys(IndexScanDesc scan)
+_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
  {
     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     int         i;
  
+   Assert(so->numArrayKeys);
+   Assert(so->qual_ok);
+
     for (i = 0; i < so->numArrayKeys; i++)
     {
         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+       ScanKey     skey = &so->keyData[curArrayKey->scan_key];
+
+       Assert(curArrayKey->num_elems > 0);
+       Assert(skey->sk_flags & SK_SEARCHARRAY);
  
-       curArrayKey->mark_elem = curArrayKey->cur_elem;
+       if (ScanDirectionIsBackward(dir))
+           curArrayKey->cur_elem = curArrayKey->num_elems - 1;
+       else
+           curArrayKey->cur_elem = 0;
+       skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
     }
+   so->scanBehind = false;
  }
  
  /*
- * _bt_restore_array_keys() -- Handle array keys during btrestrpos
+ * _bt_advance_array_keys_increment() -- Advance to next set of array elements
+ *
+ * Advances the array keys by a single increment in the current scan
+ * direction.  When there are multiple array keys this can roll over from the
+ * lowest order array to higher order arrays.
   *
- * Restore the array keys to where they were when the mark was set.
+ * Returns true if there is another set of values to consider, false if not.
+ * On true result, the scankeys are initialized with the next set of values.
+ * On false result, the scankeys stay the same, and the array keys are not
+ * advanced (every array remains at its final element for scan direction).
   */
-void
-_bt_restore_array_keys(IndexScanDesc scan)
+static bool
+_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
  {
     BTScanOpaque so = (BTScanOpaque) scan->opaque;
-   bool        changed = false;
-   int         i;
  
-   /* Restore each array key to its position when the mark was set */
-   for (i = 0; i < so->numArrayKeys; i++)
-   {
-       BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
-       ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
-       int         mark_elem = curArrayKey->mark_elem;
+   /*
+    * We must advance the last array key most quickly, since it will
+    * correspond to the lowest-order index column among the available
+    * qualifications
+    */
+   for (int i = so->numArrayKeys - 1; i >= 0; i--)
+   {
+       BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+       ScanKey     skey = &so->keyData[curArrayKey->scan_key];
+       int         cur_elem = curArrayKey->cur_elem;
+       int         num_elems = curArrayKey->num_elems;
+       bool        rolled = false;
+
+       if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems)
+       {
+           cur_elem = 0;
+           rolled = true;
+       }
+       else if (ScanDirectionIsBackward(dir) && --cur_elem < 0)
+       {
+           cur_elem = num_elems - 1;
+           rolled = true;
+       }
+
+       curArrayKey->cur_elem = cur_elem;
+       skey->sk_argument = curArrayKey->elem_values[cur_elem];
+       if (!rolled)
+           return true;
+
+       /* Need to advance next array key, if any */
+   }
+
+   /*
+    * The array keys are now exhausted.  (There isn't actually a distinct
+    * state that represents array exhaustion, since index scans don't always
+    * end after btgettuple returns "false".)
+    *
+    * Restore the array keys to the state they were in immediately before we
+    * were called.  This ensures that the arrays only ever ratchet in the
+    * current scan direction.  Without this, scans would overlook matching
+    * tuples if and when the scan's direction was subsequently reversed.
+    */
+   _bt_start_array_keys(scan, -dir);
+
+   return false;
+}
+
+/*
+ * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays
+ *
+ * Called when _bt_advance_array_keys decides to start a new primitive index
+ * scan on the basis of the current scan position being before the position
+ * that _bt_first is capable of repositioning the scan to by applying an
+ * inequality operator required in the opposite-to-scan direction only.
+ *
+ * Although equality strategy scan keys (for both arrays and non-arrays alike)
+ * are either marked required in both directions or in neither direction,
+ * there is a sense in which non-required arrays behave like required arrays.
+ * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)",
+ * the scan key on "c" is non-required, but nevertheless enables positioning
+ * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the
+ * first descent of the tree by _bt_first.  Later on, there could also be a
+ * second descent, that places the scan right before tuples >= "(200, 3, 5)".
+ * _bt_first must never be allowed to build an insertion scan key whose "c"
+ * entry is set to a value other than 5, the "c" array's first element/value.
+ * (Actually, it's the first in the current scan direction.  This example uses
+ * a forward scan.)
+ *
+ * Calling here resets the array scan key elements for the scan's non-required
+ * arrays.  This is strictly necessary for correctness in a subset of cases
+ * involving "required in opposite direction"-triggered primitive index scans.
+ * Not all callers are at risk of _bt_first using a non-required array like
+ * this, but advancement always resets the arrays when another primitive scan
+ * is scheduled, just to keep things simple.  Array advancement even makes
+ * sure to reset non-required arrays during scans that have no inequalities.
+ * (Advancement still won't call here when there are no inequalities, though
+ * that's just because it's all handled indirectly instead.)
+ *
+ * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that
+ * everybody got this right.
+ */
+static void
+_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+   int         arrayidx = 0;
+
+   for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
+   {
+       ScanKey     cur = so->keyData + ikey;
+       BTArrayKeyInfo *array = NULL;
+       int         first_elem_dir;
+
+       if (!(cur->sk_flags & SK_SEARCHARRAY) ||
+           cur->sk_strategy != BTEqualStrategyNumber)
+           continue;
+
+       array = &so->arrayKeys[arrayidx++];
+       Assert(array->scan_key == ikey);
+
+       if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
+           continue;
+
+       if (ScanDirectionIsForward(dir))
+           first_elem_dir = 0;
+       else
+           first_elem_dir = array->num_elems - 1;
+
+       if (array->cur_elem != first_elem_dir)
+       {
+           array->cur_elem = first_elem_dir;
+           cur->sk_argument = array->elem_values[first_elem_dir];
+       }
+   }
+}
+
+/*
+ * _bt_tuple_before_array_skeys() -- too early to advance required arrays?
+ *
+ * We always compare the tuple using the current array keys (which we assume
+ * are already set in so->keyData[]).  readpagetup indicates if tuple is the
+ * scan's current _bt_readpage-wise tuple.
+ *
+ * readpagetup callers must only call here when _bt_check_compare already set
+ * continuescan=false.  We help these callers deal with _bt_check_compare's
+ * inability to distinguishing between the < and > cases (it uses equality
+ * operator scan keys, whereas we use 3-way ORDER procs).  These callers pass
+ * a _bt_check_compare-set sktrig value that indicates which scan key
+ * triggered the call (!readpagetup callers just pass us sktrig=0 instead).
+ * This information allows us to avoid wastefully checking earlier scan keys
+ * that were already deemed to have been satisfied inside _bt_check_compare.
+ *
+ * Returns false when caller's tuple is >= the current required equality scan
+ * keys (or <=, in the case of backwards scans).  This happens to readpagetup
+ * callers when the scan has reached the point of needing its array keys
+ * advanced; caller will need to advance required and non-required arrays at
+ * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over.
+ * (When we return false to readpagetup callers, tuple can only be == current
+ * required equality scan keys when caller's sktrig indicates that the arrays
+ * need to be advanced due to an unsatisfied required inequality key trigger.)
+ *
+ * Returns true when caller passes a tuple that is < the current set of
+ * equality keys for the most significant non-equal required scan key/column
+ * (or > the keys, during backwards scans).  This happens to readpagetup
+ * callers when tuple is still before the start of matches for the scan's
+ * required equality strategy scan keys.  (sktrig can't have indicated that an
+ * inequality strategy scan key wasn't satisfied in _bt_check_compare when we
+ * return true.  In fact, we automatically return false when passed such an
+ * inequality sktrig by readpagetup callers -- _bt_check_compare's initial
+ * continuescan=false doesn't really need to be confirmed here by us.)
+ *
+ * !readpagetup callers optionally pass us *scanBehind, which tracks whether
+ * any missing truncated attributes might have affected array advancement
+ * (compared to what would happen if it was shown the first non-pivot tuple on
+ * the page to the right of caller's finaltup/high key tuple instead).  It's
+ * only possible that we'll set *scanBehind to true when caller passes us a
+ * pivot tuple (with truncated -inf attributes) that we return false for.
+ */
+static bool
+_bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
+                            IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
+                            bool readpagetup, int sktrig, bool *scanBehind)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+   Assert(so->numArrayKeys);
+   Assert(so->numberOfKeys);
+   Assert(sktrig == 0 || readpagetup);
+   Assert(!readpagetup || scanBehind == NULL);
+
+   if (scanBehind)
+       *scanBehind = false;
+
+   for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++)
+   {
+       ScanKey     cur = so->keyData + ikey;
+       Datum       tupdatum;
+       bool        tupnull;
+       int32       result;
+
+       /* readpagetup calls require one ORDER proc comparison (at most) */
+       Assert(!readpagetup || ikey == sktrig);
+
+       /*
+        * Once we reach a non-required scan key, we're completely done.
+        *
+        * Note: we deliberately don't consider the scan direction here.
+        * _bt_advance_array_keys caller requires that we track *scanBehind
+        * without concern for scan direction.
+        */
+       if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0)
+       {
+           Assert(!readpagetup);
+           Assert(ikey > sktrig || ikey == 0);
+           return false;
+       }
+
+       if (cur->sk_attno > tupnatts)
+       {
+           Assert(!readpagetup);
+
+           /*
+            * When we reach a high key's truncated attribute, assume that the
+            * tuple attribute's value is >= the scan's equality constraint
+            * scan keys (but set *scanBehind to let interested callers know
+            * that a truncated attribute might have affected our answer).
+            */
+           if (scanBehind)
+               *scanBehind = true;
+
+           return false;
+       }
+
+       /*
+        * Deal with inequality strategy scan keys that _bt_check_compare set
+        * continuescan=false for
+        */
+       if (cur->sk_strategy != BTEqualStrategyNumber)
+       {
+           /*
+            * When _bt_check_compare indicated that a required inequality
+            * scan key wasn't satisfied, there's no need to verify anything;
+            * caller always calls _bt_advance_array_keys with this sktrig.
+            */
+           if (readpagetup)
+               return false;
+
+           /*
+            * Otherwise we can't give up, since we must check all required
+            * scan keys (required in either direction) in order to correctly
+            * track *scanBehind for caller
+            */
+           continue;
+       }
+
+       tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull);
+
+       result = _bt_compare_array_skey(&so->orderProcs[ikey],
+                                       tupdatum, tupnull,
+                                       cur->sk_argument, cur);
+
+       /*
+        * Does this comparison indicate that caller must _not_ advance the
+        * scan's arrays just yet?
+        */
+       if ((ScanDirectionIsForward(dir) && result < 0) ||
+           (ScanDirectionIsBackward(dir) && result > 0))
+           return true;
+
+       /*
+        * Does this comparison indicate that caller should now advance the
+        * scan's arrays?  (Must be if we get here during a readpagetup call.)
+        */
+       if (readpagetup || result != 0)
+       {
+           Assert(result != 0);
+           return false;
+       }
+
+       /*
+        * Inconclusive -- need to check later scan keys, too.
+        *
+        * This must be a finaltup precheck, or a call made from an assertion.
+        */
+       Assert(result == 0);
+   }
+
+   Assert(!readpagetup);
+
+   return false;
+}
+
+/*
+ * _bt_start_prim_scan() -- start scheduled primitive index scan?
+ *
+ * Returns true if _bt_checkkeys scheduled another primitive index scan, just
+ * as the last one ended.  Otherwise returns false, indicating that the array
+ * keys are now fully exhausted.
+ *
+ * Only call here during scans with one or more equality type array scan keys,
+ * after _bt_first or _bt_next return false.
+ */
+bool
+_bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+   Assert(so->numArrayKeys);
+
+   /* scanBehind flag doesn't persist across primitive index scans - reset */
+   so->scanBehind = false;
+
+   /*
+    * Array keys are advanced within _bt_checkkeys when the scan reaches the
+    * leaf level (more precisely, they're advanced when the scan reaches the
+    * end of each distinct set of array elements).  This process avoids
+    * repeat access to leaf pages (across multiple primitive index scans) by
+    * advancing the scan's array keys when it allows the primitive index scan
+    * to find nearby matching tuples (or when it eliminates ranges of array
+    * key space that can't possibly be satisfied by any index tuple).
+    *
+    * _bt_checkkeys sets a simple flag variable to schedule another primitive
+    * index scan.  The flag tells us what to do.
+    *
+    * We cannot rely on _bt_first always reaching _bt_checkkeys.  There are
+    * various cases where that won't happen.  For example, if the index is
+    * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
+    * We also don't expect a call to _bt_checkkeys during searches for a
+    * non-existent value that happens to be lower/higher than any existing
+    * value in the index.
+    *
+    * We don't require special handling for these cases -- we don't need to
+    * be explicitly instructed to _not_ perform another primitive index scan.
+    * It's up to code under the control of _bt_first to always set the flag
+    * when another primitive index scan will be required.
+    *
+    * This works correctly, even with the tricky cases listed above, which
+    * all involve access to leaf pages "near the boundaries of the key space"
+    * (whether it's from a leftmost/rightmost page, or an imaginary empty
+    * leaf root page).  If _bt_checkkeys cannot be reached by a primitive
+    * index scan for one set of array keys, then it also won't be reached for
+    * any later set ("later" in terms of the direction that we scan the index
+    * and advance the arrays).  The array keys won't have advanced in these
+    * cases, but that's the correct behavior (even _bt_advance_array_keys
+    * won't always advance the arrays at the point they become "exhausted").
+    */
+   if (so->needPrimScan)
+   {
+       Assert(_bt_verify_arrays_bt_first(scan, dir));
+
+       /*
+        * Flag was set -- must call _bt_first again, which will reset the
+        * scan's needPrimScan flag
+        */
+       return true;
+   }
+
+   /* The top-level index scan ran out of tuples in this scan direction */
+   if (scan->parallel_scan != NULL)
+       _bt_parallel_done(scan);
+
+   return false;
+}
+
+/*
+ * _bt_advance_array_keys() -- Advance array elements using a tuple
+ *
+ * The scan always gets a new qual as a consequence of calling here (except
+ * when we determine that the top-level scan has run out of matching tuples).
+ * All later _bt_check_compare calls also use the same new qual that was first
+ * used here (at least until the next call here advances the keys once again).
+ * It's convenient to structure _bt_check_compare rechecks of caller's tuple
+ * (using the new qual) as one the steps of advancing the scan's array keys,
+ * so this function works as a wrapper around _bt_check_compare.
+ *
+ * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the
+ * caller, and return a boolean indicating if caller's tuple satisfies the
+ * scan's new qual.  But unlike _bt_check_compare, we set so->needPrimScan
+ * when we set continuescan=false, indicating if a new primitive index scan
+ * has been scheduled (otherwise, the top-level scan has run out of tuples in
+ * the current scan direction).
+ *
+ * Caller must use _bt_tuple_before_array_skeys to determine if the current
+ * place in the scan is >= the current array keys _before_ calling here.
+ * We're responsible for ensuring that caller's tuple is <= the newly advanced
+ * required array keys once we return.  We try to find an exact match, but
+ * failing that we'll advance the array keys to whatever set of array elements
+ * comes next in the key space for the current scan direction.  Required array
+ * keys "ratchet forwards" (or backwards).  They can only advance as the scan
+ * itself advances through the index/key space.
+ *
+ * (The rules are the same for backwards scans, except that the operators are
+ * flipped: just replace the precondition's >= operator with a <=, and the
+ * postcondition's <= operator with with a >=.  In other words, just swap the
+ * precondition with the postcondition.)
+ *
+ * We also deal with "advancing" non-required arrays here.  Callers whose
+ * sktrig scan key is non-required specify sktrig_required=false.  These calls
+ * are the only exception to the general rule about always advancing the
+ * required array keys (the scan may not even have a required array).  These
+ * callers should just pass a NULL pstate (since there is never any question
+ * of stopping the scan).  No call to _bt_tuple_before_array_skeys is required
+ * ahead of these calls (it's already clear that any required scan keys must
+ * be satisfied by caller's tuple).
+ *
+ * Note that we deal with non-array required equality strategy scan keys as
+ * degenerate single element arrays here.  Obviously, they can never really
+ * advance in the way that real arrays can, but they must still affect how we
+ * advance real array scan keys (exactly like true array equality scan keys).
+ * We have to keep around a 3-way ORDER proc for these (using the "=" operator
+ * won't do), since in general whether the tuple is < or > _any_ unsatisfied
+ * required equality key influences how the scan's real arrays must advance.
+ *
+ * Note also that we may sometimes need to advance the array keys when the
+ * existing required array keys (and other required equality keys) are already
+ * an exact match for every corresponding value from caller's tuple.  We must
+ * do this for inequalities that _bt_check_compare set continuescan=false for.
+ * They'll advance the array keys here, just like any other scan key that
+ * _bt_check_compare stops on.  (This can even happen _after_ we advance the
+ * array keys, in which case we'll advance the array keys a second time.  That
+ * way _bt_checkkeys caller always has its required arrays advance to the
+ * maximum possible extent that its tuple will allow.)
+ */
+static bool
+_bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
+                      IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+                      int sktrig, bool sktrig_required)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+   Relation    rel = scan->indexRelation;
+   ScanDirection dir = pstate ? pstate->dir : ForwardScanDirection;
+   int         arrayidx = 0;
+   bool        beyond_end_advance = false,
+               has_required_opposite_direction_only = false,
+               oppodir_inequality_sktrig = false,
+               all_required_satisfied = true,
+               all_satisfied = true;
+
+   if (sktrig_required)
+   {
+       /*
+        * Precondition array state assertion
+        */
+       Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
+                                            tupnatts, false, 0, NULL));
+
+       so->scanBehind = false; /* reset */
+
+       /*
+        * Required scan key wasn't satisfied, so required arrays will have to
+        * advance.  Invalidate page-level state that tracks whether the
+        * scan's required-in-opposite-direction-only keys are known to be
+        * satisfied by page's remaining tuples.
+        */
+       pstate->firstmatch = false;
+
+       /* Shouldn't have to invalidate 'prechecked', though */
+       Assert(!pstate->prechecked);
+
+       /*
+        * Once we return we'll have a new set of required array keys, so
+        * reset state used by "look ahead" optimization
+        */
+       pstate->rechecks = 0;
+       pstate->targetdistance = 0;
+   }
+
+   Assert(_bt_verify_keys_with_arraykeys(scan));
+
+   for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
+   {
+       ScanKey     cur = so->keyData + ikey;
+       BTArrayKeyInfo *array = NULL;
+       Datum       tupdatum;
+       bool        required = false,
+                   required_opposite_direction_only = false,
+                   tupnull;
+       int32       result;
+       int         set_elem = 0;
+
+       if (cur->sk_strategy == BTEqualStrategyNumber)
+       {
+           /* Manage array state */
+           if (cur->sk_flags & SK_SEARCHARRAY)
+           {
+               array = &so->arrayKeys[arrayidx++];
+               Assert(array->scan_key == ikey);
+           }
+       }
+       else
+       {
+           /*
+            * Are any inequalities required in the opposite direction only
+            * present here?
+            */
+           if (((ScanDirectionIsForward(dir) &&
+                 (cur->sk_flags & (SK_BT_REQBKWD))) ||
+                (ScanDirectionIsBackward(dir) &&
+                 (cur->sk_flags & (SK_BT_REQFWD)))))
+               has_required_opposite_direction_only =
+                   required_opposite_direction_only = true;
+       }
+
+       /* Optimization: skip over known-satisfied scan keys */
+       if (ikey < sktrig)
+           continue;
+
+       if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))
+       {
+           Assert(sktrig_required);
+
+           required = true;
+
+           if (cur->sk_attno > tupnatts)
+           {
+               /* Set this just like _bt_tuple_before_array_skeys */
+               Assert(sktrig < ikey);
+               so->scanBehind = true;
+           }
+       }
+
+       /*
+        * Handle a required non-array scan key that the initial call to
+        * _bt_check_compare indicated triggered array advancement, if any.
+        *
+        * The non-array scan key's strategy will be <, <=, or = during a
+        * forwards scan (or any one of =, >=, or > during a backwards scan).
+        * It follows that the corresponding tuple attribute's value must now
+        * be either > or >= the scan key value (for backwards scans it must
+        * be either < or <= that value).
+        *
+        * If this is a required equality strategy scan key, this is just an
+        * optimization; _bt_tuple_before_array_skeys already confirmed that
+        * this scan key places us ahead of caller's tuple.  There's no need
+        * to repeat that work now.  (The same underlying principle also gets
+        * applied by the cur_elem_trig optimization used to speed up searches
+        * for the next array element.)
+        *
+        * If this is a required inequality strategy scan key, we _must_ rely
+        * on _bt_check_compare like this; we aren't capable of directly
+        * evaluating required inequality strategy scan keys here, on our own.
+        */
+       if (ikey == sktrig && !array)
+       {
+           Assert(sktrig_required && required && all_required_satisfied);
+
+           /* Use "beyond end" advancement.  See below for an explanation. */
+           beyond_end_advance = true;
+           all_satisfied = all_required_satisfied = false;
+
+           /*
+            * Set a flag that remembers that this was an inequality required
+            * in the opposite scan direction only, that nevertheless
+            * triggered the call here.
+            *
+            * This only happens when an inequality operator (which must be
+            * strict) encounters a group of NULLs that indicate the end of
+            * non-NULL values for tuples in the current scan direction.
+            */
+           if (unlikely(required_opposite_direction_only))
+               oppodir_inequality_sktrig = true;
+
+           continue;
+       }
+
+       /*
+        * Nothing more for us to do with an inequality strategy scan key that
+        * wasn't the one that _bt_check_compare stopped on, though.
+        *
+        * Note: if our later call to _bt_check_compare (to recheck caller's
+        * tuple) sets continuescan=false due to finding this same inequality
+        * unsatisfied (possible when it's required in the scan direction),
+        * we'll deal with it via a recursive "second pass" call.
+        */
+       else if (cur->sk_strategy != BTEqualStrategyNumber)
+           continue;
+
+       /*
+        * Nothing for us to do with an equality strategy scan key that isn't
+        * marked required, either -- unless it's a non-required array
+        */
+       else if (!required && !array)
+           continue;
+
+       /*
+        * Here we perform steps for all array scan keys after a required
+        * array scan key whose binary search triggered "beyond end of array
+        * element" array advancement due to encountering a tuple attribute
+        * value > the closest matching array key (or < for backwards scans).
+        */
+       if (beyond_end_advance)
+       {
+           int         final_elem_dir;
+
+           if (ScanDirectionIsBackward(dir) || !array)
+               final_elem_dir = 0;
+           else
+               final_elem_dir = array->num_elems - 1;
+
+           if (array && array->cur_elem != final_elem_dir)
+           {
+               array->cur_elem = final_elem_dir;
+               cur->sk_argument = array->elem_values[final_elem_dir];
+           }
+
+           continue;
+       }
+
+       /*
+        * Here we perform steps for all array scan keys after a required
+        * array scan key whose tuple attribute was < the closest matching
+        * array key when we dealt with it (or > for backwards scans).
+        *
+        * This earlier required array key already puts us ahead of caller's
+        * tuple in the key space (for the current scan direction).  We must
+        * make sure that subsequent lower-order array keys do not put us too
+        * far ahead (ahead of tuples that have yet to be seen by our caller).
+        * For example, when a tuple "(a, b) = (42, 5)" advances the array
+        * keys on "a" from 40 to 45, we must also set "b" to whatever the
+        * first array element for "b" is.  It would be wrong to allow "b" to
+        * be set based on the tuple value.
+        *
+        * Perform the same steps with truncated high key attributes.  You can
+        * think of this as a "binary search" for the element closest to the
+        * value -inf.  Again, the arrays must never get ahead of the scan.
+        */
+       if (!all_required_satisfied || cur->sk_attno > tupnatts)
+       {
+           int         first_elem_dir;
+
+           if (ScanDirectionIsForward(dir) || !array)
+               first_elem_dir = 0;
+           else
+               first_elem_dir = array->num_elems - 1;
+
+           if (array && array->cur_elem != first_elem_dir)
+           {
+               array->cur_elem = first_elem_dir;
+               cur->sk_argument = array->elem_values[first_elem_dir];
+           }
+
+           continue;
+       }
+
+       /*
+        * Search in scankey's array for the corresponding tuple attribute
+        * value from caller's tuple
+        */
+       tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull);
+
+       if (array)
+       {
+           bool        cur_elem_trig = (sktrig_required && ikey == sktrig);
+
+           /*
+            * Binary search for closest match that's available from the array
+            */
+           set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey],
+                                             cur_elem_trig, dir,
+                                             tupdatum, tupnull, array, cur,
+                                             &result);
+
+           Assert(set_elem >= 0 && set_elem < array->num_elems);
+       }
+       else
+       {
+           Assert(sktrig_required && required);
+
+           /*
+            * This is a required non-array equality strategy scan key, which
+            * we'll treat as a degenerate single element array.
+            *
+            * This scan key's imaginary "array" can't really advance, but it
+            * can still roll over like any other array.  (Actually, this is
+            * no different to real single value arrays, which never advance
+            * without rolling over -- they can never truly advance, either.)
+            */
+           result = _bt_compare_array_skey(&so->orderProcs[ikey],
+                                           tupdatum, tupnull,
+                                           cur->sk_argument, cur);
+       }
+
+       /*
+        * Consider "beyond end of array element" array advancement.
+        *
+        * When the tuple attribute value is > the closest matching array key
+        * (or < in the backwards scan case), we need to ratchet this array
+        * forward (backward) by one increment, so that caller's tuple ends up
+        * being < final array value instead (or > final array value instead).
+        * This process has to work for all of the arrays, not just this one:
+        * it must "carry" to higher-order arrays when the set_elem that we
+        * just found happens to be the final one for the scan's direction.
+        * Incrementing (decrementing) set_elem itself isn't good enough.
+        *
+        * Our approach is to provisionally use set_elem as if it was an exact
+        * match now, then set each later/less significant array to whatever
+        * its final element is.  Once outside the loop we'll then "increment
+        * this array's set_elem" by calling _bt_advance_array_keys_increment.
+        * That way the process rolls over to higher order arrays as needed.
+        *
+        * Under this scheme any required arrays only ever ratchet forwards
+        * (or backwards), and always do so to the maximum possible extent
+        * that we can know will be safe without seeing the scan's next tuple.
+        * We don't need any special handling for required scan keys that lack
+        * a real array to advance, nor for redundant scan keys that couldn't
+        * be eliminated by _bt_preprocess_keys.  It won't matter if some of
+        * our "true" array scan keys (or even all of them) are non-required.
+        */
+       if (required &&
+           ((ScanDirectionIsForward(dir) && result > 0) ||
+            (ScanDirectionIsBackward(dir) && result < 0)))
+           beyond_end_advance = true;
+
+       Assert(all_required_satisfied && all_satisfied);
+       if (result != 0)
+       {
+           /*
+            * Track whether caller's tuple satisfies our new post-advancement
+            * qual, for required scan keys, as well as for the entire set of
+            * interesting scan keys (all required scan keys plus non-required
+            * array scan keys are considered interesting.)
+            */
+           all_satisfied = false;
+           if (required)
+               all_required_satisfied = false;
+           else
+           {
+               /*
+                * There's no need to advance the arrays using the best
+                * available match for a non-required array.  Give up now.
+                * (Though note that sktrig_required calls still have to do
+                * all the usual post-advancement steps, including the recheck
+                * call to _bt_check_compare.)
+                */
+               break;
+           }
+       }
+
+       /* Advance array keys, even when set_elem isn't an exact match */
+       if (array && array->cur_elem != set_elem)
+       {
+           array->cur_elem = set_elem;
+           cur->sk_argument = array->elem_values[set_elem];
+       }
+   }
+
+   /*
+    * Advance the array keys incrementally whenever "beyond end of array
+    * element" array advancement happens, so that advancement will carry to
+    * higher-order arrays (might exhaust all the scan's arrays instead, which
+    * ends the top-level scan).
+    */
+   if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir))
+       goto end_toplevel_scan;
+
+   Assert(_bt_verify_keys_with_arraykeys(scan));
+
+   /*
+    * Does tuple now satisfy our new qual?  Recheck with _bt_check_compare.
+    *
+    * Calls triggered by an unsatisfied required scan key, whose tuple now
+    * satisfies all required scan keys, but not all nonrequired array keys,
+    * will still require a recheck call to _bt_check_compare.  They'll still
+    * need its "second pass" handling of required inequality scan keys.
+    * (Might have missed a still-unsatisfied required inequality scan key
+    * that caller didn't detect as the sktrig scan key during its initial
+    * _bt_check_compare call that used the old/original qual.)
+    *
+    * Calls triggered by an unsatisfied nonrequired array scan key never need
+    * "second pass" handling of required inequalities (nor any other handling
+    * of any required scan key).  All that matters is whether caller's tuple
+    * satisfies the new qual, so it's safe to just skip the _bt_check_compare
+    * recheck when we've already determined that it can only return 'false'.
+    */
+   if ((sktrig_required && all_required_satisfied) ||
+       (!sktrig_required && all_satisfied))
+   {
+       int         nsktrig = sktrig + 1;
+       bool        continuescan;
+
+       Assert(all_required_satisfied);
+
+       /* Recheck _bt_check_compare on behalf of caller */
+       if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
+                             false, false, false,
+                             &continuescan, &nsktrig) &&
+           !so->scanBehind)
+       {
+           /* This tuple satisfies the new qual */
+           Assert(all_satisfied && continuescan);
+
+           if (pstate)
+               pstate->continuescan = true;
+
+           return true;
+       }
+
+       /*
+        * Consider "second pass" handling of required inequalities.
+        *
+        * It's possible that our _bt_check_compare call indicated that the
+        * scan should end due to some unsatisfied inequality that wasn't
+        * initially recognized as such by us.  Handle this by calling
+        * ourselves recursively, this time indicating that the trigger is the
+        * inequality that we missed first time around (and using a set of
+        * required array/equality keys that are now exact matches for tuple).
+        *
+        * We make a strong, general guarantee that every _bt_checkkeys call
+        * here will advance the array keys to the maximum possible extent
+        * that we can know to be safe based on caller's tuple alone.  If we
+        * didn't perform this step, then that guarantee wouldn't quite hold.
+        */
+       if (unlikely(!continuescan))
+       {
+           bool        satisfied PG_USED_FOR_ASSERTS_ONLY;
+
+           Assert(sktrig_required);
+           Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber);
+
+           /*
+            * The tuple must use "beyond end" advancement during the
+            * recursive call, so we cannot possibly end up back here when
+            * recursing.  We'll consume a small, fixed amount of stack space.
+            */
+           Assert(!beyond_end_advance);
+
+           /* Advance the array keys a second time using same tuple */
+           satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts,
+                                              tupdesc, nsktrig, true);
+
+           /* This tuple doesn't satisfy the inequality */
+           Assert(!satisfied);
+           return false;
+       }
+
+       /*
+        * Some non-required scan key (from new qual) still not satisfied.
+        *
+        * All scan keys required in the current scan direction must still be
+        * satisfied, though, so we can trust all_required_satisfied below.
+        */
+   }
+
+   /*
+    * When we were called just to deal with "advancing" non-required arrays,
+    * this is as far as we can go (cannot stop the scan for these callers)
+    */
+   if (!sktrig_required)
+   {
+       /* Caller's tuple doesn't match any qual */
+       return false;
+   }
+
+   /*
+    * Postcondition array state assertion (for still-unsatisfied tuples).
+    *
+    * By here we have established that the scan's required arrays (scan must
+    * have at least one required array) advanced, without becoming exhausted.
+    *
+    * Caller's tuple is now < the newly advanced array keys (or > when this
+    * is a backwards scan), except in the case where we only got this far due
+    * to an unsatisfied non-required scan key.  Verify that with an assert.
+    *
+    * Note: we don't just quit at this point when all required scan keys were
+    * found to be satisfied because we need to consider edge-cases involving
+    * scan keys required in the opposite direction only; those aren't tracked
+    * by all_required_satisfied. (Actually, oppodir_inequality_sktrig trigger
+    * scan keys are tracked by all_required_satisfied, since it's convenient
+    * for _bt_check_compare to behave as if they are required in the current
+    * scan direction to deal with NULLs.  We'll account for that separately.)
+    */
+   Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts,
+                                       false, 0, NULL) ==
+          !all_required_satisfied);
+
+   /*
+    * We generally permit primitive index scans to continue onto the next
+    * sibling page when the page's finaltup satisfies all required scan keys
+    * at the point where we're between pages.
+    *
+    * If caller's tuple is also the page's finaltup, and we see that required
+    * scan keys still aren't satisfied, start a new primitive index scan.
+    */
+   if (!all_required_satisfied && pstate->finaltup == tuple)
+       goto new_prim_scan;
+
+   /*
+    * Proactively check finaltup (don't wait until finaltup is reached by the
+    * scan) when it might well turn out to not be satisfied later on.
+    *
+    * Note: if so->scanBehind hasn't already been set for finaltup by us,
+    * it'll be set during this call to _bt_tuple_before_array_skeys.  Either
+    * way, it'll be set correctly (for the whole page) after this point.
+    */
+   if (!all_required_satisfied && pstate->finaltup &&
+       _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
+                                    BTreeTupleGetNAtts(pstate->finaltup, rel),
+                                    false, 0, &so->scanBehind))
+       goto new_prim_scan;
+
+   /*
+    * When we encounter a truncated finaltup high key attribute, we're
+    * optimistic about the chances of its corresponding required scan key
+    * being satisfied when we go on to check it against tuples from this
+    * page's right sibling leaf page.  We consider truncated attributes to be
+    * satisfied by required scan keys, which allows the primitive index scan
+    * to continue to the next leaf page.  We must set so->scanBehind to true
+    * to remember that the last page's finaltup had "satisfied" required scan
+    * keys for one or more truncated attribute values (scan keys required in
+    * _either_ scan direction).
+    *
+    * There is a chance that _bt_checkkeys (which checks so->scanBehind) will
+    * find that even the sibling leaf page's finaltup is < the new array
+    * keys.  When that happens, our optimistic policy will have incurred a
+    * single extra leaf page access that could have been avoided.
+    *
+    * A pessimistic policy would give backward scans a gratuitous advantage
+    * over forward scans.  We'd punish forward scans for applying more
+    * accurate information from the high key, rather than just using the
+    * final non-pivot tuple as finaltup, in the style of backward scans.
+    * Being pessimistic would also give some scans with non-required arrays a
+    * perverse advantage over similar scans that use required arrays instead.
+    *
+    * You can think of this as a speculative bet on what the scan is likely
+    * to find on the next page.  It's not much of a gamble, though, since the
+    * untruncated prefix of attributes must strictly satisfy the new qual
+    * (though it's okay if any non-required scan keys fail to be satisfied).
+    */
+   if (so->scanBehind && has_required_opposite_direction_only)
+   {
+       /*
+        * However, we avoid this behavior whenever the scan involves a scan
+        * key required in the opposite direction to the scan only, along with
+        * a finaltup with at least one truncated attribute that's associated
+        * with a scan key marked required (required in either direction).
+        *
+        * _bt_check_compare simply won't stop the scan for a scan key that's
+        * marked required in the opposite scan direction only.  That leaves
+        * us without any reliable way of reconsidering any opposite-direction
+        * inequalities if it turns out that starting a new primitive index
+        * scan will allow _bt_first to skip ahead by a great many leaf pages
+        * (see next section for details of how that works).
+        */
+       goto new_prim_scan;
+   }
+
+   /*
+    * Handle inequalities marked required in the opposite scan direction.
+    * They can also signal that we should start a new primitive index scan.
+    *
+    * It's possible that the scan is now positioned where "matching" tuples
+    * begin, and that caller's tuple satisfies all scan keys required in the
+    * current scan direction.  But if caller's tuple still doesn't satisfy
+    * other scan keys that are required in the opposite scan direction only
+    * (e.g., a required >= strategy scan key when scan direction is forward),
+    * it's still possible that there are many leaf pages before the page that
+    * _bt_first could skip straight to.  Groveling through all those pages
+    * will always give correct answers, but it can be very inefficient.  We
+    * must avoid needlessly scanning extra pages.
+    *
+    * Separately, it's possible that _bt_check_compare set continuescan=false
+    * for a scan key that's required in the opposite direction only.  This is
+    * a special case, that happens only when _bt_check_compare sees that the
+    * inequality encountered a NULL value.  This signals the end of non-NULL
+    * values in the current scan direction, which is reason enough to end the
+    * (primitive) scan.  If this happens at the start of a large group of
+    * NULL values, then we shouldn't expect to be called again until after
+    * the scan has already read indefinitely-many leaf pages full of tuples
+    * with NULL suffix values.  We need a separate test for this case so that
+    * we don't miss our only opportunity to skip over such a group of pages.
+    * (_bt_first is expected to skip over the group of NULLs by applying a
+    * similar "deduce NOT NULL" rule, where it finishes its insertion scan
+    * key by consing up an explicit SK_SEARCHNOTNULL key.)
+    *
+    * Apply a test against finaltup to detect and recover from these problem:
+    * if even finaltup doesn't satisfy such an inequality, we just skip by
+    * starting a new primitive index scan.  When we skip, we know for sure
+    * that all of the tuples on the current page following caller's tuple are
+    * also before the _bt_first-wise start of tuples for our new qual.  That
+    * at least suggests many more skippable pages beyond the current page.
+    */
+   if (has_required_opposite_direction_only && pstate->finaltup &&
+       (all_required_satisfied || oppodir_inequality_sktrig))
+   {
+       int         nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel);
+       ScanDirection flipped;
+       bool        continuescanflip;
+       int         opsktrig;
+
+       /*
+        * We're checking finaltup (which is usually not caller's tuple), so
+        * cannot reuse work from caller's earlier _bt_check_compare call.
+        *
+        * Flip the scan direction when calling _bt_check_compare this time,
+        * so that it will set continuescanflip=false when it encounters an
+        * inequality required in the opposite scan direction.
+        */
+       Assert(!so->scanBehind);
+       opsktrig = 0;
+       flipped = -dir;
+       _bt_check_compare(scan, flipped,
+                         pstate->finaltup, nfinaltupatts, tupdesc,
+                         false, false, false,
+                         &continuescanflip, &opsktrig);
+
+       /*
+        * If we ended up here due to the all_required_satisfied criteria,
+        * test opsktrig in a way that ensures that finaltup contains the same
+        * prefix of key columns as caller's tuple (a prefix that satisfies
+        * earlier required-in-current-direction scan keys).
+        *
+        * If we ended up here due to the oppodir_inequality_sktrig criteria,
+        * test opsktrig in a way that ensures that the same scan key that our
+        * caller found to be unsatisfied (by the scan's tuple) was also the
+        * one unsatisfied just now (by finaltup).  That way we'll only start
+        * a new primitive scan when we're sure that both tuples _don't_ share
+        * the same prefix of satisfied equality-constrained attribute values,
+        * and that finaltup has a non-NULL attribute value indicated by the
+        * unsatisfied scan key at offset opsktrig/sktrig.  (This depends on
+        * _bt_check_compare not caring about the direction that inequalities
+        * are required in whenever NULL attribute values are unsatisfied.  It
+        * only cares about the scan direction, and its relationship to
+        * whether NULLs are stored first or last relative to non-NULLs.)
+        */
+       Assert(all_required_satisfied != oppodir_inequality_sktrig);
+       if (unlikely(!continuescanflip &&
+                    ((all_required_satisfied && opsktrig > sktrig) ||
+                     (oppodir_inequality_sktrig && opsktrig >= sktrig))))
+       {
+           Assert(so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber);
+
+           /*
+            * Make sure that any non-required arrays are set to the first
+            * array element for the current scan direction
+            */
+           _bt_rewind_nonrequired_arrays(scan, dir);
+
+           goto new_prim_scan;
+       }
+   }
+
+   /*
+    * Stick with the ongoing primitive index scan for now.
+    *
+    * It's possible that later tuples will also turn out to have values that
+    * are still < the now-current array keys (or > the current array keys).
+    * Our caller will handle this by performing what amounts to a linear
+    * search of the page, implemented by calling _bt_check_compare and then
+    * _bt_tuple_before_array_skeys for each tuple.
+    *
+    * This approach has various advantages over a binary search of the page.
+    * Repeated binary searches of the page (one binary search for every array
+    * advancement) won't outperform a continuous linear search.  While there
+    * are workloads that a naive linear search won't handle well, our caller
+    * has a "look ahead" fallback mechanism to deal with that problem.
+    */
+   pstate->continuescan = true;    /* Override _bt_check_compare */
+   so->needPrimScan = false;   /* _bt_readpage has more tuples to check */
+
+   if (so->scanBehind)
+   {
+       /* Optimization: skip by setting "look ahead" mechanism's offnum */
+       Assert(ScanDirectionIsForward(dir));
+       pstate->skip = pstate->maxoff + 1;
+   }
+
+   /* Caller's tuple doesn't match the new qual */
+   return false;
+
+new_prim_scan:
+
+   /*
+    * End this primitive index scan, but schedule another.
+    *
+    * Note: If the scan direction happens to change, this scheduled primitive
+    * index scan won't go ahead after all.
+    */
+   pstate->continuescan = false;   /* Tell _bt_readpage we're done... */
+   so->needPrimScan = true;    /* ...but call _bt_first again */
+
+   if (scan->parallel_scan)
+       _bt_parallel_primscan_schedule(scan, pstate->prev_scan_page);
  
-       if (curArrayKey->cur_elem != mark_elem)
-       {
-           curArrayKey->cur_elem = mark_elem;
-           skey->sk_argument = curArrayKey->elem_values[mark_elem];
-           changed = true;
-       }
-   }
+   /* Caller's tuple doesn't match the new qual */
+   return false;
+
+end_toplevel_scan:
  
     /*
-    * If we changed any keys, we must redo _bt_preprocess_keys.  That might
-    * sound like overkill, but in cases with multiple keys per index column
-    * it seems necessary to do the full set of pushups.
+    * End the current primitive index scan, but don't schedule another.
+    *
+    * This ends the entire top-level scan in the current scan direction.
      *
-    * Also do this whenever the scan's set of array keys "wrapped around" at
-    * the end of the last primitive index scan.  There won't have been a call
-    * to _bt_preprocess_keys from some other place following wrap around, so
-    * we do it for ourselves.
+    * Note: The scan's arrays (including any non-required arrays) are now in
+    * their final positions for the current scan direction.  If the scan
+    * direction happens to change, then the arrays will already be in their
+    * first positions for what will then be the current scan direction.
      */
-   if (changed || !so->arraysStarted)
-   {
-       _bt_preprocess_keys(scan);
-       /* The mark should have been set on a consistent set of keys... */
-       Assert(so->qual_ok);
-   }
-}
+   pstate->continuescan = false;   /* Tell _bt_readpage we're done... */
+   so->needPrimScan = false;   /* ...don't call _bt_first again, though */
  
+   /* Caller's tuple doesn't match any qual */
+   return false;
+}
  
  /*
   * _bt_preprocess_keys() -- Preprocess scan keys
   *
- * The given search-type keys (in scan->keyData[] or so->arrayKeyData[])
+ * The given search-type keys (taken from scan->keyData[])
   * are copied to so->keyData[] with possible transformation.
   * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
   * the number of output keys (possibly less, never greater).
@@ -690,8 +2485,9 @@ _bt_restore_array_keys(IndexScanDesc scan)
   * The output keys must be sorted by index attribute.  Presently we expect
   * (but verify) that the input keys are already so sorted --- this is done
   * by match_clauses_to_index() in indxpath.c.  Some reordering of the keys
- * within each attribute may be done as a byproduct of the processing here,
- * but no other code depends on that.
+ * within each attribute may be done as a byproduct of the processing here.
+ * That process must leave array scan keys (within an attribute) in the same
+ * order as corresponding entries from the scan's BTArrayKeyInfo array info.
   *
   * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
   * if they must be satisfied in order to continue the scan forward or backward
@@ -748,8 +2544,8 @@ _bt_restore_array_keys(IndexScanDesc scan)
   *
   * Note: the reason we have to copy the preprocessed scan keys into private
   * storage is that we are modifying the array based on comparisons of the
- * key argument values, which could change on a rescan or after moving to
- * new elements of array keys.  Therefore we can't overwrite the source data.
+ * key argument values, which could change on a rescan.  Therefore we can't
+ * overwrite the source data.
   */
  void
  _bt_preprocess_keys(IndexScanDesc scan)
@@ -762,11 +2558,31 @@ _bt_preprocess_keys(IndexScanDesc scan)
     ScanKey     inkeys;
     ScanKey     outkeys;
     ScanKey     cur;
-   ScanKey     xform[BTMaxStrategyNumber];
+   BTScanKeyPreproc xform[BTMaxStrategyNumber];
     bool        test_result;
     int         i,
                 j;
     AttrNumber  attno;
+   ScanKey     arrayKeyData;
+   int        *keyDataMap = NULL;
+   int         arrayidx = 0;
+
+   /*
+    * We're called at the start of each primitive index scan during scans
+    * that use equality array keys.  We can just reuse the scan keys that
+    * were output at the start of the scan's first primitive index scan.
+    */
+   if (so->numberOfKeys > 0)
+   {
+       /*
+        * An earlier call to _bt_advance_array_keys already set everything up
+        * already.  Just assert that the scan's existing output scan keys are
+        * consistent with its current array elements.
+        */
+       Assert(so->numArrayKeys);
+       Assert(_bt_verify_keys_with_arraykeys(scan));
+       return;
+   }
  
     /* initialize result variables */
     so->qual_ok = true;
@@ -775,11 +2591,27 @@ _bt_preprocess_keys(IndexScanDesc scan)
     if (numberOfKeys < 1)
         return;                 /* done if qual-less scan */
  
+   /* If any keys are SK_SEARCHARRAY type, set up array-key info */
+   arrayKeyData = _bt_preprocess_array_keys(scan);
+   if (!so->qual_ok)
+   {
+       /* unmatchable array, so give up */
+       return;
+   }
+
     /*
-    * Read so->arrayKeyData if array keys are present, else scan->keyData
+    * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[])
+    * as our input if _bt_preprocess_array_keys just allocated it, else just
+    * use scan->keyData[]
      */
-   if (so->arrayKeyData != NULL)
-       inkeys = so->arrayKeyData;
+   if (arrayKeyData)
+   {
+       inkeys = arrayKeyData;
+
+       /* Also maintain keyDataMap for remapping so->orderProc[] later */
+       keyDataMap = MemoryContextAlloc(so->arrayContext,
+                                       numberOfKeys * sizeof(int));
+   }
     else
         inkeys = scan->keyData;
  
@@ -800,6 +2632,19 @@ _bt_preprocess_keys(IndexScanDesc scan)
         /* We can mark the qual as required if it's for first index col */
         if (cur->sk_attno == 1)
             _bt_mark_scankey_required(outkeys);
+       if (arrayKeyData)
+       {
+           /*
+            * Don't call _bt_preprocess_array_keys_final in this fast path
+            * (we'll miss out on the single value array transformation, but
+            * that's not nearly as important when there's only one scan key)
+            */
+           Assert(cur->sk_flags & SK_SEARCHARRAY);
+           Assert(cur->sk_strategy != BTEqualStrategyNumber ||
+                  (so->arrayKeys[0].scan_key == 0 &&
+                   OidIsValid(so->orderProcs[0].fn_oid)));
+       }
+
         return;
     }
  
@@ -859,13 +2704,29 @@ _bt_preprocess_keys(IndexScanDesc scan)
              * check, and we've rejected any combination of it with a regular
              * equality condition; but not with other types of conditions.
              */
-           if (xform[BTEqualStrategyNumber - 1])
+           if (xform[BTEqualStrategyNumber - 1].skey)
             {
-               ScanKey     eq = xform[BTEqualStrategyNumber - 1];
+               ScanKey     eq = xform[BTEqualStrategyNumber - 1].skey;
+               BTArrayKeyInfo *array = NULL;
+               FmgrInfo   *orderproc = NULL;
+
+               if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY))
+               {
+                   int         eq_in_ikey,
+                               eq_arrayidx;
+
+                   eq_in_ikey = xform[BTEqualStrategyNumber - 1].ikey;
+                   eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx;
+                   array = &so->arrayKeys[eq_arrayidx - 1];
+                   orderproc = so->orderProcs + eq_in_ikey;
+
+                   Assert(array->scan_key == eq_in_ikey);
+                   Assert(OidIsValid(orderproc->fn_oid));
+               }
  
                 for (j = BTMaxStrategyNumber; --j >= 0;)
                 {
-                   ScanKey     chk = xform[j];
+                   ScanKey     chk = xform[j].skey;
  
                     if (!chk || j == (BTEqualStrategyNumber - 1))
                         continue;
@@ -878,6 +2739,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
                     }
  
                     if (_bt_compare_scankey_args(scan, chk, eq, chk,
+                                                array, orderproc,
                                                  &test_result))
                     {
                         if (!test_result)
@@ -887,7 +2749,9 @@ _bt_preprocess_keys(IndexScanDesc scan)
                             return;
                         }
                         /* else discard the redundant non-equality key */
-                       xform[j] = NULL;
+                       Assert(!array || array->num_elems > 0);
+                       xform[j].skey = NULL;
+                       xform[j].ikey = -1;
                     }
                     /* else, cannot determine redundancy, keep both keys */
                 }
@@ -896,36 +2760,36 @@ _bt_preprocess_keys(IndexScanDesc scan)
             }
  
             /* try to keep only one of <, <= */
-           if (xform[BTLessStrategyNumber - 1]
-               && xform[BTLessEqualStrategyNumber - 1])
+           if (xform[BTLessStrategyNumber - 1].skey
+               && xform[BTLessEqualStrategyNumber - 1].skey)
             {
-               ScanKey     lt = xform[BTLessStrategyNumber - 1];
-               ScanKey     le = xform[BTLessEqualStrategyNumber - 1];
+               ScanKey     lt = xform[BTLessStrategyNumber - 1].skey;
+               ScanKey     le = xform[BTLessEqualStrategyNumber - 1].skey;
  
-               if (_bt_compare_scankey_args(scan, le, lt, le,
+               if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL,
                                              &test_result))
                 {
                     if (test_result)
-                       xform[BTLessEqualStrategyNumber - 1] = NULL;
+                       xform[BTLessEqualStrategyNumber - 1].skey = NULL;
                     else
-                       xform[BTLessStrategyNumber - 1] = NULL;
+                       xform[BTLessStrategyNumber - 1].skey = NULL;
                 }
             }
  
             /* try to keep only one of >, >= */
-           if (xform[BTGreaterStrategyNumber - 1]
-               && xform[BTGreaterEqualStrategyNumber - 1])
+           if (xform[BTGreaterStrategyNumber - 1].skey
+               && xform[BTGreaterEqualStrategyNumber - 1].skey)
             {
-               ScanKey     gt = xform[BTGreaterStrategyNumber - 1];
-               ScanKey     ge = xform[BTGreaterEqualStrategyNumber - 1];
+               ScanKey     gt = xform[BTGreaterStrategyNumber - 1].skey;
+               ScanKey     ge = xform[BTGreaterEqualStrategyNumber - 1].skey;
  
-               if (_bt_compare_scankey_args(scan, ge, gt, ge,
+               if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL,
                                              &test_result))
                 {
                     if (test_result)
-                       xform[BTGreaterEqualStrategyNumber - 1] = NULL;
+                       xform[BTGreaterEqualStrategyNumber - 1].skey = NULL;
                     else
-                       xform[BTGreaterStrategyNumber - 1] = NULL;
+                       xform[BTGreaterStrategyNumber - 1].skey = NULL;
                 }
             }
  
@@ -936,11 +2800,13 @@ _bt_preprocess_keys(IndexScanDesc scan)
              */
             for (j = BTMaxStrategyNumber; --j >= 0;)
             {
-               if (xform[j])
+               if (xform[j].skey)
                 {
                     ScanKey     outkey = &outkeys[new_numberOfKeys++];
  
-                   memcpy(outkey, xform[j], sizeof(ScanKeyData));
+                   memcpy(outkey, xform[j].skey, sizeof(ScanKeyData));
+                   if (arrayKeyData)
+                       keyDataMap[new_numberOfKeys - 1] = xform[j].ikey;
                     if (priorNumberOfEqualCols == attno - 1)
                         _bt_mark_scankey_required(outkey);
                 }
@@ -966,6 +2832,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
             ScanKey     outkey = &outkeys[new_numberOfKeys++];
  
             memcpy(outkey, cur, sizeof(ScanKeyData));
+           if (arrayKeyData)
+               keyDataMap[new_numberOfKeys - 1] = i;
             if (numberOfEqualCols == attno - 1)
                 _bt_mark_scankey_required(outkey);
  
@@ -977,20 +2845,112 @@ _bt_preprocess_keys(IndexScanDesc scan)
             continue;
         }
  
-       /* have we seen one of these before? */
-       if (xform[j] == NULL)
+       /*
+        * Does this input scan key require further processing as an array?
+        */
+       if (cur->sk_strategy == InvalidStrategy)
+       {
+           /* _bt_preprocess_array_keys marked this array key redundant */
+           Assert(arrayKeyData);
+           Assert(cur->sk_flags & SK_SEARCHARRAY);
+           continue;
+       }
+
+       if (cur->sk_strategy == BTEqualStrategyNumber &&
+           (cur->sk_flags & SK_SEARCHARRAY))
         {
-           /* nope, so remember this scankey */
-           xform[j] = cur;
+           /* _bt_preprocess_array_keys kept this array key */
+           Assert(arrayKeyData);
+           arrayidx++;
+       }
+
+       /*
+        * have we seen a scan key for this same attribute and using this same
+        * operator strategy before now?
+        */
+       if (xform[j].skey == NULL)
+       {
+           /* nope, so this scan key wins by default (at least for now) */
+           xform[j].skey = cur;
+           xform[j].ikey = i;
+           xform[j].arrayidx = arrayidx;
         }
         else
         {
-           /* yup, keep only the more restrictive key */
-           if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
-                                        &test_result))
+           FmgrInfo   *orderproc = NULL;
+           BTArrayKeyInfo *array = NULL;
+
+           /*
+            * Seen one of these before, so keep only the more restrictive key
+            * if possible
+            */
+           if (j == (BTEqualStrategyNumber - 1) && arrayKeyData)
+           {
+               /*
+                * Have to set up array keys
+                */
+               if ((cur->sk_flags & SK_SEARCHARRAY))
+               {
+                   array = &so->arrayKeys[arrayidx - 1];
+                   orderproc = so->orderProcs + i;
+
+                   Assert(array->scan_key == i);
+                   Assert(OidIsValid(orderproc->fn_oid));
+               }
+               else if ((xform[j].skey->sk_flags & SK_SEARCHARRAY))
+               {
+                   array = &so->arrayKeys[xform[j].arrayidx - 1];
+                   orderproc = so->orderProcs + xform[j].ikey;
+
+                   Assert(array->scan_key == xform[j].ikey);
+                   Assert(OidIsValid(orderproc->fn_oid));
+               }
+
+               /*
+                * Both scan keys might have arrays, in which case we'll
+                * arbitrarily pass only one of the arrays.  That won't
+                * matter, since _bt_compare_scankey_args is aware that two
+                * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys
+                * failed to eliminate redundant arrays through array merging.
+                * _bt_compare_scankey_args just returns false when it sees
+                * this; it won't even try to examine either array.
+                */
+           }
+
+           if (_bt_compare_scankey_args(scan, cur, cur, xform[j].skey,
+                                        array, orderproc, &test_result))
             {
+               /* Have all we need to determine redundancy */
                 if (test_result)
-                   xform[j] = cur;
+               {
+                   Assert(!array || array->num_elems > 0);
+
+                   /*
+                    * New key is more restrictive, and so replaces old key...
+                    */
+                   if (j != (BTEqualStrategyNumber - 1) ||
+                       !(xform[j].skey->sk_flags & SK_SEARCHARRAY))
+                   {
+                       Assert(!array || array->scan_key == i);
+                       xform[j].skey = cur;
+                       xform[j].ikey = i;
+                       xform[j].arrayidx = arrayidx;
+                   }
+                   else
+                   {
+                       /*
+                        * ...unless we have to keep the old key because it's
+                        * an array that rendered the new key redundant.  We
+                        * need to make sure that we don't throw away an array
+                        * scan key.  _bt_compare_scankey_args expects us to
+                        * always keep arrays (and discard non-arrays).
+                        */
+                       Assert(j == (BTEqualStrategyNumber - 1));
+                       Assert(xform[j].skey->sk_flags & SK_SEARCHARRAY);
+                       Assert(xform[j].ikey == array->scan_key);
+                       Assert(!(cur->sk_flags & SK_SEARCHARRAY));
+                   }
+               }
                 else if (j == (BTEqualStrategyNumber - 1))
                 {
                     /* key == a && key == b, but a != b */
@@ -1002,21 +2962,129 @@ _bt_preprocess_keys(IndexScanDesc scan)
             else
             {
                 /*
-                * We can't determine which key is more restrictive.  Keep the
-                * previous one in xform[j] and push this one directly to the
-                * output array.
+                * We can't determine which key is more restrictive.  Push
+                * xform[j] directly to the output array, then set xform[j] to
+                * the new scan key.
+                *
+                * Note: We do things this way around so that our arrays are
+                * always in the same order as their corresponding scan keys,
+                * even with incomplete opfamilies.  _bt_advance_array_keys
+                * depends on this.
                  */
                 ScanKey     outkey = &outkeys[new_numberOfKeys++];
  
-               memcpy(outkey, cur, sizeof(ScanKeyData));
+               memcpy(outkey, xform[j].skey, sizeof(ScanKeyData));
+               if (arrayKeyData)
+                   keyDataMap[new_numberOfKeys - 1] = xform[j].ikey;
                 if (numberOfEqualCols == attno - 1)
                     _bt_mark_scankey_required(outkey);
+               xform[j].skey = cur;
+               xform[j].ikey = i;
+               xform[j].arrayidx = arrayidx;
             }
         }
     }
  
     so->numberOfKeys = new_numberOfKeys;
+
+   /*
+    * Now that we've built a temporary mapping from so->keyData[] (output
+    * scan keys) to scan->keyData[] (input scan keys), fix array->scan_key
+    * references.  Also consolidate the so->orderProc[] array such that it
+    * can be subscripted using so->keyData[]-wise offsets.
+    */
+   if (arrayKeyData)
+       _bt_preprocess_array_keys_final(scan, keyDataMap);
+
+   /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
+}
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * Verify that the scan's qual state matches what we expect at the point that
+ * _bt_start_prim_scan is about to start a just-scheduled new primitive scan.
+ *
+ * We enforce a rule against non-required array scan keys: they must start out
+ * with whatever element is the first for the scan's current scan direction.
+ * See _bt_rewind_nonrequired_arrays comments for an explanation.
+ */
+static bool
+_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+   int         arrayidx = 0;
+
+   for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
+   {
+       ScanKey     cur = so->keyData + ikey;
+       BTArrayKeyInfo *array = NULL;
+       int         first_elem_dir;
+
+       if (!(cur->sk_flags & SK_SEARCHARRAY) ||
+           cur->sk_strategy != BTEqualStrategyNumber)
+           continue;
+
+       array = &so->arrayKeys[arrayidx++];
+
+       if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+           ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+           continue;
+
+       if (ScanDirectionIsForward(dir))
+           first_elem_dir = 0;
+       else
+           first_elem_dir = array->num_elems - 1;
+
+       if (array->cur_elem != first_elem_dir)
+           return false;
+   }
+
+   return _bt_verify_keys_with_arraykeys(scan);
+}
+
+/*
+ * Verify that the scan's "so->keyData[]" scan keys are in agreement with
+ * its array key state
+ */
+static bool
+_bt_verify_keys_with_arraykeys(IndexScanDesc scan)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+   int         last_sk_attno = InvalidAttrNumber,
+               arrayidx = 0;
+
+   if (!so->qual_ok)
+       return false;
+
+   for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
+   {
+       ScanKey     cur = so->keyData + ikey;
+       BTArrayKeyInfo *array;
+
+       if (cur->sk_strategy != BTEqualStrategyNumber ||
+           !(cur->sk_flags & SK_SEARCHARRAY))
+           continue;
+
+       array = &so->arrayKeys[arrayidx++];
+       if (array->scan_key != ikey)
+           return false;
+
+       if (array->num_elems <= 0)
+           return false;
+
+       if (cur->sk_argument != array->elem_values[array->cur_elem])
+           return false;
+       if (last_sk_attno > cur->sk_attno)
+           return false;
+       last_sk_attno = cur->sk_attno;
+   }
+
+   if (arrayidx != so->numArrayKeys)
+       return false;
+
+   return true;
  }
+#endif
  
  /*
   * Compare two scankey values using a specified operator.
@@ -1033,9 +3101,24 @@ _bt_preprocess_keys(IndexScanDesc scan)
   * we store the operator result in *result and return true.  We return false
   * if the comparison could not be made.
   *
+ * If either leftarg or rightarg are an array, we'll apply array-specific
+ * rules to determine which array elements are redundant on behalf of caller.
+ * It is up to our caller to save whichever of the two scan keys is the array,
+ * and discard the non-array scan key (the non-array scan key is guaranteed to
+ * be redundant with any complete opfamily).  Caller isn't expected to call
+ * here with a pair of array scan keys provided we're dealing with a complete
+ * opfamily (_bt_preprocess_array_keys will merge array keys together to make
+ * sure of that).
+ *
+ * Note: we'll also shrink caller's array as needed to eliminate redundant
+ * array elements.  One reason why caller should prefer to discard non-array
+ * scan keys is so that we'll have the opportunity to shrink the array
+ * multiple times, in multiple calls (for each of several other scan keys on
+ * the same index attribute).
+ *
   * Note: op always points at the same ScanKey as either leftarg or rightarg.
- * Since we don't scribble on the scankeys, this aliasing should cause no
- * trouble.
+ * Since we don't scribble on the scankeys themselves, this aliasing should
+ * cause no trouble.
   *
   * Note: this routine needs to be insensitive to any DESC option applied
   * to the index column.  For example, "x < 4" is a tighter constraint than
@@ -1044,6 +3127,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
  static bool
  _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
                          ScanKey leftarg, ScanKey rightarg,
+                        BTArrayKeyInfo *array, FmgrInfo *orderproc,
                          bool *result)
  {
     Relation    rel = scan->indexRelation;
@@ -1112,6 +3196,48 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
         return true;
     }
  
+   /*
+    * If either leftarg or rightarg are equality-type array scankeys, we need
+    * specialized handling (since by now we know that IS NULL wasn't used)
+    */
+   if (array)
+   {
+       bool        leftarray,
+                   rightarray;
+
+       leftarray = ((leftarg->sk_flags & SK_SEARCHARRAY) &&
+                    leftarg->sk_strategy == BTEqualStrategyNumber);
+       rightarray = ((rightarg->sk_flags & SK_SEARCHARRAY) &&
+                     rightarg->sk_strategy == BTEqualStrategyNumber);
+
+       /*
+        * _bt_preprocess_array_keys is responsible for merging together array
+        * scan keys, and will do so whenever the opfamily has the required
+        * cross-type support.  If it failed to do that, we handle it just
+        * like the case where we can't make the comparison ourselves.
+        */
+       if (leftarray && rightarray)
+       {
+           /* Can't make the comparison */
+           *result = false;    /* suppress compiler warnings */
+           return false;
+       }
+
+       /*
+        * Otherwise we need to determine if either one of leftarg or rightarg
+        * uses an array, then pass this through to a dedicated helper
+        * function.
+        */
+       if (leftarray)
+           return _bt_compare_array_scankey_args(scan, leftarg, rightarg,
+                                                 orderproc, array, result);
+       else if (rightarray)
+           return _bt_compare_array_scankey_args(scan, rightarg, leftarg,
+                                                 orderproc, array, result);
+
+       /* FALL THRU */
+   }
+
     /*
      * The opfamily we need to worry about is identified by the index column.
      */
@@ -1351,60 +3477,234 @@ _bt_mark_scankey_required(ScanKey skey)
   *
   * Return true if so, false if not.  If the tuple fails to pass the qual,
   * we also determine whether there's any need to continue the scan beyond
- * this tuple, and set *continuescan accordingly.  See comments for
+ * this tuple, and set pstate.continuescan accordingly.  See comments for
   * _bt_preprocess_keys(), above, about how this is done.
   *
   * Forward scan callers can pass a high key tuple in the hopes of having
   * us set *continuescan to false, and avoiding an unnecessary visit to
   * the page to the right.
   *
+ * Advances the scan's array keys when necessary for arrayKeys=true callers.
+ * Caller can avoid all array related side-effects when calling just to do a
+ * page continuescan precheck -- pass arrayKeys=false for that.  Scans without
+ * any arrays keys must always pass arrayKeys=false.
+ *
+ * Also stops and starts primitive index scans for arrayKeys=true callers.
+ * Scans with array keys are required to set up page state that helps us with
+ * this.  The page's finaltup tuple (the page high key for a forward scan, or
+ * the page's first non-pivot tuple for a backward scan) must be set in
+ * pstate.finaltup ahead of the first call here for the page (or possibly the
+ * first call after an initial continuescan-setting page precheck call).  Set
+ * this to NULL for rightmost page (or the leftmost page for backwards scans).
+ *
   * scan: index scan descriptor (containing a search-type scankey)
+ * pstate: page level input and output parameters
+ * arrayKeys: should we advance the scan's array keys if necessary?
   * tuple: index tuple to test
   * tupnatts: number of attributes in tupnatts (high key may be truncated)
- * dir: direction we are scanning in
- * continuescan: output parameter (will be set correctly in all cases)
- * continuescanPrechecked: indicates that *continuescan flag is known to
- *                            be true for the last item on the page
- * haveFirstMatch: indicates that we already have at least one match
- *                               in the current page
   */
  bool
-_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
-             ScanDirection dir, bool *continuescan,
-             bool continuescanPrechecked, bool haveFirstMatch)
+_bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
+             IndexTuple tuple, int tupnatts)
  {
-   TupleDesc   tupdesc;
-   BTScanOpaque so;
-   int         keysz;
-   int         ikey;
-   ScanKey     key;
+   TupleDesc   tupdesc = RelationGetDescr(scan->indexRelation);
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+   ScanDirection dir = pstate->dir;
+   int         ikey = 0;
+   bool        res;
  
     Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
  
-   *continuescan = true;       /* default assumption */
+   res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
+                           arrayKeys, pstate->prechecked, pstate->firstmatch,
+                           &pstate->continuescan, &ikey);
+
+#ifdef USE_ASSERT_CHECKING
+   if (!arrayKeys && so->numArrayKeys)
+   {
+       /*
+        * This is a continuescan precheck call for a scan with array keys.
+        *
+        * Assert that the scan isn't in danger of becoming confused.
+        */
+       Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch);
+       Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
+                                            tupnatts, false, 0, NULL));
+   }
+   if (pstate->prechecked || pstate->firstmatch)
+   {
+       bool        dcontinuescan;
+       int         dikey = 0;
+
+       /*
+        * Call relied on continuescan/firstmatch prechecks -- assert that we
+        * get the same answer without those optimizations
+        */
+       Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
+                                       false, false, false,
+                                       &dcontinuescan, &dikey));
+       Assert(pstate->continuescan == dcontinuescan);
+   }
+#endif
+
+   /*
+    * Only one _bt_check_compare call is required in the common case where
+    * there are no equality strategy array scan keys.  Otherwise we can only
+    * accept _bt_check_compare's answer unreservedly when it didn't set
+    * pstate.continuescan=false.
+    */
+   if (!arrayKeys || pstate->continuescan)
+       return res;
+
+   /*
+    * _bt_check_compare call set continuescan=false in the presence of
+    * equality type array keys.  This could mean that the tuple is just past
+    * the end of matches for the current array keys.
+    *
+    * It's also possible that the scan is still _before_ the _start_ of
+    * tuples matching the current set of array keys.  Check for that first.
+    */
+   if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
+                                    ikey, NULL))
+   {
+       /*
+        * Tuple is still before the start of matches according to the scan's
+        * required array keys (according to _all_ of its required equality
+        * strategy keys, actually).
+        *
+        * _bt_advance_array_keys occasionally sets so->scanBehind to signal
+        * that the scan's current position/tuples might be significantly
+        * behind (multiple pages behind) its current array keys.  When this
+        * happens, we need to be prepared to recover by starting a new
+        * primitive index scan here, on our own.
+        */
+       Assert(!so->scanBehind ||
+              so->keyData[ikey].sk_strategy == BTEqualStrategyNumber);
+       if (unlikely(so->scanBehind) && pstate->finaltup &&
+           _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
+                                        BTreeTupleGetNAtts(pstate->finaltup,
+                                                           scan->indexRelation),
+                                        false, 0, NULL))
+       {
+           /* Cut our losses -- start a new primitive index scan now */
+           pstate->continuescan = false;
+           so->needPrimScan = true;
+       }
+       else
+       {
+           /* Override _bt_check_compare, continue primitive scan */
+           pstate->continuescan = true;
+
+           /*
+            * We will end up here repeatedly given a group of tuples > the
+            * previous array keys and < the now-current keys (for a backwards
+            * scan it's just the same, though the operators swap positions).
+            *
+            * We must avoid allowing this linear search process to scan very
+            * many tuples from well before the start of tuples matching the
+            * current array keys (or from well before the point where we'll
+            * once again have to advance the scan's array keys).
+            *
+            * We keep the overhead under control by speculatively "looking
+            * ahead" to later still-unscanned items from this same leaf page.
+            * We'll only attempt this once the number of tuples that the
+            * linear search process has examined starts to get out of hand.
+            */
+           pstate->rechecks++;
+           if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS)
+           {
+               /* See if we should skip ahead within the current leaf page */
+               _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
+
+               /*
+                * Might have set pstate.skip to a later page offset.  When
+                * that happens then _bt_readpage caller will inexpensively
+                * skip ahead to a later tuple from the same page (the one
+                * just after the tuple we successfully "looked ahead" to).
+                */
+           }
+       }
  
-   tupdesc = RelationGetDescr(scan->indexRelation);
-   so = (BTScanOpaque) scan->opaque;
-   keysz = so->numberOfKeys;
+       /* This indextuple doesn't match the current qual, in any case */
+       return false;
+   }
+
+   /*
+    * Caller's tuple is >= the current set of array keys and other equality
+    * constraint scan keys (or <= if this is a backwards scan).  It's now
+    * clear that we _must_ advance any required array keys in lockstep with
+    * the scan.
+    */
+   return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc,
+                                 ikey, true);
+}
+
+/*
+ * Test whether an indextuple satisfies current scan condition.
+ *
+ * Return true if so, false if not.  If not, also sets *continuescan to false
+ * when it's also not possible for any later tuples to pass the current qual
+ * (with the scan's current set of array keys, in the current scan direction),
+ * in addition to setting *ikey to the so->keyData[] subscript/offset for the
+ * unsatisfied scan key (needed when caller must consider advancing the scan's
+ * array keys).
+ *
+ * This is a subroutine for _bt_checkkeys.  We provisionally assume that
+ * reaching the end of the current set of required keys (in particular the
+ * current required array keys) ends the ongoing (primitive) index scan.
+ * Callers without array keys should just end the scan right away when they
+ * find that continuescan has been set to false here by us.  Things are more
+ * complicated for callers with array keys.
+ *
+ * Callers with array keys must first consider advancing the arrays when
+ * continuescan has been set to false here by us.  They must then consider if
+ * it really does make sense to end the current (primitive) index scan, in
+ * light of everything that is known at that point.  (In general when we set
+ * continuescan=false for these callers it must be treated as provisional.)
+ *
+ * We deal with advancing unsatisfied non-required arrays directly, though.
+ * This is safe, since by definition non-required keys can't end the scan.
+ * This is just how we determine if non-required arrays are just unsatisfied
+ * by the current array key, or if they're truly unsatisfied (that is, if
+ * they're unsatisfied by every possible array key).
+ *
+ * Though we advance non-required array keys on our own, that shouldn't have
+ * any lasting consequences for the scan.  By definition, non-required arrays
+ * have no fixed relationship with the scan's progress.  (There are delicate
+ * considerations for non-required arrays when the arrays need to be advanced
+ * following our setting continuescan to false, but that doesn't concern us.)
+ *
+ * Pass advancenonrequired=false to avoid all array related side effects.
+ * This allows _bt_advance_array_keys caller to avoid infinite recursion.
+ */
+static bool
+_bt_check_compare(IndexScanDesc scan, ScanDirection dir,
+                 IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+                 bool advancenonrequired, bool prechecked, bool firstmatch,
+                 bool *continuescan, int *ikey)
+{
+   BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+   *continuescan = true;       /* default assumption */
  
-   for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
+   for (; *ikey < so->numberOfKeys; (*ikey)++)
     {
+       ScanKey     key = so->keyData + *ikey;
         Datum       datum;
         bool        isNull;
-       Datum       test;
         bool        requiredSameDir = false,
-                   requiredOppositeDir = false;
+                   requiredOppositeDirOnly = false;
  
         /*
-        * Check if the key is required for ordered scan in the same or
-        * opposite direction.  Save as flag variables for future usage.
+        * Check if the key is required in the current scan direction, in the
+        * opposite scan direction _only_, or in neither direction
          */
         if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
             ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
             requiredSameDir = true;
         else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
                  ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
-           requiredOppositeDir = true;
+           requiredOppositeDirOnly = true;
  
         /*
          * If the caller told us the *continuescan flag is known to be true
@@ -1422,8 +3722,9 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
          * Both cases above work except for the row keys, where NULLs could be
          * found in the middle of matching values.
          */
-       if ((requiredSameDir || (requiredOppositeDir && haveFirstMatch)) &&
-           !(key->sk_flags & SK_ROW_HEADER) && continuescanPrechecked)
+       if (prechecked &&
+           (requiredSameDir || (requiredOppositeDirOnly && firstmatch)) &&
+           !(key->sk_flags & SK_ROW_HEADER))
             continue;
  
         if (key->sk_attno > tupnatts)
@@ -1434,7 +3735,6 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
              * right could be any possible value.  Assume that truncated
              * attribute passes the qual.
              */
-           Assert(ScanDirectionIsForward(dir));
             Assert(BTreeTupleIsPivot(tuple));
             continue;
         }
@@ -1495,6 +3795,8 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
                  * because it's not possible for any future tuples to pass. On
                  * a forward scan, however, we must keep going, because we may
                  * have initially positioned to the start of the index.
+                * (_bt_advance_array_keys also relies on this behavior during
+                * forward scans.)
                  */
                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
                     ScanDirectionIsBackward(dir))
@@ -1511,6 +3813,8 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
                  * because it's not possible for any future tuples to pass. On
                  * a backward scan, however, we must keep going, because we
                  * may have initially positioned to the end of the index.
+                * (_bt_advance_array_keys also relies on this behavior during
+                * backward scans.)
                  */
                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
                     ScanDirectionIsForward(dir))
@@ -1524,24 +3828,15 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
         }
  
         /*
-        * Apply the key-checking function.  When the key is required for the
-        * opposite direction scan, it must be already satisfied as soon as
-        * there is already match on the page.  Except for the NULLs checking,
-        * which have already done above.
+        * Apply the key-checking function, though only if we must.
+        *
+        * When a key is required in the opposite-of-scan direction _only_,
+        * then it must already be satisfied if firstmatch=true indicates that
+        * an earlier tuple from this same page satisfied it earlier on.
          */
-       if (!(requiredOppositeDir && haveFirstMatch))
-       {
-           test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
-                                    datum, key->sk_argument);
-       }
-       else
-       {
-           test = true;
-           Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation,
-                                            datum, key->sk_argument));
-       }
-
-       if (!DatumGetBool(test))
+       if (!(requiredOppositeDirOnly && firstmatch) &&
+           !DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation,
+                                           datum, key->sk_argument)))
         {
             /*
              * Tuple fails this qual.  If it's a required qual for the current
@@ -1557,7 +3852,19 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
                 *continuescan = false;
  
             /*
-            * In any case, this indextuple doesn't match the qual.
+            * If this is a non-required equality-type array key, the tuple
+            * needs to be checked against every possible array key.  Handle
+            * this by "advancing" the scan key's array to a matching value
+            * (if we're successful then the tuple might match the qual).
+            */
+           else if (advancenonrequired &&
+                    key->sk_strategy == BTEqualStrategyNumber &&
+                    (key->sk_flags & SK_SEARCHARRAY))
+               return _bt_advance_array_keys(scan, NULL, tuple, tupnatts,
+                                             tupdesc, *ikey, false);
+
+           /*
+            * This indextuple doesn't match the qual.
              */
             return false;
         }
@@ -1574,7 +3881,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
   * it's not possible for any future tuples in the current scan direction
   * to pass the qual.
   *
- * This is a subroutine for _bt_checkkeys, which see for more info.
+ * This is a subroutine for _bt_checkkeys/_bt_check_compare.
   */
  static bool
  _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
@@ -1603,7 +3910,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
              * right could be any possible value.  Assume that truncated
              * attribute passes the qual.
              */
-           Assert(ScanDirectionIsForward(dir));
             Assert(BTreeTupleIsPivot(tuple));
             cmpresult = 0;
             if (subkey->sk_flags & SK_ROW_END)
@@ -1630,6 +3936,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
                  * because it's not possible for any future tuples to pass. On
                  * a forward scan, however, we must keep going, because we may
                  * have initially positioned to the start of the index.
+                * (_bt_advance_array_keys also relies on this behavior during
+                * forward scans.)
                  */
                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
                     ScanDirectionIsBackward(dir))
@@ -1646,6 +3954,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
                  * because it's not possible for any future tuples to pass. On
                  * a backward scan, however, we must keep going, because we
                  * may have initially positioned to the end of the index.
+                * (_bt_advance_array_keys also relies on this behavior during
+                * backward scans.)
                  */
                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
                     ScanDirectionIsForward(dir))
@@ -1741,6 +4051,90 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
     return result;
  }
  
+/*
+ * Determine if a scan with array keys should skip over uninteresting tuples.
+ *
+ * This is a subroutine for _bt_checkkeys.  Called when _bt_readpage's linear
+ * search process (started after it finishes reading an initial group of
+ * matching tuples, used to locate the start of the next group of tuples
+ * matching the next set of required array keys) has already scanned an
+ * excessive number of tuples whose key space is "between arrays".
+ *
+ * When we perform look ahead successfully, we'll sets pstate.skip, which
+ * instructs _bt_readpage to skip ahead to that tuple next (could be past the
+ * end of the scan's leaf page).  Pages where the optimization is effective
+ * will generally still need to skip several times.  Each call here performs
+ * only a single "look ahead" comparison of a later tuple, whose distance from
+ * the current tuple's offset number is determined by applying heuristics.
+ */
+static void
+_bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
+                        int tupnatts, TupleDesc tupdesc)
+{
+   ScanDirection dir = pstate->dir;
+   OffsetNumber aheadoffnum;
+   IndexTuple  ahead;
+
+   /* Avoid looking ahead when comparing the page high key */
+   if (pstate->offnum < pstate->minoff)
+       return;
+
+   /*
+    * Don't look ahead when there aren't enough tuples remaining on the page
+    * (in the current scan direction) for it to be worth our while
+    */
+   if (ScanDirectionIsForward(dir) &&
+       pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE)
+       return;
+   else if (ScanDirectionIsBackward(dir) &&
+            pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE)
+       return;
+
+   /*
+    * The look ahead distance starts small, and ramps up as each call here
+    * allows _bt_readpage to skip over more tuples
+    */
+   if (!pstate->targetdistance)
+       pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE;
+   else
+       pstate->targetdistance *= 2;
+
+   /* Don't read past the end (or before the start) of the page, though */
+   if (ScanDirectionIsForward(dir))
+       aheadoffnum = Min((int) pstate->maxoff,
+                         (int) pstate->offnum + pstate->targetdistance);
+   else
+       aheadoffnum = Max((int) pstate->minoff,
+                         (int) pstate->offnum - pstate->targetdistance);
+
+   ahead = (IndexTuple) PageGetItem(pstate->page,
+                                    PageGetItemId(pstate->page, aheadoffnum));
+   if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts,
+                                    false, 0, NULL))
+   {
+       /*
+        * Success -- instruct _bt_readpage to skip ahead to very next tuple
+        * after the one we determined was still before the current array keys
+        */
+       if (ScanDirectionIsForward(dir))
+           pstate->skip = aheadoffnum + 1;
+       else
+           pstate->skip = aheadoffnum - 1;
+   }
+   else
+   {
+       /*
+        * Failure -- "ahead" tuple is too far ahead (we were too aggresive).
+        *
+        * Reset the number of rechecks, and aggressively reduce the target
+        * distance (we're much more aggressive here than we were when the
+        * distance was initially ramped up).
+        */
+       pstate->rechecks = 0;
+       pstate->targetdistance = Max(pstate->targetdistance / 8, 1);
+   }
+}
+
  /*
   * _bt_killitems - set LP_DEAD state for items an indexscan caller has
   * told us were killed
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c

index 9e35aaf56e50b1e22d30b7be0805d6d88867f9ec..fcf6d1d932184a36661590a4961600f5aba084e1 100644 (file)
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -628,6 +628,8 @@ ExecIndexOnlyScanEstimate(IndexOnlyScanState *node,
     EState     *estate = node->ss.ps.state;
  
     node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc,
+                                                     node->ioss_NumScanKeys,
+                                                     node->ioss_NumOrderByKeys,
                                                       estate->es_snapshot);
     shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen);
     shm_toc_estimate_keys(&pcxt->estimator, 1);
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c

index 2a3264599d283d1ae1aded251d1db97deb62e7d5..8000feff4c9f94507b3da4609fde516bc3ac4668 100644 (file)
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -1644,6 +1644,8 @@ ExecIndexScanEstimate(IndexScanState *node,
     EState     *estate = node->ss.ps.state;
  
     node->iss_PscanLen = index_parallelscan_estimate(node->iss_RelationDesc,
+                                                    node->iss_NumScanKeys,
+                                                    node->iss_NumOrderByKeys,
                                                      estate->es_snapshot);
     shm_toc_estimate_chunk(&pcxt->estimator, node->iss_PscanLen);
     shm_toc_estimate_keys(&pcxt->estimator, 1);
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c

index 32c6a8bbdcbb9e3ff9b3b36866232cc654fe245b..2230b13104743e500aa061b97cef2a567acdfe80 100644 (file)
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -106,8 +106,7 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel,
                                IndexOptInfo *index, IndexClauseSet *clauses,
                                bool useful_predicate,
                                ScanTypeControl scantype,
-                              bool *skip_nonnative_saop,
-                              bool *skip_lower_saop);
+                              bool *skip_nonnative_saop);
  static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel,
                                 List *clauses, List *other_clauses);
  static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
@@ -706,8 +705,6 @@ eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids,
   * index AM supports them natively, we should just include them in simple
   * index paths.  If not, we should exclude them while building simple index
   * paths, and then make a separate attempt to include them in bitmap paths.
- * Furthermore, we should consider excluding lower-order ScalarArrayOpExpr
- * quals so as to create ordered paths.
   */
  static void
  get_index_paths(PlannerInfo *root, RelOptInfo *rel,
@@ -716,37 +713,17 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
  {
     List       *indexpaths;
     bool        skip_nonnative_saop = false;
-   bool        skip_lower_saop = false;
     ListCell   *lc;
  
     /*
      * Build simple index paths using the clauses.  Allow ScalarArrayOpExpr
-    * clauses only if the index AM supports them natively, and skip any such
-    * clauses for index columns after the first (so that we produce ordered
-    * paths if possible).
+    * clauses only if the index AM supports them natively.
      */
     indexpaths = build_index_paths(root, rel,
                                    index, clauses,
                                    index->predOK,
                                    ST_ANYSCAN,
-                                  &skip_nonnative_saop,
-                                  &skip_lower_saop);
-
-   /*
-    * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM
-    * that supports them, then try again including those clauses.  This will
-    * produce paths with more selectivity but no ordering.
-    */
-   if (skip_lower_saop)
-   {
-       indexpaths = list_concat(indexpaths,
-                                build_index_paths(root, rel,
-                                                  index, clauses,
-                                                  index->predOK,
-                                                  ST_ANYSCAN,
-                                                  &skip_nonnative_saop,
-                                                  NULL));
-   }
+                                  &skip_nonnative_saop);
  
     /*
      * Submit all the ones that can form plain IndexScan plans to add_path. (A
@@ -784,7 +761,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
                                        index, clauses,
                                        false,
                                        ST_BITMAPSCAN,
-                                      NULL,
                                        NULL);
         *bitindexpaths = list_concat(*bitindexpaths, indexpaths);
     }
@@ -817,27 +793,19 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
   * to true if we found any such clauses (caller must initialize the variable
   * to false).  If it's NULL, we do not ignore ScalarArrayOpExpr clauses.
   *
- * If skip_lower_saop is non-NULL, we ignore ScalarArrayOpExpr clauses for
- * non-first index columns, and we set *skip_lower_saop to true if we found
- * any such clauses (caller must initialize the variable to false).  If it's
- * NULL, we do not ignore non-first ScalarArrayOpExpr clauses, but they will
- * result in considering the scan's output to be unordered.
- *
   * 'rel' is the index's heap relation
   * 'index' is the index for which we want to generate paths
   * 'clauses' is the collection of indexable clauses (IndexClause nodes)
   * 'useful_predicate' indicates whether the index has a useful predicate
   * 'scantype' indicates whether we need plain or bitmap scan support
   * 'skip_nonnative_saop' indicates whether to accept SAOP if index AM doesn't
- * 'skip_lower_saop' indicates whether to accept non-first-column SAOP
   */
  static List *
  build_index_paths(PlannerInfo *root, RelOptInfo *rel,
                   IndexOptInfo *index, IndexClauseSet *clauses,
                   bool useful_predicate,
                   ScanTypeControl scantype,
-                 bool *skip_nonnative_saop,
-                 bool *skip_lower_saop)
+                 bool *skip_nonnative_saop)
  {
     List       *result = NIL;
     IndexPath  *ipath;
@@ -848,12 +816,13 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
     List       *orderbyclausecols;
     List       *index_pathkeys;
     List       *useful_pathkeys;
-   bool        found_lower_saop_clause;
     bool        pathkeys_possibly_useful;
     bool        index_is_ordered;
     bool        index_only_scan;
     int         indexcol;
  
+   Assert(skip_nonnative_saop != NULL || scantype == ST_BITMAPSCAN);
+
     /*
      * Check that index supports the desired scan type(s)
      */
@@ -880,19 +849,11 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
      * on by btree and possibly other places.)  The list can be empty, if the
      * index AM allows that.
      *
-    * found_lower_saop_clause is set true if we accept a ScalarArrayOpExpr
-    * index clause for a non-first index column.  This prevents us from
-    * assuming that the scan result is ordered.  (Actually, the result is
-    * still ordered if there are equality constraints for all earlier
-    * columns, but it seems too expensive and non-modular for this code to be
-    * aware of that refinement.)
-    *
      * We also build a Relids set showing which outer rels are required by the
      * selected clauses.  Any lateral_relids are included in that, but not
      * otherwise accounted for.
      */
     index_clauses = NIL;
-   found_lower_saop_clause = false;
     outer_relids = bms_copy(rel->lateral_relids);
     for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++)
     {
@@ -903,30 +864,18 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
             IndexClause *iclause = (IndexClause *) lfirst(lc);
             RestrictInfo *rinfo = iclause->rinfo;
  
-           /* We might need to omit ScalarArrayOpExpr clauses */
-           if (IsA(rinfo->clause, ScalarArrayOpExpr))
+           if (skip_nonnative_saop && !index->amsearcharray &&
+               IsA(rinfo->clause, ScalarArrayOpExpr))
             {
-               if (!index->amsearcharray)
-               {
-                   if (skip_nonnative_saop)
-                   {
-                       /* Ignore because not supported by index */
-                       *skip_nonnative_saop = true;
-                       continue;
-                   }
-                   /* Caller had better intend this only for bitmap scan */
-                   Assert(scantype == ST_BITMAPSCAN);
-               }
-               if (indexcol > 0)
-               {
-                   if (skip_lower_saop)
-                   {
-                       /* Caller doesn't want to lose index ordering */
-                       *skip_lower_saop = true;
-                       continue;
-                   }
-                   found_lower_saop_clause = true;
-               }
+               /*
+                * Caller asked us to generate IndexPaths that omit any
+                * ScalarArrayOpExpr clauses when the underlying index AM
+                * lacks native support.
+                *
+                * We must omit this clause (and tell caller about it).
+                */
+               *skip_nonnative_saop = true;
+               continue;
             }
  
             /* OK to include this clause */
@@ -956,11 +905,9 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
     /*
      * 2. Compute pathkeys describing index's ordering, if any, then see how
      * many of them are actually useful for this query.  This is not relevant
-    * if we are only trying to build bitmap indexscans, nor if we have to
-    * assume the scan is unordered.
+    * if we are only trying to build bitmap indexscans.
      */
     pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN &&
-                               !found_lower_saop_clause &&
                                 has_useful_pathkeys(root, rel));
     index_is_ordered = (index->sortopfamily != NULL);
     if (index_is_ordered && pathkeys_possibly_useful)
@@ -1212,7 +1159,6 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel,
                                        index, &clauseset,
                                        useful_predicate,
                                        ST_BITMAPSCAN,
-                                      NULL,
                                        NULL);
         result = list_concat(result, indexpaths);
     }
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index cea777e9d40f3c2c322fa508116e1afcfb88bff1..35f8f306ee4175c228810a67415b6d13c5cc61e8 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6572,21 +6572,26 @@ genericcostestimate(PlannerInfo *root,
     selectivityQuals = add_predicate_to_index_quals(index, indexQuals);
  
     /*
-    * Check for ScalarArrayOpExpr index quals, and estimate the number of
-    * index scans that will be performed.
+    * If caller didn't give us an estimate for ScalarArrayOpExpr index scans,
+    * just assume that the number of index descents is the number of distinct
+    * combinations of array elements from all of the scan's SAOP clauses.
      */
-   num_sa_scans = 1;
-   foreach(l, indexQuals)
+   num_sa_scans = costs->num_sa_scans;
+   if (num_sa_scans < 1)
     {
-       RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
-
-       if (IsA(rinfo->clause, ScalarArrayOpExpr))
+       num_sa_scans = 1;
+       foreach(l, indexQuals)
         {
-           ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
-           double      alength = estimate_array_length(root, lsecond(saop->args));
+           RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
  
-           if (alength > 1)
-               num_sa_scans *= alength;
+           if (IsA(rinfo->clause, ScalarArrayOpExpr))
+           {
+               ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
+               double      alength = estimate_array_length(root, lsecond(saop->args));
+
+               if (alength > 1)
+                   num_sa_scans *= alength;
+           }
         }
     }
  
@@ -6813,9 +6818,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
      * For a RowCompareExpr, we consider only the first column, just as
      * rowcomparesel() does.
      *
-    * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N
-    * index scans not one, but the ScalarArrayOpExpr's operator can be
-    * considered to act the same as it normally does.
+    * If there's a ScalarArrayOpExpr in the quals, we'll actually perform up
+    * to N index descents (not just one), but the ScalarArrayOpExpr's
+    * operator can be considered to act the same as it normally does.
      */
     indexBoundQuals = NIL;
     indexcol = 0;
@@ -6867,7 +6872,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
  
                 clause_op = saop->opno;
                 found_saop = true;
-               /* count number of SA scans induced by indexBoundQuals only */
+               /* estimate SA descents by indexBoundQuals only */
                 if (alength > 1)
                     num_sa_scans *= alength;
             }
@@ -6930,10 +6935,48 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
                                                   NULL);
         numIndexTuples = btreeSelectivity * index->rel->tuples;
  
+       /*
+        * btree automatically combines individual ScalarArrayOpExpr primitive
+        * index scans whenever the tuples covered by the next set of array
+        * keys are close to tuples covered by the current set.  That puts a
+        * natural ceiling on the worst case number of descents -- there
+        * cannot possibly be more than one descent per leaf page scanned.
+        *
+        * Clamp the number of descents to at most 1/3 the number of index
+        * pages.  This avoids implausibly high estimates with low selectivity
+        * paths, where scans usually require only one or two descents.  This
+        * is most likely to help when there are several SAOP clauses, where
+        * naively accepting the total number of distinct combinations of
+        * array elements as the number of descents would frequently lead to
+        * wild overestimates.
+        *
+        * We somewhat arbitrarily don't just make the cutoff the total number
+        * of leaf pages (we make it 1/3 the total number of pages instead) to
+        * give the btree code credit for its ability to continue on the leaf
+        * level with low selectivity scans.
+        */
+       num_sa_scans = Min(num_sa_scans, ceil(index->pages * 0.3333333));
+       num_sa_scans = Max(num_sa_scans, 1);
+
         /*
          * As in genericcostestimate(), we have to adjust for any
          * ScalarArrayOpExpr quals included in indexBoundQuals, and then round
          * to integer.
+        *
+        * It is tempting to make genericcostestimate behave as if SAOP
+        * clauses work in almost the same way as scalar operators during
+        * btree scans, making the top-level scan look like a continuous scan
+        * (as opposed to num_sa_scans-many primitive index scans).  After
+        * all, btree scans mostly work like that at runtime.  However, such a
+        * scheme would badly bias genericcostestimate's simplistic appraoch
+        * to calculating numIndexPages through prorating.
+        *
+        * Stick with the approach taken by non-native SAOP scans for now.
+        * genericcostestimate will use the Mackert-Lohman formula to
+        * compensate for repeat page fetches, even though that definitely
+        * won't happen during btree scans (not for leaf pages, at least).
+        * We're usually very pessimistic about the number of primitive index
+        * scans that will be required, but it's not clear how to do better.
          */
         numIndexTuples = rint(numIndexTuples / num_sa_scans);
     }
@@ -6942,6 +6985,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
      * Now do generic index cost estimation.
      */
     costs.numIndexTuples = numIndexTuples;
+   costs.num_sa_scans = num_sa_scans;
  
     genericcostestimate(root, path, loop_count, &costs);
  
@@ -6952,9 +6996,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
      * comparisons to descend a btree of N leaf tuples.  We charge one
      * cpu_operator_cost per comparison.
      *
-    * If there are ScalarArrayOpExprs, charge this once per SA scan.  The
-    * ones after the first one are not startup cost so far as the overall
-    * plan is concerned, so add them only to "total" cost.
+    * If there are ScalarArrayOpExprs, charge this once per estimated SA
+    * index descent.  The ones after the first one are not startup cost so
+    * far as the overall plan goes, so just add them to "total" cost.
      */
     if (index->tuples > 1)      /* avoid computing log(0) */
     {
@@ -6971,7 +7015,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
      * in cases where only a single leaf page is expected to be visited.  This
      * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page
      * touched.  The number of such pages is btree tree height plus one (ie,
-    * we charge for the leaf page too).  As above, charge once per SA scan.
+    * we charge for the leaf page too).  As above, charge once per estimated
+    * SA index descent.
      */
     descentCost = (index->tree_height + 1) * DEFAULT_PAGE_CPU_MULTIPLIER * cpu_operator_cost;
     costs.indexStartupCost += descentCost;
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h

index 2c6c307efcf196bac3bd072497b75202e80cbc67..00300dd720e2d268c835a30df1febdfc960c0c65 100644 (file)
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -194,7 +194,7 @@ typedef void (*amrestrpos_function) (IndexScanDesc scan);
   */
  
  /* estimate size of parallel scan descriptor */
-typedef Size (*amestimateparallelscan_function) (void);
+typedef Size (*amestimateparallelscan_function) (int nkeys, int norderbys);
  
  /* prepare for parallel index scan */
  typedef void (*aminitparallelscan_function) (void *target);
diff --git a/src/include/access/genam.h b/src/include/access/genam.h

index 8026c2b36dd0abbd68096b2474958cc25f8613e8..fdcfbe8db74d385a5e0385ee16f91842c5314abe 100644 (file)
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -165,7 +165,8 @@ extern void index_rescan(IndexScanDesc scan,
  extern void index_endscan(IndexScanDesc scan);
  extern void index_markpos(IndexScanDesc scan);
  extern void index_restrpos(IndexScanDesc scan);
-extern Size index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot);
+extern Size index_parallelscan_estimate(Relation indexRelation,
+                                       int nkeys, int norderbys, Snapshot snapshot);
  extern void index_parallelscan_initialize(Relation heapRelation,
                                           Relation indexRelation, Snapshot snapshot,
                                           ParallelIndexScanDesc target);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h

index 6eb162052e99641964696576b021d0f150fbc228..b9053219a699cfd22416f8c17c254698636338f0 100644 (file)
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -960,11 +960,20 @@ typedef struct BTScanPosData
      * moreLeft and moreRight track whether we think there may be matching
      * index entries to the left and right of the current page, respectively.
      * We can clear the appropriate one of these flags when _bt_checkkeys()
-    * returns continuescan = false.
+    * sets BTReadPageState.continuescan = false.
      */
     bool        moreLeft;
     bool        moreRight;
  
+   /*
+    * Direction of the scan at the time that _bt_readpage was called.
+    *
+    * Used by btrestrpos to "restore" the scan's array keys by resetting each
+    * array to its first element's value (first in this scan direction). This
+    * avoids the need to directly track the array keys in btmarkpos.
+    */
+   ScanDirection dir;
+
     /*
      * If we are doing an index-only scan, nextTupleOffset is the first free
      * location in the associated tuple storage workspace.
@@ -1022,9 +1031,8 @@ typedef BTScanPosData *BTScanPos;
  /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
  typedef struct BTArrayKeyInfo
  {
-   int         scan_key;       /* index of associated key in arrayKeyData */
+   int         scan_key;       /* index of associated key in keyData */
     int         cur_elem;       /* index of current element in elem_values */
-   int         mark_elem;      /* index of marked element in elem_values */
     int         num_elems;      /* number of elems in current array value */
     Datum      *elem_values;    /* array of num_elems Datums */
  } BTArrayKeyInfo;
@@ -1037,14 +1045,11 @@ typedef struct BTScanOpaqueData
     ScanKey     keyData;        /* array of preprocessed scan keys */
  
     /* workspace for SK_SEARCHARRAY support */
-   ScanKey     arrayKeyData;   /* modified copy of scan->keyData */
-   bool        arraysStarted;  /* Started array keys, but have yet to "reach
-                                * past the end" of all arrays? */
-   int         numArrayKeys;   /* number of equality-type array keys (-1 if
-                                * there are any unsatisfiable array keys) */
-   int         arrayKeyCount;  /* count indicating number of array scan keys
-                                * processed */
+   int         numArrayKeys;   /* number of equality-type array keys */
+   bool        needPrimScan;   /* New prim scan to continue in current dir? */
+   bool        scanBehind;     /* Last array advancement matched -inf attr? */
     BTArrayKeyInfo *arrayKeys;  /* info about each equality-type array key */
+   FmgrInfo   *orderProcs;     /* ORDER procs for required equality keys */
     MemoryContext arrayContext; /* scan-lifespan context for array data */
  
     /* info about killed items if any (killedItems is NULL if never used) */
@@ -1075,6 +1080,42 @@ typedef struct BTScanOpaqueData
  
  typedef BTScanOpaqueData *BTScanOpaque;
  
+/*
+ * _bt_readpage state used across _bt_checkkeys calls for a page
+ */
+typedef struct BTReadPageState
+{
+   /* Input parameters, set by _bt_readpage for _bt_checkkeys */
+   ScanDirection dir;          /* current scan direction */
+   OffsetNumber minoff;        /* Lowest non-pivot tuple's offset */
+   OffsetNumber maxoff;        /* Highest non-pivot tuple's offset */
+   IndexTuple  finaltup;       /* Needed by scans with array keys */
+   BlockNumber prev_scan_page; /* previous _bt_parallel_release block */
+   Page        page;           /* Page being read */
+
+   /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
+   OffsetNumber offnum;        /* current tuple's page offset number */
+
+   /* Output parameter, set by _bt_checkkeys for _bt_readpage */
+   OffsetNumber skip;          /* Array keys "look ahead" skip offnum */
+   bool        continuescan;   /* Terminate ongoing (primitive) index scan? */
+
+   /*
+    * Input and output parameters, set and unset by both _bt_readpage and
+    * _bt_checkkeys to manage precheck optimizations
+    */
+   bool        prechecked;     /* precheck set continuescan to 'true'? */
+   bool        firstmatch;     /* at least one match so far?  */
+
+   /*
+    * Private _bt_checkkeys state used to manage "look ahead" optimization
+    * (only used during scans with array keys)
+    */
+   int16       rechecks;
+   int16       targetdistance;
+
+} BTReadPageState;
+
  /*
   * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
   * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
@@ -1128,7 +1169,7 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull,
                      bool indexUnchanged,
                      struct IndexInfo *indexInfo);
  extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
-extern Size btestimateparallelscan(void);
+extern Size btestimateparallelscan(int nkeys, int norderbys);
  extern void btinitparallelscan(void *target);
  extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
  extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
@@ -1149,10 +1190,12 @@ extern bool btcanreturn(Relation index, int attno);
  /*
   * prototypes for internal functions in nbtree.c
   */
-extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno);
+extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
+                              bool first);
  extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
  extern void _bt_parallel_done(IndexScanDesc scan);
-extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
+extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
+                                          BlockNumber prev_scan_page);
  
  /*
   * prototypes for functions in nbtdedup.c
@@ -1243,15 +1286,11 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
   */
  extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
  extern void _bt_freestack(BTStack stack);
-extern void _bt_preprocess_array_keys(IndexScanDesc scan);
+extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir);
  extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
-extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
-extern void _bt_mark_array_keys(IndexScanDesc scan);
-extern void _bt_restore_array_keys(IndexScanDesc scan);
  extern void _bt_preprocess_keys(IndexScanDesc scan);
-extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
-                         int tupnatts, ScanDirection dir, bool *continuescan,
-                         bool requiredMatchedByPrecheck, bool haveFirstMatch);
+extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
+                         IndexTuple tuple, int tupnatts);
  extern void _bt_killitems(IndexScanDesc scan);
  extern BTCycleId _bt_vacuum_cycleid(Relation rel);
  extern BTCycleId _bt_start_vacuum(Relation rel);
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h

index 2fa4c4fc1b0d09f628af0bb3a9dd487fee0dbf15..f2563ad1cb3a8e23c826a0222d1d944c67ab294f 100644 (file)
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -117,6 +117,9 @@ typedef struct VariableStatData
   * Callers should initialize all fields of GenericCosts to zero.  In addition,
   * they can set numIndexTuples to some positive value if they have a better
   * than default way of estimating the number of leaf index tuples visited.
+ * Similarly, they can set num_sa_scans to some value >= 1 for an index AM
+ * that doesn't necessarily perform exactly one primitive index scan per
+ * distinct combination of ScalarArrayOp array elements.
   */
  typedef struct
  {
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out

index 8311a03c3df984d51dfc47d3d14adf8c09aacf42..510646cbce71b01c4f6a84eea01ff45cad292656 100644 (file)
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -189,6 +189,58 @@ select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limi
        48 |      8
  (1 row)
  
+--
+-- Add coverage for ScalarArrayOp btree quals with pivot tuple constants
+--
+explain (costs off)
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82);
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Unique
+   ->  Index Only Scan using tenk1_hundred on tenk1
+         Index Cond: (hundred = ANY ('{47,48,72,82}'::integer[]))
+(3 rows)
+
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82);
+ hundred 
+---------
+      47
+      48
+      72
+      82
+(4 rows)
+
+explain (costs off)
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Unique
+   ->  Index Only Scan Backward using tenk1_hundred on tenk1
+         Index Cond: (hundred = ANY ('{47,48,72,82}'::integer[]))
+(3 rows)
+
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc;
+ hundred 
+---------
+      82
+      72
+      48
+      47
+(4 rows)
+
+explain (costs off)
+select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000;
+                                      QUERY PLAN                                       
+---------------------------------------------------------------------------------------
+ Index Only Scan using tenk1_thous_tenthous on tenk1
+   Index Cond: ((thousand = ANY ('{364,366,380}'::integer[])) AND (tenthous = 200000))
+(2 rows)
+
+select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000;
+ thousand 
+----------
+(0 rows)
+
  --
  -- Check correct optimization of LIKE (special index operator support)
  -- for both indexscan and bitmapscan cases
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out

index 70ab47a92f23ac8b1338bd5028a1feaec19e83d5..cf6eac57349a023f8c2e72144898a3382a8e8cae 100644 (file)
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -1698,6 +1698,12 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500;
       0
  (1 row)
  
+SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1);
+ count 
+-------
+     1
+(1 row)
+
  DROP INDEX onek_nulltest;
  CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2 desc nulls last,unique1);
  SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL;
@@ -1910,7 +1916,7 @@ SELECT count(*) FROM dupindexcols
  (1 row)
  
  --
--- Check ordering of =ANY indexqual results (bug in 9.2.0)
+-- Check that index scans with =ANY indexquals return rows in index order
  --
  explain (costs off)
  SELECT unique1 FROM tenk1
@@ -1932,16 +1938,16 @@ ORDER BY unique1;
        42
  (3 rows)
  
+-- Non-required array scan key on "tenthous":
  explain (costs off)
  SELECT thousand, tenthous FROM tenk1
  WHERE thousand < 2 AND tenthous IN (1001,3000)
  ORDER BY thousand;
-                      QUERY PLAN                       
--------------------------------------------------------
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
   Index Only Scan using tenk1_thous_tenthous on tenk1
-   Index Cond: (thousand < 2)
-   Filter: (tenthous = ANY ('{1001,3000}'::integer[]))
-(3 rows)
+   Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
+(2 rows)
  
  SELECT thousand, tenthous FROM tenk1
  WHERE thousand < 2 AND tenthous IN (1001,3000)
@@ -1952,29 +1958,166 @@ ORDER BY thousand;
          1 |     1001
  (2 rows)
  
-SET enable_indexonlyscan = OFF;
+-- Non-required array scan key on "tenthous", backward scan:
  explain (costs off)
  SELECT thousand, tenthous FROM tenk1
  WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
-                                      QUERY PLAN                                      
---------------------------------------------------------------------------------------
- Sort
-   Sort Key: thousand
-   ->  Index Scan using tenk1_thous_tenthous on tenk1
-         Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
-(4 rows)
+ORDER BY thousand DESC, tenthous DESC;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Index Only Scan Backward using tenk1_thous_tenthous on tenk1
+   Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
+(2 rows)
  
  SELECT thousand, tenthous FROM tenk1
  WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
+ORDER BY thousand DESC, tenthous DESC;
   thousand | tenthous 
  ----------+----------
-        0 |     3000
          1 |     1001
+        0 |     3000
+(2 rows)
+
+--
+-- Check elimination of redundant and contradictory index quals
+--
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}');
+                                             QUERY PLAN                                             
+----------------------------------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = ANY ('{7,8,9}'::integer[])))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}');
+ unique1 
+---------
+       7
+(1 row)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]);
+                                             QUERY PLAN                                             
+----------------------------------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 = ANY ('{7,14,22}'::integer[])) AND (unique1 = ANY ('{33,44}'::bigint[])))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]);
+ unique1 
+---------
+(0 rows)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = 1))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1;
+ unique1 
+---------
+       1
+(1 row)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345;
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = 12345))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345;
+ unique1 
+---------
+(0 rows)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 >= 42))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42;
+ unique1 
+---------
+      42
+(1 row)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42;
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 > 42))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42;
+ unique1 
+---------
+(0 rows)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 > 9996) AND (unique1 >= 9999))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999;
+ unique1 
+---------
+    9999
+(1 row)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 < 3) AND (unique1 <= 3))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3;
+ unique1 
+---------
+       0
+       1
+       2
+(3 rows)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 < 3) AND (unique1 < '-1'::bigint))
  (2 rows)
  
-RESET enable_indexonlyscan;
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint;
+ unique1 
+---------
+(0 rows)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint;
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+   Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 < '-1'::bigint))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint;
+ unique1 
+---------
+(0 rows)
+
  --
  -- Check elimination of constant-NULL subexpressions
  --
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out

index 63cddac0d633745a9f6c80a169ddfedf8af665f5..8b640c2fc2fc9186f463bd44aea59f6c528a0588 100644 (file)
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -8880,10 +8880,9 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 >= any (array[1,5]);
     Merge Cond: (j1.id1 = j2.id1)
     Join Filter: (j2.id2 = j1.id2)
     ->  Index Scan using j1_id1_idx on j1
-   ->  Index Only Scan using j2_pkey on j2
+   ->  Index Scan using j2_id1_idx on j2
           Index Cond: (id1 >= ANY ('{1,5}'::integer[]))
-         Filter: ((id1 % 1000) = 1)
-(7 rows)
+(6 rows)
  
  select * from j1
  inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out

index 4ffc5b4c563cd8219e0875d0fae3e404f5eaf52e..87273fa635ec12f31f72a3a83c14999178c0463b 100644 (file)
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -361,6 +361,7 @@ alter table tenk2 reset (parallel_workers);
  -- test parallel index scans.
  set enable_seqscan to off;
  set enable_bitmapscan to off;
+set random_page_cost = 2;
  explain (costs off)
     select  count((unique1)) from tenk1 where hundred > 1;
                               QUERY PLAN                             
@@ -379,6 +380,30 @@ select  count((unique1)) from tenk1 where hundred > 1;
    9800
  (1 row)
  
+-- Parallel ScalarArrayOp index scan
+explain (costs off)
+  select count((unique1)) from tenk1
+  where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]);
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Finalize Aggregate
+   InitPlan 1
+     ->  Aggregate
+           ->  Function Scan on generate_series i
+   ->  Gather
+         Workers Planned: 4
+         ->  Partial Aggregate
+               ->  Parallel Index Scan using tenk1_hundred on tenk1
+                     Index Cond: (hundred = ANY ((InitPlan 1).col1))
+(9 rows)
+
+select count((unique1)) from tenk1
+where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]);
+ count 
+-------
+   700
+(1 row)
+
  -- test parallel index-only scans.
  explain (costs off)
     select  count(*) from tenk1 where thousand > 95;
diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql

index ef8435423472d6ff4100a0d73ad1e21d75fbd113..0d2a33f37053cef22a42909d28b1d310086a10f1 100644 (file)
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -135,6 +135,21 @@ explain (costs off)
  select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1;
  select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1;
  
+--
+-- Add coverage for ScalarArrayOp btree quals with pivot tuple constants
+--
+explain (costs off)
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82);
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82);
+
+explain (costs off)
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc;
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc;
+
+explain (costs off)
+select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000;
+select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000;
+
  --
  -- Check correct optimization of LIKE (special index operator support)
  -- for both indexscan and bitmapscan cases
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql

index d49ce9f3007a87a1e123d911234dba3260014f28..e296891cab8064771d46959bc3570da35ceae484 100644 (file)
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -668,6 +668,7 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL;
  SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL;
  SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500;
  SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500;
+SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1);
  
  DROP INDEX onek_nulltest;
  
@@ -753,7 +754,7 @@ SELECT count(*) FROM dupindexcols
    WHERE f1 BETWEEN 'WA' AND 'ZZZ' and id < 1000 and f1 ~<~ 'YX';
  
  --
--- Check ordering of =ANY indexqual results (bug in 9.2.0)
+-- Check that index scans with =ANY indexquals return rows in index order
  --
  
  explain (costs off)
@@ -765,6 +766,7 @@ SELECT unique1 FROM tenk1
  WHERE unique1 IN (1,42,7)
  ORDER BY unique1;
  
+-- Non-required array scan key on "tenthous":
  explain (costs off)
  SELECT thousand, tenthous FROM tenk1
  WHERE thousand < 2 AND tenthous IN (1001,3000)
@@ -774,18 +776,68 @@ SELECT thousand, tenthous FROM tenk1
  WHERE thousand < 2 AND tenthous IN (1001,3000)
  ORDER BY thousand;
  
-SET enable_indexonlyscan = OFF;
-
+-- Non-required array scan key on "tenthous", backward scan:
  explain (costs off)
  SELECT thousand, tenthous FROM tenk1
  WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
+ORDER BY thousand DESC, tenthous DESC;
  
  SELECT thousand, tenthous FROM tenk1
  WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
+ORDER BY thousand DESC, tenthous DESC;
+
+--
+-- Check elimination of redundant and contradictory index quals
+--
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}');
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}');
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]);
+
+SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]);
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999;
+
+SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3;
+
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint;
+
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint;
  
-RESET enable_indexonlyscan;
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint;
  
  --
  -- Check elimination of constant-NULL subexpressions
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql

index c43a5b211914bc0f2b6b9ba97b30a72e10d82ee5..20376c03fae2ea9f8a7f47f0c24843de771ad396 100644 (file)
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -137,11 +137,19 @@ alter table tenk2 reset (parallel_workers);
  -- test parallel index scans.
  set enable_seqscan to off;
  set enable_bitmapscan to off;
+set random_page_cost = 2;
  
  explain (costs off)
     select  count((unique1)) from tenk1 where hundred > 1;
  select  count((unique1)) from tenk1 where hundred > 1;
  
+-- Parallel ScalarArrayOp index scan
+explain (costs off)
+  select count((unique1)) from tenk1
+  where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]);
+select count((unique1)) from tenk1
+where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]);
+
  -- test parallel index-only scans.
  explain (costs off)
     select  count(*) from tenk1 where thousand > 95;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index 01845ee71d5dbbf2639ab0eb3f3774192e31e86b..f87e8b80ec071e01bce06bcb4718b5ce10a3c3ec 100644 (file)
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -208,8 +208,10 @@ BTPageStat
  BTPageState
  BTParallelScanDesc
  BTPendingFSM
+BTReadPageState
  BTScanInsert
  BTScanInsertData
+BTScanKeyPreproc
  BTScanOpaque
  BTScanOpaqueData
  BTScanPos
author	Peter Geoghegan <[email protected]>
	Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
committer	Peter Geoghegan <[email protected]>
	Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
doc/src/sgml/indexam.sgml		patch \| blob \| blame \| history
doc/src/sgml/monitoring.sgml		patch \| blob \| blame \| history
src/backend/access/index/indexam.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtree.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtsearch.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtutils.c		patch \| blob \| blame \| history
src/backend/executor/nodeIndexonlyscan.c		patch \| blob \| blame \| history
src/backend/executor/nodeIndexscan.c		patch \| blob \| blame \| history
src/backend/optimizer/path/indxpath.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/access/amapi.h		patch \| blob \| blame \| history
src/include/access/genam.h		patch \| blob \| blame \| history
src/include/access/nbtree.h		patch \| blob \| blame \| history
src/include/utils/selfuncs.h		patch \| blob \| blame \| history
src/test/regress/expected/btree_index.out		patch \| blob \| blame \| history
src/test/regress/expected/create_index.out		patch \| blob \| blame \| history
src/test/regress/expected/join.out		patch \| blob \| blame \| history
src/test/regress/expected/select_parallel.out		patch \| blob \| blame \| history
src/test/regress/sql/btree_index.sql		patch \| blob \| blame \| history
src/test/regress/sql/create_index.sql		patch \| blob \| blame \| history
src/test/regress/sql/select_parallel.sql		patch \| blob \| blame \| history
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history