Improve planner's estimates of tuple hash table sizes.

author Tom Lane <[email protected]>

Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)

committer Tom Lane <[email protected]>

Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
author Tom Lane <[email protected]>
Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
committer Tom Lane <[email protected]>
Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c

index b4bdaa3c3056f408d933aff478ec7f203bc05d5b..e1a3a813dd9b86c554e08e2900d3b9ef23071242 100644 (file)
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -14,6 +14,7 @@
   */
  #include "postgres.h"
  
+#include "access/htup_details.h"
  #include "access/parallel.h"
  #include "common/hashfn.h"
  #include "executor/executor.h"
@@ -302,6 +303,64 @@ ResetTupleHashTable(TupleHashTable hashtable)
     MemoryContextReset(hashtable->tuplescxt);
  }
  
+/*
+ * Estimate the amount of space needed for a TupleHashTable with nentries
+ * entries, if the tuples have average data width tupleWidth and the caller
+ * requires additionalsize extra space per entry.
+ *
+ * Return SIZE_MAX if it'd overflow size_t.
+ *
+ * nentries is "double" because this is meant for use by the planner,
+ * which typically works with double rowcount estimates.  So we'd need to
+ * clamp to integer somewhere and that might as well be here.  We do expect
+ * the value not to be NaN or negative, else the result will be garbage.
+ */
+Size
+EstimateTupleHashTableSpace(double nentries,
+                           Size tupleWidth,
+                           Size additionalsize)
+{
+   Size        sh_space;
+   double      tuples_space;
+
+   /* First estimate the space needed for the simplehash table */
+   sh_space = tuplehash_estimate_space(nentries);
+
+   /* Give up if that's already too big */
+   if (sh_space >= SIZE_MAX)
+       return sh_space;
+
+   /*
+    * Compute space needed for hashed tuples with additional data.  nentries
+    * must be somewhat sane, so it should be safe to compute this product.
+    *
+    * We assume that the hashed tuples will be kept in a BumpContext so that
+    * there is not additional per-tuple overhead.
+    *
+    * (Note that this is only accurate if MEMORY_CONTEXT_CHECKING is off,
+    * else bump.c will add a MemoryChunk header to each tuple.  However, it
+    * seems undesirable for debug builds to make different planning choices
+    * than production builds, so we assume the production behavior always.)
+    */
+   tuples_space = nentries * (MAXALIGN(SizeofMinimalTupleHeader) +
+                              MAXALIGN(tupleWidth) +
+                              MAXALIGN(additionalsize));
+
+   /*
+    * Check for size_t overflow.  This coding is trickier than it may appear,
+    * because on 64-bit machines SIZE_MAX cannot be represented exactly as a
+    * double.  We must cast it explicitly to suppress compiler warnings about
+    * an inexact conversion, and we must trust that any double value that
+    * compares strictly less than "(double) SIZE_MAX" will cast to a
+    * representable size_t value.
+    */
+   if (sh_space + tuples_space >= (double) SIZE_MAX)
+       return SIZE_MAX;
+
+   /* We don't bother estimating size of the miscellaneous overhead data */
+   return (Size) (sh_space + tuples_space);
+}
+
  /*
   * Find or create a hashtable entry for the tuple group containing the
   * given tuple.  The tuple must be the same type as the hashtable entries.
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c

index 7b223a7ca3ae9792b4ae635eafccfdd6a60d08b0..5aabed18a09e80c672f44f78c659ed248150688b 100644 (file)
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -111,6 +111,15 @@ build_hash_table(SetOpState *setopstate)
                                                 false);
  }
  
+/* Planner support routine to estimate space needed for hash table */
+Size
+EstimateSetOpHashTableSpace(double nentries, Size tupleWidth)
+{
+   return EstimateTupleHashTableSpace(nentries,
+                                      tupleWidth,
+                                      sizeof(SetOpStatePerGroupData));
+}
+
  /*
   * We've completed processing a tuple group.  Decide how many copies (if any)
   * of its representative row to emit, and store the count into numOutput.
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c

index 9f6e45bcb0bac9b4fba99afd680455b259f6a917..1cd0988bb496572beb1015fd474dfdc43a7fc9f1 100644 (file)
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -525,7 +525,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
                                               node->tab_hash_funcs,
                                               node->tab_collations,
                                               nbuckets,
-                                             0,
+                                             0,    /* no additional data */
                                               node->planstate->state->es_query_cxt,
                                               node->tuplesContext,
                                               innerecontext->ecxt_per_tuple_memory,
@@ -554,7 +554,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
                                                   node->tab_hash_funcs,
                                                   node->tab_collations,
                                                   nbuckets,
-                                                 0,
+                                                 0,    /* no additional data */
                                                   node->planstate->state->es_query_cxt,
                                                   node->tuplesContext,
                                                   innerecontext->ecxt_per_tuple_memory,
@@ -636,6 +636,55 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
     MemoryContextSwitchTo(oldcontext);
  }
  
+/* Planner support routine to estimate space needed for hash table(s) */
+Size
+EstimateSubplanHashTableSpace(double nentries,
+                             Size tupleWidth,
+                             bool unknownEqFalse)
+{
+   Size        tab1space,
+               tab2space;
+
+   /* Estimate size of main hashtable */
+   tab1space = EstimateTupleHashTableSpace(nentries,
+                                           tupleWidth,
+                                           0 /* no additional data */ );
+
+   /* Give up if that's already too big */
+   if (tab1space >= SIZE_MAX)
+       return tab1space;
+
+   /* Done if we don't need a hashnulls table */
+   if (unknownEqFalse)
+       return tab1space;
+
+   /*
+    * Adjust the rowcount estimate in the same way that buildSubPlanHash
+    * will, except that we don't bother with the special case for a single
+    * hash column.  (We skip that detail because it'd be notationally painful
+    * for our caller to provide the column count, and this table has
+    * relatively little impact on the total estimate anyway.)
+    */
+   nentries /= 16;
+   if (nentries < 1)
+       nentries = 1;
+
+   /*
+    * It might be sane to also reduce the tupleWidth, but on the other hand
+    * we are not accounting for the space taken by the tuples' null bitmaps.
+    * Leave it alone for now.
+    */
+   tab2space = EstimateTupleHashTableSpace(nentries,
+                                           tupleWidth,
+                                           0 /* no additional data */ );
+
+   /* Guard against overflow */
+   if (tab2space >= SIZE_MAX - tab1space)
+       return SIZE_MAX;
+
+   return tab1space + tab2space;
+}
+
  /*
   * execTuplesUnequal
   *     Return true if two tuples are definitely unequal in the indicated
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c

index 14192a1323655e9162747e9eaa84bdcfb8dc7e8e..ff63d20f8d536c31e0f0cae5634deace32c5b956 100644 (file)
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -20,6 +20,7 @@
  #include "catalog/pg_operator.h"
  #include "catalog/pg_type.h"
  #include "executor/executor.h"
+#include "executor/nodeSubplan.h"
  #include "miscadmin.h"
  #include "nodes/makefuncs.h"
  #include "nodes/nodeFuncs.h"
@@ -79,8 +80,8 @@ static Node *convert_testexpr(PlannerInfo *root,
                               List *subst_nodes);
  static Node *convert_testexpr_mutator(Node *node,
                                       convert_testexpr_context *context);
-static bool subplan_is_hashable(Plan *plan);
-static bool subpath_is_hashable(Path *path);
+static bool subplan_is_hashable(Plan *plan, bool unknownEqFalse);
+static bool subpath_is_hashable(Path *path, bool unknownEqFalse);
  static bool testexpr_is_hashable(Node *testexpr, List *param_ids);
  static bool test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids);
  static bool hash_ok_operator(OpExpr *expr);
@@ -283,7 +284,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
             best_path = final_rel->cheapest_total_path;
  
             /* Now we can check if it'll fit in hash_mem */
-           if (subpath_is_hashable(best_path))
+           if (subpath_is_hashable(best_path, true))
             {
                 SubPlan    *hashplan;
                 AlternativeSubPlan *asplan;
@@ -524,7 +525,7 @@ build_subplan(PlannerInfo *root, Plan *plan, Path *path,
          */
         if (subLinkType == ANY_SUBLINK &&
             splan->parParam == NIL &&
-           subplan_is_hashable(plan) &&
+           subplan_is_hashable(plan, unknownEqFalse) &&
             testexpr_is_hashable(splan->testexpr, splan->paramIds))
             splan->useHashTable = true;
  
@@ -711,19 +712,19 @@ convert_testexpr_mutator(Node *node,
   * is suitable for hashing.  We only look at the subquery itself.
   */
  static bool
-subplan_is_hashable(Plan *plan)
+subplan_is_hashable(Plan *plan, bool unknownEqFalse)
  {
-   double      subquery_size;
+   Size        hashtablesize;
  
     /*
-    * The estimated size of the subquery result must fit in hash_mem. (Note:
-    * we use heap tuple overhead here even though the tuples will actually be
-    * stored as MinimalTuples; this provides some fudge factor for hashtable
-    * overhead.)
+    * The estimated size of the hashtable holding the subquery result must
+    * fit in hash_mem.  (Note: reject on equality, to ensure that an estimate
+    * of SIZE_MAX disables hashing regardless of the hash_mem limit.)
      */
-   subquery_size = plan->plan_rows *
-       (MAXALIGN(plan->plan_width) + MAXALIGN(SizeofHeapTupleHeader));
-   if (subquery_size > get_hash_memory_limit())
+   hashtablesize = EstimateSubplanHashTableSpace(plan->plan_rows,
+                                                 plan->plan_width,
+                                                 unknownEqFalse);
+   if (hashtablesize >= get_hash_memory_limit())
         return false;
  
     return true;
@@ -735,19 +736,19 @@ subplan_is_hashable(Plan *plan)
   * Identical to subplan_is_hashable, but work from a Path for the subplan.
   */
  static bool
-subpath_is_hashable(Path *path)
+subpath_is_hashable(Path *path, bool unknownEqFalse)
  {
-   double      subquery_size;
+   Size        hashtablesize;
  
     /*
-    * The estimated size of the subquery result must fit in hash_mem. (Note:
-    * we use heap tuple overhead here even though the tuples will actually be
-    * stored as MinimalTuples; this provides some fudge factor for hashtable
-    * overhead.)
+    * The estimated size of the hashtable holding the subquery result must
+    * fit in hash_mem.  (Note: reject on equality, to ensure that an estimate
+    * of SIZE_MAX disables hashing regardless of the hash_mem limit.)
      */
-   subquery_size = path->rows *
-       (MAXALIGN(path->pathtarget->width) + MAXALIGN(SizeofHeapTupleHeader));
-   if (subquery_size > get_hash_memory_limit())
+   hashtablesize = EstimateSubplanHashTableSpace(path->rows,
+                                                 path->pathtarget->width,
+                                                 unknownEqFalse);
+   if (hashtablesize >= get_hash_memory_limit())
         return false;
  
     return true;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c

index 44ac5312edda4d89698dfe9fdc26a745cbc168de..e4fd6950fad1d12cbe5625c9155166d0d84ecf77 100644 (file)
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -17,6 +17,7 @@
  #include <math.h>
  
  #include "access/htup_details.h"
+#include "executor/nodeSetOp.h"
  #include "foreign/fdwapi.h"
  #include "miscadmin.h"
  #include "nodes/extensible.h"
@@ -3461,7 +3462,7 @@ create_setop_path(PlannerInfo *root,
     }
     else
     {
-       Size        hashentrysize;
+       Size        hashtablesize;
  
         /*
          * In hashed mode, we must read all the input before we can emit
@@ -3490,11 +3491,12 @@ create_setop_path(PlannerInfo *root,
  
         /*
          * Also disable if it doesn't look like the hashtable will fit into
-        * hash_mem.
+        * hash_mem.  (Note: reject on equality, to ensure that an estimate of
+        * SIZE_MAX disables hashing regardless of the hash_mem limit.)
          */
-       hashentrysize = MAXALIGN(leftpath->pathtarget->width) +
-           MAXALIGN(SizeofMinimalTupleHeader);
-       if (hashentrysize * numGroups > get_hash_memory_limit())
+       hashtablesize = EstimateSetOpHashTableSpace(numGroups,
+                                                   leftpath->pathtarget->width);
+       if (hashtablesize >= get_hash_memory_limit())
             pathnode->path.disabled_nodes++;
     }
     pathnode->path.rows = outputRows;
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h

index 8e7a5453064396905cdd2b8c3785b910647ebd41..086f52cff3d8685cd70e3f2a44425feeaf785765 100644 (file)
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -157,6 +157,9 @@ extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
                                          ExprState *eqcomp,
                                          ExprState *hashexpr);
  extern void ResetTupleHashTable(TupleHashTable hashtable);
+extern Size EstimateTupleHashTableSpace(double nentries,
+                                       Size tupleWidth,
+                                       Size additionalsize);
  
  #ifndef FRONTEND
  /*
diff --git a/src/include/executor/nodeSetOp.h b/src/include/executor/nodeSetOp.h

index 024c6ba1fceb80cf8abbb0bae8abfc2001f62943..302936df8be437a23a427adc31a5caded7010b14 100644 (file)
--- a/src/include/executor/nodeSetOp.h
+++ b/src/include/executor/nodeSetOp.h
@@ -20,4 +20,6 @@ extern SetOpState *ExecInitSetOp(SetOp *node, EState *estate, int eflags);
  extern void ExecEndSetOp(SetOpState *node);
  extern void ExecReScanSetOp(SetOpState *node);
  
+extern Size EstimateSetOpHashTableSpace(double nentries, Size tupleWidth);
+
  #endif                         /* NODESETOP_H */
diff --git a/src/include/executor/nodeSubplan.h b/src/include/executor/nodeSubplan.h

index a1cafbcc694d1eece1cb294772bae0b0c7c5e9a6..301c29d1f24693ea389e37dcb5db7390c2c2d6f6 100644 (file)
--- a/src/include/executor/nodeSubplan.h
+++ b/src/include/executor/nodeSubplan.h
@@ -20,6 +20,10 @@ extern SubPlanState *ExecInitSubPlan(SubPlan *subplan, PlanState *parent);
  
  extern Datum ExecSubPlan(SubPlanState *node, ExprContext *econtext, bool *isNull);
  
+extern Size EstimateSubplanHashTableSpace(double nentries,
+                                         Size tupleWidth,
+                                         bool unknownEqFalse);
+
  extern void ExecReScanSetParamPlan(SubPlanState *node, PlanState *parent);
  
  extern void ExecSetParamPlan(SubPlanState *node, ExprContext *econtext);
diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h

index 9622131ede6811d9c0b8cb723a6587524735f79c..031a377da846c6a48df1400c584b4d760bcbb074 100644 (file)
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -125,6 +125,7 @@
  #define SH_ITERATE SH_MAKE_NAME(iterate)
  #define SH_ALLOCATE SH_MAKE_NAME(allocate)
  #define SH_FREE SH_MAKE_NAME(free)
+#define SH_ESTIMATE_SPACE SH_MAKE_NAME(estimate_space)
  #define SH_STAT SH_MAKE_NAME(stat)
  
  /* internal helper functions (no externally visible prototypes) */
@@ -242,7 +243,10 @@ SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at);
  /* <element> *<prefix>_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
  SH_SCOPE   SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
  
-/* void <prefix>_stat(<prefix>_hash *tb */
+/* size_t <prefix>_estimate_space(double nentries) */
+SH_SCOPE size_t SH_ESTIMATE_SPACE(double nentries);
+
+/* void <prefix>_stat(<prefix>_hash *tb) */
  SH_SCOPE void SH_STAT(SH_TYPE * tb);
  
  #endif                         /* SH_DECLARE */
@@ -305,7 +309,7 @@ SH_SCOPE void SH_STAT(SH_TYPE * tb);
  
  /*
   * Compute allocation size for hashtable. Result can be passed to
- * SH_UPDATE_PARAMETERS.
+ * SH_UPDATE_PARAMETERS.  (Keep SH_ESTIMATE_SPACE in sync with this!)
   */
  static inline uint64
  SH_COMPUTE_SIZE(uint64 newsize)
@@ -1068,6 +1072,47 @@ SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
     return NULL;
  }
  
+/*
+ * Estimate the amount of space needed for a hashtable with nentries entries.
+ * Return SIZE_MAX if that's too many entries.
+ *
+ * nentries is "double" because this is meant for use by the planner,
+ * which typically works with double rowcount estimates.  So we'd need to
+ * clamp to integer somewhere and that might as well be here.  We do expect
+ * the value not to be NaN or negative, else the result will be garbage.
+ */
+SH_SCOPE size_t
+SH_ESTIMATE_SPACE(double nentries)
+{
+   uint64      size;
+   uint64      space;
+
+   /* scale request by SH_FILLFACTOR, as SH_CREATE does */
+   nentries = nentries / SH_FILLFACTOR;
+
+   /* fail if we'd overrun SH_MAX_SIZE entries */
+   if (nentries >= SH_MAX_SIZE)
+       return SIZE_MAX;
+
+   /* should be safe to convert to uint64 */
+   size = (uint64) nentries;
+
+   /* supporting zero sized hashes would complicate matters */
+   size = Max(size, 2);
+
+   /* round up size to the next power of 2, that's how bucketing works */
+   size = pg_nextpower2_64(size);
+
+   /* calculate space needed for ->data */
+   space = ((uint64) sizeof(SH_ELEMENT_TYPE)) * size;
+
+   /* verify that allocation of ->data is possible on this platform */
+   if (space >= SIZE_MAX / 2)
+       return SIZE_MAX;
+
+   return (size_t) space + sizeof(SH_TYPE);
+}
+
  /*
   * Report some statistics about the state of the hashtable. For
   * debugging/profiling purposes only.
@@ -1195,6 +1240,7 @@ SH_STAT(SH_TYPE * tb)
  #undef SH_ITERATE
  #undef SH_ALLOCATE
  #undef SH_FREE
+#undef SH_ESTIMATE_SPACE
  #undef SH_STAT
  
  /* internal function names */
author	Tom Lane <[email protected]>
	Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
committer	Tom Lane <[email protected]>
	Sun, 2 Nov 2025 21:57:26 +0000 (16:57 -0500)
src/backend/executor/execGrouping.c		patch \| blob \| blame \| history
src/backend/executor/nodeSetOp.c		patch \| blob \| blame \| history
src/backend/executor/nodeSubplan.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/subselect.c		patch \| blob \| blame \| history
src/backend/optimizer/util/pathnode.c		patch \| blob \| blame \| history
src/include/executor/executor.h		patch \| blob \| blame \| history
src/include/executor/nodeSetOp.h		patch \| blob \| blame \| history
src/include/executor/nodeSubplan.h		patch \| blob \| blame \| history
src/include/lib/simplehash.h		patch \| blob \| blame \| history