Teach estimate_array_length() to use statistics where available.
authorTom Lane <[email protected]>
Thu, 4 Jan 2024 23:36:19 +0000 (18:36 -0500)
committerTom Lane <[email protected]>
Thu, 4 Jan 2024 23:36:19 +0000 (18:36 -0500)
If we have DECHIST statistics about the argument expression, use
the average number of distinct elements as the array length estimate.
(It'd be better to use the average total number of elements, but
that is not currently calculated by compute_array_stats(), and
it's unclear that it'd be worth extra effort to get.)

To do this, we have to change the signature of estimate_array_length
to pass the "root" pointer.  While at it, also change its result
type to "double".  That's probably not really necessary, but it
avoids any risk of overflow of the value extracted from DECHIST.
All existing callers are going to use the result in a "double"
calculation anyway.

Paul Jungwirth, reviewed by Jian He and myself

Discussion: https://postgr.es/m/CA+renyUnM2d+SmrxKpDuAdpiq6FOM=FByvi6aS6yi__qyf6j9A@mail.gmail.com

src/backend/optimizer/path/costsize.c
src/backend/utils/adt/arrayfuncs.c
src/backend/utils/adt/selfuncs.c
src/include/utils/selfuncs.h

index 7cfebc95d69060a221e62c35826164a9bcefb92c..8b76e985296c5f983db2e68c79ed74fc49ef7a1a 100644 (file)
@@ -1256,7 +1256,7 @@ cost_tidscan(Path *path, PlannerInfo *root,
        QualCost        qpqual_cost;
        Cost            cpu_per_tuple;
        QualCost        tid_qual_cost;
-       int                     ntuples;
+       double          ntuples;
        ListCell   *l;
        double          spc_random_page_cost;
 
@@ -1283,7 +1283,7 @@ cost_tidscan(Path *path, PlannerInfo *root,
                        ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
                        Node       *arraynode = (Node *) lsecond(saop->args);
 
-                       ntuples += estimate_array_length(arraynode);
+                       ntuples += estimate_array_length(root, arraynode);
                }
                else if (IsA(qual, CurrentOfExpr))
                {
@@ -4770,7 +4770,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
                Node       *arraynode = (Node *) lsecond(saop->args);
                QualCost        sacosts;
                QualCost        hcosts;
-               int                     estarraylen = estimate_array_length(arraynode);
+               double          estarraylen = estimate_array_length(context->root, arraynode);
 
                set_sa_opfuncid(saop);
                sacosts.startup = sacosts.per_tuple = 0;
@@ -4808,7 +4808,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
                         */
                        context->total.startup += sacosts.startup;
                        context->total.per_tuple += sacosts.per_tuple *
-                               estimate_array_length(arraynode) * 0.5;
+                               estimate_array_length(context->root, arraynode) * 0.5;
                }
        }
        else if (IsA(node, Aggref) ||
@@ -4859,7 +4859,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
                context->total.startup += perelemcost.startup;
                if (perelemcost.per_tuple > 0)
                        context->total.per_tuple += perelemcost.per_tuple *
-                               estimate_array_length((Node *) acoerce->arg);
+                               estimate_array_length(context->root, (Node *) acoerce->arg);
        }
        else if (IsA(node, RowCompareExpr))
        {
index 957d21a0a0d7c496851a748f25da11b2527168ab..f3fee54e37060981976007cd988de0969bf13e27 100644 (file)
@@ -6340,7 +6340,7 @@ array_unnest_support(PG_FUNCTION_ARGS)
                        /* We can use estimated argument values here */
                        arg1 = estimate_expression_value(req->root, linitial(args));
 
-                       req->rows = estimate_array_length(arg1);
+                       req->rows = estimate_array_length(req->root, arg1);
                        ret = (Node *) req;
                }
        }
index 7a3f69f2d9ddf32a386c67cdd65d1f237c8d8096..dbcd98d9851665336ce80ad8bd6e45f551734784 100644 (file)
@@ -2128,10 +2128,11 @@ scalararraysel(PlannerInfo *root,
 /*
  * Estimate number of elements in the array yielded by an expression.
  *
- * It's important that this agree with scalararraysel.
+ * Note: the result is integral, but we use "double" to avoid overflow
+ * concerns.  Most callers will use it in double-type expressions anyway.
  */
-int
-estimate_array_length(Node *arrayexpr)
+double
+estimate_array_length(PlannerInfo *root, Node *arrayexpr)
 {
        /* look through any binary-compatible relabeling of arrayexpr */
        arrayexpr = strip_array_coercion(arrayexpr);
@@ -2152,11 +2153,39 @@ estimate_array_length(Node *arrayexpr)
        {
                return list_length(((ArrayExpr *) arrayexpr)->elements);
        }
-       else
+       else if (arrayexpr)
        {
-               /* default guess --- see also scalararraysel */
-               return 10;
+               /* See if we can find any statistics about it */
+               VariableStatData vardata;
+               AttStatsSlot sslot;
+               double          nelem = 0;
+
+               examine_variable(root, arrayexpr, 0, &vardata);
+               if (HeapTupleIsValid(vardata.statsTuple))
+               {
+                       /*
+                        * Found stats, so use the average element count, which is stored
+                        * in the last stanumbers element of the DECHIST statistics.
+                        * Actually that is the average count of *distinct* elements;
+                        * perhaps we should scale it up somewhat?
+                        */
+                       if (get_attstatsslot(&sslot, vardata.statsTuple,
+                                                                STATISTIC_KIND_DECHIST, InvalidOid,
+                                                                ATTSTATSSLOT_NUMBERS))
+                       {
+                               if (sslot.nnumbers > 0)
+                                       nelem = clamp_row_est(sslot.numbers[sslot.nnumbers - 1]);
+                               free_attstatsslot(&sslot);
+                       }
+               }
+               ReleaseVariableStats(vardata);
+
+               if (nelem > 0)
+                       return nelem;
        }
+
+       /* Else use a default guess --- this should match scalararraysel */
+       return 10;
 }
 
 /*
@@ -6540,7 +6569,7 @@ genericcostestimate(PlannerInfo *root,
                if (IsA(rinfo->clause, ScalarArrayOpExpr))
                {
                        ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
-                       int                     alength = estimate_array_length(lsecond(saop->args));
+                       double          alength = estimate_array_length(root, lsecond(saop->args));
 
                        if (alength > 1)
                                num_sa_scans *= alength;
@@ -6820,7 +6849,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
                        {
                                ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
                                Node       *other_operand = (Node *) lsecond(saop->args);
-                               int                     alength = estimate_array_length(other_operand);
+                               double          alength = estimate_array_length(root, other_operand);
 
                                clause_op = saop->opno;
                                found_saop = true;
@@ -7414,7 +7443,7 @@ gincost_scalararrayopexpr(PlannerInfo *root,
        {
                counts->exactEntries++;
                counts->searchEntries++;
-               counts->arrayScans *= estimate_array_length(rightop);
+               counts->arrayScans *= estimate_array_length(root, rightop);
                return true;
        }
 
index 6dd5171d548c1967948f471662db3e1f61db9dbd..2fa4c4fc1b0d09f628af0bb3a9dd487fee0dbf15 100644 (file)
@@ -200,7 +200,7 @@ extern Selectivity scalararraysel(PlannerInfo *root,
                                                                  ScalarArrayOpExpr *clause,
                                                                  bool is_join_clause,
                                                                  int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo);
-extern int     estimate_array_length(Node *arrayexpr);
+extern double estimate_array_length(PlannerInfo *root, Node *arrayexpr);
 extern Selectivity rowcomparesel(PlannerInfo *root,
                                                                 RowCompareExpr *clause,
                                                                 int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo);