Use Limit instead of Unique to implement DISTINCT, when possible

author David Rowley <[email protected]>

Fri, 28 Oct 2022 10:04:38 +0000 (23:04 +1300)

committer David Rowley <[email protected]>

Fri, 28 Oct 2022 10:04:38 +0000 (23:04 +1300)
author David Rowley <[email protected]>
Fri, 28 Oct 2022 10:04:38 +0000 (23:04 +1300)
committer David Rowley <[email protected]>
Fri, 28 Oct 2022 10:04:38 +0000 (23:04 +1300)
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 78a81745348d5d86e3c423b416e432985e1aecb9..493a3af0fa3331b2a24af312fa5f5c0b3722e7a7 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4780,11 +4780,46 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
  
             if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
             {
-               add_path(distinct_rel, (Path *)
-                        create_upper_unique_path(root, distinct_rel,
-                                                 path,
-                                                 list_length(root->distinct_pathkeys),
-                                                 numDistinctRows));
+               /*
+                * distinct_pathkeys may have become empty if all of the
+                * pathkeys were determined to be redundant.  If all of the
+                * pathkeys are redundant then each DISTINCT target must only
+                * allow a single value, therefore all resulting tuples must
+                * be identical (or at least indistinguishable by an equality
+                * check).  We can uniquify these tuples simply by just taking
+                * the first tuple.  All we do here is add a path to do "LIMIT
+                * 1" atop of 'path'.  When doing a DISTINCT ON we may still
+                * have a non-NIL sort_pathkeys list, so we must still only do
+                * this with paths which are correctly sorted by
+                * sort_pathkeys.
+                */
+               if (root->distinct_pathkeys == NIL)
+               {
+                   Node       *limitCount;
+
+                   limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
+                                                   sizeof(int64),
+                                                   Int64GetDatum(1), false,
+                                                   FLOAT8PASSBYVAL);
+
+                   /*
+                    * If the query already has a LIMIT clause, then we could
+                    * end up with a duplicate LimitPath in the final plan.
+                    * That does not seem worth troubling over too much.
+                    */
+                   add_path(distinct_rel, (Path *)
+                            create_limit_path(root, distinct_rel, path, NULL,
+                                              limitCount, LIMIT_OPTION_COUNT,
+                                              0, 1));
+               }
+               else
+               {
+                   add_path(distinct_rel, (Path *)
+                            create_upper_unique_path(root, distinct_rel,
+                                                     path,
+                                                     list_length(root->distinct_pathkeys),
+                                                     numDistinctRows));
+               }
             }
         }
  
@@ -4805,13 +4840,33 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
             path = (Path *) create_sort_path(root, distinct_rel,
                                              path,
                                              needed_pathkeys,
-                                            -1.0);
+                                            root->distinct_pathkeys == NIL ?
+                                            1.0 : -1.0);
  
-       add_path(distinct_rel, (Path *)
-                create_upper_unique_path(root, distinct_rel,
-                                         path,
-                                         list_length(root->distinct_pathkeys),
-                                         numDistinctRows));
+       /*
+        * As above, use a LimitPath instead of a UniquePath when all of the
+        * distinct_pathkeys are redundant and we're only going to get a
+        * series of tuples all with the same values anyway.
+        */
+       if (root->distinct_pathkeys == NIL)
+       {
+           Node       *limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
+                                                       sizeof(int64),
+                                                       Int64GetDatum(1), false,
+                                                       FLOAT8PASSBYVAL);
+
+           add_path(distinct_rel, (Path *)
+                    create_limit_path(root, distinct_rel, path, NULL,
+                                      limitCount, LIMIT_OPTION_COUNT, 0, 1));
+       }
+       else
+       {
+           add_path(distinct_rel, (Path *)
+                    create_upper_unique_path(root, distinct_rel,
+                                             path,
+                                             list_length(root->distinct_pathkeys),
+                                             numDistinctRows));
+       }
     }
  
     /*
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out

index 748419cee05fb9b2651da7002f70b7bdd1941606..6ce889d87c1d2f4cf2bc69196bbae07b2ac8dcbb 100644 (file)
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -279,6 +279,60 @@ RESET max_parallel_workers_per_gather;
  RESET min_parallel_table_scan_size;
  RESET parallel_setup_cost;
  RESET parallel_tuple_cost;
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+         QUERY PLAN         
+----------------------------
+ Limit
+   ->  Seq Scan on tenk1
+         Filter: (four = 0)
+(3 rows)
+
+-- Ensure the above gives us the correct result
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+ four 
+------
+    0
+(1 row)
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+                 QUERY PLAN                  
+---------------------------------------------
+ Limit
+   ->  Seq Scan on tenk1
+         Filter: ((two <> 0) AND (four = 0))
+(3 rows)
+
+-- Ensure no rows are returned
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+ four 
+------
+(0 rows)
+
+-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+         QUERY PLAN         
+----------------------------
+ Limit
+   ->  Seq Scan on tenk1
+         Filter: (four = 0)
+(3 rows)
+
+-- Ensure we only get 1 row
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+ four | ?column? | ?column? | ?column? 
+------+----------+----------+----------
+    0 |        1 |        2 |        3
+(1 row)
+
  --
  -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
  -- very own regression file.
diff --git a/src/test/regress/expected/select_distinct_on.out b/src/test/regress/expected/select_distinct_on.out

index 3d989909912eeb8a9634da2a1e145b854d6c7e7d..b2978c1114a093e7b5013fa933c961edec584faa 100644 (file)
--- a/src/test/regress/expected/select_distinct_on.out
+++ b/src/test/regress/expected/select_distinct_on.out
@@ -73,3 +73,53 @@ select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2;
   0 | -2147483647
  (1 row)
  
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+-- Ensure we also get a LIMIT plan with DISTINCT ON
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 ORDER BY 1;
+            QUERY PLAN            
+----------------------------------
+ Result
+   ->  Limit
+         ->  Seq Scan on tenk1
+               Filter: (four = 0)
+(4 rows)
+
+-- and check the result of the above query is correct
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 ORDER BY 1;
+ four | two 
+------+-----
+    0 |   0
+(1 row)
+
+-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 ORDER BY 1,2;
+            QUERY PLAN            
+----------------------------------
+ Limit
+   ->  Sort
+         Sort Key: two
+         ->  Seq Scan on tenk1
+               Filter: (four = 0)
+(5 rows)
+
+-- Same again but use a column that is indexed so that we get an index scan
+-- then a limit
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,hundred
+   FROM tenk1 WHERE four = 0 ORDER BY 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Result
+   ->  Limit
+         ->  Index Scan using tenk1_hundred on tenk1
+               Filter: (four = 0)
+(4 rows)
+
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql

index f27ff714f8d397a87d1ea44cd85f27f5635d5461..34020adad1dbf11a72677987f615a0909d1a8150 100644 (file)
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -146,6 +146,32 @@ RESET min_parallel_table_scan_size;
  RESET parallel_setup_cost;
  RESET parallel_tuple_cost;
  
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+
+-- Ensure the above gives us the correct result
+SELECT DISTINCT four FROM tenk1 WHERE four = 0;
+
+-- Ensure we get a plan with a Limit 1
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+
+-- Ensure no rows are returned
+SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
+
+-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+
+-- Ensure we only get 1 row
+SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
+
  --
  -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
  -- very own regression file.
diff --git a/src/test/regress/sql/select_distinct_on.sql b/src/test/regress/sql/select_distinct_on.sql

index 0920bd64b96e08bdc984f2f29370cc961550fe47..261e6140feb151484687cfeaf5e3270682095f71 100644 (file)
--- a/src/test/regress/sql/select_distinct_on.sql
+++ b/src/test/regress/sql/select_distinct_on.sql
@@ -17,3 +17,28 @@ SELECT DISTINCT ON (string4, ten) string4, ten, two
  
  -- bug #5049: early 8.4.x chokes on volatile DISTINCT ON clauses
  select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2;
+
+--
+-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
+-- all of the distinct_pathkeys have been marked as redundant
+--
+
+-- Ensure we also get a LIMIT plan with DISTINCT ON
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 ORDER BY 1;
+
+-- and check the result of the above query is correct
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 ORDER BY 1;
+
+-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,two
+   FROM tenk1 WHERE four = 0 ORDER BY 1,2;
+
+-- Same again but use a column that is indexed so that we get an index scan
+-- then a limit
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (four) four,hundred
+   FROM tenk1 WHERE four = 0 ORDER BY 1,2;
author	David Rowley <[email protected]>
	Fri, 28 Oct 2022 10:04:38 +0000 (23:04 +1300)
committer	David Rowley <[email protected]>
	Fri, 28 Oct 2022 10:04:38 +0000 (23:04 +1300)
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/test/regress/expected/select_distinct.out		patch \| blob \| blame \| history
src/test/regress/expected/select_distinct_on.out		patch \| blob \| blame \| history
src/test/regress/sql/select_distinct.sql		patch \| blob \| blame \| history
src/test/regress/sql/select_distinct_on.sql		patch \| blob \| blame \| history