[MINOR][SQL] Fixed approx_count_distinct rsd param description

alexander-daskalov · maropu · commit 81d7747f972b · 2020-08-14T22:11:07.000+09:00
### What changes were proposed in this pull request? In the docs concerning the approx_count_distinct I have changed the description of the rsd parameter from **_maximum estimation error allowed_** to _**maximum relative standard deviation allowed**_ ### Why are the changes needed? Maximum estimation error allowed can be misleading. You can set the target relative standard deviation, which affects the estimation error, but on given runs the estimation error can still be above the rsd parameter. ### Does this PR introduce _any_ user-facing change? This PR should make it easier for users reading the docs to understand that the rsd parameter in approx_count_distinct doesn't cap the estimation error, but just sets the target deviation instead, ### How was this patch tested? No tests, as no code changes were made. Closes apache#29424 from Comonut/fix-approx_count_distinct-rsd-param-description. Authored-by: alexander-daskalov <alexander.daskalov@adevinta.com> Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org> (cherry picked from commit 10edeaf) Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -2199,7 +2199,7 @@ setMethod("pmod", signature(y = "Column"),
             column(jc)
           })
 
-#' @param rsd maximum estimation error allowed (default = 0.05).
+#' @param rsd maximum relative standard deviation allowed (default = 0.05).
 #'
 #' @rdname column_aggregate_functions
 #' @aliases approx_count_distinct,Column-method
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -335,8 +335,8 @@ def approx_count_distinct(col, rsd=None):
     """Aggregate function: returns a new :class:`Column` for approximate distinct count of
     column `col`.
 
-    :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
-        efficient to use :func:`countDistinct`
+    :param rsd: maximum relative standard deviation allowed (default = 0.05).
+        For rsd < 0.01, it is more efficient to use :func:`countDistinct`
 
     >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect()
     [Row(distinct_ages=2)]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala
@@ -39,7 +39,8 @@ import org.apache.spark.unsafe.Platform
  *                            and its elements should be sorted into ascending order.
  *                            Duplicate endpoints are allowed, e.g. (1, 5, 5, 10), and ndv for
  *                            interval (5, 5] would be 1.
- * @param relativeSD The maximum estimation error allowed in the HyperLogLogPlusPlus algorithm.
+ * @param relativeSD The maximum relative standard deviation allowed
+ *                   in the HyperLogLogPlusPlus algorithm.
  */
 case class ApproxCountDistinctForIntervals(
     child: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
@@ -41,13 +41,13 @@ import org.apache.spark.sql.types._
  * https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen#
  *
  * @param child to estimate the cardinality of.
- * @param relativeSD the maximum estimation error allowed.
+ * @param relativeSD the maximum relative standard deviation allowed.
  */
 // scalastyle:on
 @ExpressionDescription(
   usage = """
     _FUNC_(expr[, relativeSD]) - Returns the estimated cardinality by HyperLogLog++.
-      `relativeSD` defines the maximum estimation error allowed.""",
+      `relativeSD` defines the maximum relative standard deviation allowed.""",
   examples = """
     Examples:
       > SELECT _FUNC_(col1) FROM VALUES (1), (1), (2), (2), (3) tab(col1);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1578,8 +1578,8 @@ object SQLConf {
   val NDV_MAX_ERROR =
     buildConf("spark.sql.statistics.ndv.maxError")
       .internal()
-      .doc("The maximum estimation error allowed in HyperLogLog++ algorithm when generating " +
-        "column level statistics.")
+      .doc("The maximum relative standard deviation allowed in HyperLogLog++ algorithm " +
+        "when generating column level statistics.")
       .version("2.1.1")
       .doubleConf
       .createWithDefault(0.05)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -262,7 +262,7 @@ object functions {
   /**
    * Aggregate function: returns the approximate number of distinct items in a group.
    *
-   * @param rsd maximum estimation error allowed (default = 0.05)
+   * @param rsd maximum relative standard deviation allowed (default = 0.05)
    *
    * @group agg_funcs
    * @since 2.1.0
@@ -274,7 +274,7 @@ object functions {
   /**
    * Aggregate function: returns the approximate number of distinct items in a group.
    *
-   * @param rsd maximum estimation error allowed (default = 0.05)
+   * @param rsd maximum relative standard deviation allowed (default = 0.05)
    *
    * @group agg_funcs
    * @since 2.1.0

Original file line number	Diff line number	Diff line change
`@@ -262,7 +262,7 @@ object functions {`
`262`	`262`	`/**`
`263`	`263`	`* Aggregate function: returns the approximate number of distinct items in a group.`
`264`	`264`	`*`
`265`		`- * @param rsd maximum estimation error allowed (default = 0.05)`
	`265`	`+ * @param rsd maximum relative standard deviation allowed (default = 0.05)`
`266`	`266`	`*`
`267`	`267`	`* @group agg_funcs`
`268`	`268`	`* @since 2.1.0`
`@@ -274,7 +274,7 @@ object functions {`
`274`	`274`	`/**`
`275`	`275`	`* Aggregate function: returns the approximate number of distinct items in a group.`
`276`	`276`	`*`
`277`		`- * @param rsd maximum estimation error allowed (default = 0.05)`
	`277`	`+ * @param rsd maximum relative standard deviation allowed (default = 0.05)`
`278`	`278`	`*`
`279`	`279`	`* @group agg_funcs`
`280`	`280`	`* @since 2.1.0`