From 7c14f177eb5b52d491f41b217926cc8ca5f0ce4c Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Tue, 22 Sep 2020 12:45:19 -0700 Subject: [PATCH] [SPARK-32306][SQL][DOCS] Clarify the result of `percentile_approx()` ### What changes were proposed in this pull request? More precise description of the result of the `percentile_approx()` function and its synonym `approx_percentile()`. The proposed sentence clarifies that the function returns **one of elements** (or array of elements) from the input column. ### Why are the changes needed? To improve Spark docs and avoid misunderstanding of the function behavior. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? `./dev/scalastyle` Closes #29835 from MaxGekk/doc-percentile_approx. Authored-by: Max Gekk Signed-off-by: Liang-Chi Hsieh --- R/pkg/R/functions.R | 6 ++++-- python/pyspark/sql/functions.py | 4 +++- .../aggregate/ApproximatePercentile.scala | 12 +++++++----- .../main/scala/org/apache/spark/sql/functions.scala | 5 +++-- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 1d75819cb6..2d1667f563 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1417,8 +1417,10 @@ setMethod("quarter", }) #' @details -#' \code{percentile_approx} Returns the approximate percentile value of -#' numeric column at the given percentage. +#' \code{percentile_approx} Returns the approximate \code{percentile} of the numeric column +#' \code{col} which is the smallest value in the ordered \code{col} values (sorted from least to +#' greatest) such that no more than \code{percentage} of \code{col} values is less than the value +#' or equal to that value. #' #' @param percentage Numeric percentage at which percentile should be computed #' All values should be between 0 and 1. diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index f01bdb0165..14d101a652 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -592,7 +592,9 @@ def nanvl(col1, col2): @since(3.1) def percentile_approx(col, percentage, accuracy=10000): - """Returns the approximate percentile value of numeric column col at the given percentage. + """Returns the approximate `percentile` of the numeric column `col` which is the smallest value + in the ordered `col` values (sorted from least to greatest) such that no more than `percentage` + of `col` values is less than the value or equal to that value. The value of percentage must be between 0.0 and 1.0. The accuracy parameter (default: 10000) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index d06eeeef23..7a1eec1a30 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -49,11 +49,13 @@ import org.apache.spark.sql.types._ */ @ExpressionDescription( usage = """ - _FUNC_(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric - column `col` at the given percentage. The value of percentage must be between 0.0 - and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which - controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields - better accuracy, `1.0/accuracy` is the relative error of the approximation. + _FUNC_(col, percentage [, accuracy]) - Returns the approximate `percentile` of the numeric + column `col` which is the smallest value in the ordered `col` values (sorted from least to + greatest) such that no more than `percentage` of `col` values is less than the value + or equal to that value. The value of percentage must be between 0.0 and 1.0. The `accuracy` + parameter (default: 10000) is a positive numeric literal which controls approximation accuracy + at the cost of memory. Higher value of `accuracy` yields better accuracy, `1.0/accuracy` is + the relative error of the approximation. When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0. In this case, returns the approximate percentile array of column `col` at the given percentage array. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index b20e8c241e..acf845d6ec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -684,8 +684,9 @@ object functions { def min(columnName: String): Column = min(Column(columnName)) /** - * Aggregate function: returns and array of the approximate percentile values - * of numeric column col at the given percentages. + * Aggregate function: returns the approximate `percentile` of the numeric column `col` which + * is the smallest value in the ordered `col` values (sorted from least to greatest) such that + * no more than `percentage` of `col` values is less than the value or equal to that value. * * If percentage is an array, each value must be between 0.0 and 1.0. * If it is a single floating point value, it must be between 0.0 and 1.0.