From 7c14f177eb5b52d491f41b217926cc8ca5f0ce4c Mon Sep 17 00:00:00 2001
From: Max Gekk <max.gekk@gmail.com>
Date: Tue, 22 Sep 2020 12:45:19 -0700
Subject: [PATCH] [SPARK-32306][SQL][DOCS] Clarify the result of
 `percentile_approx()`

### What changes were proposed in this pull request?
More precise description of the result of the `percentile_approx()` function and its synonym `approx_percentile()`. The proposed sentence clarifies that  the function returns **one of elements** (or array of elements) from the input column.

### Why are the changes needed?
To improve Spark docs and avoid misunderstanding of the function behavior.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
`./dev/scalastyle`

Closes #29835 from MaxGekk/doc-percentile_approx.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 R/pkg/R/functions.R                                  |  6 ++++--
 python/pyspark/sql/functions.py                      |  4 +++-
 .../aggregate/ApproximatePercentile.scala            | 12 +++++++-----
 .../main/scala/org/apache/spark/sql/functions.scala  |  5 +++--
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 1d75819cb6..2d1667f563 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1417,8 +1417,10 @@ setMethod("quarter",
           })
 
 #' @details
-#' \code{percentile_approx} Returns the approximate percentile value of
-#' numeric column at the given percentage.
+#' \code{percentile_approx} Returns the approximate \code{percentile} of the numeric column
+#' \code{col} which is the smallest value in the ordered \code{col} values (sorted from least to
+#' greatest) such that no more than \code{percentage} of \code{col} values is less than the value
+#' or equal to that value.
 #'
 #' @param percentage Numeric percentage at which percentile should be computed
 #'                   All values should be between 0 and 1.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index f01bdb0165..14d101a652 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -592,7 +592,9 @@ def nanvl(col1, col2):
 
 @since(3.1)
 def percentile_approx(col, percentage, accuracy=10000):
-    """Returns the approximate percentile value of numeric column col at the given percentage.
+    """Returns the approximate `percentile` of the numeric column `col` which is the smallest value
+    in the ordered `col` values (sorted from least to greatest) such that no more than `percentage`
+    of `col` values is less than the value or equal to that value.
     The value of percentage must be between 0.0 and 1.0.
 
     The accuracy parameter (default: 10000)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index d06eeeef23..7a1eec1a30 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -49,11 +49,13 @@ import org.apache.spark.sql.types._
  */
 @ExpressionDescription(
   usage = """
-    _FUNC_(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric
-      column `col` at the given percentage. The value of percentage must be between 0.0
-      and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which
-      controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields
-      better accuracy, `1.0/accuracy` is the relative error of the approximation.
+    _FUNC_(col, percentage [, accuracy]) - Returns the approximate `percentile` of the numeric
+      column `col` which is the smallest value in the ordered `col` values (sorted from least to
+      greatest) such that no more than `percentage` of `col` values is less than the value
+      or equal to that value. The value of percentage must be between 0.0 and 1.0. The `accuracy`
+      parameter (default: 10000) is a positive numeric literal which controls approximation accuracy
+      at the cost of memory. Higher value of `accuracy` yields better accuracy, `1.0/accuracy` is
+      the relative error of the approximation.
       When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0.
       In this case, returns the approximate percentile array of column `col` at the given
       percentage array.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index b20e8c241e..acf845d6ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -684,8 +684,9 @@ object functions {
   def min(columnName: String): Column = min(Column(columnName))
 
   /**
-   * Aggregate function: returns and array of the approximate percentile values
-   * of numeric column col at the given percentages.
+   * Aggregate function: returns the approximate `percentile` of the numeric column `col` which
+   * is the smallest value in the ordered `col` values (sorted from least to greatest) such that
+   * no more than `percentage` of `col` values is less than the value or equal to that value.
    *
    * If percentage is an array, each value must be between 0.0 and 1.0.
    * If it is a single floating point value, it must be between 0.0 and 1.0.