[SPARK-18111][SQL] Wrong ApproximatePercentile answer when multiple records have the minimum value
## What changes were proposed in this pull request? When multiple records have the minimum value, the answer of ApproximatePercentile is wrong. ## How was this patch tested? add a test case Author: wangzhenhua <wangzhenhua@huawei.com> Closes #15641 from wzhfy/percentile.
This commit is contained in:
parent
623fc7fc67
commit
cb80edc263
|
@ -264,7 +264,9 @@ object QuantileSummaries {
|
|||
res.prepend(head)
|
||||
// If necessary, add the minimum element:
|
||||
val currHead = currentSamples.head
|
||||
if (currHead.value < head.value) {
|
||||
// don't add the minimum element if `currentSamples` has only one element (both `currHead` and
|
||||
// `head` point to the same element)
|
||||
if (currHead.value <= head.value && currentSamples.length > 1) {
|
||||
res.prepend(currentSamples.head)
|
||||
}
|
||||
res.toArray
|
||||
|
|
|
@ -64,6 +64,17 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSQLContext {
|
|||
}
|
||||
}
|
||||
|
||||
test("percentile_approx, multiple records with the minimum value in a partition") {
|
||||
withTempView(table) {
|
||||
spark.sparkContext.makeRDD(Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5), 4).toDF("col")
|
||||
.createOrReplaceTempView(table)
|
||||
checkAnswer(
|
||||
spark.sql(s"SELECT percentile_approx(col, array(0.5)) FROM $table"),
|
||||
Row(Seq(1.0D))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
test("percentile_approx, with different accuracies") {
|
||||
|
||||
withTempView(table) {
|
||||
|
|
Loading…
Reference in a new issue