[SPARK-18111][SQL] Wrong ApproximatePercentile answer when multiple records have the minimum value
## What changes were proposed in this pull request? When multiple records have the minimum value, the answer of ApproximatePercentile is wrong. ## How was this patch tested? add a test case Author: wangzhenhua <wangzhenhua@huawei.com> Closes #15641 from wzhfy/percentile.
This commit is contained in:
parent
623fc7fc67
commit
cb80edc263
|
@ -264,7 +264,9 @@ object QuantileSummaries {
|
||||||
res.prepend(head)
|
res.prepend(head)
|
||||||
// If necessary, add the minimum element:
|
// If necessary, add the minimum element:
|
||||||
val currHead = currentSamples.head
|
val currHead = currentSamples.head
|
||||||
if (currHead.value < head.value) {
|
// don't add the minimum element if `currentSamples` has only one element (both `currHead` and
|
||||||
|
// `head` point to the same element)
|
||||||
|
if (currHead.value <= head.value && currentSamples.length > 1) {
|
||||||
res.prepend(currentSamples.head)
|
res.prepend(currentSamples.head)
|
||||||
}
|
}
|
||||||
res.toArray
|
res.toArray
|
||||||
|
|
|
@ -64,6 +64,17 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSQLContext {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("percentile_approx, multiple records with the minimum value in a partition") {
|
||||||
|
withTempView(table) {
|
||||||
|
spark.sparkContext.makeRDD(Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5), 4).toDF("col")
|
||||||
|
.createOrReplaceTempView(table)
|
||||||
|
checkAnswer(
|
||||||
|
spark.sql(s"SELECT percentile_approx(col, array(0.5)) FROM $table"),
|
||||||
|
Row(Seq(1.0D))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
test("percentile_approx, with different accuracies") {
|
test("percentile_approx, with different accuracies") {
|
||||||
|
|
||||||
withTempView(table) {
|
withTempView(table) {
|
||||||
|
|
Loading…
Reference in a new issue