[SPARK-17439][SQL] Fixing compression issues with approximate quantiles and adding more tests
## What changes were proposed in this pull request? This PR build on #14976 and fixes a correctness bug that would cause the wrong quantile to be returned for small target errors. ## How was this patch tested? This PR adds 8 unit tests that were failing without the fix. Author: Timothy Hunter <timhunter@databricks.com> Author: Sean Owen <sowen@cloudera.com> Closes #15002 from thunterdb/ml-1783.
This commit is contained in:
parent
29ba9578f4
commit
180796ecb3
|
@ -17,7 +17,7 @@
|
||||||
|
|
||||||
package org.apache.spark.sql.catalyst.util
|
package org.apache.spark.sql.catalyst.util
|
||||||
|
|
||||||
import scala.collection.mutable.ArrayBuffer
|
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
|
||||||
|
|
||||||
import org.apache.spark.sql.catalyst.util.QuantileSummaries.Stats
|
import org.apache.spark.sql.catalyst.util.QuantileSummaries.Stats
|
||||||
|
|
||||||
|
@ -61,7 +61,12 @@ class QuantileSummaries(
|
||||||
def insert(x: Double): QuantileSummaries = {
|
def insert(x: Double): QuantileSummaries = {
|
||||||
headSampled += x
|
headSampled += x
|
||||||
if (headSampled.size >= defaultHeadSize) {
|
if (headSampled.size >= defaultHeadSize) {
|
||||||
this.withHeadBufferInserted
|
val result = this.withHeadBufferInserted
|
||||||
|
if (result.sampled.length >= compressThreshold) {
|
||||||
|
result.compress()
|
||||||
|
} else {
|
||||||
|
result
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
this
|
this
|
||||||
}
|
}
|
||||||
|
@ -236,7 +241,7 @@ object QuantileSummaries {
|
||||||
if (currentSamples.isEmpty) {
|
if (currentSamples.isEmpty) {
|
||||||
return Array.empty[Stats]
|
return Array.empty[Stats]
|
||||||
}
|
}
|
||||||
val res: ArrayBuffer[Stats] = ArrayBuffer.empty
|
val res = ListBuffer.empty[Stats]
|
||||||
// Start for the last element, which is always part of the set.
|
// Start for the last element, which is always part of the set.
|
||||||
// The head contains the current new head, that may be merged with the current element.
|
// The head contains the current new head, that may be merged with the current element.
|
||||||
var head = currentSamples.last
|
var head = currentSamples.last
|
||||||
|
@ -258,7 +263,10 @@ object QuantileSummaries {
|
||||||
}
|
}
|
||||||
res.prepend(head)
|
res.prepend(head)
|
||||||
// If necessary, add the minimum element:
|
// If necessary, add the minimum element:
|
||||||
res.prepend(currentSamples.head)
|
val currHead = currentSamples.head
|
||||||
|
if (currHead.value < head.value) {
|
||||||
|
res.prepend(currentSamples.head)
|
||||||
|
}
|
||||||
res.toArray
|
res.toArray
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,6 +40,20 @@ class QuantileSummariesSuite extends SparkFunSuite {
|
||||||
summary.compress()
|
summary.compress()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interleaves compression and insertions.
|
||||||
|
*/
|
||||||
|
private def buildCompressSummary(
|
||||||
|
data: Seq[Double],
|
||||||
|
epsi: Double,
|
||||||
|
threshold: Int): QuantileSummaries = {
|
||||||
|
var summary = new QuantileSummaries(threshold, epsi)
|
||||||
|
data.foreach { x =>
|
||||||
|
summary = summary.insert(x).compress()
|
||||||
|
}
|
||||||
|
summary
|
||||||
|
}
|
||||||
|
|
||||||
private def checkQuantile(quant: Double, data: Seq[Double], summary: QuantileSummaries): Unit = {
|
private def checkQuantile(quant: Double, data: Seq[Double], summary: QuantileSummaries): Unit = {
|
||||||
val approx = summary.query(quant)
|
val approx = summary.query(quant)
|
||||||
// The rank of the approximation.
|
// The rank of the approximation.
|
||||||
|
@ -54,8 +68,8 @@ class QuantileSummariesSuite extends SparkFunSuite {
|
||||||
|
|
||||||
for {
|
for {
|
||||||
(seq_name, data) <- Seq(increasing, decreasing, random)
|
(seq_name, data) <- Seq(increasing, decreasing, random)
|
||||||
epsi <- Seq(0.1, 0.0001)
|
epsi <- Seq(0.1, 0.0001) // With a significant value and with full precision
|
||||||
compression <- Seq(1000, 10)
|
compression <- Seq(1000, 10) // This interleaves n so that we test without and with compression
|
||||||
} {
|
} {
|
||||||
|
|
||||||
test(s"Extremas with epsi=$epsi and seq=$seq_name, compression=$compression") {
|
test(s"Extremas with epsi=$epsi and seq=$seq_name, compression=$compression") {
|
||||||
|
@ -75,6 +89,17 @@ class QuantileSummariesSuite extends SparkFunSuite {
|
||||||
checkQuantile(0.1, data, s)
|
checkQuantile(0.1, data, s)
|
||||||
checkQuantile(0.001, data, s)
|
checkQuantile(0.001, data, s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test(s"Some quantile values with epsi=$epsi and seq=$seq_name, compression=$compression " +
|
||||||
|
s"(interleaved)") {
|
||||||
|
val s = buildCompressSummary(data, epsi, compression)
|
||||||
|
assert(s.count == data.size, s"Found count=${s.count} but data size=${data.size}")
|
||||||
|
checkQuantile(0.9999, data, s)
|
||||||
|
checkQuantile(0.9, data, s)
|
||||||
|
checkQuantile(0.5, data, s)
|
||||||
|
checkQuantile(0.1, data, s)
|
||||||
|
checkQuantile(0.001, data, s)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tests for merging procedure
|
// Tests for merging procedure
|
||||||
|
|
Loading…
Reference in a new issue