[SPARK-23799][SQL][FOLLOW-UP] FilterEstimation.evaluateInSet produces wrong stats for STRING
## What changes were proposed in this pull request? `colStat.min` AND `colStat.max` are empty for string type. Thus, `evaluateInSet` should not return zero when either `colStat.min` or `colStat.max`. ## How was this patch tested? Added a test case. Author: gatorsmile <gatorsmile@gmail.com> Closes #21147 from gatorsmile/cached.
This commit is contained in:
parent
d1eb8d3ddc
commit
ce2f919f8d
|
@ -392,13 +392,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
|
|||
val dataType = attr.dataType
|
||||
var newNdv = ndv
|
||||
|
||||
if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
|
||||
return Some(0.0)
|
||||
}
|
||||
|
||||
// use [min, max] to filter the original hSet
|
||||
dataType match {
|
||||
case _: NumericType | BooleanType | DateType | TimestampType =>
|
||||
if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
|
||||
return Some(0.0)
|
||||
}
|
||||
|
||||
val statsInterval =
|
||||
ValueInterval(colStat.min, colStat.max, dataType).asInstanceOf[NumericValueInterval]
|
||||
val validQuerySet = hSet.filter { v =>
|
||||
|
@ -422,6 +422,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
|
|||
|
||||
// We assume the whole set since there is no min/max information for String/Binary type
|
||||
case StringType | BinaryType =>
|
||||
if (ndv.toDouble == 0) {
|
||||
return Some(0.0)
|
||||
}
|
||||
|
||||
newNdv = ndv.min(BigInt(hSet.size))
|
||||
if (update) {
|
||||
val newStats = colStat.copy(distinctCount = Some(newNdv), nullCount = Some(0))
|
||||
|
|
|
@ -368,6 +368,18 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
|
|||
expectedRowCount = 0)
|
||||
}
|
||||
|
||||
test("evaluateInSet with string") {
|
||||
validateEstimatedStats(
|
||||
Filter(InSet(attrString, Set("A0")),
|
||||
StatsTestPlan(Seq(attrString), 10,
|
||||
AttributeMap(Seq(attrString ->
|
||||
ColumnStat(distinctCount = Some(10), min = None, max = None,
|
||||
nullCount = Some(0), avgLen = Some(2), maxLen = Some(2)))))),
|
||||
Seq(attrString -> ColumnStat(distinctCount = Some(1), min = None, max = None,
|
||||
nullCount = Some(0), avgLen = Some(2), maxLen = Some(2))),
|
||||
expectedRowCount = 1)
|
||||
}
|
||||
|
||||
test("cint NOT IN (3, 4, 5)") {
|
||||
validateEstimatedStats(
|
||||
Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),
|
||||
|
|
Loading…
Reference in a new issue