[SPARK-23799][SQL][FOLLOW-UP] FilterEstimation.evaluateInSet produces wrong stats for STRING

## What changes were proposed in this pull request?
`colStat.min` AND `colStat.max` are empty for string type. Thus, `evaluateInSet` should not return zero when either `colStat.min` or `colStat.max`.

## How was this patch tested?
Added a test case.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #21147 from gatorsmile/cached.
This commit is contained in:
gatorsmile 2018-04-26 19:07:13 +08:00 committed by Wenchen Fan
parent d1eb8d3ddc
commit ce2f919f8d
2 changed files with 20 additions and 4 deletions

View file

@ -392,13 +392,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
val dataType = attr.dataType
var newNdv = ndv
if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
return Some(0.0)
}
// use [min, max] to filter the original hSet
dataType match {
case _: NumericType | BooleanType | DateType | TimestampType =>
if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
return Some(0.0)
}
val statsInterval =
ValueInterval(colStat.min, colStat.max, dataType).asInstanceOf[NumericValueInterval]
val validQuerySet = hSet.filter { v =>
@ -422,6 +422,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
// We assume the whole set since there is no min/max information for String/Binary type
case StringType | BinaryType =>
if (ndv.toDouble == 0) {
return Some(0.0)
}
newNdv = ndv.min(BigInt(hSet.size))
if (update) {
val newStats = colStat.copy(distinctCount = Some(newNdv), nullCount = Some(0))

View file

@ -368,6 +368,18 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
expectedRowCount = 0)
}
test("evaluateInSet with string") {
validateEstimatedStats(
Filter(InSet(attrString, Set("A0")),
StatsTestPlan(Seq(attrString), 10,
AttributeMap(Seq(attrString ->
ColumnStat(distinctCount = Some(10), min = None, max = None,
nullCount = Some(0), avgLen = Some(2), maxLen = Some(2)))))),
Seq(attrString -> ColumnStat(distinctCount = Some(1), min = None, max = None,
nullCount = Some(0), avgLen = Some(2), maxLen = Some(2))),
expectedRowCount = 1)
}
test("cint NOT IN (3, 4, 5)") {
validateEstimatedStats(
Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),