[SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit…
## What changes were proposed in this pull request?
The upper bound of group-by columns row number is to multiply distinct counts of group-by columns. However, column with only null value will cause the output row number to be 0 which is incorrect.
Ex:
col1 (distinct: 2, rowCount 2)
col2 (distinct: 0, rowCount 2)
=> group by col1, col2
Actual: output rows: 0
Expected: output rows: 2
## How was this patch tested?
According unit test has been added, plus manual test has been done in our tpcds benchmark environement.
Closes #24286 from pengbo/master.
Lead-authored-by: pengbo <bo.peng1019@gmail.com>
Co-authored-by: mingbo_pb <mingbo.pb@alibaba-inc.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit c58a4fed8d
)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
parent
ede02b692f
commit
40668c53ed
|
@ -39,8 +39,16 @@ object AggregateEstimation {
|
||||||
// Multiply distinct counts of group-by columns. This is an upper bound, which assumes
|
// Multiply distinct counts of group-by columns. This is an upper bound, which assumes
|
||||||
// the data contains all combinations of distinct values of group-by columns.
|
// the data contains all combinations of distinct values of group-by columns.
|
||||||
var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))(
|
var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))(
|
||||||
(res, expr) => res *
|
(res, expr) => {
|
||||||
childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount.get)
|
val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute])
|
||||||
|
val distinctCount = columnStat.distinctCount.get
|
||||||
|
val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) {
|
||||||
|
1
|
||||||
|
} else {
|
||||||
|
distinctCount
|
||||||
|
}
|
||||||
|
res * distinctValue
|
||||||
|
})
|
||||||
|
|
||||||
outputRows = if (agg.groupingExpressions.isEmpty) {
|
outputRows = if (agg.groupingExpressions.isEmpty) {
|
||||||
// If there's no group-by columns, the output is a single row containing values of aggregate
|
// If there's no group-by columns, the output is a single row containing values of aggregate
|
||||||
|
|
|
@ -38,7 +38,9 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
|
||||||
attr("key22") -> ColumnStat(distinctCount = Some(2), min = Some(10), max = Some(20),
|
attr("key22") -> ColumnStat(distinctCount = Some(2), min = Some(10), max = Some(20),
|
||||||
nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
|
nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
|
||||||
attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
|
attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
|
||||||
nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))
|
nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
|
||||||
|
attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
|
||||||
|
nullCount = Some(4), avgLen = Some(4), maxLen = Some(4))
|
||||||
))
|
))
|
||||||
|
|
||||||
private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1)
|
private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1)
|
||||||
|
@ -92,6 +94,14 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
|
||||||
expectedOutputRowCount = 0)
|
expectedOutputRowCount = 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("group-by column with only null value") {
|
||||||
|
checkAggStats(
|
||||||
|
tableColumns = Seq("key22", "key32"),
|
||||||
|
tableRowCount = 6,
|
||||||
|
groupByColumns = Seq("key22", "key32"),
|
||||||
|
expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get)
|
||||||
|
}
|
||||||
|
|
||||||
test("non-cbo estimation") {
|
test("non-cbo estimation") {
|
||||||
val attributes = Seq("key12").map(nameToAttr)
|
val attributes = Seq("key12").map(nameToAttr)
|
||||||
val child = StatsTestPlan(
|
val child = StatsTestPlan(
|
||||||
|
|
Loading…
Reference in a new issue