[SPARK-27539][SQL] Fix inaccurate aggregate outputRows estimation with column containing null values
## What changes were proposed in this pull request?
This PR is follow up of https://github.com/apache/spark/pull/24286. As gatorsmile pointed out that column with null value is inaccurate as well.
```
> select key from test;
2
NULL
1
spark-sql> desc extended test key;
col_name key
data_type int
comment NULL
min 1
max 2
num_nulls 1
distinct_count 2
```
The distinct count should be distinct_count + 1 when column contains null value.
## How was this patch tested?
Existing tests & new UT added.
Closes #24436 from pengbo/aggregation_estimation.
Authored-by: pengbo <bo.peng1019@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit d9b2ce0f0f
)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
parent
4472a9fb08
commit
42cb4a2ccd
|
@ -42,8 +42,8 @@ object AggregateEstimation {
|
|||
(res, expr) => {
|
||||
val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute])
|
||||
val distinctCount = columnStat.distinctCount.get
|
||||
val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) {
|
||||
1
|
||||
val distinctValue: BigInt = if (columnStat.nullCount.get > 0) {
|
||||
distinctCount + 1
|
||||
} else {
|
||||
distinctCount
|
||||
}
|
||||
|
|
|
@ -40,7 +40,9 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
|
|||
attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
|
||||
nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
|
||||
attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None,
|
||||
nullCount = Some(4), avgLen = Some(4), maxLen = Some(4))
|
||||
nullCount = Some(4), avgLen = Some(4), maxLen = Some(4)),
|
||||
attr("key33") -> ColumnStat(distinctCount = Some(2), min = None, max = None,
|
||||
nullCount = Some(2), avgLen = Some(4), maxLen = Some(4))
|
||||
))
|
||||
|
||||
private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1)
|
||||
|
@ -102,6 +104,15 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
|
|||
expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get)
|
||||
}
|
||||
|
||||
test("group-by column with null value") {
|
||||
checkAggStats(
|
||||
tableColumns = Seq("key21", "key33"),
|
||||
tableRowCount = 6,
|
||||
groupByColumns = Seq("key21", "key33"),
|
||||
expectedOutputRowCount = nameToColInfo("key21")._2.distinctCount.get *
|
||||
(nameToColInfo("key33")._2.distinctCount.get + 1))
|
||||
}
|
||||
|
||||
test("non-cbo estimation") {
|
||||
val attributes = Seq("key12").map(nameToAttr)
|
||||
val child = StatsTestPlan(
|
||||
|
|
Loading…
Reference in a new issue