[SPARK-27403][SQL] Fix updateTableStats to update table stats always with new stats or None

## What changes were proposed in this pull request?

System shall update the table stats automatically if user set spark.sql.statistics.size.autoUpdate.enabled as true, currently this property is not having any significance even if it is enabled or disabled. This feature is similar to Hives auto-gather feature where statistics are automatically computed by default if this feature is enabled.
Reference:
https://cwiki.apache.org/confluence/display/Hive/StatsDev

As part of fix , autoSizeUpdateEnabled  validation is been done initially so that system will calculate the table size for the user automatically and record it in metastore as per user expectation.

## How was this patch tested?
UT is written and manually verified in cluster.
Tested with unit tests + some internal tests on real cluster.

Before fix:

![image](https://user-images.githubusercontent.com/12999161/55688682-cd8d4780-5998-11e9-85da-e1a4e34419f6.png)

After fix
![image](https://user-images.githubusercontent.com/12999161/55688654-7d15ea00-5998-11e9-973f-1f4cee27018f.png)

Closes #24315 from sujith71955/master_autoupdate.

Authored-by: s71955 <sujithchacko.2010@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
s71955 2019-04-11 08:53:00 -07:00 committed by Dongjoon Hyun
parent d33ae2e9ed
commit 239082d966
2 changed files with 28 additions and 10 deletions

View file

@ -42,18 +42,16 @@ object CommandUtils extends Logging {
/** Change statistics after changing data by commands. */
def updateTableStats(sparkSession: SparkSession, table: CatalogTable): Unit = {
if (table.stats.nonEmpty) {
val catalog = sparkSession.sessionState.catalog
if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
val newTable = catalog.getTableMetadata(table.identifier)
val newSize = CommandUtils.calculateTotalSize(sparkSession, newTable)
val newStats = CatalogStatistics(sizeInBytes = newSize)
catalog.alterTableStats(table.identifier, Some(newStats))
} else {
} else if (table.stats.nonEmpty) {
catalog.alterTableStats(table.identifier, None)
}
}
}
def calculateTotalSize(spark: SparkSession, catalogTable: CatalogTable): BigInt = {
val sessionState = spark.sessionState

View file

@ -337,6 +337,26 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
}
}
test("auto gather stats after insert command") {
val table = "change_stats_insert_datasource_table"
Seq(false, true).foreach { autoUpdate =>
withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> autoUpdate.toString) {
withTable(table) {
sql(s"CREATE TABLE $table (i int, j string) USING PARQUET")
// insert into command
sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
val stats = getCatalogTable(table).stats
if (autoUpdate) {
assert(stats.isDefined)
assert(stats.get.sizeInBytes >= 0)
} else {
assert(stats.isEmpty)
}
}
}
}
}
test("invalidation of tableRelationCache after inserts") {
val table = "invalidate_catalog_cache_table"
Seq(false, true).foreach { autoUpdate =>