[SPARK-27403][SQL] Fix updateTableStats
to update table stats always with new stats or None
## What changes were proposed in this pull request? System shall update the table stats automatically if user set spark.sql.statistics.size.autoUpdate.enabled as true, currently this property is not having any significance even if it is enabled or disabled. This feature is similar to Hives auto-gather feature where statistics are automatically computed by default if this feature is enabled. Reference: https://cwiki.apache.org/confluence/display/Hive/StatsDev As part of fix , autoSizeUpdateEnabled validation is been done initially so that system will calculate the table size for the user automatically and record it in metastore as per user expectation. ## How was this patch tested? UT is written and manually verified in cluster. Tested with unit tests + some internal tests on real cluster. Before fix: ![image](https://user-images.githubusercontent.com/12999161/55688682-cd8d4780-5998-11e9-85da-e1a4e34419f6.png) After fix ![image](https://user-images.githubusercontent.com/12999161/55688654-7d15ea00-5998-11e9-973f-1f4cee27018f.png) Closes #24315 from sujith71955/master_autoupdate. Authored-by: s71955 <sujithchacko.2010@gmail.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
parent
d33ae2e9ed
commit
239082d966
|
@ -42,18 +42,16 @@ object CommandUtils extends Logging {
|
|||
|
||||
/** Change statistics after changing data by commands. */
|
||||
def updateTableStats(sparkSession: SparkSession, table: CatalogTable): Unit = {
|
||||
if (table.stats.nonEmpty) {
|
||||
val catalog = sparkSession.sessionState.catalog
|
||||
if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
|
||||
val newTable = catalog.getTableMetadata(table.identifier)
|
||||
val newSize = CommandUtils.calculateTotalSize(sparkSession, newTable)
|
||||
val newStats = CatalogStatistics(sizeInBytes = newSize)
|
||||
catalog.alterTableStats(table.identifier, Some(newStats))
|
||||
} else {
|
||||
} else if (table.stats.nonEmpty) {
|
||||
catalog.alterTableStats(table.identifier, None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def calculateTotalSize(spark: SparkSession, catalogTable: CatalogTable): BigInt = {
|
||||
val sessionState = spark.sessionState
|
||||
|
|
|
@ -337,6 +337,26 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
|
|||
}
|
||||
}
|
||||
|
||||
test("auto gather stats after insert command") {
|
||||
val table = "change_stats_insert_datasource_table"
|
||||
Seq(false, true).foreach { autoUpdate =>
|
||||
withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> autoUpdate.toString) {
|
||||
withTable(table) {
|
||||
sql(s"CREATE TABLE $table (i int, j string) USING PARQUET")
|
||||
// insert into command
|
||||
sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
|
||||
val stats = getCatalogTable(table).stats
|
||||
if (autoUpdate) {
|
||||
assert(stats.isDefined)
|
||||
assert(stats.get.sizeInBytes >= 0)
|
||||
} else {
|
||||
assert(stats.isEmpty)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("invalidation of tableRelationCache after inserts") {
|
||||
val table = "invalidate_catalog_cache_table"
|
||||
Seq(false, true).foreach { autoUpdate =>
|
||||
|
|
Loading…
Reference in a new issue