From ac8307d75ca7c9e920b1170fb65755bbb85577e4 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Tue, 26 Jan 2021 15:36:44 +0000 Subject: [PATCH] [SPARK-34215][SQL] Keep tables cached after truncation ### What changes were proposed in this pull request? Invoke `CatalogImpl.refreshTable()` instead of combination of `SessionCatalog.refreshTable()` + `uncacheQuery()`. This allows to clear cached table data while keeping the table cached. ### Why are the changes needed? 1. To improve user experience with Spark SQL 2. To be consistent to other commands, see https://github.com/apache/spark/pull/31206 ### Does this PR introduce _any_ user-facing change? Yes. Before: ```scala scala> sql("CREATE TABLE tbl (c0 int)") res1: org.apache.spark.sql.DataFrame = [] scala> sql("INSERT INTO tbl SELECT 0") res2: org.apache.spark.sql.DataFrame = [] scala> sql("CACHE TABLE tbl") res3: org.apache.spark.sql.DataFrame = [] scala> sql("SELECT * FROM tbl").show(false) +---+ |c0 | +---+ |0 | +---+ scala> spark.catalog.isCached("tbl") res5: Boolean = true scala> sql("TRUNCATE TABLE tbl") res6: org.apache.spark.sql.DataFrame = [] scala> spark.catalog.isCached("tbl") res7: Boolean = false ``` After: ```scala scala> sql("TRUNCATE TABLE tbl") res6: org.apache.spark.sql.DataFrame = [] scala> spark.catalog.isCached("tbl") res7: Boolean = true ``` ### How was this patch tested? Added new test to `CachedTableSuite`: ``` $ build/sbt -Phive -Phive-thriftserver "test:testOnly *CachedTableSuite" $ build/sbt -Phive -Phive-thriftserver "test:testOnly *CatalogedDDLSuite" ``` Closes #31308 from MaxGekk/truncate-table-cached. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- docs/sql-migration-guide.md | 1 + .../apache/spark/sql/execution/command/tables.scala | 13 +++---------- .../apache/spark/sql/hive/CachedTableSuite.scala | 13 +++++++++++++ 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 55ff8c40c4..da092488c9 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -49,6 +49,7 @@ license: | * `MSCK REPAIR TABLE` * `LOAD DATA` * `REFRESH TABLE` + * `TRUNCATE TABLE` * and the method `spark.catalog.refreshTable` In Spark 3.1 and earlier, table refreshing leaves dependents uncached. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 0d94894645..4979b2d56c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -561,16 +561,9 @@ case class TruncateTableCommand( } } } - // After deleting the data, invalidate the table to make sure we don't keep around a stale - // file relation in the metastore cache. - spark.sessionState.refreshTable(tableName.unquotedString) - // Also try to drop the contents of the table from the columnar cache - try { - spark.sharedState.cacheManager.uncacheQuery(spark.table(table.identifier), cascade = true) - } catch { - case NonFatal(e) => - log.warn(s"Exception when attempting to uncache table $tableIdentWithDB", e) - } + // After deleting the data, refresh the table to make sure we don't keep around a stale + // file relation in the metastore cache and cached table data in the cache manager. + spark.catalog.refreshTable(tableIdentWithDB) if (table.stats.nonEmpty) { // empty table after truncation diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index d2c1759a41..765cc1807d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -501,4 +501,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } } + + test("SPARK-34215: keep table cached after truncation") { + withTable("tbl") { + sql("CREATE TABLE tbl (c0 int)") + sql("INSERT INTO tbl SELECT 0") + sql("CACHE TABLE tbl") + assert(spark.catalog.isCached("tbl")) + checkAnswer(sql("SELECT * FROM tbl"), Row(0)) + sql("TRUNCATE TABLE tbl") + assert(spark.catalog.isCached("tbl")) + checkAnswer(sql("SELECT * FROM tbl"), Seq.empty) + } + } }