[SPARK-34215][SQL] Keep tables cached after truncation

### What changes were proposed in this pull request? Invoke `CatalogImpl.refreshTable()` instead of combination of `SessionCatalog.refreshTable()` + `uncacheQuery()`. This allows to clear cached table data while keeping the table cached. ### Why are the changes needed? 1. To improve user experience with Spark SQL 2. To be consistent to other commands, see https://github.com/apache/spark/pull/31206 ### Does this PR introduce _any_ user-facing change? Yes. Before: ```scala scala> sql("CREATE TABLE tbl (c0 int)") res1: org.apache.spark.sql.DataFrame = [] scala> sql("INSERT INTO tbl SELECT 0") res2: org.apache.spark.sql.DataFrame = [] scala> sql("CACHE TABLE tbl") res3: org.apache.spark.sql.DataFrame = [] scala> sql("SELECT * FROM tbl").show(false) +---+ |c0 | +---+ |0 | +---+ scala> spark.catalog.isCached("tbl") res5: Boolean = true scala> sql("TRUNCATE TABLE tbl") res6: org.apache.spark.sql.DataFrame = [] scala> spark.catalog.isCached("tbl") res7: Boolean = false ``` After: ```scala scala> sql("TRUNCATE TABLE tbl") res6: org.apache.spark.sql.DataFrame = [] scala> spark.catalog.isCached("tbl") res7: Boolean = true ``` ### How was this patch tested? Added new test to `CachedTableSuite`: ``` $ build/sbt -Phive -Phive-thriftserver "test:testOnly *CachedTableSuite" $ build/sbt -Phive -Phive-thriftserver "test:testOnly *CatalogedDDLSuite" ``` Closes #31308 from MaxGekk/truncate-table-cached. Authored-by: Max Gekk <max.gekk@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-01-26 15:36:44 +00:00 · 2021-01-26 15:36:44 +00:00 · ac8307d75c
parent dd88eff820
commit ac8307d75c
3 changed files with 17 additions and 10 deletions
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@ -49,6 +49,7 @@ license: |
    * `MSCK REPAIR TABLE`
    * `LOAD DATA`
    * `REFRESH TABLE`
+    * `TRUNCATE TABLE`
    * and the method `spark.catalog.refreshTable`
  In Spark 3.1 and earlier, table refreshing leaves dependents uncached.

--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@ -561,16 +561,9 @@ case class TruncateTableCommand(
        }
      }
    }
-    // After deleting the data, invalidate the table to make sure we don't keep around a stale
-    // file relation in the metastore cache.
-    spark.sessionState.refreshTable(tableName.unquotedString)
-    // Also try to drop the contents of the table from the columnar cache
-    try {
-      spark.sharedState.cacheManager.uncacheQuery(spark.table(table.identifier), cascade = true)
-    } catch {
-      case NonFatal(e) =>
-        log.warn(s"Exception when attempting to uncache table $tableIdentWithDB", e)
-    }
+    // After deleting the data, refresh the table to make sure we don't keep around a stale
+    // file relation in the metastore cache and cached table data in the cache manager.
+    spark.catalog.refreshTable(tableIdentWithDB)

    if (table.stats.nonEmpty) {
      // empty table after truncation
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@ -501,4 +501,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
      }
    }
  }
+
+  test("SPARK-34215: keep table cached after truncation") {
+    withTable("tbl") {
+      sql("CREATE TABLE tbl (c0 int)")
+      sql("INSERT INTO tbl SELECT 0")
+      sql("CACHE TABLE tbl")
+      assert(spark.catalog.isCached("tbl"))
+      checkAnswer(sql("SELECT * FROM tbl"), Row(0))
+      sql("TRUNCATE TABLE tbl")
+      assert(spark.catalog.isCached("tbl"))
+      checkAnswer(sql("SELECT * FROM tbl"), Seq.empty)
+    }
+  }
 }