[SPARK-34215][SQL] Keep tables cached after truncation

### What changes were proposed in this pull request?
Invoke `CatalogImpl.refreshTable()` instead of combination of `SessionCatalog.refreshTable()` + `uncacheQuery()`. This allows to clear cached table data while keeping the table cached.

### Why are the changes needed?
1. To improve user experience with Spark SQL
2. To be consistent to other commands, see https://github.com/apache/spark/pull/31206

### Does this PR introduce _any_ user-facing change?
Yes.

Before:
```scala
scala> sql("CREATE TABLE tbl (c0 int)")
res1: org.apache.spark.sql.DataFrame = []
scala> sql("INSERT INTO tbl SELECT 0")
res2: org.apache.spark.sql.DataFrame = []
scala> sql("CACHE TABLE tbl")
res3: org.apache.spark.sql.DataFrame = []
scala> sql("SELECT * FROM tbl").show(false)
+---+
|c0 |
+---+
|0  |
+---+
scala> spark.catalog.isCached("tbl")
res5: Boolean = true
scala> sql("TRUNCATE TABLE tbl")
res6: org.apache.spark.sql.DataFrame = []
scala> spark.catalog.isCached("tbl")
res7: Boolean = false
```

After:
```scala
scala> sql("TRUNCATE TABLE tbl")
res6: org.apache.spark.sql.DataFrame = []
scala> spark.catalog.isCached("tbl")
res7: Boolean = true
```

### How was this patch tested?
Added new test to `CachedTableSuite`:
```
$ build/sbt -Phive -Phive-thriftserver "test:testOnly *CachedTableSuite"
$ build/sbt -Phive -Phive-thriftserver "test:testOnly *CatalogedDDLSuite"
```

Closes #31308 from MaxGekk/truncate-table-cached.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Max Gekk 2021-01-26 15:36:44 +00:00 committed by Wenchen Fan
parent dd88eff820
commit ac8307d75c
3 changed files with 17 additions and 10 deletions

View file

@ -49,6 +49,7 @@ license: |
* `MSCK REPAIR TABLE`
* `LOAD DATA`
* `REFRESH TABLE`
* `TRUNCATE TABLE`
* and the method `spark.catalog.refreshTable`
In Spark 3.1 and earlier, table refreshing leaves dependents uncached.

View file

@ -561,16 +561,9 @@ case class TruncateTableCommand(
}
}
}
// After deleting the data, invalidate the table to make sure we don't keep around a stale
// file relation in the metastore cache.
spark.sessionState.refreshTable(tableName.unquotedString)
// Also try to drop the contents of the table from the columnar cache
try {
spark.sharedState.cacheManager.uncacheQuery(spark.table(table.identifier), cascade = true)
} catch {
case NonFatal(e) =>
log.warn(s"Exception when attempting to uncache table $tableIdentWithDB", e)
}
// After deleting the data, refresh the table to make sure we don't keep around a stale
// file relation in the metastore cache and cached table data in the cache manager.
spark.catalog.refreshTable(tableIdentWithDB)
if (table.stats.nonEmpty) {
// empty table after truncation

View file

@ -501,4 +501,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
}
}
}
test("SPARK-34215: keep table cached after truncation") {
withTable("tbl") {
sql("CREATE TABLE tbl (c0 int)")
sql("INSERT INTO tbl SELECT 0")
sql("CACHE TABLE tbl")
assert(spark.catalog.isCached("tbl"))
checkAnswer(sql("SELECT * FROM tbl"), Row(0))
sql("TRUNCATE TABLE tbl")
assert(spark.catalog.isCached("tbl"))
checkAnswer(sql("SELECT * FROM tbl"), Seq.empty)
}
}
}