[SPARK-19463][SQL] refresh cache after the InsertIntoHadoopFsRelationCommand
## What changes were proposed in this pull request? If we first cache a DataSource table, then we insert some data into the table, we should refresh the data in the cache after the insert command. ## How was this patch tested? unit test added Author: windpiger <songjun@outlook.com> Closes #16809 from windpiger/refreshCacheAfterInsert.
This commit is contained in:
parent
9734a928a7
commit
ce233f18e3
|
@ -147,7 +147,10 @@ case class InsertIntoHadoopFsRelationCommand(
|
|||
refreshFunction = refreshPartitionsCallback,
|
||||
options = options)
|
||||
|
||||
// refresh cached files in FileIndex
|
||||
fileIndex.foreach(_.refresh())
|
||||
// refresh data cache if table is cached
|
||||
sparkSession.catalog.refreshByPath(outputPath.toString)
|
||||
} else {
|
||||
logInfo("Skipping insertion into a relation that already exists.")
|
||||
}
|
||||
|
|
|
@ -77,8 +77,6 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
|
|||
val df = spark.read.parquet(path).cache()
|
||||
assert(df.count() == 1000)
|
||||
spark.range(10).write.mode("overwrite").parquet(path)
|
||||
assert(df.count() == 1000)
|
||||
spark.catalog.refreshByPath(path)
|
||||
assert(df.count() == 10)
|
||||
assert(spark.read.parquet(path).count() == 10)
|
||||
}
|
||||
|
@ -91,8 +89,6 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
|
|||
val df = spark.read.parquet(path).cache()
|
||||
assert(df.count() == 1000)
|
||||
spark.range(10).write.mode("append").parquet(path)
|
||||
assert(df.count() == 1000)
|
||||
spark.catalog.refreshByPath(path)
|
||||
assert(df.count() == 1010)
|
||||
assert(spark.read.parquet(path).count() == 1010)
|
||||
}
|
||||
|
|
|
@ -281,15 +281,15 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
|
|||
""".stripMargin)
|
||||
// jsonTable should be recached.
|
||||
assertCached(sql("SELECT * FROM jsonTable"))
|
||||
// TODO we need to invalidate the cached data in InsertIntoHadoopFsRelation
|
||||
// // The cached data is the new data.
|
||||
// checkAnswer(
|
||||
// sql("SELECT a, b FROM jsonTable"),
|
||||
// sql("SELECT a * 2, b FROM jt").collect())
|
||||
//
|
||||
// // Verify uncaching
|
||||
// spark.catalog.uncacheTable("jsonTable")
|
||||
// assertCached(sql("SELECT * FROM jsonTable"), 0)
|
||||
|
||||
// The cached data is the new data.
|
||||
checkAnswer(
|
||||
sql("SELECT a, b FROM jsonTable"),
|
||||
sql("SELECT a * 2, b FROM jt").collect())
|
||||
|
||||
// Verify uncaching
|
||||
spark.catalog.uncacheTable("jsonTable")
|
||||
assertCached(sql("SELECT * FROM jsonTable"), 0)
|
||||
}
|
||||
|
||||
test("it's not allowed to insert into a relation that is not an InsertableRelation") {
|
||||
|
|
|
@ -204,13 +204,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
|
|||
assertCached(table("refreshTable"))
|
||||
// Append new data.
|
||||
table("src").write.mode(SaveMode.Append).parquet(tempPath.toString)
|
||||
// We are still using the old data.
|
||||
assertCached(table("refreshTable"))
|
||||
checkAnswer(
|
||||
table("refreshTable"),
|
||||
table("src").collect())
|
||||
// Refresh the table.
|
||||
sql("REFRESH TABLE refreshTable")
|
||||
|
||||
// We are using the new data.
|
||||
assertCached(table("refreshTable"))
|
||||
checkAnswer(
|
||||
|
@ -249,13 +244,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
|
|||
assertCached(table("refreshTable"))
|
||||
// Append new data.
|
||||
table("src").write.mode(SaveMode.Append).parquet(tempPath.toString)
|
||||
// We are still using the old data.
|
||||
assertCached(table("refreshTable"))
|
||||
checkAnswer(
|
||||
table("refreshTable"),
|
||||
table("src").collect())
|
||||
// Refresh the table.
|
||||
sql(s"REFRESH ${tempPath.toString}")
|
||||
|
||||
// We are using the new data.
|
||||
assertCached(table("refreshTable"))
|
||||
checkAnswer(
|
||||
|
|
Loading…
Reference in a new issue