[SPARK-19463][SQL] refresh cache after the InsertIntoHadoopFsRelationCommand

## What changes were proposed in this pull request?

If we first cache a DataSource table, then we insert some data into the table, we should refresh the data in the cache after the insert command.

## How was this patch tested?
unit test added

Author: windpiger <songjun@outlook.com>

Closes #16809 from windpiger/refreshCacheAfterInsert.
This commit is contained in:
windpiger 2017-02-28 11:59:18 -08:00 committed by Wenchen Fan
parent 9734a928a7
commit ce233f18e3
4 changed files with 14 additions and 25 deletions

View file

@ -147,7 +147,10 @@ case class InsertIntoHadoopFsRelationCommand(
refreshFunction = refreshPartitionsCallback,
options = options)
// refresh cached files in FileIndex
fileIndex.foreach(_.refresh())
// refresh data cache if table is cached
sparkSession.catalog.refreshByPath(outputPath.toString)
} else {
logInfo("Skipping insertion into a relation that already exists.")
}

View file

@ -77,8 +77,6 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
val df = spark.read.parquet(path).cache()
assert(df.count() == 1000)
spark.range(10).write.mode("overwrite").parquet(path)
assert(df.count() == 1000)
spark.catalog.refreshByPath(path)
assert(df.count() == 10)
assert(spark.read.parquet(path).count() == 10)
}
@ -91,8 +89,6 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
val df = spark.read.parquet(path).cache()
assert(df.count() == 1000)
spark.range(10).write.mode("append").parquet(path)
assert(df.count() == 1000)
spark.catalog.refreshByPath(path)
assert(df.count() == 1010)
assert(spark.read.parquet(path).count() == 1010)
}

View file

@ -281,15 +281,15 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
""".stripMargin)
// jsonTable should be recached.
assertCached(sql("SELECT * FROM jsonTable"))
// TODO we need to invalidate the cached data in InsertIntoHadoopFsRelation
// // The cached data is the new data.
// checkAnswer(
// sql("SELECT a, b FROM jsonTable"),
// sql("SELECT a * 2, b FROM jt").collect())
//
// // Verify uncaching
// spark.catalog.uncacheTable("jsonTable")
// assertCached(sql("SELECT * FROM jsonTable"), 0)
// The cached data is the new data.
checkAnswer(
sql("SELECT a, b FROM jsonTable"),
sql("SELECT a * 2, b FROM jt").collect())
// Verify uncaching
spark.catalog.uncacheTable("jsonTable")
assertCached(sql("SELECT * FROM jsonTable"), 0)
}
test("it's not allowed to insert into a relation that is not an InsertableRelation") {

View file

@ -204,13 +204,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
assertCached(table("refreshTable"))
// Append new data.
table("src").write.mode(SaveMode.Append).parquet(tempPath.toString)
// We are still using the old data.
assertCached(table("refreshTable"))
checkAnswer(
table("refreshTable"),
table("src").collect())
// Refresh the table.
sql("REFRESH TABLE refreshTable")
// We are using the new data.
assertCached(table("refreshTable"))
checkAnswer(
@ -249,13 +244,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
assertCached(table("refreshTable"))
// Append new data.
table("src").write.mode(SaveMode.Append).parquet(tempPath.toString)
// We are still using the old data.
assertCached(table("refreshTable"))
checkAnswer(
table("refreshTable"),
table("src").collect())
// Refresh the table.
sql(s"REFRESH ${tempPath.toString}")
// We are using the new data.
assertCached(table("refreshTable"))
checkAnswer(