[SPARK-21150][SQL] Persistent view stored in Hive metastore should be case preserving
## What changes were proposed in this pull request? This is a regression in Spark 2.2. In Spark 2.2, we introduced a new way to resolve persisted view: https://issues.apache.org/jira/browse/SPARK-18209 , but this makes the persisted view non case-preserving because we store the schema in hive metastore directly. We should follow data source table and store schema in table properties. ## How was this patch tested? new regression test Author: Wenchen Fan <wenchen@databricks.com> Closes #18360 from cloud-fan/view.
This commit is contained in:
parent
ef1622899f
commit
e862dc9049
|
@ -159,7 +159,9 @@ case class CreateViewCommand(
|
||||||
checkCyclicViewReference(analyzedPlan, Seq(viewIdent), viewIdent)
|
checkCyclicViewReference(analyzedPlan, Seq(viewIdent), viewIdent)
|
||||||
|
|
||||||
// Handles `CREATE OR REPLACE VIEW v0 AS SELECT ...`
|
// Handles `CREATE OR REPLACE VIEW v0 AS SELECT ...`
|
||||||
catalog.alterTable(prepareTable(sparkSession, analyzedPlan))
|
// Nothing we need to retain from the old view, so just drop and create a new one
|
||||||
|
catalog.dropTable(viewIdent, ignoreIfNotExists = false, purge = false)
|
||||||
|
catalog.createTable(prepareTable(sparkSession, analyzedPlan), ignoreIfExists = false)
|
||||||
} else {
|
} else {
|
||||||
// Handles `CREATE VIEW v0 AS SELECT ...`. Throws exception when the target view already
|
// Handles `CREATE VIEW v0 AS SELECT ...`. Throws exception when the target view already
|
||||||
// exists.
|
// exists.
|
||||||
|
|
|
@ -669,4 +669,14 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
|
||||||
"positive."))
|
"positive."))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("permanent view should be case-preserving") {
|
||||||
|
withView("v") {
|
||||||
|
sql("CREATE VIEW v AS SELECT 1 as aBc")
|
||||||
|
assert(spark.table("v").schema.head.name == "aBc")
|
||||||
|
|
||||||
|
sql("CREATE OR REPLACE VIEW v AS SELECT 2 as cBa")
|
||||||
|
assert(spark.table("v").schema.head.name == "cBa")
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -224,39 +224,36 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
|
||||||
throw new TableAlreadyExistsException(db = db, table = table)
|
throw new TableAlreadyExistsException(db = db, table = table)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tableDefinition.tableType == VIEW) {
|
// Ideally we should not create a managed table with location, but Hive serde table can
|
||||||
client.createTable(tableDefinition, ignoreIfExists)
|
// specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have
|
||||||
|
// to create the table directory and write out data before we create this table, to avoid
|
||||||
|
// exposing a partial written table.
|
||||||
|
val needDefaultTableLocation = tableDefinition.tableType == MANAGED &&
|
||||||
|
tableDefinition.storage.locationUri.isEmpty
|
||||||
|
|
||||||
|
val tableLocation = if (needDefaultTableLocation) {
|
||||||
|
Some(CatalogUtils.stringToURI(defaultTablePath(tableDefinition.identifier)))
|
||||||
} else {
|
} else {
|
||||||
// Ideally we should not create a managed table with location, but Hive serde table can
|
tableDefinition.storage.locationUri
|
||||||
// specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have
|
}
|
||||||
// to create the table directory and write out data before we create this table, to avoid
|
|
||||||
// exposing a partial written table.
|
|
||||||
val needDefaultTableLocation = tableDefinition.tableType == MANAGED &&
|
|
||||||
tableDefinition.storage.locationUri.isEmpty
|
|
||||||
|
|
||||||
val tableLocation = if (needDefaultTableLocation) {
|
if (DDLUtils.isDatasourceTable(tableDefinition)) {
|
||||||
Some(CatalogUtils.stringToURI(defaultTablePath(tableDefinition.identifier)))
|
createDataSourceTable(
|
||||||
} else {
|
tableDefinition.withNewStorage(locationUri = tableLocation),
|
||||||
tableDefinition.storage.locationUri
|
ignoreIfExists)
|
||||||
}
|
} else {
|
||||||
|
val tableWithDataSourceProps = tableDefinition.copy(
|
||||||
if (DDLUtils.isHiveTable(tableDefinition)) {
|
// We can't leave `locationUri` empty and count on Hive metastore to set a default table
|
||||||
val tableWithDataSourceProps = tableDefinition.copy(
|
// location, because Hive metastore uses hive.metastore.warehouse.dir to generate default
|
||||||
// We can't leave `locationUri` empty and count on Hive metastore to set a default table
|
// table location for tables in default database, while we expect to use the location of
|
||||||
// location, because Hive metastore uses hive.metastore.warehouse.dir to generate default
|
// default database.
|
||||||
// table location for tables in default database, while we expect to use the location of
|
storage = tableDefinition.storage.copy(locationUri = tableLocation),
|
||||||
// default database.
|
// Here we follow data source tables and put table metadata like table schema, partition
|
||||||
storage = tableDefinition.storage.copy(locationUri = tableLocation),
|
// columns etc. in table properties, so that we can work around the Hive metastore issue
|
||||||
// Here we follow data source tables and put table metadata like table schema, partition
|
// about not case preserving and make Hive serde table and view support mixed-case column
|
||||||
// columns etc. in table properties, so that we can work around the Hive metastore issue
|
// names.
|
||||||
// about not case preserving and make Hive serde table support mixed-case column names.
|
properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition))
|
||||||
properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition))
|
client.createTable(tableWithDataSourceProps, ignoreIfExists)
|
||||||
client.createTable(tableWithDataSourceProps, ignoreIfExists)
|
|
||||||
} else {
|
|
||||||
createDataSourceTable(
|
|
||||||
tableDefinition.withNewStorage(locationUri = tableLocation),
|
|
||||||
ignoreIfExists)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -679,16 +676,21 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
|
||||||
|
|
||||||
var table = inputTable
|
var table = inputTable
|
||||||
|
|
||||||
if (table.tableType != VIEW) {
|
table.properties.get(DATASOURCE_PROVIDER) match {
|
||||||
table.properties.get(DATASOURCE_PROVIDER) match {
|
case None if table.tableType == VIEW =>
|
||||||
// No provider in table properties, which means this is a Hive serde table.
|
// If this is a view created by Spark 2.2 or higher versions, we should restore its schema
|
||||||
case None =>
|
// from table properties.
|
||||||
table = restoreHiveSerdeTable(table)
|
if (table.properties.contains(DATASOURCE_SCHEMA_NUMPARTS)) {
|
||||||
|
table = table.copy(schema = getSchemaFromTableProperties(table))
|
||||||
|
}
|
||||||
|
|
||||||
// This is a regular data source table.
|
// No provider in table properties, which means this is a Hive serde table.
|
||||||
case Some(provider) =>
|
case None =>
|
||||||
table = restoreDataSourceTable(table, provider)
|
table = restoreHiveSerdeTable(table)
|
||||||
}
|
|
||||||
|
// This is a regular data source table.
|
||||||
|
case Some(provider) =>
|
||||||
|
table = restoreDataSourceTable(table, provider)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Restore Spark's statistics from information in Metastore.
|
// Restore Spark's statistics from information in Metastore.
|
||||||
|
|
Loading…
Reference in a new issue