[SPARK-25418][SQL] The metadata of DataSource table should not include Hive-generated storage properties.
## What changes were proposed in this pull request? When Hive support enabled, Hive catalog puts extra storage properties into table metadata even for DataSource tables, but we should not have them. ## How was this patch tested? Modified a test. Closes #22410 from ueshin/issues/SPARK-25418/hive_metadata. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: gatorsmile <gatorsmile@gmail.com>
This commit is contained in:
parent
9deddbb13e
commit
a81ef9e1f9
|
@ -28,6 +28,7 @@ import scala.util.control.NonFatal
|
|||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.hadoop.hive.ql.metadata.HiveException
|
||||
import org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT
|
||||
import org.apache.thrift.TException
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkException}
|
||||
|
@ -806,6 +807,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
|
|||
updateLocationInStorageProps(table, newPath = None).copy(
|
||||
locationUri = tableLocation.map(CatalogUtils.stringToURI(_)))
|
||||
}
|
||||
val storageWithoutHiveGeneratedProperties = storageWithLocation.copy(
|
||||
properties = storageWithLocation.properties.filterKeys(!HIVE_GENERATED_STORAGE_PROPERTIES(_)))
|
||||
val partitionProvider = table.properties.get(TABLE_PARTITION_PROVIDER)
|
||||
|
||||
val schemaFromTableProps = getSchemaFromTableProperties(table)
|
||||
|
@ -814,7 +817,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
|
|||
|
||||
table.copy(
|
||||
provider = Some(provider),
|
||||
storage = storageWithLocation,
|
||||
storage = storageWithoutHiveGeneratedProperties,
|
||||
schema = reorderedSchema,
|
||||
partitionColumnNames = partColumnNames,
|
||||
bucketSpec = getBucketSpecFromTableProperties(table),
|
||||
|
@ -1309,6 +1312,8 @@ object HiveExternalCatalog {
|
|||
|
||||
val CREATED_SPARK_VERSION = SPARK_SQL_PREFIX + "create.version"
|
||||
|
||||
val HIVE_GENERATED_STORAGE_PROPERTIES = Set(SERIALIZATION_FORMAT)
|
||||
|
||||
// When storing data source tables in hive metastore, we need to set data schema to empty if the
|
||||
// schema is hive-incompatible. However we need a hack to preserve existing behavior. Before
|
||||
// Spark 2.0, we do not set a default serde here (this was done in Hive), and so if the user
|
||||
|
|
|
@ -72,7 +72,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA
|
|||
outputFormat = serde.get.outputFormat,
|
||||
serde = serde.get.serde,
|
||||
compressed = false,
|
||||
properties = Map("serialization.format" -> "1"))
|
||||
properties = Map.empty)
|
||||
} else {
|
||||
CatalogStorageFormat(
|
||||
locationUri = Some(catalog.defaultTablePath(name)),
|
||||
|
|
Loading…
Reference in a new issue