[SPARK-25418][SQL] The metadata of DataSource table should not include Hive-generated storage properties.

## What changes were proposed in this pull request?

When Hive support enabled, Hive catalog puts extra storage properties into table metadata even for DataSource tables, but we should not have them.

## How was this patch tested?

Modified a test.

Closes #22410 from ueshin/issues/SPARK-25418/hive_metadata.

Authored-by: Takuya UESHIN <ueshin@databricks.com>
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
This commit is contained in:
Takuya UESHIN 2018-09-13 22:22:00 -07:00 committed by gatorsmile
parent 9deddbb13e
commit a81ef9e1f9
2 changed files with 7 additions and 2 deletions

View file

@ -28,6 +28,7 @@ import scala.util.control.NonFatal
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.ql.metadata.HiveException
import org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT
import org.apache.thrift.TException
import org.apache.spark.{SparkConf, SparkException}
@ -806,6 +807,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
updateLocationInStorageProps(table, newPath = None).copy(
locationUri = tableLocation.map(CatalogUtils.stringToURI(_)))
}
val storageWithoutHiveGeneratedProperties = storageWithLocation.copy(
properties = storageWithLocation.properties.filterKeys(!HIVE_GENERATED_STORAGE_PROPERTIES(_)))
val partitionProvider = table.properties.get(TABLE_PARTITION_PROVIDER)
val schemaFromTableProps = getSchemaFromTableProperties(table)
@ -814,7 +817,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
table.copy(
provider = Some(provider),
storage = storageWithLocation,
storage = storageWithoutHiveGeneratedProperties,
schema = reorderedSchema,
partitionColumnNames = partColumnNames,
bucketSpec = getBucketSpecFromTableProperties(table),
@ -1309,6 +1312,8 @@ object HiveExternalCatalog {
val CREATED_SPARK_VERSION = SPARK_SQL_PREFIX + "create.version"
val HIVE_GENERATED_STORAGE_PROPERTIES = Set(SERIALIZATION_FORMAT)
// When storing data source tables in hive metastore, we need to set data schema to empty if the
// schema is hive-incompatible. However we need a hack to preserve existing behavior. Before
// Spark 2.0, we do not set a default serde here (this was done in Hive), and so if the user

View file

@ -72,7 +72,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA
outputFormat = serde.get.outputFormat,
serde = serde.get.serde,
compressed = false,
properties = Map("serialization.format" -> "1"))
properties = Map.empty)
} else {
CatalogStorageFormat(
locationUri = Some(catalog.defaultTablePath(name)),