diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index d4df35c8ec..03874d005a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -363,11 +363,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat (None, message) // our bucketing is un-compatible with hive(different hash function) - case _ if table.bucketSpec.nonEmpty => + case Some(serde) if table.bucketSpec.nonEmpty => val message = s"Persisting bucketed data source table $qualifiedTableName into " + - "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " - (None, message) + "Hive metastore in Spark SQL specific format, which is NOT compatible with " + + "Hive bucketed table. But Hive can read this table as a non-bucketed table." + (Some(newHiveCompatibleMetastoreTable(serde)), message) case Some(serde) => val message = diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index deb0a10857..007694543d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias import org.apache.spark.sql.hive.test.TestHiveSingleton -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils} import org.apache.spark.sql.types._ @@ -284,4 +284,40 @@ class DataSourceWithHiveMetastoreCatalogSuite } } + + test("SPARK-27592 set the bucketed data source table SerDe correctly") { + val provider = "parquet" + withTable("t") { + spark.sql( + s""" + |CREATE TABLE t + |USING $provider + |CLUSTERED BY (c1) + |SORTED BY (c1) + |INTO 2 BUCKETS + |AS SELECT 1 AS c1, 2 AS c2 + """.stripMargin) + + val metadata = sessionState.catalog.getTableMetadata(TableIdentifier("t", Some("default"))) + + val hiveSerDe = HiveSerDe.sourceToSerDe(provider).get + assert(metadata.storage.serde === hiveSerDe.serde) + assert(metadata.storage.inputFormat === hiveSerDe.inputFormat) + assert(metadata.storage.outputFormat === hiveSerDe.outputFormat) + + // It's a bucketed table at Spark side + assert(sql("DESC FORMATTED t").collect().containsSlice( + Seq(Row("Num Buckets", "2", ""), Row("Bucket Columns", "[`c1`]", "")) + )) + checkAnswer(table("t"), Row(1, 2)) + + // It's not a bucketed table at Hive side + val hiveSide = sparkSession.metadataHive.runSqlHive("DESC FORMATTED t") + assert(hiveSide.contains("Num Buckets: \t-1 \t ")) + assert(hiveSide.contains("Bucket Columns: \t[] \t ")) + assert(hiveSide.contains("\tspark.sql.sources.schema.numBuckets\t2 ")) + assert(hiveSide.contains("\tspark.sql.sources.schema.bucketCol.0\tc1 ")) + assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1\t2")) + } + } }