[SPARK-36197][SQL] Use PartitionDesc instead of TableDesc for reading hive partitioned tables
### What changes were proposed in this pull request? A hive partition can have different `PartitionDesc`s from `TableDesc` for describing Serde/InputFormatClass/OutputFormatClass, for a hive partitioned table, we shall respect those in `PartitionDesc`. ### Why are the changes needed? in many cases, that Spark reads hive tables could result in surprise because of this issue. ### Does this PR introduce _any_ user-facing change? yes, hive partition table that contains different serde/input/output could be recognized by Spark ### How was this patch tested? new test added Closes #33406 from yaooqinn/SPARK-36197. Authored-by: Kent Yao <yao@apache.org> Signed-off-by: Kent Yao <yao@apache.org>
This commit is contained in:
parent
8396a70ddc
commit
ef80356614
|
@ -26,7 +26,7 @@ import org.apache.hadoop.fs.{Path, PathFilter}
|
|||
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants._
|
||||
import org.apache.hadoop.hive.ql.exec.Utilities
|
||||
import org.apache.hadoop.hive.ql.metadata.{Partition => HivePartition, Table => HiveTable}
|
||||
import org.apache.hadoop.hive.ql.plan.TableDesc
|
||||
import org.apache.hadoop.hive.ql.plan.{PartitionDesc, TableDesc}
|
||||
import org.apache.hadoop.hive.serde2.Deserializer
|
||||
import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.AvroTableProperties
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorConverters, StructObjectInspector}
|
||||
|
@ -201,7 +201,7 @@ class HadoopTableReader(
|
|||
|
||||
val hivePartitionRDDs = verifyPartitionPath(partitionToDeserializer)
|
||||
.map { case (partition, partDeserializer) =>
|
||||
val partDesc = Utilities.getPartitionDesc(partition)
|
||||
val partDesc = Utilities.getPartitionDescFromTableDesc(tableDesc, partition, true)
|
||||
val partPath = partition.getDataLocation
|
||||
val inputPathStr = applyFilterIfNeeded(partPath, filterOpt)
|
||||
// Get partition field info
|
||||
|
@ -245,7 +245,8 @@ class HadoopTableReader(
|
|||
|
||||
// Create local references so that the outer object isn't serialized.
|
||||
val localTableDesc = tableDesc
|
||||
createHadoopRDD(localTableDesc, inputPathStr).mapPartitions { iter =>
|
||||
|
||||
createHadoopRDD(partDesc, inputPathStr).mapPartitions { iter =>
|
||||
val hconf = broadcastedHiveConf.value.value
|
||||
val deserializer = localDeserializer.getConstructor().newInstance()
|
||||
// SPARK-13709: For SerDes like AvroSerDe, some essential information (e.g. Avro schema
|
||||
|
@ -314,6 +315,15 @@ class HadoopTableReader(
|
|||
}
|
||||
}
|
||||
|
||||
private def createHadoopRDD(partitionDesc: PartitionDesc, inputPathStr: String): RDD[Writable] = {
|
||||
val inputFormatClazz = partitionDesc.getInputFileFormatClass
|
||||
if (classOf[newInputClass[_, _]].isAssignableFrom(inputFormatClazz)) {
|
||||
createNewHadoopRDD(partitionDesc, inputPathStr)
|
||||
} else {
|
||||
createOldHadoopRDD(partitionDesc, inputPathStr)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a HadoopRDD based on the broadcasted HiveConf and other job properties that will be
|
||||
* applied locally on each executor.
|
||||
|
@ -322,7 +332,24 @@ class HadoopTableReader(
|
|||
val initializeJobConfFunc = HadoopTableReader.initializeLocalJobConfFunc(path, tableDesc) _
|
||||
val inputFormatClass = tableDesc.getInputFileFormatClass
|
||||
.asInstanceOf[Class[oldInputClass[Writable, Writable]]]
|
||||
createOldHadoopRDD(inputFormatClass, initializeJobConfFunc)
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a HadoopRDD based on the broadcasted HiveConf and other job properties that will be
|
||||
* applied locally on each executor.
|
||||
*/
|
||||
private def createOldHadoopRDD(partitionDesc: PartitionDesc, path: String): RDD[Writable] = {
|
||||
val initializeJobConfFunc =
|
||||
HadoopTableReader.initializeLocalJobConfFunc(path, partitionDesc.getTableDesc) _
|
||||
val inputFormatClass = partitionDesc.getInputFileFormatClass
|
||||
.asInstanceOf[Class[oldInputClass[Writable, Writable]]]
|
||||
createOldHadoopRDD(inputFormatClass, initializeJobConfFunc)
|
||||
}
|
||||
|
||||
private def createOldHadoopRDD(
|
||||
inputFormatClass: Class[oldInputClass[Writable, Writable]],
|
||||
initializeJobConfFunc: JobConf => Unit): RDD[Writable] = {
|
||||
val rdd = new HadoopRDD(
|
||||
sparkSession.sparkContext,
|
||||
_broadcastedHadoopConf.asInstanceOf[Broadcast[SerializableConfiguration]],
|
||||
|
@ -345,13 +372,26 @@ class HadoopTableReader(
|
|||
HadoopTableReader.initializeLocalJobConfFunc(path, tableDesc)(newJobConf)
|
||||
val inputFormatClass = tableDesc.getInputFileFormatClass
|
||||
.asInstanceOf[Class[newInputClass[Writable, Writable]]]
|
||||
createNewHadoopRDD(inputFormatClass, newJobConf)
|
||||
}
|
||||
|
||||
private def createNewHadoopRDD(partDesc: PartitionDesc, path: String): RDD[Writable] = {
|
||||
val newJobConf = new JobConf(hadoopConf)
|
||||
HadoopTableReader.initializeLocalJobConfFunc(path, partDesc.getTableDesc)(newJobConf)
|
||||
val inputFormatClass = partDesc.getInputFileFormatClass
|
||||
.asInstanceOf[Class[newInputClass[Writable, Writable]]]
|
||||
createNewHadoopRDD(inputFormatClass, newJobConf)
|
||||
}
|
||||
|
||||
private def createNewHadoopRDD(
|
||||
inputFormatClass: Class[newInputClass[Writable, Writable]],
|
||||
jobConf: JobConf): RDD[Writable] = {
|
||||
val rdd = new NewHadoopRDD(
|
||||
sparkSession.sparkContext,
|
||||
inputFormatClass,
|
||||
classOf[Writable],
|
||||
classOf[Writable],
|
||||
newJobConf
|
||||
jobConf
|
||||
)
|
||||
|
||||
// Only take the value (skip the key) because Hive works only with values.
|
||||
|
|
|
@ -2614,6 +2614,24 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("SPARK-36197: Use PartitionDesc instead of TableDesc for reading hive partitioned tables") {
|
||||
withTempDir { dir =>
|
||||
val t1Loc = s"file:///$dir/t1"
|
||||
val t2Loc = s"file:///$dir/t2"
|
||||
withTable("t1", "t2") {
|
||||
hiveClient.runSqlHive(
|
||||
s"create table t1(id int) partitioned by(pid int) stored as avro location '$t1Loc'")
|
||||
hiveClient.runSqlHive("insert into t1 partition(pid=1) select 2")
|
||||
hiveClient.runSqlHive(
|
||||
s"create table t2(id int) partitioned by(pid int) stored as textfile location '$t2Loc'")
|
||||
hiveClient.runSqlHive("insert into t2 partition(pid=2) select 2")
|
||||
hiveClient.runSqlHive(s"alter table t1 add partition (pid=2) location '$t2Loc/pid=2'")
|
||||
hiveClient.runSqlHive("alter table t1 partition(pid=2) SET FILEFORMAT textfile")
|
||||
checkAnswer(sql("select pid, id from t1 order by pid"), Seq(Row(1, 2), Row(2, 2)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SlowHiveTest
|
||||
|
|
Loading…
Reference in a new issue