[SPARK-36673][SQL] Fix incorrect schema of nested types of union

### What changes were proposed in this pull request? This patch proposes to fix incorrect schema of `union`. ### Why are the changes needed? The current `union` result of nested struct columns is incorrect. By definition of `union` API, it should resolve columns by position, not by name. Right now when determining the `output` (aka. the schema) of union plan, we use `merge` API which actually merges two structs (simply think it as concatenate fields from two structs if not overlapping). The merging behavior doesn't match the `union` definition. So currently we get incorrect schema but the query result is correct. We should fix the incorrect schema. ### Does this PR introduce _any_ user-facing change? Yes, fixing a bug of incorrect schema. ### How was this patch tested? Added unit test. Closes #34025 from viirya/SPARK-36673. Authored-by: Liang-Chi Hsieh <viirya@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit cdd7ae937d) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-09-17 21:37:19 +08:00 · 2021-09-17 21:37:19 +08:00 · 895218996a
parent af7dd18a5e
commit 895218996a
4 changed files with 123 additions and 36 deletions
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@ -307,7 +307,7 @@ case class Union(
    children.map(_.output).transpose.map { attrs =>
      val firstAttr = attrs.head
      val nullable = attrs.exists(_.nullable)
-      val newDt = attrs.map(_.dataType).reduce(StructType.merge)
+      val newDt = attrs.map(_.dataType).reduce(StructType.unionLikeMerge)
      if (firstAttr.dataType == newDt) {
        firstAttr.withNullability(nullable)
      } else {
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@ -559,52 +559,81 @@ object StructType extends AbstractDataType {
      case _ => dt
    }
  /**
   * This leverages `merge` to merge data types for UNION operator by specializing
   * the handling of struct types to follow UNION semantics.
   */
  private[sql] def unionLikeMerge(left: DataType, right: DataType): DataType =
    mergeInternal(left, right, (s1: StructType, s2: StructType) => {
      val leftFields = s1.fields
      val rightFields = s2.fields
      require(leftFields.size == rightFields.size, "To merge nullability, " +
        "two structs must have same number of fields.")
      val newFields = leftFields.zip(rightFields).map {
        case (leftField, rightField) =>
          leftField.copy(
            dataType = unionLikeMerge(leftField.dataType, rightField.dataType),
            nullable = leftField.nullable || rightField.nullable)
      }.toSeq
      StructType(newFields)
    })
  private[sql] def merge(left: DataType, right: DataType): DataType =
    mergeInternal(left, right, (s1: StructType, s2: StructType) => {
      val leftFields = s1.fields
      val rightFields = s2.fields
      val newFields = mutable.ArrayBuffer.empty[StructField]
      val rightMapped = fieldsMap(rightFields)
      leftFields.foreach {
        case leftField @ StructField(leftName, leftType, leftNullable, _) =>
          rightMapped.get(leftName)
            .map { case rightField @ StructField(rightName, rightType, rightNullable, _) =>
              try {
                leftField.copy(
                  dataType = merge(leftType, rightType),
                  nullable = leftNullable || rightNullable)
              } catch {
                case NonFatal(e) =>
                  throw QueryExecutionErrors.failedMergingFieldsError(leftName, rightName, e)
              }
            }
            .orElse {
              Some(leftField)
            }
            .foreach(newFields += _)
      }
      val leftMapped = fieldsMap(leftFields)
      rightFields
        .filterNot(f => leftMapped.get(f.name).nonEmpty)
        .foreach { f =>
          newFields += f
        }
      StructType(newFields.toSeq)
    })
  private def mergeInternal(
      left: DataType,
      right: DataType,
      mergeStruct: (StructType, StructType) => StructType): DataType =
    (left, right) match {
      case (ArrayType(leftElementType, leftContainsNull),
      ArrayType(rightElementType, rightContainsNull)) =>
        ArrayType(
-          merge(leftElementType, rightElementType),
+          mergeInternal(leftElementType, rightElementType, mergeStruct),
          leftContainsNull || rightContainsNull)
      case (MapType(leftKeyType, leftValueType, leftContainsNull),
      MapType(rightKeyType, rightValueType, rightContainsNull)) =>
        MapType(
-          merge(leftKeyType, rightKeyType),
+          mergeInternal(leftKeyType, rightKeyType, mergeStruct),
-          merge(leftValueType, rightValueType),
+          mergeInternal(leftValueType, rightValueType, mergeStruct),
          leftContainsNull || rightContainsNull)
-      case (StructType(leftFields), StructType(rightFields)) =>
+      case (s1: StructType, s2: StructType) => mergeStruct(s1, s2)
        val newFields = mutable.ArrayBuffer.empty[StructField]
        val rightMapped = fieldsMap(rightFields)
        leftFields.foreach {
          case leftField @ StructField(leftName, leftType, leftNullable, _) =>
            rightMapped.get(leftName)
              .map { case rightField @ StructField(rightName, rightType, rightNullable, _) =>
                try {
                  leftField.copy(
                    dataType = merge(leftType, rightType),
                    nullable = leftNullable || rightNullable)
                } catch {
                  case NonFatal(e) =>
                    throw QueryExecutionErrors.failedMergingFieldsError(leftName, rightName, e)
                }
              }
              .orElse {
                Some(leftField)
              }
              .foreach(newFields += _)
        }
        val leftMapped = fieldsMap(leftFields)
        rightFields
          .filterNot(f => leftMapped.get(f.name).nonEmpty)
          .foreach { f =>
            newFields += f
          }
        StructType(newFields.toSeq)
      case (DecimalType.Fixed(leftPrecision, leftScale),
        DecimalType.Fixed(rightPrecision, rightScale)) =>
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@ -684,7 +684,7 @@ case class UnionExec(children: Seq[SparkPlan]) extends SparkPlan {
    children.map(_.output).transpose.map { attrs =>
      val firstAttr = attrs.head
      val nullable = attrs.exists(_.nullable)
-      val newDt = attrs.map(_.dataType).reduce(StructType.merge)
+      val newDt = attrs.map(_.dataType).reduce(StructType.unionLikeMerge)
      if (firstAttr.dataType == newDt) {
        firstAttr.withNullability(nullable)
      } else {
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
@ -1018,6 +1018,64 @@ class DataFrameSetOperationsSuite extends QueryTest with SharedSparkSession {
    unionDF = df1.unionByName(df2)
    checkAnswer(unionDF, expected)
  }
  test("SPARK-36673: Only merge nullability for Unions of struct") {
    val df1 = spark.range(2).withColumn("nested", struct(expr("id * 5 AS INNER")))
    val df2 = spark.range(2).withColumn("nested", struct(expr("id * 5 AS inner")))
    val union1 = df1.union(df2)
    val union2 = df1.unionByName(df2)
    val schema = StructType(Seq(StructField("id", LongType, false),
      StructField("nested", StructType(Seq(StructField("INNER", LongType, false))), false)))
    Seq(union1, union2).foreach { df =>
      assert(df.schema == schema)
      assert(df.queryExecution.optimizedPlan.schema == schema)
      assert(df.queryExecution.executedPlan.schema == schema)
      checkAnswer(df, Row(0, Row(0)) :: Row(1, Row(5)) :: Row(0, Row(0)) :: Row(1, Row(5)) :: Nil)
      checkAnswer(df.select("nested.*"), Row(0) :: Row(5) :: Row(0) :: Row(5) :: Nil)
    }
  }
  test("SPARK-36673: Only merge nullability for unionByName of struct") {
    val df1 = spark.range(2).withColumn("nested", struct(expr("id * 5 AS INNER")))
    val df2 = spark.range(2).withColumn("nested", struct(expr("id * 5 AS inner")))
    val df = df1.unionByName(df2)
    val schema = StructType(Seq(StructField("id", LongType, false),
      StructField("nested", StructType(Seq(StructField("INNER", LongType, false))), false)))
    assert(df.schema == schema)
    assert(df.queryExecution.optimizedPlan.schema == schema)
    assert(df.queryExecution.executedPlan.schema == schema)
    checkAnswer(df, Row(0, Row(0)) :: Row(1, Row(5)) :: Row(0, Row(0)) :: Row(1, Row(5)) :: Nil)
    checkAnswer(df.select("nested.*"), Row(0) :: Row(5) :: Row(0) :: Row(5) :: Nil)
  }
  test("SPARK-36673: Union of structs with different orders") {
    val df1 = spark.range(2).withColumn("nested",
      struct(expr("id * 5 AS inner1"), struct(expr("id * 10 AS inner2"))))
    val df2 = spark.range(2).withColumn("nested",
      struct(expr("id * 5 AS inner2"), struct(expr("id * 10 AS inner1"))))
    val err1 = intercept[AnalysisException](df1.union(df2).collect())
    assert(err1.message
      .contains("Union can only be performed on tables with the compatible column types"))
    val df3 = spark.range(2).withColumn("nested",
      struct(expr("id * 5 AS inner1"), struct(expr("id * 10 AS inner2").cast("string"))))
    val df4 = spark.range(2).withColumn("nested",
      struct(expr("id * 5 AS inner2").cast("string"), struct(expr("id * 10 AS inner1"))))
    val err2 = intercept[AnalysisException](df3.union(df4).collect())
    assert(err2.message
      .contains("Union can only be performed on tables with the compatible column types"))
  }
 }
 case class UnionClass1a(a: Int, b: Long, nested: UnionClass2)