[SPARK-32400][SQL] Improve test coverage of HiveScriptTransformationExec

### What changes were proposed in this pull request? 1. Extract common test case (no serde) to BasicScriptTransformationExecSuite 2. Add more test case for no serde mode about supported data type and behavior in `BasicScriptTransformationExecSuite` 3. Add more test case for hive serde mode about supported type and behavior in `HiveScriptTransformationExecSuite` ### Why are the changes needed? Improve test coverage of Script Transformation ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? Added UT Closes #29401 from AngersZhuuuu/SPARK-32400. Authored-by: angerszhu <angers.zhu@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2020-08-12 06:02:42 +00:00 · 2020-08-12 06:02:42 +00:00 · 4cf8c1d07d
parent 2d6eb00256
commit 4cf8c1d07d
6 changed files with 572 additions and 144 deletions
--- a/sql/core/src/test/resources/test_script.py
+++ b/sql/core/src/test/resources/test_script.py
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala
@ -0,0 +1,382 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.execution
 import java.sql.{Date, Timestamp}
 import org.json4s.DefaultFormats
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 import org.scalatest.Assertions._
 import org.scalatest.BeforeAndAfterEach
 import org.scalatest.exceptions.TestFailedException
 import org.apache.spark.{SparkException, TaskContext, TestUtils}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, GenericInternalRow}
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 abstract class BaseScriptTransformationSuite extends SparkPlanTest with SQLTestUtils
  with BeforeAndAfterEach {
  import testImplicits._
  import ScriptTransformationIOSchema._
  protected val uncaughtExceptionHandler = new TestUncaughtExceptionHandler
  private var defaultUncaughtExceptionHandler: Thread.UncaughtExceptionHandler = _
  protected override def beforeAll(): Unit = {
    super.beforeAll()
    defaultUncaughtExceptionHandler = Thread.getDefaultUncaughtExceptionHandler
    Thread.setDefaultUncaughtExceptionHandler(uncaughtExceptionHandler)
  }
  protected override def afterAll(): Unit = {
    super.afterAll()
    Thread.setDefaultUncaughtExceptionHandler(defaultUncaughtExceptionHandler)
  }
  override protected def afterEach(): Unit = {
    super.afterEach()
    uncaughtExceptionHandler.cleanStatus()
  }
  def isHive23OrSpark: Boolean
  def createScriptTransformationExec(
      input: Seq[Expression],
      script: String,
      output: Seq[Attribute],
      child: SparkPlan,
      ioschema: ScriptTransformationIOSchema): BaseScriptTransformationExec
  test("cat without SerDe") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    checkAnswer(
      rowsDf,
      (child: SparkPlan) => createScriptTransformationExec(
        input = Seq(rowsDf.col("a").expr),
        script = "cat",
        output = Seq(AttributeReference("a", StringType)()),
        child = child,
        ioschema = defaultIOSchema
      ),
      rowsDf.collect())
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
  test("script transformation should not swallow errors from upstream operators (no serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    val e = intercept[TestFailedException] {
      checkAnswer(
        rowsDf,
        (child: SparkPlan) => createScriptTransformationExec(
          input = Seq(rowsDf.col("a").expr),
          script = "cat",
          output = Seq(AttributeReference("a", StringType)()),
          child = ExceptionInjectingOperator(child),
          ioschema = defaultIOSchema
        ),
        rowsDf.collect())
    }
    assert(e.getMessage().contains("intentional exception"))
    // Before SPARK-25158, uncaughtExceptionHandler will catch IllegalArgumentException
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
  test("SPARK-25990: TRANSFORM should handle different data types correctly") {
    assume(TestUtils.testCommandAvailable("python"))
    val scriptFilePath = getTestResourcePath("test_script.py")
    withTempView("v") {
      val df = Seq(
        (1, "1", 1.0, BigDecimal(1.0), new Timestamp(1)),
        (2, "2", 2.0, BigDecimal(2.0), new Timestamp(2)),
        (3, "3", 3.0, BigDecimal(3.0), new Timestamp(3))
      ).toDF("a", "b", "c", "d", "e") // Note column d's data type is Decimal(38, 18)
      df.createTempView("v")
      val query = sql(
        s"""
           |SELECT
           |TRANSFORM(a, b, c, d, e)
           |USING 'python $scriptFilePath' AS (a, b, c, d, e)
           |FROM v
        """.stripMargin)
      // In Hive 1.2, the string representation of a decimal omits trailing zeroes.
      // But in Hive 2.3, it is always padded to 18 digits with trailing zeroes if necessary.
      val decimalToString: Column => Column = if (isHive23OrSpark) {
        c => c.cast("string")
      } else {
        c => c.cast("decimal(1, 0)").cast("string")
      }
      checkAnswer(query, identity, df.select(
        'a.cast("string"),
        'b.cast("string"),
        'c.cast("string"),
        decimalToString('d),
        'e.cast("string")).collect())
    }
  }
  test("SPARK-25990: TRANSFORM should handle schema less correctly (no serde)") {
    assume(TestUtils.testCommandAvailable("python"))
    val scriptFilePath = getTestResourcePath("test_script.py")
    withTempView("v") {
      val df = Seq(
        (1, "1", 1.0, BigDecimal(1.0), new Timestamp(1)),
        (2, "2", 2.0, BigDecimal(2.0), new Timestamp(2)),
        (3, "3", 3.0, BigDecimal(3.0), new Timestamp(3))
      ).toDF("a", "b", "c", "d", "e") // Note column d's data type is Decimal(38, 18)
      checkAnswer(
        df,
        (child: SparkPlan) => createScriptTransformationExec(
          input = Seq(
            df.col("a").expr,
            df.col("b").expr,
            df.col("c").expr,
            df.col("d").expr,
            df.col("e").expr),
          script = s"python $scriptFilePath",
          output = Seq(
            AttributeReference("key", StringType)(),
            AttributeReference("value", StringType)()),
          child = child,
          ioschema = defaultIOSchema.copy(schemaLess = true)
        ),
        df.select(
          'a.cast("string").as("key"),
          'b.cast("string").as("value")).collect())
    }
  }
  test("SPARK-30973: TRANSFORM should wait for the termination of the script (no serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    val e = intercept[SparkException] {
      val plan =
        createScriptTransformationExec(
          input = Seq(rowsDf.col("a").expr),
          script = "some_non_existent_command",
          output = Seq(AttributeReference("a", StringType)()),
          child = rowsDf.queryExecution.sparkPlan,
          ioschema = defaultIOSchema)
      SparkPlanTest.executePlan(plan, spark.sqlContext)
    }
    assert(e.getMessage.contains("Subprocess exited with status"))
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
  def testBasicInputDataTypesWith(serde: ScriptTransformationIOSchema, testName: String): Unit = {
    test(s"SPARK-32400: TRANSFORM should support basic data types as input ($testName)") {
      assume(TestUtils.testCommandAvailable("python"))
      withTempView("v") {
        val df = Seq(
          (1, "1", 1.0f, 1.0, 11.toByte, BigDecimal(1.0), new Timestamp(1),
            new Date(2020, 7, 1), true),
          (2, "2", 2.0f, 2.0, 22.toByte, BigDecimal(2.0), new Timestamp(2),
            new Date(2020, 7, 2), true),
          (3, "3", 3.0f, 3.0, 33.toByte, BigDecimal(3.0), new Timestamp(3),
            new Date(2020, 7, 3), false)
        ).toDF("a", "b", "c", "d", "e", "f", "g", "h", "i")
          .withColumn("j", lit("abc").cast("binary"))
        checkAnswer(
          df,
          (child: SparkPlan) => createScriptTransformationExec(
            input = Seq(
              df.col("a").expr,
              df.col("b").expr,
              df.col("c").expr,
              df.col("d").expr,
              df.col("e").expr,
              df.col("f").expr,
              df.col("g").expr,
              df.col("h").expr,
              df.col("i").expr,
              df.col("j").expr),
            script = "cat",
            output = Seq(
              AttributeReference("a", IntegerType)(),
              AttributeReference("b", StringType)(),
              AttributeReference("c", FloatType)(),
              AttributeReference("d", DoubleType)(),
              AttributeReference("e", ByteType)(),
              AttributeReference("f", DecimalType(38, 18))(),
              AttributeReference("g", TimestampType)(),
              AttributeReference("h", DateType)(),
              AttributeReference("i", BooleanType)(),
              AttributeReference("j", BinaryType)()),
            child = child,
            ioschema = serde
          ),
          df.select('a, 'b, 'c, 'd, 'e, 'f, 'g, 'h, 'i, 'j).collect())
      }
    }
  }
  testBasicInputDataTypesWith(defaultIOSchema, "no serde")
  test("SPARK-32400: TRANSFORM should support more data types (interval, array, map, struct " +
    "and udt) as input (no serde)") {
    assume(TestUtils.testCommandAvailable("python"))
    withTempView("v") {
      val df = Seq(
        (new CalendarInterval(7, 1, 1000), Array(0, 1, 2), Map("a" -> 1), (1, 2),
          new SimpleTuple(1, 1L)),
        (new CalendarInterval(7, 2, 2000), Array(3, 4, 5), Map("b" -> 2), (3, 4),
          new SimpleTuple(1, 1L)),
        (new CalendarInterval(7, 3, 3000), Array(6, 7, 8), Map("c" -> 3), (5, 6),
          new SimpleTuple(1, 1L))
      ).toDF("a", "b", "c", "d", "e")
      // Can't support convert script output data to ArrayType/MapType/StructType now,
      // return these column still as string.
      // For UserDefinedType, if user defined deserialize method to support convert string
      // to UserType like [[SimpleTupleUDT]], we can support convert to this UDT, else we
      // will return null value as column.
      checkAnswer(
        df,
        (child: SparkPlan) => createScriptTransformationExec(
          input = Seq(
            df.col("a").expr,
            df.col("b").expr,
            df.col("c").expr,
            df.col("d").expr,
            df.col("e").expr),
          script = "cat",
          output = Seq(
            AttributeReference("a", CalendarIntervalType)(),
            AttributeReference("b", StringType)(),
            AttributeReference("c", StringType)(),
            AttributeReference("d", StringType)(),
            AttributeReference("e", new SimpleTupleUDT)()),
          child = child,
          ioschema = defaultIOSchema
        ),
        df.select('a, 'b.cast("string"), 'c.cast("string"), 'd.cast("string"), 'e).collect())
    }
  }
  test("SPARK-32400: TRANSFORM should respect DATETIME_JAVA8API_ENABLED (no serde)") {
    assume(TestUtils.testCommandAvailable("python"))
    Array(false, true).foreach { java8AapiEnable =>
      withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8AapiEnable.toString) {
        withTempView("v") {
          val df = Seq(
            (new Timestamp(1), new Date(2020, 7, 1)),
            (new Timestamp(2), new Date(2020, 7, 2)),
            (new Timestamp(3), new Date(2020, 7, 3))
          ).toDF("a", "b")
          df.createTempView("v")
          val query = sql(
            """
              |SELECT TRANSFORM (a, b)
              |USING 'cat' AS (a timestamp, b date)
              |FROM v
            """.stripMargin)
          checkAnswer(query, identity, df.select('a, 'b).collect())
        }
      }
    }
  }
 }
 case class ExceptionInjectingOperator(child: SparkPlan) extends UnaryExecNode {
  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().map { x =>
      assert(TaskContext.get() != null) // Make sure that TaskContext is defined.
      Thread.sleep(1000) // This sleep gives the external process time to start.
      throw new IllegalArgumentException("intentional exception")
    }
  }
  override def output: Seq[Attribute] = child.output
  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
@SQLUserDefinedType(udt = classOf[SimpleTupleUDT])
 private class SimpleTuple(val id: Int, val size: Long) extends Serializable {
  override def hashCode(): Int = getClass.hashCode()
  override def equals(other: Any): Boolean = other match {
    case v: SimpleTuple => this.id == v.id && this.size == v.size
    case _ => false
  }
  override def toString: String =
    compact(render(
      ("id" -> id) ~
        ("size" -> size)
    ))
 }
 private class SimpleTupleUDT extends UserDefinedType[SimpleTuple] {
  override def sqlType: DataType = StructType(
    StructField("id", IntegerType, false) ::
      StructField("size", LongType, false) ::
      Nil)
  override def serialize(sql: SimpleTuple): Any = {
    val row = new GenericInternalRow(2)
    row.setInt(0, sql.id)
    row.setLong(1, sql.size)
    row
  }
  override def deserialize(datum: Any): SimpleTuple = {
    datum match {
      case str: String =>
        implicit val format = DefaultFormats
        val json = parse(str)
        new SimpleTuple((json \ "id").extract[Int], (json \ "size").extract[Long])
      case data: InternalRow if data.numFields == 2 =>
        new SimpleTuple(data.getInt(0), data.getLong(1))
      case _ => null
    }
  }
  override def userClass: Class[SimpleTuple] = classOf[SimpleTuple]
  override def asNullable: SimpleTupleUDT = this
  override def hashCode(): Int = getClass.hashCode()
  override def equals(other: Any): Boolean = {
    other.isInstanceOf[SimpleTupleUDT]
  }
 }
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/TestUncaughtExceptionHandler.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/TestUncaughtExceptionHandler.scala
@ -15,7 +15,7 @@
 * limitations under the License.
 */
-package org.apache.spark.sql.hive.execution
+package org.apache.spark.sql.execution
 class TestUncaughtExceptionHandler extends Thread.UncaughtExceptionHandler {
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@ -1063,6 +1063,9 @@ private[hive] trait HiveInspectors {
      case DateType => dateTypeInfo
      case TimestampType => timestampTypeInfo
      case NullType => voidTypeInfo
      case dt =>
        throw new AnalysisException(
          s"${dt.catalogString} cannot be converted to Hive TypeInfo")
    }
  }
 }
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
@ -20,67 +20,44 @@ package org.apache.spark.sql.hive.execution
 import java.sql.Timestamp
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.scalatest.Assertions._
 import org.scalatest.BeforeAndAfterEach
 import org.scalatest.exceptions.TestFailedException
-import org.apache.spark.{SparkException, TaskContext, TestUtils}
+import org.apache.spark.{SparkException, TestUtils}
-import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
-import org.apache.spark.sql.Column
+import org.apache.spark.sql.execution._
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{ScriptTransformationIOSchema, SparkPlan, SparkPlanTest, UnaryExecNode}
 import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.hive.test.TestHiveSingleton
-import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types._
-import org.apache.spark.sql.types.StringType
+import org.apache.spark.unsafe.types.CalendarInterval
 class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with TestHiveSingleton {
  import testImplicits._
 class HiveScriptTransformationSuite extends SparkPlanTest with SQLTestUtils with TestHiveSingleton
  with BeforeAndAfterEach {
  import spark.implicits._
  import ScriptTransformationIOSchema._
-  private val serdeIOSchema = defaultIOSchema.copy(
+  override def isHive23OrSpark: Boolean = HiveUtils.isHive23
  override def createScriptTransformationExec(
      input: Seq[Expression],
      script: String,
      output: Seq[Attribute],
      child: SparkPlan,
      ioschema: ScriptTransformationIOSchema): BaseScriptTransformationExec = {
    HiveScriptTransformationExec(
      input = input,
      script = script,
      output = output,
      child = child,
      ioschema = ioschema
    )
  }
  private val hiveIOSchema: ScriptTransformationIOSchema = {
    defaultIOSchema.copy(
      inputSerdeClass = Some(classOf[LazySimpleSerDe].getCanonicalName),
      outputSerdeClass = Some(classOf[LazySimpleSerDe].getCanonicalName)
    )
  private var defaultUncaughtExceptionHandler: Thread.UncaughtExceptionHandler = _
  private val uncaughtExceptionHandler = new TestUncaughtExceptionHandler
  protected override def beforeAll(): Unit = {
    super.beforeAll()
    defaultUncaughtExceptionHandler = Thread.getDefaultUncaughtExceptionHandler
    Thread.setDefaultUncaughtExceptionHandler(uncaughtExceptionHandler)
  }
  protected override def afterAll(): Unit = {
    super.afterAll()
    Thread.setDefaultUncaughtExceptionHandler(defaultUncaughtExceptionHandler)
  }
  override protected def afterEach(): Unit = {
    super.afterEach()
    uncaughtExceptionHandler.cleanStatus()
  }
  test("cat without SerDe") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    checkAnswer(
      rowsDf,
      (child: SparkPlan) => new HiveScriptTransformationExec(
        input = Seq(rowsDf.col("a").expr),
        script = "cat",
        output = Seq(AttributeReference("a", StringType)()),
        child = child,
        ioschema = defaultIOSchema
      ),
      rowsDf.collect())
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
  test("cat with LazySimpleSerDe") {
@ -89,30 +66,30 @@ class HiveScriptTransformationSuite extends SparkPlanTest with SQLTestUtils with
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    checkAnswer(
      rowsDf,
-      (child: SparkPlan) => new HiveScriptTransformationExec(
+      (child: SparkPlan) => createScriptTransformationExec(
        input = Seq(rowsDf.col("a").expr),
        script = "cat",
        output = Seq(AttributeReference("a", StringType)()),
        child = child,
-        ioschema = serdeIOSchema
+        ioschema = hiveIOSchema
      ),
      rowsDf.collect())
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
-  test("script transformation should not swallow errors from upstream operators (no serde)") {
+  test("script transformation should not swallow errors from upstream operators (hive serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    val e = intercept[TestFailedException] {
      checkAnswer(
        rowsDf,
-        (child: SparkPlan) => new HiveScriptTransformationExec(
+        (child: SparkPlan) => createScriptTransformationExec(
          input = Seq(rowsDf.col("a").expr),
          script = "cat",
          output = Seq(AttributeReference("a", StringType)()),
          child = ExceptionInjectingOperator(child),
-          ioschema = defaultIOSchema
+          ioschema = hiveIOSchema
        ),
        rowsDf.collect())
    }
@ -121,67 +98,65 @@ class HiveScriptTransformationSuite extends SparkPlanTest with SQLTestUtils with
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
-  test("script transformation should not swallow errors from upstream operators (with serde)") {
+  test("SPARK-14400 script transformation should fail for bad script command (hive serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    val e = intercept[TestFailedException] {
      checkAnswer(
        rowsDf,
        (child: SparkPlan) => new HiveScriptTransformationExec(
          input = Seq(rowsDf.col("a").expr),
          script = "cat",
          output = Seq(AttributeReference("a", StringType)()),
          child = ExceptionInjectingOperator(child),
          ioschema = serdeIOSchema
        ),
        rowsDf.collect())
    }
    assert(e.getMessage().contains("intentional exception"))
    // Before SPARK-25158, uncaughtExceptionHandler will catch IllegalArgumentException
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
  test("SPARK-14400 script transformation should fail for bad script command") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    val e = intercept[SparkException] {
      val plan =
-        new HiveScriptTransformationExec(
+        createScriptTransformationExec(
          input = Seq(rowsDf.col("a").expr),
          script = "some_non_existent_command",
          output = Seq(AttributeReference("a", StringType)()),
          child = rowsDf.queryExecution.sparkPlan,
-          ioschema = serdeIOSchema)
+          ioschema = hiveIOSchema)
      SparkPlanTest.executePlan(plan, hiveContext)
    }
    assert(e.getMessage.contains("Subprocess exited with status"))
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
-  test("SPARK-24339 verify the result after pruning the unused columns") {
+  test("SPARK-24339 verify the result after pruning the unused columns (hive serde)") {
    val rowsDf = Seq(
      ("Bob", 16, 176),
      ("Alice", 32, 164),
      ("David", 60, 192),
-      ("Amy", 24, 180)).toDF("name", "age", "height")
+      ("Amy", 24, 180)
    ).toDF("name", "age", "height")
    checkAnswer(
      rowsDf,
-      (child: SparkPlan) => new HiveScriptTransformationExec(
+      (child: SparkPlan) => createScriptTransformationExec(
        input = Seq(rowsDf.col("name").expr),
        script = "cat",
        output = Seq(AttributeReference("name", StringType)()),
        child = child,
-        ioschema = serdeIOSchema
+        ioschema = hiveIOSchema
      ),
      rowsDf.select("name").collect())
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
-  test("SPARK-25990: TRANSFORM should handle different data types correctly") {
+  test("SPARK-30973: TRANSFORM should wait for the termination of the script (hive serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
    val e = intercept[SparkException] {
      val plan =
        createScriptTransformationExec(
          input = Seq(rowsDf.col("a").expr),
          script = "some_non_existent_command",
          output = Seq(AttributeReference("a", StringType)()),
          child = rowsDf.queryExecution.sparkPlan,
          ioschema = hiveIOSchema)
      SparkPlanTest.executePlan(plan, hiveContext)
    }
    assert(e.getMessage.contains("Subprocess exited with status"))
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
  test("SPARK-25990: TRANSFORM should handle schema less correctly (hive serde)") {
    assume(TestUtils.testCommandAvailable("python"))
    val scriptFilePath = getTestResourcePath("test_script.py")
@ -195,75 +170,142 @@ class HiveScriptTransformationSuite extends SparkPlanTest with SQLTestUtils with
      val query = sql(
        s"""
-          |SELECT
+           |SELECT TRANSFORM(a, b, c, d, e)
-          |TRANSFORM(a, b, c, d, e)
+           |USING 'python ${scriptFilePath}'
          |USING 'python $scriptFilePath' AS (a, b, c, d, e)
           |FROM v
        """.stripMargin)
-      // In Hive 1.2, the string representation of a decimal omits trailing zeroes.
+      // In hive default serde mode, if we don't define output schema, it will choose first
-      // But in Hive 2.3, it is always padded to 18 digits with trailing zeroes if necessary.
+      // two column as output schema (key: String, value: String)
-      val decimalToString: Column => Column = if (HiveUtils.isHive23) {
+      checkAnswer(
-        c => c.cast("string")
+        query,
-      } else {
+        identity,
-        c => c.cast("decimal(1, 0)").cast("string")
+        df.select(
-      }
+          'a.cast("string").as("key"),
-      checkAnswer(query, identity, df.select(
+          'b.cast("string").as("value")).collect())
        'a.cast("string"),
        'b.cast("string"),
        'c.cast("string"),
        decimalToString('d),
        'e.cast("string")).collect())
    }
  }
-  test("SPARK-30973: TRANSFORM should wait for the termination of the script (no serde)") {
+  testBasicInputDataTypesWith(hiveIOSchema, "hive serde")
  test("SPARK-32400: TRANSFORM supports complex data types type (hive serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    withTempView("v") {
      val df = Seq(
        (1, "1", Array(0, 1, 2), Map("a" -> 1)),
        (2, "2", Array(3, 4, 5), Map("b" -> 2))
      ).toDF("a", "b", "c", "d")
        .select('a, 'b, 'c, 'd, struct('a, 'b).as("e"))
      df.createTempView("v")
-    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
+      // Hive serde support ArrayType/MapType/StructType as input and output data type
-    val e = intercept[SparkException] {
+      checkAnswer(
-      val plan =
+        df,
-        new HiveScriptTransformationExec(
+        (child: SparkPlan) => createScriptTransformationExec(
-          input = Seq(rowsDf.col("a").expr),
+          input = Seq(
-          script = "some_non_existent_command",
+            df.col("c").expr,
-          output = Seq(AttributeReference("a", StringType)()),
+            df.col("d").expr,
-          child = rowsDf.queryExecution.sparkPlan,
+            df.col("e").expr),
-          ioschema = defaultIOSchema)
+          script = "cat",
-      SparkPlanTest.executePlan(plan, hiveContext)
+          output = Seq(
            AttributeReference("c", ArrayType(IntegerType))(),
            AttributeReference("d", MapType(StringType, IntegerType))(),
            AttributeReference("e", StructType(
              Seq(
                StructField("col1", IntegerType, false),
                StructField("col2", StringType, true))))()),
          child = child,
          ioschema = hiveIOSchema
        ),
        df.select('c, 'd, 'e).collect())
    }
    assert(e.getMessage.contains("Subprocess exited with status"))
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
-  test("SPARK-30973: TRANSFORM should wait for the termination of the script (with serde)") {
+  test("SPARK-32400: TRANSFORM supports complex data types end to end (hive serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    withTempView("v") {
      val df = Seq(
        (1, "1", Array(0, 1, 2), Map("a" -> 1)),
        (2, "2", Array(3, 4, 5), Map("b" -> 2))
      ).toDF("a", "b", "c", "d")
        .select('a, 'b, 'c, 'd, struct('a, 'b).as("e"))
      df.createTempView("v")
-    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
+      // Hive serde support ArrayType/MapType/StructType as input and output data type
-    val e = intercept[SparkException] {
+      val query = sql(
-      val plan =
+        """
-        new HiveScriptTransformationExec(
+          |SELECT TRANSFORM (c, d, e)
-          input = Seq(rowsDf.col("a").expr),
+          |USING 'cat' AS (c array<int>, d map<string, int>, e struct<col1:int, col2:string>)
-          script = "some_non_existent_command",
+          |FROM v
-          output = Seq(AttributeReference("a", StringType)()),
+        """.stripMargin)
-          child = rowsDf.queryExecution.sparkPlan,
+      checkAnswer(query, identity, df.select('c, 'd, 'e).collect())
-          ioschema = serdeIOSchema)
+    }
-      SparkPlanTest.executePlan(plan, hiveContext)
+  }
  test("SPARK-32400: TRANSFORM doesn't support CalenderIntervalType/UserDefinedType (hive serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    withTempView("v") {
      val df = Seq(
        (1, new CalendarInterval(7, 1, 1000), new TestUDT.MyDenseVector(Array(1, 2, 3))),
        (1, new CalendarInterval(7, 1, 1000), new TestUDT.MyDenseVector(Array(1, 2, 3)))
      ).toDF("a", "b", "c")
      df.createTempView("v")
      val e1 = intercept[SparkException] {
        val plan = createScriptTransformationExec(
          input = Seq(df.col("a").expr, df.col("b").expr),
          script = "cat",
          output = Seq(
            AttributeReference("a", IntegerType)(),
            AttributeReference("b", CalendarIntervalType)()),
          child = df.queryExecution.sparkPlan,
          ioschema = hiveIOSchema)
        SparkPlanTest.executePlan(plan, hiveContext)
      }.getMessage
      assert(e1.contains("interval cannot be converted to Hive TypeInfo"))
      val e2 = intercept[SparkException] {
        val plan = createScriptTransformationExec(
          input = Seq(df.col("a").expr, df.col("c").expr),
          script = "cat",
          output = Seq(
            AttributeReference("a", IntegerType)(),
            AttributeReference("c", new TestUDT.MyDenseVectorUDT)()),
          child = df.queryExecution.sparkPlan,
          ioschema = hiveIOSchema)
        SparkPlanTest.executePlan(plan, hiveContext)
      }.getMessage
      assert(e2.contains("array<double> cannot be converted to Hive TypeInfo"))
    }
  }
  test("SPARK-32400: TRANSFORM doesn't support" +
    " CalenderIntervalType/UserDefinedType end to end (hive serde)") {
    assume(TestUtils.testCommandAvailable("/bin/bash"))
    withTempView("v") {
      val df = Seq(
        (1, new CalendarInterval(7, 1, 1000), new TestUDT.MyDenseVector(Array(1, 2, 3))),
        (1, new CalendarInterval(7, 1, 1000), new TestUDT.MyDenseVector(Array(1, 2, 3)))
      ).toDF("a", "b", "c")
      df.createTempView("v")
      val e1 = intercept[SparkException] {
        sql(
          """
            |SELECT TRANSFORM(a, b) USING 'cat' AS (a, b)
            |FROM v
          """.stripMargin).collect()
      }.getMessage
      assert(e1.contains("interval cannot be converted to Hive TypeInfo"))
      val e2 = intercept[SparkException] {
        sql(
          """
            |SELECT TRANSFORM(a, c) USING 'cat' AS (a, c)
            |FROM v
          """.stripMargin).collect()
      }.getMessage
      assert(e2.contains("array<double> cannot be converted to Hive TypeInfo"))
    }
    assert(e.getMessage.contains("Subprocess exited with status"))
    assert(uncaughtExceptionHandler.exception.isEmpty)
  }
 }
 private case class ExceptionInjectingOperator(child: SparkPlan) extends UnaryExecNode {
  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().map { x =>
      assert(TaskContext.get() != null) // Make sure that TaskContext is defined.
      Thread.sleep(1000) // This sleep gives the external process time to start.
      throw new IllegalArgumentException("intentional exception")
    }
  }
  override def output: Seq[Attribute] = child.output
  override def outputPartitioning: Partitioning = child.outputPartitioning
 }
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, Functio
 import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, CatalogUtils, HiveTableRelation}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
 import org.apache.spark.sql.execution.TestUncaughtExceptionHandler
 import org.apache.spark.sql.execution.adaptive.{DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite}
 import org.apache.spark.sql.execution.command.{FunctionsCommand, LoadDataCommand}
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}