[SPARK-32968][SQL] Prune unnecessary columns from CsvToStructs

### What changes were proposed in this pull request? This patch proposes to do column pruning for CsvToStructs expression if we only require some fields from it. ### Why are the changes needed? `CsvToStructs` takes a schema parameter used to tell CSV Parser what fields are needed to parse. If `CsvToStructs` is followed by GetStructField. We can prune the schema to only parse certain field. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test Closes #30912 from viirya/SPARK-32968. Lead-authored-by: Liang-Chi Hsieh <viirya@gmail.com> Co-authored-by: Hyukjin Kwon <gurwls223@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-12-29 21:37:17 +09:00 · 2020-12-29 21:37:17 +09:00 · f9fe742442
parent 2627825647
commit f9fe742442
8 changed files with 272 additions and 101 deletions
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
@ -51,7 +51,8 @@ case class CsvToStructs(
    schema: StructType,
    options: Map[String, String],
    child: Expression,
-    timeZoneId: Option[String] = None)
+    timeZoneId: Option[String] = None,
    requiredSchema: Option[StructType] = None)
  extends UnaryExpression
    with TimeZoneAwareExpression
    with CodegenFallback
@ -113,7 +114,12 @@ case class CsvToStructs(
    val actualSchema =
      StructType(nullableSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
-    val rawParser = new UnivocityParser(actualSchema, actualSchema, parsedOptions)
+    val actualRequiredSchema =
      StructType(requiredSchema.map(_.asNullable).getOrElse(nullableSchema)
        .filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
    val rawParser = new UnivocityParser(actualSchema,
      actualRequiredSchema,
      parsedOptions)
    new FailureSafeParser[String](
      input => rawParser.parse(input),
      mode,
@ -121,7 +127,7 @@ case class CsvToStructs(
      parsedOptions.columnNameOfCorruptRecord)
  }
-  override def dataType: DataType = nullableSchema
+  override def dataType: DataType = requiredSchema.getOrElse(schema).asNullable
  override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = {
    copy(timeZoneId = Option(timeZoneId))
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCsvJsonExprs.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCsvJsonExprs.scala
@ -0,0 +1,120 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{ArrayType, StructType}
 /**
 * Simplify redundant csv/json related expressions.
 *
 * The optimization includes:
 * 1. JsonToStructs(StructsToJson(child)) => child.
 * 2. Prune unnecessary columns from GetStructField/GetArrayStructFields + JsonToStructs.
 * 3. CreateNamedStruct(JsonToStructs(json).col1, JsonToStructs(json).col2, ...) =>
 *      If(IsNull(json), nullStruct, KnownNotNull(JsonToStructs(prunedSchema, ..., json)))
 *      if JsonToStructs(json) is shared among all fields of CreateNamedStruct. `prunedSchema`
 *      contains all accessed fields in original CreateNamedStruct.
 * 4. Prune unnecessary columns from GetStructField + CsvToStructs.
 */
 object OptimizeCsvJsonExprs extends Rule[LogicalPlan] {
  private def nameOfCorruptRecord = SQLConf.get.getConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD)
  override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
    case p =>
      val optimized = if (SQLConf.get.jsonExpressionOptimization) {
        p.transformExpressions(jsonOptimization)
      } else {
        p
      }
      if (SQLConf.get.csvExpressionOptimization) {
        optimized.transformExpressions(csvOptimization)
      } else {
        optimized
      }
  }
  private val jsonOptimization: PartialFunction[Expression, Expression] = {
    case c: CreateNamedStruct
        // If we create struct from various fields of the same `JsonToStructs`.
        if c.valExprs.forall { v =>
          v.isInstanceOf[GetStructField] &&
            v.asInstanceOf[GetStructField].child.isInstanceOf[JsonToStructs] &&
            v.children.head.semanticEquals(c.valExprs.head.children.head)
        } =>
      val jsonToStructs = c.valExprs.map(_.children.head)
      val sameFieldName = c.names.zip(c.valExprs).forall {
        case (name, valExpr: GetStructField) =>
          name.toString == valExpr.childSchema(valExpr.ordinal).name
        case _ => false
      }
      // Although `CreateNamedStruct` allows duplicated field names, e.g. "a int, a int",
      // `JsonToStructs` does not support parsing json with duplicated field names.
      val duplicateFields = c.names.map(_.toString).distinct.length != c.names.length
      // If we create struct from various fields of the same `JsonToStructs` and we don't
      // alias field names and there is no duplicated field in the struct.
      if (sameFieldName && !duplicateFields) {
        val fromJson = jsonToStructs.head.asInstanceOf[JsonToStructs].copy(schema = c.dataType)
        val nullFields = c.children.grouped(2).flatMap {
          case Seq(name, value) => Seq(name, Literal(null, value.dataType))
        }.toSeq
        If(IsNull(fromJson.child), c.copy(children = nullFields), KnownNotNull(fromJson))
      } else {
        c
      }
    case jsonToStructs @ JsonToStructs(_, options1,
      StructsToJson(options2, child, timeZoneId2), timeZoneId1)
        if options1.isEmpty && options2.isEmpty && timeZoneId1 == timeZoneId2 &&
          jsonToStructs.dataType == child.dataType =>
      // `StructsToJson` only fails when `JacksonGenerator` encounters data types it
      // cannot convert to JSON. But `StructsToJson.checkInputDataTypes` already
      // verifies its child's data types is convertible to JSON. But in
      // `StructsToJson(JsonToStructs(...))` case, we cannot verify input json string
      // so `JsonToStructs` might throw error in runtime. Thus we cannot optimize
      // this case similarly.
      child
    case g @ GetStructField(j @ JsonToStructs(schema: StructType, _, _, _), ordinal, _)
        if schema.length > 1 =>
      val prunedSchema = StructType(Seq(schema(ordinal)))
      g.copy(child = j.copy(schema = prunedSchema), ordinal = 0)
    case g @ GetArrayStructFields(j @ JsonToStructs(schema: ArrayType, _, _, _), _, _, _, _)
        if schema.elementType.asInstanceOf[StructType].length > 1 =>
      val prunedSchema = ArrayType(StructType(Seq(g.field)), g.containsNull)
      g.copy(child = j.copy(schema = prunedSchema), ordinal = 0, numFields = 1)
  }
  private val csvOptimization: PartialFunction[Expression, Expression] = {
    case g @ GetStructField(c @ CsvToStructs(schema: StructType, _, _, _, None), ordinal, _)
        if schema.length > 1 && c.options.isEmpty && schema(ordinal).name != nameOfCorruptRecord =>
        // When the parse mode is permissive, and corrupt column is not selected, we can prune here
        // from `GetStructField`. To be more conservative, it does not optimize when any option
        // is set.
      val prunedSchema = StructType(Seq(schema(ordinal)))
      g.copy(child = c.copy(requiredSchema = Some(prunedSchema)), ordinal = 0)
  }
 }
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJsonExprs.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJsonExprs.scala
@ -1,96 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{ArrayType, StructType}
 /**
 * Simplify redundant json related expressions.
 *
 * The optimization includes:
 * 1. JsonToStructs(StructsToJson(child)) => child.
 * 2. Prune unnecessary columns from GetStructField/GetArrayStructFields + JsonToStructs.
 * 3. CreateNamedStruct(JsonToStructs(json).col1, JsonToStructs(json).col2, ...) =>
 *      If(IsNull(json), nullStruct, KnownNotNull(JsonToStructs(prunedSchema, ..., json)))
 *      if JsonToStructs(json) is shared among all fields of CreateNamedStruct. `prunedSchema`
 *      contains all accessed fields in original CreateNamedStruct.
 */
 object OptimizeJsonExprs extends Rule[LogicalPlan] {
  override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
    case p if SQLConf.get.jsonExpressionOptimization => p.transformExpressions {
      case c: CreateNamedStruct
          // If we create struct from various fields of the same `JsonToStructs`.
          if c.valExprs.forall { v =>
            v.isInstanceOf[GetStructField] &&
              v.asInstanceOf[GetStructField].child.isInstanceOf[JsonToStructs] &&
              v.children.head.semanticEquals(c.valExprs.head.children.head)
          } =>
        val jsonToStructs = c.valExprs.map(_.children.head)
        val sameFieldName = c.names.zip(c.valExprs).forall {
          case (name, valExpr: GetStructField) =>
            name.toString == valExpr.childSchema(valExpr.ordinal).name
          case _ => false
        }
        // Although `CreateNamedStruct` allows duplicated field names, e.g. "a int, a int",
        // `JsonToStructs` does not support parsing json with duplicated field names.
        val duplicateFields = c.names.map(_.toString).distinct.length != c.names.length
        // If we create struct from various fields of the same `JsonToStructs` and we don't
        // alias field names and there is no duplicated field in the struct.
        if (sameFieldName && !duplicateFields) {
          val fromJson = jsonToStructs.head.asInstanceOf[JsonToStructs].copy(schema = c.dataType)
          val nullFields = c.children.grouped(2).flatMap {
            case Seq(name, value) => Seq(name, Literal(null, value.dataType))
          }.toSeq
          If(IsNull(fromJson.child), c.copy(children = nullFields), KnownNotNull(fromJson))
        } else {
          c
        }
      case jsonToStructs @ JsonToStructs(_, options1,
        StructsToJson(options2, child, timeZoneId2), timeZoneId1)
          if options1.isEmpty && options2.isEmpty && timeZoneId1 == timeZoneId2 &&
            jsonToStructs.dataType == child.dataType =>
        // `StructsToJson` only fails when `JacksonGenerator` encounters data types it
        // cannot convert to JSON. But `StructsToJson.checkInputDataTypes` already
        // verifies its child's data types is convertible to JSON. But in
        // `StructsToJson(JsonToStructs(...))` case, we cannot verify input json string
        // so `JsonToStructs` might throw error in runtime. Thus we cannot optimize
        // this case similarly.
        child
      case g @ GetStructField(j @ JsonToStructs(schema: StructType, _, _, _), ordinal, _)
          if schema.length > 1 =>
        val prunedSchema = StructType(Seq(schema(ordinal)))
        g.copy(child = j.copy(schema = prunedSchema), ordinal = 0)
      case g @ GetArrayStructFields(j @ JsonToStructs(schema: ArrayType, _, _, _), _, _, _, _)
          if schema.elementType.asInstanceOf[StructType].length > 1 =>
        val prunedSchema = ArrayType(StructType(Seq(g.field)), g.containsNull)
        g.copy(child = j.copy(schema = prunedSchema), ordinal = 0, numFields = 1)
    }
  }
 }
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@ -114,7 +114,7 @@ abstract class Optimizer(catalogManager: CatalogManager)
        RemoveNoopOperators,
        OptimizeUpdateFields,
        SimplifyExtractValueOps,
-        OptimizeJsonExprs,
+        OptimizeCsvJsonExprs,
        CombineConcats) ++
        extendedOperatorOptimizationRules
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@ -1631,6 +1631,14 @@ object SQLConf {
      .booleanConf
      .createWithDefault(true)
  val CSV_EXPRESSION_OPTIMIZATION =
    buildConf("spark.sql.optimizer.enableCsvExpressionOptimization")
      .doc("Whether to optimize CSV expressions in SQL optimizer. It includes pruning " +
        "unnecessary columns from from_csv.")
      .version("3.2.0")
      .booleanConf
      .createWithDefault(true)
  val FILE_SINK_LOG_DELETION = buildConf("spark.sql.streaming.fileSink.log.deletion")
    .internal()
    .doc("Whether to delete the expired log files in file stream sink.")
@ -3489,6 +3497,8 @@ class SQLConf extends Serializable with Logging {
  def jsonExpressionOptimization: Boolean = getConf(SQLConf.JSON_EXPRESSION_OPTIMIZATION)
  def csvExpressionOptimization: Boolean = getConf(SQLConf.CSV_EXPRESSION_OPTIMIZATION)
  def parallelFileListingInStatsComputation: Boolean =
    getConf(SQLConf.PARALLEL_FILE_LISTING_IN_STATS_COMPUTATION)
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCsvExprsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCsvExprsSuite.scala
@ -0,0 +1,83 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 class OptimizeCsvExprsSuite extends PlanTest with ExpressionEvalHelper {
  private var csvExpressionOptimizeEnabled: Boolean = _
  protected override def beforeAll(): Unit = {
    csvExpressionOptimizeEnabled = SQLConf.get.csvExpressionOptimization
  }
  protected override def afterAll(): Unit = {
    SQLConf.get.setConf(SQLConf.CSV_EXPRESSION_OPTIMIZATION, csvExpressionOptimizeEnabled)
  }
  object Optimizer extends RuleExecutor[LogicalPlan] {
    val batches = Batch("Csv optimization", FixedPoint(10), OptimizeCsvJsonExprs) :: Nil
  }
  val schema = StructType.fromDDL("a int, b int")
  private val csvAttr = 'csv.string
  private val testRelation = LocalRelation(csvAttr)
  test("SPARK-32968: prune unnecessary columns from GetStructField + from_csv") {
    val options = Map.empty[String, String]
    val query1 = testRelation
      .select(GetStructField(CsvToStructs(schema, options, 'csv), 0))
    val optimized1 = Optimizer.execute(query1.analyze)
    val prunedSchema1 = StructType.fromDDL("a int")
    val expected1 = testRelation
      .select(GetStructField(CsvToStructs(schema, options, 'csv, None, Some(prunedSchema1)), 0))
      .analyze
    comparePlans(optimized1, expected1)
    val query2 = testRelation
      .select(GetStructField(CsvToStructs(schema, options, 'csv), 1))
    val optimized2 = Optimizer.execute(query2.analyze)
    val prunedSchema2 = StructType.fromDDL("b int")
    val expected2 = testRelation
      .select(GetStructField(CsvToStructs(schema, options, 'csv, None, Some(prunedSchema2)), 0))
      .analyze
    comparePlans(optimized2, expected2)
  }
  test("SPARK-32968: don't prune columns if options is not empty") {
    val options = Map("mode" -> "failfast")
    val query = testRelation
      .select(GetStructField(CsvToStructs(schema, options, 'csv), 0))
    val optimized = Optimizer.execute(query.analyze)
    val expected = query.analyze
    comparePlans(optimized, expected)
  }
 }
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJsonExprsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeJsonExprsSuite.scala
@ -39,7 +39,7 @@ class OptimizeJsonExprsSuite extends PlanTest with ExpressionEvalHelper {
  }
  object Optimizer extends RuleExecutor[LogicalPlan] {
-    val batches = Batch("Json optimization", FixedPoint(10), OptimizeJsonExprs) :: Nil
+    val batches = Batch("Json optimization", FixedPoint(10), OptimizeCsvJsonExprs) :: Nil
  }
  val schema = StructType.fromDDL("a int, b int")
--- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
@ -250,4 +250,52 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession {
         | """.stripMargin)
    checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]"))
  }
  test("SPARK-32968: Pruning csv field should not change result") {
    Seq("true", "false").foreach { enabled =>
      withSQLConf(SQLConf.CSV_EXPRESSION_OPTIMIZATION.key -> enabled) {
        val df1 = sparkContext.parallelize(Seq("a,b")).toDF("csv")
          .selectExpr("from_csv(csv, 'a string, b string', map('mode', 'failfast')) as parsed")
        checkAnswer(df1.selectExpr("parsed.a"), Seq(Row("a")))
        checkAnswer(df1.selectExpr("parsed.b"), Seq(Row("b")))
        val df2 = sparkContext.parallelize(Seq("a,b")).toDF("csv")
          .selectExpr("from_csv(csv, 'a string, b string') as parsed")
        checkAnswer(df2.selectExpr("parsed.a"), Seq(Row("a")))
        checkAnswer(df2.selectExpr("parsed.b"), Seq(Row("b")))
      }
    }
  }
  test("SPARK-32968: bad csv input with csv pruning optimization") {
    Seq("true", "false").foreach { enabled =>
      withSQLConf(SQLConf.CSV_EXPRESSION_OPTIMIZATION.key -> enabled) {
        val df = sparkContext.parallelize(Seq("1,\u0001\u0000\u0001234")).toDF("csv")
          .selectExpr("from_csv(csv, 'a int, b int', map('mode', 'failfast')) as parsed")
        val err1 = intercept[SparkException] {
          df.selectExpr("parsed.a").collect
        }
        val err2 = intercept[SparkException] {
          df.selectExpr("parsed.b").collect
        }
        assert(err1.getMessage.contains("Malformed records are detected in record parsing"))
        assert(err2.getMessage.contains("Malformed records are detected in record parsing"))
      }
    }
  }
  test("SPARK-32968: csv pruning optimization with corrupt record field") {
    Seq("true", "false").foreach { enabled =>
      withSQLConf(SQLConf.CSV_EXPRESSION_OPTIMIZATION.key -> enabled) {
        val df = sparkContext.parallelize(Seq("a,b,c,d")).toDF("csv")
          .selectExpr("from_csv(csv, 'a string, b string, _corrupt_record string') as parsed")
          .selectExpr("parsed._corrupt_record")
        checkAnswer(df, Seq(Row("a,b,c,d")))
      }
    }
  }
 }