From 034070a23aa8bcecc351bb2fec413e1662dcbb75 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 4 Nov 2020 12:30:38 +0800 Subject: [PATCH] Revert "[SPARK-33248][SQL] Add a configuration to control the legacy behavior of whether need to pad null value when value size less then schema size" This reverts commit 0c943cd2fbc6f2d25588991613abf469ace0153e. --- docs/sql-migration-guide.md | 2 -- .../org/apache/spark/sql/internal/SQLConf.scala | 15 --------------- .../execution/BaseScriptTransformationExec.scala | 10 ++-------- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 319e72172d..fdc764a934 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -51,8 +51,6 @@ license: | - In Spark 3.1, loading and saving of timestamps from/to parquet files fails if the timestamps are before 1900-01-01 00:00:00Z, and loaded (saved) as the INT96 type. In Spark 3.0, the actions don't fail but might lead to shifting of the input timestamps due to rebasing from/to Julian to/from Proleptic Gregorian calendar. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.parquet.int96RebaseModeInRead` or/and `spark.sql.legacy.parquet.int96RebaseModeInWrite` to `LEGACY`. - In Spark 3.1, the `schema_of_json` and `schema_of_csv` functions return the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case. - - - In Spark 3.1, when `spark.sql.legacy.transformationPadNullWhenValueLessThenSchema` is true, Spark will pad NULL value when script transformation's output value size less then schema size in default-serde mode(script transformation with row format of `ROW FORMAT DELIMITED`). If false, Spark will keep original behavior to throw `ArrayIndexOutOfBoundsException`. ## Upgrading from Spark SQL 3.0 to 3.0.1 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 8825f4f963..21357a492e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2765,18 +2765,6 @@ object SQLConf { .checkValue(_ > 0, "The timeout value must be positive") .createWithDefault(10L) - val LEGACY_SCRIPT_TRANSFORM_PAD_NULL = - buildConf("spark.sql.legacy.transformationPadNullWhenValueLessThenSchema") - .internal() - .doc("Whether pad null value when transformation output's value size less then " + - "schema size in default-serde mode(script transformation with row format of " + - "`ROW FORMAT DELIMITED`)." + - "When true, Spark will pad NULL value to keep same behavior with hive." + - "When false, Spark keep original behavior to throw `ArrayIndexOutOfBoundsException`") - .version("3.1.0") - .booleanConf - .createWithDefault(true) - val LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP = buildConf("spark.sql.legacy.allowCastNumericToTimestamp") .internal() @@ -3505,9 +3493,6 @@ class SQLConf extends Serializable with Logging { def legacyAllowModifyActiveSession: Boolean = getConf(StaticSQLConf.LEGACY_ALLOW_MODIFY_ACTIVE_SESSION) - def legacyPadNullWhenValueLessThenSchema: Boolean = - getConf(SQLConf.LEGACY_SCRIPT_TRANSFORM_PAD_NULL) - def legacyAllowCastNumericToTimestamp: Boolean = getConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala index f2cddc7ba7..74e5aa716a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala @@ -104,16 +104,10 @@ trait BaseScriptTransformationExec extends UnaryExecNode { val reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)) val outputRowFormat = ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD") - - val padNull = if (conf.legacyPadNullWhenValueLessThenSchema) { - (arr: Array[String], size: Int) => arr.padTo(size, null) - } else { - (arr: Array[String], size: Int) => arr - } val processRowWithoutSerde = if (!ioschema.schemaLess) { prevLine: String => new GenericInternalRow( - padNull(prevLine.split(outputRowFormat), outputFieldWriters.size) + prevLine.split(outputRowFormat).padTo(outputFieldWriters.size, null) .zip(outputFieldWriters) .map { case (data, writer) => writer(data) }) } else { @@ -124,7 +118,7 @@ trait BaseScriptTransformationExec extends UnaryExecNode { val kvWriter = CatalystTypeConverters.createToCatalystConverter(StringType) prevLine: String => new GenericInternalRow( - padNull(prevLine.split(outputRowFormat).slice(0, 2), 2) + prevLine.split(outputRowFormat).slice(0, 2).padTo(2, null) .map(kvWriter)) }