From 6e862792fbc6c0916ad04f1c23dc4acbc5f5a53b Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Mon, 14 Dec 2020 10:22:37 +0900 Subject: [PATCH] [SPARK-33723][SQL] ANSI mode: Casting String to Date should throw exception on parse error ### What changes were proposed in this pull request? Currently, when casting a string as timestamp type in ANSI mode, Spark throws a runtime exception on parsing error. However, the result for casting a string to date is always null. We should throw an exception on parsing error as well. ### Why are the changes needed? Add missing feature for ANSI mode ### Does this PR introduce _any_ user-facing change? Yes for ANSI mode, Casting string to date will throw an exception on parsing error ### How was this patch tested? Unit test Closes #30687 from gengliangwang/castDate. Authored-by: Gengliang Wang Signed-off-by: HyukjinKwon --- docs/sql-ref-ansi-compliance.md | 1 + .../spark/sql/catalyst/expressions/Cast.scala | 27 ++++++++---- .../sql/catalyst/util/DateTimeUtils.scala | 11 +++-- .../sql/catalyst/expressions/CastSuite.scala | 41 +++++++++++++++---- .../resources/sql-tests/inputs/datetime.sql | 5 ++- .../sql-tests/results/ansi/datetime.sql.out | 11 ++++- .../sql-tests/results/datetime-legacy.sql.out | 10 ++++- .../sql-tests/results/datetime.sql.out | 10 ++++- 8 files changed, 92 insertions(+), 24 deletions(-) diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 08ba07aa8d..8201fd7072 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -163,6 +163,7 @@ The behavior of some SQL operators can be different under ANSI mode (`spark.sql. - `array_col[index]`: This operator throws `ArrayIndexOutOfBoundsException` if using invalid indices. - `map_col[key]`: This operator throws `NoSuchElementException` if key does not exist in map. - `CAST(string_col AS TIMESTAMP)`: This operator should fail with an exception if the input string can't be parsed. + - `CAST(string_col AS DATE)`: This operator should fail with an exception if the input string can't be parsed. ### SQL Keywords diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 72bd9ca4d3..e1ece732cf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -499,7 +499,11 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit // DateConverter private[this] def castToDate(from: DataType): Any => Any = from match { case StringType => - buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s, zoneId).orNull) + if (ansiEnabled) { + buildCast[UTF8String](_, s => DateTimeUtils.stringToDateAnsi(s, zoneId)) + } else { + buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s, zoneId).orNull) + } case TimestampType => // throw valid precision more than seconds, according to Hive. // Timestamp.nanos is in 0 to 999,999,999, no more than a second. @@ -1135,15 +1139,22 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit val intOpt = ctx.freshVariable("intOpt", classOf[Option[Integer]]) val zid = getZoneId() (c, evPrim, evNull) => - code""" - scala.Option $intOpt = - org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c, $zid); - if ($intOpt.isDefined()) { - $evPrim = ((Integer) $intOpt.get()).intValue(); + if (ansiEnabled) { + code""" + $evPrim = org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDateAnsi($c, $zid); + """ } else { - $evNull = true; + code""" + scala.Option $intOpt = + org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c, $zid); + if ($intOpt.isDefined()) { + $evPrim = ((Integer) $intOpt.get()).intValue(); + } else { + $evNull = true; + } + """ } - """ + case TimestampType => val zid = getZoneId() (c, evPrim, evNull) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 0543ef99f8..780d2bad1b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -365,11 +365,8 @@ object DateTimeUtils { } def stringToTimestampAnsi(s: UTF8String, timeZoneId: ZoneId): Long = { - val timestamp = stringToTimestamp(s, timeZoneId) - if (timestamp.isEmpty) { + stringToTimestamp(s, timeZoneId).getOrElse { throw new DateTimeException(s"Cannot cast $s to TimestampType.") - } else { - timestamp.get } } @@ -466,6 +463,12 @@ object DateTimeUtils { } } + def stringToDateAnsi(s: UTF8String, zoneId: ZoneId): Int = { + stringToDate(s, zoneId).getOrElse { + throw new DateTimeException(s"Cannot cast $s to DateType.") + } + } + // Gets the local date-time parts (year, month, day and time) of the instant expressed as the // number of microseconds since the epoch at the given time zone ID. private def getLocalDateTime(micros: Long, zoneId: ZoneId): LocalDateTime = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index e46599dc19..c4dd5c4124 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCoercionSuite import org.apache.spark.sql.catalyst.expressions.aggregate.{CollectList, CollectSet} import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext import org.apache.spark.sql.catalyst.util.DateTimeConstants._ +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.internal.SQLConf @@ -93,12 +94,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Cast(Literal("2015-03-18 123142"), DateType), new Date(c.getTimeInMillis)) checkEvaluation(Cast(Literal("2015-03-18T123123"), DateType), new Date(c.getTimeInMillis)) checkEvaluation(Cast(Literal("2015-03-18T"), DateType), new Date(c.getTimeInMillis)) - - checkEvaluation(Cast(Literal("2015-03-18X"), DateType), null) - checkEvaluation(Cast(Literal("2015/03/18"), DateType), null) - checkEvaluation(Cast(Literal("2015.03.18"), DateType), null) - checkEvaluation(Cast(Literal("20150318"), DateType), null) - checkEvaluation(Cast(Literal("2015-031-8"), DateType), null) } test("cast string to timestamp") { @@ -962,7 +957,7 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { test("ANSI mode: cast string to timestamp with parse error") { val activeConf = conf - new ParVector(ALL_TIMEZONES.toVector).foreach { zid => + DateTimeTestUtils.outstandingZoneIds.foreach { zid => def checkCastWithParseError(str: String): Unit = { checkExceptionInExpression[DateTimeException]( cast(Literal(str), TimestampType, Option(zid.getId)), @@ -984,6 +979,30 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { } } + test("ANSI mode: cast string to date with parse error") { + val activeConf = conf + DateTimeTestUtils.outstandingZoneIds.foreach { zid => + def checkCastWithParseError(str: String): Unit = { + checkExceptionInExpression[DateTimeException]( + cast(Literal(str), DateType, Option(zid.getId)), + s"Cannot cast $str to DateType.") + } + + SQLConf.withExistingConf(activeConf) { + checkCastWithParseError("12345") + checkCastWithParseError("12345-12-18") + checkCastWithParseError("2015-13-18") + checkCastWithParseError("2015-03-128") + checkCastWithParseError("2015/03/18") + checkCastWithParseError("2015.03.18") + checkCastWithParseError("20150318") + checkCastWithParseError("2015-031-8") + checkCastWithParseError("2015-03-18ABC") + checkCastWithParseError("abdef") + } + } + } + test("SPARK-26218: Fix the corner case of codegen when casting float to Integer") { checkExceptionInExpression[ArithmeticException]( cast(cast(Literal("2147483648"), FloatType), IntegerType), "overflow") @@ -1026,6 +1045,14 @@ class CastSuite extends CastSuiteBase { checkEvaluation(cast(123, DecimalType(2, 0)), null) } + test("cast string to date #2") { + checkEvaluation(Cast(Literal("2015-03-18X"), DateType), null) + checkEvaluation(Cast(Literal("2015/03/18"), DateType), null) + checkEvaluation(Cast(Literal("2015.03.18"), DateType), null) + checkEvaluation(Cast(Literal("20150318"), DateType), null) + checkEvaluation(Cast(Literal("2015-031-8"), DateType), null) + } + test("casting to fixed-precision decimals") { assert(cast(123, DecimalType.USER_DEFAULT).nullable === false) assert(cast(10.03f, DecimalType.SYSTEM_DEFAULT).nullable) diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index e35266a85d..acfd1f50e1 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -161,7 +161,7 @@ select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMM select from_csv('26/October/2015', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')); select from_csv('26/October/2015', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')); --- Timestamp type parse error +-- Datetime types parse error select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS"); select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS"); select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS"); @@ -170,4 +170,5 @@ select unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS"); select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS"); select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS"); select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS"); -select cast("Unparseable" as timestamp) +select cast("Unparseable" as timestamp); +select cast("Unparseable" as date); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out index 18a751f573..400c8d6c3c 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 122 +-- Number of queries: 123 -- !query @@ -1060,3 +1060,12 @@ struct<> -- !query output java.time.DateTimeException Cannot cast Unparseable to TimestampType. + + +-- !query +select cast("Unparseable" as date) +-- !query schema +struct<> +-- !query output +java.time.DateTimeException +Cannot cast Unparseable to DateType. diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out index be75f6fb99..7e4ea78bf4 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 122 +-- Number of queries: 123 -- !query @@ -1013,3 +1013,11 @@ select cast("Unparseable" as timestamp) struct -- !query output NULL + + +-- !query +select cast("Unparseable" as date) +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 1e963ed16f..01db4c1c11 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 122 +-- Number of queries: 123 -- !query @@ -1021,3 +1021,11 @@ select cast("Unparseable" as timestamp) struct -- !query output NULL + + +-- !query +select cast("Unparseable" as date) +-- !query schema +struct +-- !query output +NULL