From ed0e351f05ac6edc132c3a630206b2031c419e1c Mon Sep 17 00:00:00 2001 From: Linhong Liu Date: Thu, 29 Jul 2021 09:16:46 +0800 Subject: [PATCH] [SPARK-36286][SQL] Block some invalid datetime string ### What changes were proposed in this pull request? In PR #32959, we found some weird datetime strings that can be parsed. ([details](https://github.com/apache/spark/pull/32959#discussion_r665015489)) This PR blocks the invalid datetime string. ### Why are the changes needed? bug fix ### Does this PR introduce _any_ user-facing change? Yes, below strings will have different results when cast to datetime. ```sql select cast('12::' as timestamp); -- Before: 2021-07-07 12:00:00, After: NULL select cast('T' as timestamp); -- Before: 2021-07-07 00:00:00, After: NULL ``` ### How was this patch tested? some new test cases Closes #33490 from linhongliu-db/SPARK-35780-block-invalid-format. Authored-by: Linhong Liu Signed-off-by: Wenchen Fan --- .../apache/spark/sql/catalyst/util/DateTimeUtils.scala | 7 +++++-- .../apache/spark/sql/catalyst/expressions/CastSuite.scala | 4 ++++ .../spark/sql/catalyst/expressions/CastSuiteBase.scala | 1 - .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 8 +++++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 0825a115e3..36d2b9b16b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -254,7 +254,9 @@ object DateTimeUtils { val maxDigitsYear = 6 // For the nanosecond part, more than 6 digits is allowed, but will be truncated. segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || - (segment != 0 && segment != 6 && digits <= 2) + // For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID + (segment == 7 && digits <= 2) || + (segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2) } if (s == null || s.trimAll().numBytes() == 0) { return (Array.empty, None, false) @@ -527,7 +529,8 @@ object DateTimeUtils { def isValidDigits(segment: Int, digits: Int): Boolean = { // An integer is able to represent a date within [+-]5 million years. var maxDigitsYear = 7 - (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || (segment != 0 && digits <= 2) + (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || + (segment != 0 && digits > 0 && digits <= 2) } if (s == null || s.trimAll().numBytes() == 0) { return None diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 26270e6d30..4e247f5bd7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -576,4 +576,8 @@ class CastSuite extends CastSuiteBase { checkEvaluation(cast(invalidInput, TimestampNTZType), null) } } + + test("SPARK-36286: invalid string cast to timestamp") { + checkEvaluation(cast(Literal("2015-03-18T"), TimestampType), null) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala index dcdc6f9c4d..f01fea8c77 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala @@ -150,7 +150,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { c.set(Calendar.MILLISECOND, 0) checkCastStringToTimestamp("2015-03-18", new Timestamp(c.getTimeInMillis)) checkCastStringToTimestamp("2015-03-18 ", new Timestamp(c.getTimeInMillis)) - checkCastStringToTimestamp("2015-03-18T", new Timestamp(c.getTimeInMillis)) c = Calendar.getInstance(tz) c.set(2015, 2, 18, 12, 3, 17) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index 2b7b94175b..9e61cb978a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -147,6 +147,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { assert(toDate("1999 08 01").isEmpty) assert(toDate("1999-08 01").isEmpty) assert(toDate("1999 08").isEmpty) + assert(toDate("1999-08-").isEmpty) assert(toDate("").isEmpty) assert(toDate(" ").isEmpty) } @@ -182,7 +183,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { checkStringToTimestamp("1969-12-31 16:00:00", Option(date(1969, 12, 31, 16, zid = zid))) checkStringToTimestamp("0001", Option(date(1, 1, 1, 0, zid = zid))) checkStringToTimestamp("2015-03", Option(date(2015, 3, 1, zid = zid))) - Seq("2015-03-18", "2015-03-18 ", " 2015-03-18", " 2015-03-18 ", "2015-03-18T").foreach { s => + Seq("2015-03-18", "2015-03-18 ", " 2015-03-18", " 2015-03-18 ").foreach { s => checkStringToTimestamp(s, Option(date(2015, 3, 18, zid = zid))) } @@ -289,6 +290,11 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { checkStringToTimestamp("", None) checkStringToTimestamp(" ", None) checkStringToTimestamp("+", None) + checkStringToTimestamp("T", None) + checkStringToTimestamp("2015-03-18T", None) + checkStringToTimestamp("12::", None) + checkStringToTimestamp("2015-03-18T12:03:17-8:", None) + checkStringToTimestamp("2015-03-18T12:03:17-8:30:", None) // Truncating the fractional seconds expected = Option(date(2015, 3, 18, 12, 3, 17, 123456, zid = UTC))