[SPARK-36286][SQL] Block some invalid datetime string

### What changes were proposed in this pull request?
In PR #32959, we found some weird datetime strings that can be parsed. ([details](https://github.com/apache/spark/pull/32959#discussion_r665015489))
This PR blocks the invalid datetime string.

### Why are the changes needed?
bug fix

### Does this PR introduce _any_ user-facing change?
Yes, below strings will have different results when cast to datetime.
```sql
select cast('12::' as timestamp); -- Before: 2021-07-07 12:00:00, After: NULL
select cast('T' as timestamp); -- Before: 2021-07-07 00:00:00, After: NULL
```

### How was this patch tested?
some new test cases

Closes #33490 from linhongliu-db/SPARK-35780-block-invalid-format.

Authored-by: Linhong Liu <linhong.liu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Linhong Liu 2021-07-29 09:16:46 +08:00 committed by Wenchen Fan
parent 9c5cb99d6e
commit ed0e351f05
4 changed files with 16 additions and 4 deletions

View file

@ -254,7 +254,9 @@ object DateTimeUtils {
val maxDigitsYear = 6
// For the nanosecond part, more than 6 digits is allowed, but will be truncated.
segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
(segment != 0 && segment != 6 && digits <= 2)
// For the zoneId segment(7), it's could be zero digits when it's a region-based zone ID
(segment == 7 && digits <= 2) ||
(segment != 0 && segment != 6 && segment != 7 && digits > 0 && digits <= 2)
}
if (s == null || s.trimAll().numBytes() == 0) {
return (Array.empty, None, false)
@ -527,7 +529,8 @@ object DateTimeUtils {
def isValidDigits(segment: Int, digits: Int): Boolean = {
// An integer is able to represent a date within [+-]5 million years.
var maxDigitsYear = 7
(segment == 0 && digits >= 4 && digits <= maxDigitsYear) || (segment != 0 && digits <= 2)
(segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
(segment != 0 && digits > 0 && digits <= 2)
}
if (s == null || s.trimAll().numBytes() == 0) {
return None

View file

@ -576,4 +576,8 @@ class CastSuite extends CastSuiteBase {
checkEvaluation(cast(invalidInput, TimestampNTZType), null)
}
}
test("SPARK-36286: invalid string cast to timestamp") {
checkEvaluation(cast(Literal("2015-03-18T"), TimestampType), null)
}
}

View file

@ -150,7 +150,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
c.set(Calendar.MILLISECOND, 0)
checkCastStringToTimestamp("2015-03-18", new Timestamp(c.getTimeInMillis))
checkCastStringToTimestamp("2015-03-18 ", new Timestamp(c.getTimeInMillis))
checkCastStringToTimestamp("2015-03-18T", new Timestamp(c.getTimeInMillis))
c = Calendar.getInstance(tz)
c.set(2015, 2, 18, 12, 3, 17)

View file

@ -147,6 +147,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
assert(toDate("1999 08 01").isEmpty)
assert(toDate("1999-08 01").isEmpty)
assert(toDate("1999 08").isEmpty)
assert(toDate("1999-08-").isEmpty)
assert(toDate("").isEmpty)
assert(toDate(" ").isEmpty)
}
@ -182,7 +183,7 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
checkStringToTimestamp("1969-12-31 16:00:00", Option(date(1969, 12, 31, 16, zid = zid)))
checkStringToTimestamp("0001", Option(date(1, 1, 1, 0, zid = zid)))
checkStringToTimestamp("2015-03", Option(date(2015, 3, 1, zid = zid)))
Seq("2015-03-18", "2015-03-18 ", " 2015-03-18", " 2015-03-18 ", "2015-03-18T").foreach { s =>
Seq("2015-03-18", "2015-03-18 ", " 2015-03-18", " 2015-03-18 ").foreach { s =>
checkStringToTimestamp(s, Option(date(2015, 3, 18, zid = zid)))
}
@ -289,6 +290,11 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
checkStringToTimestamp("", None)
checkStringToTimestamp(" ", None)
checkStringToTimestamp("+", None)
checkStringToTimestamp("T", None)
checkStringToTimestamp("2015-03-18T", None)
checkStringToTimestamp("12::", None)
checkStringToTimestamp("2015-03-18T12:03:17-8:", None)
checkStringToTimestamp("2015-03-18T12:03:17-8:30:", None)
// Truncating the fractional seconds
expected = Option(date(2015, 3, 18, 12, 3, 17, 123456, zid = UTC))