[SPARK-33723][SQL] ANSI mode: Casting String to Date should throw exception on parse error

### What changes were proposed in this pull request?

Currently, when casting a string as timestamp type in ANSI mode, Spark throws a runtime exception on parsing error.
However, the result for casting a string to date is always null. We should throw an exception on parsing error as well.

### Why are the changes needed?

Add missing feature for ANSI mode

### Does this PR introduce _any_ user-facing change?

Yes for ANSI mode, Casting string to date will throw an exception on parsing error

### How was this patch tested?

Unit test

Closes #30687 from gengliangwang/castDate.

Authored-by: Gengliang Wang <gengliang.wang@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
Gengliang Wang 2020-12-14 10:22:37 +09:00 committed by HyukjinKwon
parent 8197ee3b15
commit 6e862792fb
8 changed files with 92 additions and 24 deletions

View file

@ -163,6 +163,7 @@ The behavior of some SQL operators can be different under ANSI mode (`spark.sql.
- `array_col[index]`: This operator throws `ArrayIndexOutOfBoundsException` if using invalid indices.
- `map_col[key]`: This operator throws `NoSuchElementException` if key does not exist in map.
- `CAST(string_col AS TIMESTAMP)`: This operator should fail with an exception if the input string can't be parsed.
- `CAST(string_col AS DATE)`: This operator should fail with an exception if the input string can't be parsed.
### SQL Keywords

View file

@ -499,7 +499,11 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
// DateConverter
private[this] def castToDate(from: DataType): Any => Any = from match {
case StringType =>
if (ansiEnabled) {
buildCast[UTF8String](_, s => DateTimeUtils.stringToDateAnsi(s, zoneId))
} else {
buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s, zoneId).orNull)
}
case TimestampType =>
// throw valid precision more than seconds, according to Hive.
// Timestamp.nanos is in 0 to 999,999,999, no more than a second.
@ -1135,6 +1139,11 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
val intOpt = ctx.freshVariable("intOpt", classOf[Option[Integer]])
val zid = getZoneId()
(c, evPrim, evNull) =>
if (ansiEnabled) {
code"""
$evPrim = org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDateAnsi($c, $zid);
"""
} else {
code"""
scala.Option<Integer> $intOpt =
org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c, $zid);
@ -1144,6 +1153,8 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
$evNull = true;
}
"""
}
case TimestampType =>
val zid = getZoneId()
(c, evPrim, evNull) =>

View file

@ -365,11 +365,8 @@ object DateTimeUtils {
}
def stringToTimestampAnsi(s: UTF8String, timeZoneId: ZoneId): Long = {
val timestamp = stringToTimestamp(s, timeZoneId)
if (timestamp.isEmpty) {
stringToTimestamp(s, timeZoneId).getOrElse {
throw new DateTimeException(s"Cannot cast $s to TimestampType.")
} else {
timestamp.get
}
}
@ -466,6 +463,12 @@ object DateTimeUtils {
}
}
def stringToDateAnsi(s: UTF8String, zoneId: ZoneId): Int = {
stringToDate(s, zoneId).getOrElse {
throw new DateTimeException(s"Cannot cast $s to DateType.")
}
}
// Gets the local date-time parts (year, month, day and time) of the instant expressed as the
// number of microseconds since the epoch at the given time zone ID.
private def getLocalDateTime(micros: Long, zoneId: ZoneId): LocalDateTime = {

View file

@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCoercionSuite
import org.apache.spark.sql.catalyst.expressions.aggregate.{CollectList, CollectSet}
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
import org.apache.spark.sql.catalyst.util.DateTimeConstants._
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
import org.apache.spark.sql.catalyst.util.DateTimeUtils._
import org.apache.spark.sql.internal.SQLConf
@ -93,12 +94,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(Cast(Literal("2015-03-18 123142"), DateType), new Date(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T123123"), DateType), new Date(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T"), DateType), new Date(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18X"), DateType), null)
checkEvaluation(Cast(Literal("2015/03/18"), DateType), null)
checkEvaluation(Cast(Literal("2015.03.18"), DateType), null)
checkEvaluation(Cast(Literal("20150318"), DateType), null)
checkEvaluation(Cast(Literal("2015-031-8"), DateType), null)
}
test("cast string to timestamp") {
@ -962,7 +957,7 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase {
test("ANSI mode: cast string to timestamp with parse error") {
val activeConf = conf
new ParVector(ALL_TIMEZONES.toVector).foreach { zid =>
DateTimeTestUtils.outstandingZoneIds.foreach { zid =>
def checkCastWithParseError(str: String): Unit = {
checkExceptionInExpression[DateTimeException](
cast(Literal(str), TimestampType, Option(zid.getId)),
@ -984,6 +979,30 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase {
}
}
test("ANSI mode: cast string to date with parse error") {
val activeConf = conf
DateTimeTestUtils.outstandingZoneIds.foreach { zid =>
def checkCastWithParseError(str: String): Unit = {
checkExceptionInExpression[DateTimeException](
cast(Literal(str), DateType, Option(zid.getId)),
s"Cannot cast $str to DateType.")
}
SQLConf.withExistingConf(activeConf) {
checkCastWithParseError("12345")
checkCastWithParseError("12345-12-18")
checkCastWithParseError("2015-13-18")
checkCastWithParseError("2015-03-128")
checkCastWithParseError("2015/03/18")
checkCastWithParseError("2015.03.18")
checkCastWithParseError("20150318")
checkCastWithParseError("2015-031-8")
checkCastWithParseError("2015-03-18ABC")
checkCastWithParseError("abdef")
}
}
}
test("SPARK-26218: Fix the corner case of codegen when casting float to Integer") {
checkExceptionInExpression[ArithmeticException](
cast(cast(Literal("2147483648"), FloatType), IntegerType), "overflow")
@ -1026,6 +1045,14 @@ class CastSuite extends CastSuiteBase {
checkEvaluation(cast(123, DecimalType(2, 0)), null)
}
test("cast string to date #2") {
checkEvaluation(Cast(Literal("2015-03-18X"), DateType), null)
checkEvaluation(Cast(Literal("2015/03/18"), DateType), null)
checkEvaluation(Cast(Literal("2015.03.18"), DateType), null)
checkEvaluation(Cast(Literal("20150318"), DateType), null)
checkEvaluation(Cast(Literal("2015-031-8"), DateType), null)
}
test("casting to fixed-precision decimals") {
assert(cast(123, DecimalType.USER_DEFAULT).nullable === false)
assert(cast(10.03f, DecimalType.SYSTEM_DEFAULT).nullable)

View file

@ -161,7 +161,7 @@ select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMM
select from_csv('26/October/2015', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy'));
select from_csv('26/October/2015', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy'));
-- Timestamp type parse error
-- Datetime types parse error
select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
@ -170,4 +170,5 @@ select unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
select cast("Unparseable" as timestamp)
select cast("Unparseable" as timestamp);
select cast("Unparseable" as date);

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 122
-- Number of queries: 123
-- !query
@ -1060,3 +1060,12 @@ struct<>
-- !query output
java.time.DateTimeException
Cannot cast Unparseable to TimestampType.
-- !query
select cast("Unparseable" as date)
-- !query schema
struct<>
-- !query output
java.time.DateTimeException
Cannot cast Unparseable to DateType.

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 122
-- Number of queries: 123
-- !query
@ -1013,3 +1013,11 @@ select cast("Unparseable" as timestamp)
struct<CAST(Unparseable AS TIMESTAMP):timestamp>
-- !query output
NULL
-- !query
select cast("Unparseable" as date)
-- !query schema
struct<CAST(Unparseable AS DATE):date>
-- !query output
NULL

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 122
-- Number of queries: 123
-- !query
@ -1021,3 +1021,11 @@ select cast("Unparseable" as timestamp)
struct<CAST(Unparseable AS TIMESTAMP):timestamp>
-- !query output
NULL
-- !query
select cast("Unparseable" as date)
-- !query schema
struct<CAST(Unparseable AS DATE):date>
-- !query output
NULL