[SPARK-33723][SQL] ANSI mode: Casting String to Date should throw exception on parse error
### What changes were proposed in this pull request? Currently, when casting a string as timestamp type in ANSI mode, Spark throws a runtime exception on parsing error. However, the result for casting a string to date is always null. We should throw an exception on parsing error as well. ### Why are the changes needed? Add missing feature for ANSI mode ### Does this PR introduce _any_ user-facing change? Yes for ANSI mode, Casting string to date will throw an exception on parsing error ### How was this patch tested? Unit test Closes #30687 from gengliangwang/castDate. Authored-by: Gengliang Wang <gengliang.wang@databricks.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
parent
8197ee3b15
commit
6e862792fb
|
@ -163,6 +163,7 @@ The behavior of some SQL operators can be different under ANSI mode (`spark.sql.
|
||||||
- `array_col[index]`: This operator throws `ArrayIndexOutOfBoundsException` if using invalid indices.
|
- `array_col[index]`: This operator throws `ArrayIndexOutOfBoundsException` if using invalid indices.
|
||||||
- `map_col[key]`: This operator throws `NoSuchElementException` if key does not exist in map.
|
- `map_col[key]`: This operator throws `NoSuchElementException` if key does not exist in map.
|
||||||
- `CAST(string_col AS TIMESTAMP)`: This operator should fail with an exception if the input string can't be parsed.
|
- `CAST(string_col AS TIMESTAMP)`: This operator should fail with an exception if the input string can't be parsed.
|
||||||
|
- `CAST(string_col AS DATE)`: This operator should fail with an exception if the input string can't be parsed.
|
||||||
|
|
||||||
### SQL Keywords
|
### SQL Keywords
|
||||||
|
|
||||||
|
|
|
@ -499,7 +499,11 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
|
||||||
// DateConverter
|
// DateConverter
|
||||||
private[this] def castToDate(from: DataType): Any => Any = from match {
|
private[this] def castToDate(from: DataType): Any => Any = from match {
|
||||||
case StringType =>
|
case StringType =>
|
||||||
buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s, zoneId).orNull)
|
if (ansiEnabled) {
|
||||||
|
buildCast[UTF8String](_, s => DateTimeUtils.stringToDateAnsi(s, zoneId))
|
||||||
|
} else {
|
||||||
|
buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s, zoneId).orNull)
|
||||||
|
}
|
||||||
case TimestampType =>
|
case TimestampType =>
|
||||||
// throw valid precision more than seconds, according to Hive.
|
// throw valid precision more than seconds, according to Hive.
|
||||||
// Timestamp.nanos is in 0 to 999,999,999, no more than a second.
|
// Timestamp.nanos is in 0 to 999,999,999, no more than a second.
|
||||||
|
@ -1135,15 +1139,22 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
|
||||||
val intOpt = ctx.freshVariable("intOpt", classOf[Option[Integer]])
|
val intOpt = ctx.freshVariable("intOpt", classOf[Option[Integer]])
|
||||||
val zid = getZoneId()
|
val zid = getZoneId()
|
||||||
(c, evPrim, evNull) =>
|
(c, evPrim, evNull) =>
|
||||||
code"""
|
if (ansiEnabled) {
|
||||||
scala.Option<Integer> $intOpt =
|
code"""
|
||||||
org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c, $zid);
|
$evPrim = org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDateAnsi($c, $zid);
|
||||||
if ($intOpt.isDefined()) {
|
"""
|
||||||
$evPrim = ((Integer) $intOpt.get()).intValue();
|
|
||||||
} else {
|
} else {
|
||||||
$evNull = true;
|
code"""
|
||||||
|
scala.Option<Integer> $intOpt =
|
||||||
|
org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c, $zid);
|
||||||
|
if ($intOpt.isDefined()) {
|
||||||
|
$evPrim = ((Integer) $intOpt.get()).intValue();
|
||||||
|
} else {
|
||||||
|
$evNull = true;
|
||||||
|
}
|
||||||
|
"""
|
||||||
}
|
}
|
||||||
"""
|
|
||||||
case TimestampType =>
|
case TimestampType =>
|
||||||
val zid = getZoneId()
|
val zid = getZoneId()
|
||||||
(c, evPrim, evNull) =>
|
(c, evPrim, evNull) =>
|
||||||
|
|
|
@ -365,11 +365,8 @@ object DateTimeUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
def stringToTimestampAnsi(s: UTF8String, timeZoneId: ZoneId): Long = {
|
def stringToTimestampAnsi(s: UTF8String, timeZoneId: ZoneId): Long = {
|
||||||
val timestamp = stringToTimestamp(s, timeZoneId)
|
stringToTimestamp(s, timeZoneId).getOrElse {
|
||||||
if (timestamp.isEmpty) {
|
|
||||||
throw new DateTimeException(s"Cannot cast $s to TimestampType.")
|
throw new DateTimeException(s"Cannot cast $s to TimestampType.")
|
||||||
} else {
|
|
||||||
timestamp.get
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -466,6 +463,12 @@ object DateTimeUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def stringToDateAnsi(s: UTF8String, zoneId: ZoneId): Int = {
|
||||||
|
stringToDate(s, zoneId).getOrElse {
|
||||||
|
throw new DateTimeException(s"Cannot cast $s to DateType.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Gets the local date-time parts (year, month, day and time) of the instant expressed as the
|
// Gets the local date-time parts (year, month, day and time) of the instant expressed as the
|
||||||
// number of microseconds since the epoch at the given time zone ID.
|
// number of microseconds since the epoch at the given time zone ID.
|
||||||
private def getLocalDateTime(micros: Long, zoneId: ZoneId): LocalDateTime = {
|
private def getLocalDateTime(micros: Long, zoneId: ZoneId): LocalDateTime = {
|
||||||
|
|
|
@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCoercionSuite
|
||||||
import org.apache.spark.sql.catalyst.expressions.aggregate.{CollectList, CollectSet}
|
import org.apache.spark.sql.catalyst.expressions.aggregate.{CollectList, CollectSet}
|
||||||
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
|
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
|
||||||
import org.apache.spark.sql.catalyst.util.DateTimeConstants._
|
import org.apache.spark.sql.catalyst.util.DateTimeConstants._
|
||||||
|
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
|
||||||
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
|
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
|
||||||
import org.apache.spark.sql.catalyst.util.DateTimeUtils._
|
import org.apache.spark.sql.catalyst.util.DateTimeUtils._
|
||||||
import org.apache.spark.sql.internal.SQLConf
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
|
@ -93,12 +94,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
|
||||||
checkEvaluation(Cast(Literal("2015-03-18 123142"), DateType), new Date(c.getTimeInMillis))
|
checkEvaluation(Cast(Literal("2015-03-18 123142"), DateType), new Date(c.getTimeInMillis))
|
||||||
checkEvaluation(Cast(Literal("2015-03-18T123123"), DateType), new Date(c.getTimeInMillis))
|
checkEvaluation(Cast(Literal("2015-03-18T123123"), DateType), new Date(c.getTimeInMillis))
|
||||||
checkEvaluation(Cast(Literal("2015-03-18T"), DateType), new Date(c.getTimeInMillis))
|
checkEvaluation(Cast(Literal("2015-03-18T"), DateType), new Date(c.getTimeInMillis))
|
||||||
|
|
||||||
checkEvaluation(Cast(Literal("2015-03-18X"), DateType), null)
|
|
||||||
checkEvaluation(Cast(Literal("2015/03/18"), DateType), null)
|
|
||||||
checkEvaluation(Cast(Literal("2015.03.18"), DateType), null)
|
|
||||||
checkEvaluation(Cast(Literal("20150318"), DateType), null)
|
|
||||||
checkEvaluation(Cast(Literal("2015-031-8"), DateType), null)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
test("cast string to timestamp") {
|
test("cast string to timestamp") {
|
||||||
|
@ -962,7 +957,7 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase {
|
||||||
|
|
||||||
test("ANSI mode: cast string to timestamp with parse error") {
|
test("ANSI mode: cast string to timestamp with parse error") {
|
||||||
val activeConf = conf
|
val activeConf = conf
|
||||||
new ParVector(ALL_TIMEZONES.toVector).foreach { zid =>
|
DateTimeTestUtils.outstandingZoneIds.foreach { zid =>
|
||||||
def checkCastWithParseError(str: String): Unit = {
|
def checkCastWithParseError(str: String): Unit = {
|
||||||
checkExceptionInExpression[DateTimeException](
|
checkExceptionInExpression[DateTimeException](
|
||||||
cast(Literal(str), TimestampType, Option(zid.getId)),
|
cast(Literal(str), TimestampType, Option(zid.getId)),
|
||||||
|
@ -984,6 +979,30 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("ANSI mode: cast string to date with parse error") {
|
||||||
|
val activeConf = conf
|
||||||
|
DateTimeTestUtils.outstandingZoneIds.foreach { zid =>
|
||||||
|
def checkCastWithParseError(str: String): Unit = {
|
||||||
|
checkExceptionInExpression[DateTimeException](
|
||||||
|
cast(Literal(str), DateType, Option(zid.getId)),
|
||||||
|
s"Cannot cast $str to DateType.")
|
||||||
|
}
|
||||||
|
|
||||||
|
SQLConf.withExistingConf(activeConf) {
|
||||||
|
checkCastWithParseError("12345")
|
||||||
|
checkCastWithParseError("12345-12-18")
|
||||||
|
checkCastWithParseError("2015-13-18")
|
||||||
|
checkCastWithParseError("2015-03-128")
|
||||||
|
checkCastWithParseError("2015/03/18")
|
||||||
|
checkCastWithParseError("2015.03.18")
|
||||||
|
checkCastWithParseError("20150318")
|
||||||
|
checkCastWithParseError("2015-031-8")
|
||||||
|
checkCastWithParseError("2015-03-18ABC")
|
||||||
|
checkCastWithParseError("abdef")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
test("SPARK-26218: Fix the corner case of codegen when casting float to Integer") {
|
test("SPARK-26218: Fix the corner case of codegen when casting float to Integer") {
|
||||||
checkExceptionInExpression[ArithmeticException](
|
checkExceptionInExpression[ArithmeticException](
|
||||||
cast(cast(Literal("2147483648"), FloatType), IntegerType), "overflow")
|
cast(cast(Literal("2147483648"), FloatType), IntegerType), "overflow")
|
||||||
|
@ -1026,6 +1045,14 @@ class CastSuite extends CastSuiteBase {
|
||||||
checkEvaluation(cast(123, DecimalType(2, 0)), null)
|
checkEvaluation(cast(123, DecimalType(2, 0)), null)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("cast string to date #2") {
|
||||||
|
checkEvaluation(Cast(Literal("2015-03-18X"), DateType), null)
|
||||||
|
checkEvaluation(Cast(Literal("2015/03/18"), DateType), null)
|
||||||
|
checkEvaluation(Cast(Literal("2015.03.18"), DateType), null)
|
||||||
|
checkEvaluation(Cast(Literal("20150318"), DateType), null)
|
||||||
|
checkEvaluation(Cast(Literal("2015-031-8"), DateType), null)
|
||||||
|
}
|
||||||
|
|
||||||
test("casting to fixed-precision decimals") {
|
test("casting to fixed-precision decimals") {
|
||||||
assert(cast(123, DecimalType.USER_DEFAULT).nullable === false)
|
assert(cast(123, DecimalType.USER_DEFAULT).nullable === false)
|
||||||
assert(cast(10.03f, DecimalType.SYSTEM_DEFAULT).nullable)
|
assert(cast(10.03f, DecimalType.SYSTEM_DEFAULT).nullable)
|
||||||
|
|
|
@ -161,7 +161,7 @@ select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMM
|
||||||
select from_csv('26/October/2015', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy'));
|
select from_csv('26/October/2015', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy'));
|
||||||
select from_csv('26/October/2015', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy'));
|
select from_csv('26/October/2015', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy'));
|
||||||
|
|
||||||
-- Timestamp type parse error
|
-- Datetime types parse error
|
||||||
select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
|
select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
|
||||||
select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
|
select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
|
||||||
select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
|
select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
|
||||||
|
@ -170,4 +170,5 @@ select unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
|
||||||
select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
|
select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
|
||||||
select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
|
select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
|
||||||
select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
|
select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
|
||||||
select cast("Unparseable" as timestamp)
|
select cast("Unparseable" as timestamp);
|
||||||
|
select cast("Unparseable" as date);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
-- Automatically generated by SQLQueryTestSuite
|
-- Automatically generated by SQLQueryTestSuite
|
||||||
-- Number of queries: 122
|
-- Number of queries: 123
|
||||||
|
|
||||||
|
|
||||||
-- !query
|
-- !query
|
||||||
|
@ -1060,3 +1060,12 @@ struct<>
|
||||||
-- !query output
|
-- !query output
|
||||||
java.time.DateTimeException
|
java.time.DateTimeException
|
||||||
Cannot cast Unparseable to TimestampType.
|
Cannot cast Unparseable to TimestampType.
|
||||||
|
|
||||||
|
|
||||||
|
-- !query
|
||||||
|
select cast("Unparseable" as date)
|
||||||
|
-- !query schema
|
||||||
|
struct<>
|
||||||
|
-- !query output
|
||||||
|
java.time.DateTimeException
|
||||||
|
Cannot cast Unparseable to DateType.
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
-- Automatically generated by SQLQueryTestSuite
|
-- Automatically generated by SQLQueryTestSuite
|
||||||
-- Number of queries: 122
|
-- Number of queries: 123
|
||||||
|
|
||||||
|
|
||||||
-- !query
|
-- !query
|
||||||
|
@ -1013,3 +1013,11 @@ select cast("Unparseable" as timestamp)
|
||||||
struct<CAST(Unparseable AS TIMESTAMP):timestamp>
|
struct<CAST(Unparseable AS TIMESTAMP):timestamp>
|
||||||
-- !query output
|
-- !query output
|
||||||
NULL
|
NULL
|
||||||
|
|
||||||
|
|
||||||
|
-- !query
|
||||||
|
select cast("Unparseable" as date)
|
||||||
|
-- !query schema
|
||||||
|
struct<CAST(Unparseable AS DATE):date>
|
||||||
|
-- !query output
|
||||||
|
NULL
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
-- Automatically generated by SQLQueryTestSuite
|
-- Automatically generated by SQLQueryTestSuite
|
||||||
-- Number of queries: 122
|
-- Number of queries: 123
|
||||||
|
|
||||||
|
|
||||||
-- !query
|
-- !query
|
||||||
|
@ -1021,3 +1021,11 @@ select cast("Unparseable" as timestamp)
|
||||||
struct<CAST(Unparseable AS TIMESTAMP):timestamp>
|
struct<CAST(Unparseable AS TIMESTAMP):timestamp>
|
||||||
-- !query output
|
-- !query output
|
||||||
NULL
|
NULL
|
||||||
|
|
||||||
|
|
||||||
|
-- !query
|
||||||
|
select cast("Unparseable" as date)
|
||||||
|
-- !query schema
|
||||||
|
struct<CAST(Unparseable AS DATE):date>
|
||||||
|
-- !query output
|
||||||
|
NULL
|
||||||
|
|
Loading…
Reference in a new issue