[SPARK-34614][SQL] ANSI mode: Casting String to Boolean should throw exception on parse error

### What changes were proposed in this pull request?

In ANSI mode, casting String to Boolean should throw an exception on parse error, instead of returning null

### Why are the changes needed?

For better ANSI compliance

### Does this PR introduce _any_ user-facing change?

Yes, in ANSI mode there will be an exception on parse failure of casting String value to Boolean type.

### How was this patch tested?

Unit tests.

Closes #31734 from gengliangwang/ansiCastToBoolean.

Authored-by: Gengliang Wang <gengliang.wang@databricks.com>
Signed-off-by: Gengliang Wang <gengliang.wang@databricks.com>
This commit is contained in:
Gengliang Wang 2021-03-04 19:04:16 +08:00
parent 53e4dba7c4
commit 2b1c170016
4 changed files with 264 additions and 80 deletions

View file

@ -165,6 +165,7 @@ The behavior of some SQL operators can be different under ANSI mode (`spark.sql.
- `map_col[key]`: This operator throws `NoSuchElementException` if key does not exist in map.
- `CAST(string_col AS TIMESTAMP)`: This operator should fail with an exception if the input string can't be parsed.
- `CAST(string_col AS DATE)`: This operator should fail with an exception if the input string can't be parsed.
- `CAST(string_col AS BOOLEAN)`: This operator should fail with an exception if the input string can't be parsed.
### SQL Keywords

View file

@ -426,9 +426,13 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
true
} else if (StringUtils.isFalseString(s)) {
false
} else {
if (ansiEnabled) {
throw new UnsupportedOperationException(s"invalid input syntax for type boolean: $s")
} else {
null
}
}
})
case TimestampType =>
buildCast[Long](_, t => t != 0)
@ -1349,13 +1353,19 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
case StringType =>
val stringUtils = inline"${StringUtils.getClass.getName.stripSuffix("$")}"
(c, evPrim, evNull) =>
val castFailureCode = if (ansiEnabled) {
val errorMessage = s""""invalid input syntax for type boolean: " + $c"""
s"throw new java.lang.UnsupportedOperationException($errorMessage);"
} else {
s"$evNull = true;"
}
code"""
if ($stringUtils.isTrueString($c)) {
$evPrim = true;
} else if ($stringUtils.isFalseString($c)) {
$evPrim = false;
} else {
$evNull = true;
$castFailureCode
}
"""
case TimestampType =>

View file

@ -351,12 +351,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
checkNullCast(ArrayType(StringType), ArrayType(IntegerType))
{
val ret = cast(array, ArrayType(BooleanType, containsNull = true))
assert(ret.resolved)
checkEvaluation(ret, Seq(null, true, false, null))
}
{
val array = Literal.create(Seq.empty, ArrayType(NullType, containsNull = false))
val ret = cast(array, ArrayType(IntegerType, containsNull = false))
@ -369,11 +363,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
assert(ret.resolved === false)
}
{
val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = true))
assert(ret.resolved)
checkEvaluation(ret, Seq(null, true, false))
}
{
val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = false))
assert(ret.resolved === false)
@ -395,11 +384,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
checkNullCast(MapType(StringType, IntegerType), MapType(StringType, StringType))
{
val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = true))
assert(ret.resolved)
checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false, "d" -> null))
}
{
val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = false))
assert(ret.resolved === false)
@ -408,11 +392,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
val ret = cast(map, MapType(IntegerType, StringType, valueContainsNull = true))
assert(ret.resolved === false)
}
{
val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = true))
assert(ret.resolved)
checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false))
}
{
val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = false))
assert(ret.resolved === false)
@ -458,15 +437,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
StructField("b", StringType, nullable = false),
StructField("c", StringType, nullable = false))))
{
val ret = cast(struct, StructType(Seq(
StructField("a", BooleanType, nullable = true),
StructField("b", BooleanType, nullable = true),
StructField("c", BooleanType, nullable = true),
StructField("d", BooleanType, nullable = true))))
assert(ret.resolved)
checkEvaluation(ret, InternalRow(null, true, false, null))
}
{
val ret = cast(struct, StructType(Seq(
StructField("a", BooleanType, nullable = true),
@ -476,14 +446,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
assert(ret.resolved === false)
}
{
val ret = cast(struct_notNull, StructType(Seq(
StructField("a", BooleanType, nullable = true),
StructField("b", BooleanType, nullable = true),
StructField("c", BooleanType, nullable = true))))
assert(ret.resolved)
checkEvaluation(ret, InternalRow(null, true, false))
}
{
val ret = cast(struct_notNull, StructType(Seq(
StructField("a", BooleanType, nullable = true),
@ -571,9 +533,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
checkCast("n", false)
checkCast("no", false)
checkCast("0", false)
checkEvaluation(cast("abc", BooleanType), null)
checkEvaluation(cast("", BooleanType), null)
}
protected def checkInvalidCastFromNumericType(to: DataType): Unit = {
@ -955,6 +914,114 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase {
"invalid input syntax for type numeric")
}
protected def checkCastToBooleanError(l: Literal, to: DataType): Unit = {
checkExceptionInExpression[UnsupportedOperationException](
cast(l, to), s"invalid input syntax for type boolean")
}
test("ANSI mode: cast string to boolean with parse error") {
checkCastToBooleanError(Literal("abc"), BooleanType)
checkCastToBooleanError(Literal(""), BooleanType)
}
test("cast from array II") {
val array = Literal.create(Seq("123", "true", "f", null),
ArrayType(StringType, containsNull = true))
val array_notNull = Literal.create(Seq("123", "true", "f"),
ArrayType(StringType, containsNull = false))
{
val to: DataType = ArrayType(BooleanType, containsNull = true)
val ret = cast(array, to)
assert(ret.resolved)
checkCastToBooleanError(array, to)
}
{
val to: DataType = ArrayType(BooleanType, containsNull = true)
val ret = cast(array_notNull, to)
assert(ret.resolved)
checkCastToBooleanError(array_notNull, to)
}
}
test("cast from map II") {
val map = Literal.create(
Map("a" -> "123", "b" -> "true", "c" -> "f", "d" -> null),
MapType(StringType, StringType, valueContainsNull = true))
val map_notNull = Literal.create(
Map("a" -> "123", "b" -> "true", "c" -> "f"),
MapType(StringType, StringType, valueContainsNull = false))
checkNullCast(MapType(StringType, IntegerType), MapType(StringType, StringType))
{
val to: DataType = MapType(StringType, BooleanType, valueContainsNull = true)
val ret = cast(map, to)
assert(ret.resolved)
checkCastToBooleanError(map, to)
}
{
val to: DataType = MapType(StringType, BooleanType, valueContainsNull = true)
val ret = cast(map_notNull, to)
assert(ret.resolved)
checkCastToBooleanError(map_notNull, to)
}
}
test("cast from struct II") {
checkNullCast(
StructType(Seq(
StructField("a", StringType),
StructField("b", IntegerType))),
StructType(Seq(
StructField("a", StringType),
StructField("b", StringType))))
val struct = Literal.create(
InternalRow(
UTF8String.fromString("123"),
UTF8String.fromString("true"),
UTF8String.fromString("f"),
null),
StructType(Seq(
StructField("a", StringType, nullable = true),
StructField("b", StringType, nullable = true),
StructField("c", StringType, nullable = true),
StructField("d", StringType, nullable = true))))
val struct_notNull = Literal.create(
InternalRow(
UTF8String.fromString("123"),
UTF8String.fromString("true"),
UTF8String.fromString("f")),
StructType(Seq(
StructField("a", StringType, nullable = false),
StructField("b", StringType, nullable = false),
StructField("c", StringType, nullable = false))))
{
val to: DataType = StructType(Seq(
StructField("a", BooleanType, nullable = true),
StructField("b", BooleanType, nullable = true),
StructField("c", BooleanType, nullable = true),
StructField("d", BooleanType, nullable = true)))
val ret = cast(struct, to)
assert(ret.resolved)
checkCastToBooleanError(struct, to)
}
{
val to: DataType = StructType(Seq(
StructField("a", BooleanType, nullable = true),
StructField("b", BooleanType, nullable = true),
StructField("c", BooleanType, nullable = true)))
val ret = cast(struct_notNull, to)
assert(ret.resolved)
checkCastToBooleanError(struct_notNull, to)
}
}
test("ANSI mode: cast string to timestamp with parse error") {
val activeConf = conf
DateTimeTestUtils.outstandingZoneIds.foreach { zid =>
@ -1185,6 +1252,101 @@ class CastSuite extends CastSuiteBase {
StructType(StructField("a", IntegerType, true) :: Nil)))
}
test("cast string to boolean II") {
checkEvaluation(cast("abc", BooleanType), null)
checkEvaluation(cast("", BooleanType), null)
}
test("cast from array II") {
val array = Literal.create(Seq("123", "true", "f", null),
ArrayType(StringType, containsNull = true))
val array_notNull = Literal.create(Seq("123", "true", "f"),
ArrayType(StringType, containsNull = false))
{
val ret = cast(array, ArrayType(BooleanType, containsNull = true))
assert(ret.resolved)
checkEvaluation(ret, Seq(null, true, false, null))
}
{
val ret = cast(array_notNull, ArrayType(BooleanType, containsNull = true))
assert(ret.resolved)
checkEvaluation(ret, Seq(null, true, false))
}
}
test("cast from map II") {
val map = Literal.create(
Map("a" -> "123", "b" -> "true", "c" -> "f", "d" -> null),
MapType(StringType, StringType, valueContainsNull = true))
val map_notNull = Literal.create(
Map("a" -> "123", "b" -> "true", "c" -> "f"),
MapType(StringType, StringType, valueContainsNull = false))
{
val ret = cast(map, MapType(StringType, BooleanType, valueContainsNull = true))
assert(ret.resolved)
checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false, "d" -> null))
}
{
val ret = cast(map_notNull, MapType(StringType, BooleanType, valueContainsNull = true))
assert(ret.resolved)
checkEvaluation(ret, Map("a" -> null, "b" -> true, "c" -> false))
}
}
test("cast from struct II") {
checkNullCast(
StructType(Seq(
StructField("a", StringType),
StructField("b", IntegerType))),
StructType(Seq(
StructField("a", StringType),
StructField("b", StringType))))
val struct = Literal.create(
InternalRow(
UTF8String.fromString("123"),
UTF8String.fromString("true"),
UTF8String.fromString("f"),
null),
StructType(Seq(
StructField("a", StringType, nullable = true),
StructField("b", StringType, nullable = true),
StructField("c", StringType, nullable = true),
StructField("d", StringType, nullable = true))))
val struct_notNull = Literal.create(
InternalRow(
UTF8String.fromString("123"),
UTF8String.fromString("true"),
UTF8String.fromString("f")),
StructType(Seq(
StructField("a", StringType, nullable = false),
StructField("b", StringType, nullable = false),
StructField("c", StringType, nullable = false))))
{
val ret = cast(struct, StructType(Seq(
StructField("a", BooleanType, nullable = true),
StructField("b", BooleanType, nullable = true),
StructField("c", BooleanType, nullable = true),
StructField("d", BooleanType, nullable = true))))
assert(ret.resolved)
checkEvaluation(ret, InternalRow(null, true, false, null))
}
{
val ret = cast(struct_notNull, StructType(Seq(
StructField("a", BooleanType, nullable = true),
StructField("b", BooleanType, nullable = true),
StructField("c", BooleanType, nullable = true))))
assert(ret.resolved)
checkEvaluation(ret, InternalRow(null, true, false))
}
}
test("SPARK-31227: Non-nullable null type should not coerce to nullable type") {
TypeCoercionSuite.allTypes.foreach { t =>
assert(Cast.canCast(ArrayType(NullType, false), ArrayType(t, false)))

View file

@ -53,9 +53,10 @@ true
-- !query
SELECT boolean('test') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: test
-- !query
@ -69,9 +70,10 @@ false
-- !query
SELECT boolean('foo') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: foo
-- !query
@ -93,9 +95,10 @@ true
-- !query
SELECT boolean('yeah') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: yeah
-- !query
@ -117,57 +120,64 @@ false
-- !query
SELECT boolean('nay') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: nay
-- !query
SELECT boolean('on') AS true
-- !query schema
struct<true:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: on
-- !query
SELECT boolean('off') AS `false`
-- !query schema
struct<false:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: off
-- !query
SELECT boolean('of') AS `false`
-- !query schema
struct<false:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: of
-- !query
SELECT boolean('o') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: o
-- !query
SELECT boolean('on_') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: on_
-- !query
SELECT boolean('off_') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: off_
-- !query
@ -181,9 +191,10 @@ true
-- !query
SELECT boolean('11') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: 11
-- !query
@ -197,17 +208,19 @@ false
-- !query
SELECT boolean('000') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: 000
-- !query
SELECT boolean('') AS error
-- !query schema
struct<error:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean:
-- !query
@ -310,17 +323,19 @@ true false
-- !query
SELECT boolean(string(' tru e ')) AS invalid
-- !query schema
struct<invalid:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean: tru e
-- !query
SELECT boolean(string('')) AS invalid
-- !query schema
struct<invalid:boolean>
struct<>
-- !query output
NULL
java.lang.UnsupportedOperationException
invalid input syntax for type boolean:
-- !query
@ -463,7 +478,8 @@ INSERT INTO BOOLTBL2
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
failed to evaluate expression CAST('XXX' AS BOOLEAN): invalid input syntax for type boolean: XXX; line 2 pos 3
-- !query
@ -471,7 +487,6 @@ SELECT '' AS f_4, BOOLTBL2.* FROM BOOLTBL2
-- !query schema
struct<f_4:string,f1:boolean>
-- !query output
NULL
false
false
false
@ -545,9 +560,6 @@ struct<tf_12_ff_4:string,f1:boolean,f1:boolean>
false false
false false
false false
true NULL
true NULL
true NULL
true false
true false
true false
@ -623,7 +635,7 @@ SELECT '' AS `Not False`, f1
-- !query schema
struct<Not False:string,f1:boolean>
-- !query output
NULL
-- !query
@ -646,7 +658,6 @@ SELECT '' AS `Not True`, f1
-- !query schema
struct<Not True:string,f1:boolean>
-- !query output
NULL
false
false
false