[SPARK-32501][SQL] Convert null to "null" in structs, maps and arrays while casting to strings

### What changes were proposed in this pull request?
Convert `NULL` elements of maps, structs and arrays to the `"null"` string while converting maps/struct/array values to strings. The SQL config `spark.sql.legacy.omitNestedNullInCast.enabled` controls the behaviour. When it is `true`, `NULL` elements of structs/maps/arrays will be omitted otherwise, when it is `false`, `NULL` elements will be converted to `"null"`.

### Why are the changes needed?
1. It is impossible to distinguish empty string and null, for instance:
```scala
scala> Seq(Seq(""), Seq(null)).toDF().show
+-----+
|value|
+-----+
|   []|
|   []|
+-----+
```
2. Inconsistent NULL conversions for top-level values and nested columns, for instance:
```scala
scala> sql("select named_struct('c', null), null").show
+---------------------+----+
|named_struct(c, NULL)|NULL|
+---------------------+----+
|                   []|null|
+---------------------+----+
```
3. `.show()` is different from conversions to Hive strings, and as a consequence its output is different from `spark-sql` (sql tests):
```sql
spark-sql> select named_struct('c', null) as struct;
{"c":null}
```
```scala
scala> sql("select named_struct('c', null) as struct").show
+------+
|struct|
+------+
|    []|
+------+
```

4. It is impossible to distinguish empty struct/array from struct/array with null in the current implementation:
```scala
scala> Seq[Seq[String]](Seq(), Seq(null)).toDF.show()
+-----+
|value|
+-----+
|   []|
|   []|
+-----+
```

### Does this PR introduce _any_ user-facing change?
Yes, before:
```scala
scala> Seq(Seq(""), Seq(null)).toDF().show
+-----+
|value|
+-----+
|   []|
|   []|
+-----+
```

After:
```scala
scala> Seq(Seq(""), Seq(null)).toDF().show
+------+
| value|
+------+
|    []|
|[null]|
+------+
```

### How was this patch tested?
By existing test suite `CastSuite`.

Closes #29311 from MaxGekk/nested-null-to-string.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Max Gekk 2020-08-05 12:03:36 +00:00 committed by Wenchen Fan
parent b14a1e2816
commit 3a437ed22b
4 changed files with 60 additions and 26 deletions

View file

@ -34,6 +34,8 @@ license: |
- In Spark 3.1, structs and maps are wrapped by the `{}` brackets in casting them to strings. For instance, the `show()` action and the `CAST` expression use such brackets. In Spark 3.0 and earlier, the `[]` brackets are used for the same purpose. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.castComplexTypesToString.enabled` to `true`.
- In Spark 3.1, NULL elements of structures, arrays and maps are converted to "null" in casting them to strings. In Spark 3.0 or earlier, NULL elements are converted to empty strings. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.castComplexTypesToString.enabled` to `true`.
## Upgrading from Spark SQL 3.0 to 3.0.1
- In Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they match to the pattern defined by the JSON option `timestampFormat`. Since version 3.0.1, the timestamp type inference is disabled by default. Set the JSON option `inferTimestamp` to `true` to enable such type inference.

View file

@ -297,9 +297,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
private lazy val dateFormatter = DateFormatter(zoneId)
private lazy val timestampFormatter = TimestampFormatter.getFractionFormatter(zoneId)
private val legacyCastToStr = SQLConf.get.getConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING)
// The brackets that are used in casting structs and maps to strings
private val (leftBracket, rightBracket) =
if (SQLConf.get.getConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING)) ("[", "]") else ("{", "}")
private val (leftBracket, rightBracket) = if (legacyCastToStr) ("[", "]") else ("{", "}")
// UDFToString
private[this] def castToString(from: DataType): Any => Any = from match {
@ -321,7 +321,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
var i = 1
while (i < array.numElements) {
builder.append(",")
if (!array.isNullAt(i)) {
if (array.isNullAt(i)) {
if (!legacyCastToStr) builder.append(" null")
} else {
builder.append(" ")
builder.append(toUTF8String(array.get(i, et)).asInstanceOf[UTF8String])
}
@ -342,7 +344,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
val valueToUTF8String = castToString(vt)
builder.append(keyToUTF8String(keyArray.get(0, kt)).asInstanceOf[UTF8String])
builder.append(" ->")
if (!valueArray.isNullAt(0)) {
if (valueArray.isNullAt(0)) {
if (!legacyCastToStr) builder.append(" null")
} else {
builder.append(" ")
builder.append(valueToUTF8String(valueArray.get(0, vt)).asInstanceOf[UTF8String])
}
@ -351,7 +355,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
builder.append(", ")
builder.append(keyToUTF8String(keyArray.get(i, kt)).asInstanceOf[UTF8String])
builder.append(" ->")
if (!valueArray.isNullAt(i)) {
if (valueArray.isNullAt(i)) {
if (!legacyCastToStr) builder.append(" null")
} else {
builder.append(" ")
builder.append(valueToUTF8String(valueArray.get(i, vt))
.asInstanceOf[UTF8String])
@ -369,13 +375,17 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
if (row.numFields > 0) {
val st = fields.map(_.dataType)
val toUTF8StringFuncs = st.map(castToString)
if (!row.isNullAt(0)) {
if (row.isNullAt(0)) {
if (!legacyCastToStr) builder.append(" null")
} else {
builder.append(toUTF8StringFuncs(0)(row.get(0, st(0))).asInstanceOf[UTF8String])
}
var i = 1
while (i < row.numFields) {
builder.append(",")
if (!row.isNullAt(i)) {
if (row.isNullAt(i)) {
if (!legacyCastToStr) builder.append(" null")
} else {
builder.append(" ")
builder.append(toUTF8StringFuncs(i)(row.get(i, st(i))).asInstanceOf[UTF8String])
}
@ -895,6 +905,10 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
"""
}
private def outNullElem(buffer: ExprValue): Block = {
if (legacyCastToStr) code"" else code"""$buffer.append(" null");"""
}
private def writeArrayToStringBuilder(
et: DataType,
array: ExprValue,
@ -917,12 +931,16 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
code"""
|$buffer.append("[");
|if ($array.numElements() > 0) {
| if (!$array.isNullAt(0)) {
| if ($array.isNullAt(0)) {
| ${outNullElem(buffer)}
| } else {
| $buffer.append($elementToStringFunc(${CodeGenerator.getValue(array, et, "0")}));
| }
| for (int $loopIndex = 1; $loopIndex < $array.numElements(); $loopIndex++) {
| $buffer.append(",");
| if (!$array.isNullAt($loopIndex)) {
| if ($array.isNullAt($loopIndex)) {
| ${outNullElem(buffer)}
| } else {
| $buffer.append(" ");
| $buffer.append($elementToStringFunc(${CodeGenerator.getValue(array, et, loopIndex)}));
| }
@ -970,7 +988,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
|if ($map.numElements() > 0) {
| $buffer.append($keyToStringFunc($getMapFirstKey));
| $buffer.append(" ->");
| if (!$map.valueArray().isNullAt(0)) {
| if ($map.valueArray().isNullAt(0)) {
| ${outNullElem(buffer)}
| } else {
| $buffer.append(" ");
| $buffer.append($valueToStringFunc($getMapFirstValue));
| }
@ -978,7 +998,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
| $buffer.append(", ");
| $buffer.append($keyToStringFunc($getMapKeyArray));
| $buffer.append(" ->");
| if (!$map.valueArray().isNullAt($loopIndex)) {
| if ($map.valueArray().isNullAt($loopIndex)) {
| ${outNullElem(buffer)}
| } else {
| $buffer.append(" ");
| $buffer.append($valueToStringFunc($getMapValueArray));
| }
@ -1000,7 +1022,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
val javaType = JavaCode.javaType(ft)
code"""
|${if (i != 0) code"""$buffer.append(",");""" else EmptyBlock}
|if (!$row.isNullAt($i)) {
|if ($row.isNullAt($i)) {
| ${outNullElem(buffer)}
|} else {
| ${if (i != 0) code"""$buffer.append(" ");""" else EmptyBlock}
|
| // Append $i field into the string buffer

View file

@ -2693,8 +2693,10 @@ object SQLConf {
val LEGACY_COMPLEX_TYPES_TO_STRING =
buildConf("spark.sql.legacy.castComplexTypesToString.enabled")
.internal()
.doc("When true, maps and structs are wrapped by [] in casting to strings. " +
"Otherwise, if this is false, which is the default, maps and structs are wrapped by {}.")
.doc("When true, maps and structs are wrapped by [] in casting to strings, and " +
"NULL elements of structs/maps/arrays will be omitted while converting to strings. " +
"Otherwise, if this is false, which is the default, maps and structs are wrapped by {}, " +
"and NULL elements will be converted to \"null\".")
.version("3.1.0")
.booleanConf
.createWithDefault(false)

View file

@ -691,16 +691,22 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(ret1, "[1, 2, 3, 4, 5]")
val ret2 = cast(Literal.create(Array("ab", "cde", "f")), StringType)
checkEvaluation(ret2, "[ab, cde, f]")
Seq(false, true).foreach { omitNull =>
withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> omitNull.toString) {
val ret3 = cast(Literal.create(Array("ab", null, "c")), StringType)
checkEvaluation(ret3, "[ab,, c]")
val ret4 = cast(Literal.create(Array("ab".getBytes, "cde".getBytes, "f".getBytes)), StringType)
checkEvaluation(ret3, s"[ab,${if (omitNull) "" else " null"}, c]")
}
}
val ret4 =
cast(Literal.create(Array("ab".getBytes, "cde".getBytes, "f".getBytes)), StringType)
checkEvaluation(ret4, "[ab, cde, f]")
val ret5 = cast(
Literal.create(Array("2014-12-03", "2014-12-04", "2014-12-06").map(Date.valueOf)),
StringType)
checkEvaluation(ret5, "[2014-12-03, 2014-12-04, 2014-12-06]")
val ret6 = cast(
Literal.create(Array("2014-12-03 13:01:00", "2014-12-04 15:05:00").map(Timestamp.valueOf)),
Literal.create(Array("2014-12-03 13:01:00", "2014-12-04 15:05:00")
.map(Timestamp.valueOf)),
StringType)
checkEvaluation(ret6, "[2014-12-03 13:01:00, 2014-12-04 15:05:00]")
val ret7 = cast(Literal.create(Array(Array(1, 2, 3), Array(4, 5))), StringType)
@ -713,15 +719,15 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
test("SPARK-22973 Cast map to string") {
Seq(
"false" -> ("{", "}"),
"true" -> ("[", "]")).foreach { case (legacyBrackets, (lb, rb)) =>
withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyBrackets) {
false -> ("{", "}"),
true -> ("[", "]")).foreach { case (legacyCast, (lb, rb)) =>
withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyCast.toString) {
val ret1 = cast(Literal.create(Map(1 -> "a", 2 -> "b", 3 -> "c")), StringType)
checkEvaluation(ret1, s"${lb}1 -> a, 2 -> b, 3 -> c$rb")
val ret2 = cast(
Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)),
StringType)
checkEvaluation(ret2, s"${lb}1 -> a, 2 ->, 3 -> c$rb")
checkEvaluation(ret2, s"${lb}1 -> a, 2 ->${if (legacyCast) "" else " null"}, 3 -> c$rb")
val ret3 = cast(
Literal.create(Map(
1 -> Date.valueOf("2014-12-03"),
@ -747,13 +753,13 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
test("SPARK-22981 Cast struct to string") {
Seq(
"false" -> ("{", "}"),
"true" -> ("[", "]")).foreach { case (legacyBrackets, (lb, rb)) =>
withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyBrackets) {
false -> ("{", "}"),
true -> ("[", "]")).foreach { case (legacyCast, (lb, rb)) =>
withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyCast.toString) {
val ret1 = cast(Literal.create((1, "a", 0.1)), StringType)
checkEvaluation(ret1, s"${lb}1, a, 0.1$rb")
val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType)
checkEvaluation(ret2, s"${lb}1,, a$rb")
checkEvaluation(ret2, s"${lb}1,${if (legacyCast) "" else " null"}, a$rb")
val ret3 = cast(Literal.create(
(Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType)
checkEvaluation(ret3, s"${lb}2014-12-03, 2014-12-03 15:05:00$rb")