[SPARK-21804][SQL] json_tuple returns null values within repeated columns except the first one
## What changes were proposed in this pull request? When json_tuple in extracting values from JSON it returns null values within repeated columns except the first one as below: ``` scala scala> spark.sql("""SELECT json_tuple('{"a":1, "b":2}', 'a', 'b', 'a')""").show() +---+---+----+ | c0| c1| c2| +---+---+----+ | 1| 2|null| +---+---+----+ ``` I think this should be consistent with Hive's implementation: ``` hive> SELECT json_tuple('{"a": 1, "b": 2}', 'a', 'a'); ... 1 1 ``` In this PR, we located all the matched indices in `fieldNames` instead of returning the first matched index, i.e., indexOf. ## How was this patch tested? Added test in JsonExpressionsSuite. Author: Jen-Ming Chung <jenmingisme@gmail.com> Closes #19017 from jmchung/SPARK-21804.
This commit is contained in:
parent
846bc61cf5
commit
95713eb4f2
|
@ -436,7 +436,8 @@ case class JsonTuple(children: Seq[Expression])
|
||||||
while (parser.nextToken() != JsonToken.END_OBJECT) {
|
while (parser.nextToken() != JsonToken.END_OBJECT) {
|
||||||
if (parser.getCurrentToken == JsonToken.FIELD_NAME) {
|
if (parser.getCurrentToken == JsonToken.FIELD_NAME) {
|
||||||
// check to see if this field is desired in the output
|
// check to see if this field is desired in the output
|
||||||
val idx = fieldNames.indexOf(parser.getCurrentName)
|
val jsonField = parser.getCurrentName
|
||||||
|
var idx = fieldNames.indexOf(jsonField)
|
||||||
if (idx >= 0) {
|
if (idx >= 0) {
|
||||||
// it is, copy the child tree to the correct location in the output row
|
// it is, copy the child tree to the correct location in the output row
|
||||||
val output = new ByteArrayOutputStream()
|
val output = new ByteArrayOutputStream()
|
||||||
|
@ -447,7 +448,14 @@ case class JsonTuple(children: Seq[Expression])
|
||||||
generator => copyCurrentStructure(generator, parser)
|
generator => copyCurrentStructure(generator, parser)
|
||||||
}
|
}
|
||||||
|
|
||||||
row(idx) = UTF8String.fromBytes(output.toByteArray)
|
val jsonValue = UTF8String.fromBytes(output.toByteArray)
|
||||||
|
|
||||||
|
// SPARK-21804: json_tuple returns null values within repeated columns
|
||||||
|
// except the first one; so that we need to check the remaining fields.
|
||||||
|
do {
|
||||||
|
row(idx) = jsonValue
|
||||||
|
idx = fieldNames.indexOf(jsonField, idx + 1)
|
||||||
|
} while (idx >= 0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -373,6 +373,16 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
|
||||||
InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2")))
|
InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2")))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("SPARK-21804: json_tuple returns null values within repeated columns except the first one") {
|
||||||
|
checkJsonTuple(
|
||||||
|
JsonTuple(Literal("""{"f1": 1, "f2": 2}""") ::
|
||||||
|
NonFoldableLiteral("f1") ::
|
||||||
|
NonFoldableLiteral("cast(NULL AS STRING)") ::
|
||||||
|
NonFoldableLiteral("f1") ::
|
||||||
|
Nil),
|
||||||
|
InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("1")))
|
||||||
|
}
|
||||||
|
|
||||||
val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID)
|
val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID)
|
||||||
|
|
||||||
test("from_json") {
|
test("from_json") {
|
||||||
|
|
Loading…
Reference in a new issue