[SPARK-21804][SQL] json_tuple returns null values within repeated columns except the first one

## What changes were proposed in this pull request?

When json_tuple in extracting values from JSON it returns null values within repeated columns except the first one as below:

``` scala
scala> spark.sql("""SELECT json_tuple('{"a":1, "b":2}', 'a', 'b', 'a')""").show()
+---+---+----+
| c0| c1|  c2|
+---+---+----+
|  1|  2|null|
+---+---+----+
```

I think this should be consistent with Hive's implementation:
```
hive> SELECT json_tuple('{"a": 1, "b": 2}', 'a', 'a');
...
1    1
```

In this PR, we located all the matched indices in `fieldNames` instead of returning the first matched index, i.e., indexOf.

## How was this patch tested?

Added test in JsonExpressionsSuite.

Author: Jen-Ming Chung <jenmingisme@gmail.com>

Closes #19017 from jmchung/SPARK-21804.
This commit is contained in:
Jen-Ming Chung 2017-08-24 19:24:00 +09:00 committed by hyukjinkwon
parent 846bc61cf5
commit 95713eb4f2
2 changed files with 20 additions and 2 deletions

View file

@ -436,7 +436,8 @@ case class JsonTuple(children: Seq[Expression])
while (parser.nextToken() != JsonToken.END_OBJECT) { while (parser.nextToken() != JsonToken.END_OBJECT) {
if (parser.getCurrentToken == JsonToken.FIELD_NAME) { if (parser.getCurrentToken == JsonToken.FIELD_NAME) {
// check to see if this field is desired in the output // check to see if this field is desired in the output
val idx = fieldNames.indexOf(parser.getCurrentName) val jsonField = parser.getCurrentName
var idx = fieldNames.indexOf(jsonField)
if (idx >= 0) { if (idx >= 0) {
// it is, copy the child tree to the correct location in the output row // it is, copy the child tree to the correct location in the output row
val output = new ByteArrayOutputStream() val output = new ByteArrayOutputStream()
@ -447,7 +448,14 @@ case class JsonTuple(children: Seq[Expression])
generator => copyCurrentStructure(generator, parser) generator => copyCurrentStructure(generator, parser)
} }
row(idx) = UTF8String.fromBytes(output.toByteArray) val jsonValue = UTF8String.fromBytes(output.toByteArray)
// SPARK-21804: json_tuple returns null values within repeated columns
// except the first one; so that we need to check the remaining fields.
do {
row(idx) = jsonValue
idx = fieldNames.indexOf(jsonField, idx + 1)
} while (idx >= 0)
} }
} }
} }

View file

@ -373,6 +373,16 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2"))) InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2")))
} }
test("SPARK-21804: json_tuple returns null values within repeated columns except the first one") {
checkJsonTuple(
JsonTuple(Literal("""{"f1": 1, "f2": 2}""") ::
NonFoldableLiteral("f1") ::
NonFoldableLiteral("cast(NULL AS STRING)") ::
NonFoldableLiteral("f1") ::
Nil),
InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("1")))
}
val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID) val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID)
test("from_json") { test("from_json") {