diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index c3757373a3..ee5da1a83a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -436,7 +436,8 @@ case class JsonTuple(children: Seq[Expression]) while (parser.nextToken() != JsonToken.END_OBJECT) { if (parser.getCurrentToken == JsonToken.FIELD_NAME) { // check to see if this field is desired in the output - val idx = fieldNames.indexOf(parser.getCurrentName) + val jsonField = parser.getCurrentName + var idx = fieldNames.indexOf(jsonField) if (idx >= 0) { // it is, copy the child tree to the correct location in the output row val output = new ByteArrayOutputStream() @@ -447,7 +448,14 @@ case class JsonTuple(children: Seq[Expression]) generator => copyCurrentStructure(generator, parser) } - row(idx) = UTF8String.fromBytes(output.toByteArray) + val jsonValue = UTF8String.fromBytes(output.toByteArray) + + // SPARK-21804: json_tuple returns null values within repeated columns + // except the first one; so that we need to check the remaining fields. + do { + row(idx) = jsonValue + idx = fieldNames.indexOf(jsonField, idx + 1) + } while (idx >= 0) } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index 1cd2b4fc18..9991bda165 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -373,6 +373,16 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2"))) } + test("SPARK-21804: json_tuple returns null values within repeated columns except the first one") { + checkJsonTuple( + JsonTuple(Literal("""{"f1": 1, "f2": 2}""") :: + NonFoldableLiteral("f1") :: + NonFoldableLiteral("cast(NULL AS STRING)") :: + NonFoldableLiteral("f1") :: + Nil), + InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("1"))) + } + val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID) test("from_json") {