diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 685e6e672b..22bd4133d4 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1717,9 +1717,9 @@ test_that("column functions", { df <- as.DataFrame(list(list("col" = "1"))) c <- collect(select(df, schema_of_json('{"name":"Bob"}'))) - expect_equal(c[[1]], "struct") + expect_equal(c[[1]], "STRUCT<`name`: STRING>") c <- collect(select(df, schema_of_json(lit('{"name":"Bob"}')))) - expect_equal(c[[1]], "struct") + expect_equal(c[[1]], "STRUCT<`name`: STRING>") # Test to_json() supports arrays of primitive types and arrays df <- sql("SELECT array(19, 42, 70) as age") diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 124b04fb2b..ee82d9ac47 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -49,6 +49,8 @@ license: | - In Spark 3.1, we remove the built-in Hive 1.2. You need to migrate your custom SerDes to Hive 2.3. See [HIVE-15167](https://issues.apache.org/jira/browse/HIVE-15167) for more details. - In Spark 3.1, loading and saving of timestamps from/to parquet files fails if the timestamps are before 1900-01-01 00:00:00Z, and loaded (saved) as the INT96 type. In Spark 3.0, the actions don't fail but might lead to shifting of the input timestamps due to rebasing from/to Julian to/from Proleptic Gregorian calendar. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.parquet.int96RebaseModeInRead` or/and `spark.sql.legacy.parquet.int96RebaseModeInWrite` to `LEGACY`. + + - In Spark 3.1, the `schema_of_json` function returns the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case. ## Upgrading from Spark SQL 3.0 to 3.0.1 diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 22941ab6f1..68639ff7b6 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2937,10 +2937,10 @@ def schema_of_json(json, options={}): >>> df = spark.range(1) >>> df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect() - [Row(json='struct')] + [Row(json='STRUCT<`a`: BIGINT>')] >>> schema = schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'}) >>> df.select(schema.alias("json")).collect() - [Row(json='struct')] + [Row(json='STRUCT<`a`: BIGINT>')] """ if isinstance(json, str): col = _create_column_from_literal(json) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index ef02d2db97..39d9eb5a36 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -741,9 +741,9 @@ case class StructsToJson( examples = """ Examples: > SELECT _FUNC_('[{"col":0}]'); - array> + ARRAY> > SELECT _FUNC_('[{"col":01}]', map('allowNumericLeadingZeros', 'true')); - array> + ARRAY> """, group = "json_funcs", since = "2.4.0") @@ -801,7 +801,7 @@ case class SchemaOfJson( } } - UTF8String.fromString(dt.catalogString) + UTF8String.fromString(dt.sql) } override def prettyName: String = "schema_of_json" diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index 6f062dcc9a..b3666936e5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -735,17 +735,17 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with test("SPARK-24709: infer schema of json strings") { checkEvaluation(new SchemaOfJson(Literal.create("""{"col":0}""")), - "struct") + "STRUCT<`col`: BIGINT>") checkEvaluation( new SchemaOfJson(Literal.create("""{"col0":["a"], "col1": {"col2": "b"}}""")), - "struct,col1:struct>") + "STRUCT<`col0`: ARRAY, `col1`: STRUCT<`col2`: STRING>>") } test("infer schema of JSON strings by using options") { checkEvaluation( new SchemaOfJson(Literal.create("""{"col":01}"""), CreateMap(Seq(Literal.create("allowNumericLeadingZeros"), Literal.create("true")))), - "struct") + "STRUCT<`col`: BIGINT>") } test("parse date with locale") { @@ -810,7 +810,7 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with } Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach { - checkDecimalInfer(_, """struct""") + checkDecimalInfer(_, """STRUCT<`d`: DECIMAL(7,3)>""") } } diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index 34a329627f..3cc45890cf 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -213,7 +213,7 @@ select schema_of_json('{"c1":0, "c2":[1]}') -- !query schema struct -- !query output -struct> +STRUCT<`c1`: BIGINT, `c2`: ARRAY> -- !query @@ -352,7 +352,7 @@ select schema_of_json('{"c1":1}', map('primitivesAsString', 'true')) -- !query schema struct -- !query output -struct +STRUCT<`c1`: STRING> -- !query @@ -360,7 +360,7 @@ select schema_of_json('{"c1":01, "c2":0.1}', map('allowNumericLeadingZeros', 'tr -- !query schema struct -- !query output -struct +STRUCT<`c1`: BIGINT, `c2`: DECIMAL(1,1)> -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index 5a1a3550d8..e2a9cf536d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -411,7 +411,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { test("infers schemas using options") { val df = spark.range(1) .select(schema_of_json(lit("{a:1}"), Map("allowUnquotedFieldNames" -> "true").asJava)) - checkAnswer(df, Seq(Row("struct"))) + checkAnswer(df, Seq(Row("STRUCT<`a`: BIGINT>"))) } test("from_json - array of primitive types") { @@ -684,14 +684,14 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { val input = regexp_replace(lit("""{"item_id": 1, "item_price": 0.1}"""), "item_", "") checkAnswer( spark.range(1).select(schema_of_json(input)), - Seq(Row("struct"))) + Seq(Row("STRUCT<`id`: BIGINT, `price`: DOUBLE>"))) } test("SPARK-31065: schema_of_json - null and empty strings as strings") { Seq("""{"id": null}""", """{"id": ""}""").foreach { input => checkAnswer( spark.range(1).select(schema_of_json(input)), - Seq(Row("struct"))) + Seq(Row("STRUCT<`id`: STRING>"))) } } @@ -703,7 +703,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { schema_of_json( lit("""{"id": "a", "drop": {"drop": null}}"""), options.asJava)), - Seq(Row("struct"))) + Seq(Row("STRUCT<`id`: STRING>"))) // Array of structs checkAnswer( @@ -711,7 +711,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { schema_of_json( lit("""[{"id": "a", "drop": {"drop": null}}]"""), options.asJava)), - Seq(Row("array>"))) + Seq(Row("ARRAY>"))) // Other types are not affected. checkAnswer( @@ -719,7 +719,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { schema_of_json( lit("""null"""), options.asJava)), - Seq(Row("string"))) + Seq(Row("STRING"))) } test("optional datetime parser does not affect json time formatting") { @@ -747,4 +747,11 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { val df4 = Seq("""{"c2": [19]}""").toDF("c0") checkAnswer(df4.select(from_json($"c0", MapType(StringType, st))), Row(null)) } + + test("SPARK-33270: infers schema for JSON field with spaces and pass them to from_json") { + val in = Seq("""{"a b": 1}""").toDS() + val out = in.select(from_json('value, schema_of_json("""{"a b": 100}""")) as "parsed") + val expected = new StructType().add("parsed", new StructType().add("a b", LongType)) + assert(out.schema == expected) + } }