diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 22bd4133d4..3a0d359e2a 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1682,9 +1682,9 @@ test_that("column functions", { df <- as.DataFrame(list(list("col" = "1"))) c <- collect(select(df, schema_of_csv("Amsterdam,2018"))) - expect_equal(c[[1]], "struct<_c0:string,_c1:int>") + expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>") c <- collect(select(df, schema_of_csv(lit("Amsterdam,2018")))) - expect_equal(c[[1]], "struct<_c0:string,_c1:int>") + expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>") # Test to_json(), from_json(), schema_of_json() df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people") diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index ee82d9ac47..fdc764a934 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -50,7 +50,7 @@ license: | - In Spark 3.1, loading and saving of timestamps from/to parquet files fails if the timestamps are before 1900-01-01 00:00:00Z, and loaded (saved) as the INT96 type. In Spark 3.0, the actions don't fail but might lead to shifting of the input timestamps due to rebasing from/to Julian to/from Proleptic Gregorian calendar. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.parquet.int96RebaseModeInRead` or/and `spark.sql.legacy.parquet.int96RebaseModeInWrite` to `LEGACY`. - - In Spark 3.1, the `schema_of_json` function returns the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case. + - In Spark 3.1, the `schema_of_json` and `schema_of_csv` functions return the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case. ## Upgrading from Spark SQL 3.0 to 3.0.1 diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 68639ff7b6..69fdf220f1 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2964,9 +2964,9 @@ def schema_of_csv(csv, options={}): >>> df = spark.range(1) >>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect() - [Row(csv='struct<_c0:int,_c1:string>')] + [Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')] >>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect() - [Row(csv='struct<_c0:int,_c1:string>')] + [Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')] """ if isinstance(csv, str): col = _create_column_from_literal(csv) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala index f9ccf3c8c8..6fad272aa4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala @@ -144,7 +144,7 @@ case class CsvToStructs( examples = """ Examples: > SELECT _FUNC_('1,abc'); - struct<_c0:int,_c1:string> + STRUCT<`_c0`: INT, `_c1`: STRING> """, since = "3.0.0") case class SchemaOfCsv( @@ -186,7 +186,7 @@ case class SchemaOfCsv( val inferSchema = new CSVInferSchema(parsedOptions) val fieldTypes = inferSchema.inferRowType(startType, row) val st = StructType(inferSchema.toStructFields(fieldTypes, header)) - UTF8String.fromString(st.catalogString) + UTF8String.fromString(st.sql) } override def prettyName: String = "schema_of_csv" diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala index 4a19add23f..7945974a1f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala @@ -158,13 +158,13 @@ class CsvExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with P } test("infer schema of CSV strings") { - checkEvaluation(new SchemaOfCsv(Literal.create("1,abc")), "struct<_c0:int,_c1:string>") + checkEvaluation(new SchemaOfCsv(Literal.create("1,abc")), "STRUCT<`_c0`: INT, `_c1`: STRING>") } test("infer schema of CSV strings by using options") { checkEvaluation( new SchemaOfCsv(Literal.create("1|abc"), Map("delimiter" -> "|")), - "struct<_c0:int,_c1:string>") + "STRUCT<`_c0`: INT, `_c1`: STRING>") } test("to_csv - struct") { diff --git a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out index 1e3173172a..7ba3f71236 100644 --- a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out @@ -82,7 +82,7 @@ select schema_of_csv('1|abc', map('delimiter', '|')) -- !query schema struct -- !query output -struct<_c0:int,_c1:string> +STRUCT<`_c0`: INT, `_c1`: STRING> -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala index 800e294cca..abccaf1908 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala @@ -80,16 +80,16 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { test("schema_of_csv - infers schemas") { checkAnswer( spark.range(1).select(schema_of_csv(lit("0.1,1"))), - Seq(Row("struct<_c0:double,_c1:int>"))) + Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>"))) checkAnswer( spark.range(1).select(schema_of_csv("0.1,1")), - Seq(Row("struct<_c0:double,_c1:int>"))) + Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>"))) } test("schema_of_csv - infers schemas using options") { val df = spark.range(1) .select(schema_of_csv(lit("0.1 1"), Map("sep" -> " ").asJava)) - checkAnswer(df, Seq(Row("struct<_c0:double,_c1:int>"))) + checkAnswer(df, Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>"))) } test("to_csv - struct") { @@ -236,7 +236,7 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { val input = concat_ws(",", lit(0.1), lit(1)) checkAnswer( spark.range(1).select(schema_of_csv(input)), - Seq(Row("struct<_c0:double,_c1:int>"))) + Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>"))) } test("optional datetime parser does not affect csv time formatting") {