[SPARK-33281][SQL] Return SQL schema instead of Catalog string from the SchemaOfCsv expression

### What changes were proposed in this pull request? Return schema in SQL format instead of Catalog string from the SchemaOfCsv expression. ### Why are the changes needed? To unify output of the `schema_of_json()` and `schema_of_csv()`. ### Does this PR introduce _any_ user-facing change? Yes, they can but `schema_of_csv()` is usually used in combination with `from_csv()`, so, the format of schema shouldn't be much matter. Before: ``` > SELECT schema_of_csv('1,abc'); struct<_c0:int,_c1:string> ``` After: ``` > SELECT schema_of_csv('1,abc'); STRUCT<`_c0`: INT, `_c1`: STRING> ``` ### How was this patch tested? By existing test suites `CsvFunctionsSuite` and `CsvExpressionsSuite`. Closes #30180 from MaxGekk/schema_of_csv-sql-schema. Authored-by: Max Gekk <max.gekk@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-10-29 21:02:10 +09:00 · 2020-10-29 21:02:10 +09:00 · b409025641
parent 9d5e48ea95
commit b409025641
7 changed files with 14 additions and 14 deletions
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@ -1682,9 +1682,9 @@ test_that("column functions", {

  df <- as.DataFrame(list(list("col" = "1")))
  c <- collect(select(df, schema_of_csv("Amsterdam,2018")))
-  expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
+  expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
  c <- collect(select(df, schema_of_csv(lit("Amsterdam,2018"))))
-  expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
+  expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")

  # Test to_json(), from_json(), schema_of_json()
  df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@ -50,7 +50,7 @@ license: |
  
  - In Spark 3.1, loading and saving of timestamps from/to parquet files fails if the timestamps are before 1900-01-01 00:00:00Z, and loaded (saved) as the INT96 type. In Spark 3.0, the actions don't fail but might lead to shifting of the input timestamps due to rebasing from/to Julian to/from Proleptic Gregorian calendar. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.parquet.int96RebaseModeInRead` or/and `spark.sql.legacy.parquet.int96RebaseModeInWrite` to `LEGACY`.
  
-  - In Spark 3.1, the `schema_of_json` function returns the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case. 
+  - In Spark 3.1, the `schema_of_json` and `schema_of_csv` functions return the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case. 

 ## Upgrading from Spark SQL 3.0 to 3.0.1

--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@ -2964,9 +2964,9 @@ def schema_of_csv(csv, options={}):

    >>> df = spark.range(1)
    >>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()
-    [Row(csv='struct<_c0:int,_c1:string>')]
+    [Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')]
    >>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()
-    [Row(csv='struct<_c0:int,_c1:string>')]
+    [Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')]
    """
    if isinstance(csv, str):
        col = _create_column_from_literal(csv)
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
@ -144,7 +144,7 @@ case class CsvToStructs(
  examples = """
    Examples:
      > SELECT _FUNC_('1,abc');
-       struct<_c0:int,_c1:string>
+       STRUCT<`_c0`: INT, `_c1`: STRING>
  """,
  since = "3.0.0")
 case class SchemaOfCsv(
@ -186,7 +186,7 @@ case class SchemaOfCsv(
    val inferSchema = new CSVInferSchema(parsedOptions)
    val fieldTypes = inferSchema.inferRowType(startType, row)
    val st = StructType(inferSchema.toStructFields(fieldTypes, header))
-    UTF8String.fromString(st.catalogString)
+    UTF8String.fromString(st.sql)
  }

  override def prettyName: String = "schema_of_csv"
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala
@ -158,13 +158,13 @@ class CsvExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with P
  }

  test("infer schema of CSV strings") {
-    checkEvaluation(new SchemaOfCsv(Literal.create("1,abc")), "struct<_c0:int,_c1:string>")
+    checkEvaluation(new SchemaOfCsv(Literal.create("1,abc")), "STRUCT<`_c0`: INT, `_c1`: STRING>")
  }

  test("infer schema of CSV strings by using options") {
    checkEvaluation(
      new SchemaOfCsv(Literal.create("1|abc"), Map("delimiter" -> "|")),
-      "struct<_c0:int,_c1:string>")
+      "STRUCT<`_c0`: INT, `_c1`: STRING>")
  }

  test("to_csv - struct") {
--- a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out
@ -82,7 +82,7 @@ select schema_of_csv('1|abc', map('delimiter', '|'))
 -- !query schema
 struct<schema_of_csv(1|abc):string>
 -- !query output
-struct<_c0:int,_c1:string>
+STRUCT<`_c0`: INT, `_c1`: STRING>


 -- !query
--- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
@ -80,16 +80,16 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession {
  test("schema_of_csv - infers schemas") {
    checkAnswer(
      spark.range(1).select(schema_of_csv(lit("0.1,1"))),
-      Seq(Row("struct<_c0:double,_c1:int>")))
+      Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
    checkAnswer(
      spark.range(1).select(schema_of_csv("0.1,1")),
-      Seq(Row("struct<_c0:double,_c1:int>")))
+      Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
  }

  test("schema_of_csv - infers schemas using options") {
    val df = spark.range(1)
      .select(schema_of_csv(lit("0.1 1"), Map("sep" -> " ").asJava))
-    checkAnswer(df, Seq(Row("struct<_c0:double,_c1:int>")))
+    checkAnswer(df, Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
  }

  test("to_csv - struct") {
@ -236,7 +236,7 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession {
    val input = concat_ws(",", lit(0.1), lit(1))
    checkAnswer(
      spark.range(1).select(schema_of_csv(input)),
-      Seq(Row("struct<_c0:double,_c1:int>")))
+      Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
  }

  test("optional datetime parser does not affect csv time formatting") {