[SPARK-33281][SQL] Return SQL schema instead of Catalog string from the SchemaOfCsv expression

### What changes were proposed in this pull request?
Return schema in SQL format instead of Catalog string from the SchemaOfCsv expression.

### Why are the changes needed?
To unify output of the `schema_of_json()` and `schema_of_csv()`.

### Does this PR introduce _any_ user-facing change?
Yes, they can but `schema_of_csv()` is usually used in combination with `from_csv()`, so, the format of schema shouldn't be much matter.

Before:
```
> SELECT schema_of_csv('1,abc');
  struct<_c0:int,_c1:string>
```

After:
```
> SELECT schema_of_csv('1,abc');
  STRUCT<`_c0`: INT, `_c1`: STRING>
```

### How was this patch tested?
By existing test suites `CsvFunctionsSuite` and `CsvExpressionsSuite`.

Closes #30180 from MaxGekk/schema_of_csv-sql-schema.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
Max Gekk 2020-10-29 21:02:10 +09:00 committed by HyukjinKwon
parent 9d5e48ea95
commit b409025641
7 changed files with 14 additions and 14 deletions

View file

@ -1682,9 +1682,9 @@ test_that("column functions", {
df <- as.DataFrame(list(list("col" = "1")))
c <- collect(select(df, schema_of_csv("Amsterdam,2018")))
expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
c <- collect(select(df, schema_of_csv(lit("Amsterdam,2018"))))
expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
# Test to_json(), from_json(), schema_of_json()
df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")

View file

@ -50,7 +50,7 @@ license: |
- In Spark 3.1, loading and saving of timestamps from/to parquet files fails if the timestamps are before 1900-01-01 00:00:00Z, and loaded (saved) as the INT96 type. In Spark 3.0, the actions don't fail but might lead to shifting of the input timestamps due to rebasing from/to Julian to/from Proleptic Gregorian calendar. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.parquet.int96RebaseModeInRead` or/and `spark.sql.legacy.parquet.int96RebaseModeInWrite` to `LEGACY`.
- In Spark 3.1, the `schema_of_json` function returns the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case.
- In Spark 3.1, the `schema_of_json` and `schema_of_csv` functions return the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case.
## Upgrading from Spark SQL 3.0 to 3.0.1

View file

@ -2964,9 +2964,9 @@ def schema_of_csv(csv, options={}):
>>> df = spark.range(1)
>>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()
[Row(csv='struct<_c0:int,_c1:string>')]
[Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')]
>>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()
[Row(csv='struct<_c0:int,_c1:string>')]
[Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')]
"""
if isinstance(csv, str):
col = _create_column_from_literal(csv)

View file

@ -144,7 +144,7 @@ case class CsvToStructs(
examples = """
Examples:
> SELECT _FUNC_('1,abc');
struct<_c0:int,_c1:string>
STRUCT<`_c0`: INT, `_c1`: STRING>
""",
since = "3.0.0")
case class SchemaOfCsv(
@ -186,7 +186,7 @@ case class SchemaOfCsv(
val inferSchema = new CSVInferSchema(parsedOptions)
val fieldTypes = inferSchema.inferRowType(startType, row)
val st = StructType(inferSchema.toStructFields(fieldTypes, header))
UTF8String.fromString(st.catalogString)
UTF8String.fromString(st.sql)
}
override def prettyName: String = "schema_of_csv"

View file

@ -158,13 +158,13 @@ class CsvExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with P
}
test("infer schema of CSV strings") {
checkEvaluation(new SchemaOfCsv(Literal.create("1,abc")), "struct<_c0:int,_c1:string>")
checkEvaluation(new SchemaOfCsv(Literal.create("1,abc")), "STRUCT<`_c0`: INT, `_c1`: STRING>")
}
test("infer schema of CSV strings by using options") {
checkEvaluation(
new SchemaOfCsv(Literal.create("1|abc"), Map("delimiter" -> "|")),
"struct<_c0:int,_c1:string>")
"STRUCT<`_c0`: INT, `_c1`: STRING>")
}
test("to_csv - struct") {

View file

@ -82,7 +82,7 @@ select schema_of_csv('1|abc', map('delimiter', '|'))
-- !query schema
struct<schema_of_csv(1|abc):string>
-- !query output
struct<_c0:int,_c1:string>
STRUCT<`_c0`: INT, `_c1`: STRING>
-- !query

View file

@ -80,16 +80,16 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession {
test("schema_of_csv - infers schemas") {
checkAnswer(
spark.range(1).select(schema_of_csv(lit("0.1,1"))),
Seq(Row("struct<_c0:double,_c1:int>")))
Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
checkAnswer(
spark.range(1).select(schema_of_csv("0.1,1")),
Seq(Row("struct<_c0:double,_c1:int>")))
Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
}
test("schema_of_csv - infers schemas using options") {
val df = spark.range(1)
.select(schema_of_csv(lit("0.1 1"), Map("sep" -> " ").asJava))
checkAnswer(df, Seq(Row("struct<_c0:double,_c1:int>")))
checkAnswer(df, Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
}
test("to_csv - struct") {
@ -236,7 +236,7 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession {
val input = concat_ws(",", lit(0.1), lit(1))
checkAnswer(
spark.range(1).select(schema_of_csv(input)),
Seq(Row("struct<_c0:double,_c1:int>")))
Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
}
test("optional datetime parser does not affect csv time formatting") {