[SPARK-33281][SQL] Return SQL schema instead of Catalog string from the SchemaOfCsv
expression
### What changes were proposed in this pull request? Return schema in SQL format instead of Catalog string from the SchemaOfCsv expression. ### Why are the changes needed? To unify output of the `schema_of_json()` and `schema_of_csv()`. ### Does this PR introduce _any_ user-facing change? Yes, they can but `schema_of_csv()` is usually used in combination with `from_csv()`, so, the format of schema shouldn't be much matter. Before: ``` > SELECT schema_of_csv('1,abc'); struct<_c0:int,_c1:string> ``` After: ``` > SELECT schema_of_csv('1,abc'); STRUCT<`_c0`: INT, `_c1`: STRING> ``` ### How was this patch tested? By existing test suites `CsvFunctionsSuite` and `CsvExpressionsSuite`. Closes #30180 from MaxGekk/schema_of_csv-sql-schema. Authored-by: Max Gekk <max.gekk@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
parent
9d5e48ea95
commit
b409025641
|
@ -1682,9 +1682,9 @@ test_that("column functions", {
|
|||
|
||||
df <- as.DataFrame(list(list("col" = "1")))
|
||||
c <- collect(select(df, schema_of_csv("Amsterdam,2018")))
|
||||
expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
|
||||
expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
|
||||
c <- collect(select(df, schema_of_csv(lit("Amsterdam,2018"))))
|
||||
expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
|
||||
expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
|
||||
|
||||
# Test to_json(), from_json(), schema_of_json()
|
||||
df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
|
||||
|
|
|
@ -50,7 +50,7 @@ license: |
|
|||
|
||||
- In Spark 3.1, loading and saving of timestamps from/to parquet files fails if the timestamps are before 1900-01-01 00:00:00Z, and loaded (saved) as the INT96 type. In Spark 3.0, the actions don't fail but might lead to shifting of the input timestamps due to rebasing from/to Julian to/from Proleptic Gregorian calendar. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.parquet.int96RebaseModeInRead` or/and `spark.sql.legacy.parquet.int96RebaseModeInWrite` to `LEGACY`.
|
||||
|
||||
- In Spark 3.1, the `schema_of_json` function returns the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case.
|
||||
- In Spark 3.1, the `schema_of_json` and `schema_of_csv` functions return the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case.
|
||||
|
||||
## Upgrading from Spark SQL 3.0 to 3.0.1
|
||||
|
||||
|
|
|
@ -2964,9 +2964,9 @@ def schema_of_csv(csv, options={}):
|
|||
|
||||
>>> df = spark.range(1)
|
||||
>>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()
|
||||
[Row(csv='struct<_c0:int,_c1:string>')]
|
||||
[Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')]
|
||||
>>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()
|
||||
[Row(csv='struct<_c0:int,_c1:string>')]
|
||||
[Row(csv='STRUCT<`_c0`: INT, `_c1`: STRING>')]
|
||||
"""
|
||||
if isinstance(csv, str):
|
||||
col = _create_column_from_literal(csv)
|
||||
|
|
|
@ -144,7 +144,7 @@ case class CsvToStructs(
|
|||
examples = """
|
||||
Examples:
|
||||
> SELECT _FUNC_('1,abc');
|
||||
struct<_c0:int,_c1:string>
|
||||
STRUCT<`_c0`: INT, `_c1`: STRING>
|
||||
""",
|
||||
since = "3.0.0")
|
||||
case class SchemaOfCsv(
|
||||
|
@ -186,7 +186,7 @@ case class SchemaOfCsv(
|
|||
val inferSchema = new CSVInferSchema(parsedOptions)
|
||||
val fieldTypes = inferSchema.inferRowType(startType, row)
|
||||
val st = StructType(inferSchema.toStructFields(fieldTypes, header))
|
||||
UTF8String.fromString(st.catalogString)
|
||||
UTF8String.fromString(st.sql)
|
||||
}
|
||||
|
||||
override def prettyName: String = "schema_of_csv"
|
||||
|
|
|
@ -158,13 +158,13 @@ class CsvExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with P
|
|||
}
|
||||
|
||||
test("infer schema of CSV strings") {
|
||||
checkEvaluation(new SchemaOfCsv(Literal.create("1,abc")), "struct<_c0:int,_c1:string>")
|
||||
checkEvaluation(new SchemaOfCsv(Literal.create("1,abc")), "STRUCT<`_c0`: INT, `_c1`: STRING>")
|
||||
}
|
||||
|
||||
test("infer schema of CSV strings by using options") {
|
||||
checkEvaluation(
|
||||
new SchemaOfCsv(Literal.create("1|abc"), Map("delimiter" -> "|")),
|
||||
"struct<_c0:int,_c1:string>")
|
||||
"STRUCT<`_c0`: INT, `_c1`: STRING>")
|
||||
}
|
||||
|
||||
test("to_csv - struct") {
|
||||
|
|
|
@ -82,7 +82,7 @@ select schema_of_csv('1|abc', map('delimiter', '|'))
|
|||
-- !query schema
|
||||
struct<schema_of_csv(1|abc):string>
|
||||
-- !query output
|
||||
struct<_c0:int,_c1:string>
|
||||
STRUCT<`_c0`: INT, `_c1`: STRING>
|
||||
|
||||
|
||||
-- !query
|
||||
|
|
|
@ -80,16 +80,16 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession {
|
|||
test("schema_of_csv - infers schemas") {
|
||||
checkAnswer(
|
||||
spark.range(1).select(schema_of_csv(lit("0.1,1"))),
|
||||
Seq(Row("struct<_c0:double,_c1:int>")))
|
||||
Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
|
||||
checkAnswer(
|
||||
spark.range(1).select(schema_of_csv("0.1,1")),
|
||||
Seq(Row("struct<_c0:double,_c1:int>")))
|
||||
Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
|
||||
}
|
||||
|
||||
test("schema_of_csv - infers schemas using options") {
|
||||
val df = spark.range(1)
|
||||
.select(schema_of_csv(lit("0.1 1"), Map("sep" -> " ").asJava))
|
||||
checkAnswer(df, Seq(Row("struct<_c0:double,_c1:int>")))
|
||||
checkAnswer(df, Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
|
||||
}
|
||||
|
||||
test("to_csv - struct") {
|
||||
|
@ -236,7 +236,7 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession {
|
|||
val input = concat_ws(",", lit(0.1), lit(1))
|
||||
checkAnswer(
|
||||
spark.range(1).select(schema_of_csv(input)),
|
||||
Seq(Row("struct<_c0:double,_c1:int>")))
|
||||
Seq(Row("STRUCT<`_c0`: DOUBLE, `_c1`: INT>")))
|
||||
}
|
||||
|
||||
test("optional datetime parser does not affect csv time formatting") {
|
||||
|
|
Loading…
Reference in a new issue