From de44e9cfa07e32d293d68355916ac0dbd31d5c54 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 7 Sep 2020 05:11:30 +0000 Subject: [PATCH] [SPARK-32785][SQL] Interval with dangling parts should not results null ### What changes were proposed in this pull request? bugfix for incomplete interval values, e.g. interval '1', interval '1 day 2', currently these cases will result null, but actually we should fail them with IllegalArgumentsException ### Why are the changes needed? correctness ### Does this PR introduce _any_ user-facing change? yes, incomplete intervals will throw exception now #### before ``` bin/spark-sql -S -e "select interval '1', interval '+', interval '1 day -'" NULL NULL NULL ``` #### after ``` -- !query select interval '1' -- !query schema struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException Cannot parse the INTERVAL value: 1(line 1, pos 7) == SQL == select interval '1' ``` ### How was this patch tested? unit tests added Closes #29635 from yaooqinn/SPARK-32785. Authored-by: Kent Yao Signed-off-by: Wenchen Fan --- docs/sql-migration-guide.md | 2 + .../sql/catalyst/util/IntervalUtils.scala | 5 +- .../catalyst/util/IntervalUtilsSuite.scala | 13 +++ .../resources/sql-tests/inputs/interval.sql | 8 ++ .../sql-tests/results/ansi/interval.sql.out | 100 +++++++++++++++++- .../sql-tests/results/interval.sql.out | 100 +++++++++++++++++- 6 files changed, 225 insertions(+), 3 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 0e03eac410..de60aed748 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -40,6 +40,8 @@ license: | - In Spark 3.1, `path` option cannot coexist when the following methods are called with path parameter(s): `DataFrameReader.load()`, `DataFrameWriter.save()`, `DataStreamReader.load()`, or `DataStreamWriter.start()`. In addition, `paths` option cannot coexist for `DataFrameReader.load()`. For example, `spark.read.format("csv").option("path", "/tmp").load("/tmp2")` or `spark.read.option("path", "/tmp").csv("/tmp2")` will throw `org.apache.spark.sql.AnalysisException`. In Spark version 3.0 and below, `path` option is overwritten if one path parameter is passed to above methods; `path` option is added to the overall paths if multiple path parameters are passed to `DataFrameReader.load()`. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.pathOptionBehavior.enabled` to `true`. + - In Spark 3.1, incomplete interval literals, e.g. `INTERVAL '1'`, `INTERVAL '1 DAY 2'` will fail with IllegalArgumentException. In Spark 3.0, they result `NULL`s. + ## Upgrading from Spark SQL 3.0 to 3.0.1 - In Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they match to the pattern defined by the JSON option `timestampFormat`. Since version 3.0.1, the timestamp type inference is disabled by default. Set the JSON option `inferTimestamp` to `true` to enable such type inference. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala index b1d4c119fa..f716ca1777 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala @@ -736,7 +736,10 @@ object IntervalUtils { val result = state match { case UNIT_SUFFIX | UNIT_END | TRIM_BEFORE_SIGN => new CalendarInterval(months, days, microseconds) - case _ => null + case TRIM_BEFORE_VALUE => throwIAE(s"expect a number after '$currentWord' but hit EOL") + case VALUE | VALUE_FRACTIONAL_PART => + throwIAE(s"expect a unit name after '$currentWord' but hit EOL") + case _ => throwIAE(s"unknown error when parsing '$currentWord'") } result diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala index d560d1048b..c313b54687 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala @@ -77,6 +77,19 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper { } } + test("string to interval: interval with dangling parts should not results null") { + checkFromInvalidString("+", "expect a number after '+' but hit EOL") + checkFromInvalidString("-", "expect a number after '-' but hit EOL") + checkFromInvalidString("+ 2", "expect a unit name after '2' but hit EOL") + checkFromInvalidString("- 1", "expect a unit name after '1' but hit EOL") + checkFromInvalidString("1", "expect a unit name after '1' but hit EOL") + checkFromInvalidString("1.2", "expect a unit name after '1.2' but hit EOL") + checkFromInvalidString("1 day 2", "expect a unit name after '2' but hit EOL") + checkFromInvalidString("1 day 2.2", "expect a unit name after '2.2' but hit EOL") + checkFromInvalidString("1 day -", "expect a number after '-' but hit EOL") + checkFromInvalidString("-.", "expect a unit name after '-.' but hit EOL") + } + test("string to interval: multiple units") { Seq( "-1 MONTH 1 day -1 microseconds" -> new CalendarInterval(-1, 1, -1), diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql index 2f6a2c9fef..e925c4508f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -198,3 +198,11 @@ SELECT to_json(from_json('{"a":"1 days"}', 'a interval')), to_json(map('a', interval 25 month 100 day 130 minute)), from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval'); + +select interval '+'; +select interval '+.'; +select interval '1'; +select interval '1.2'; +select interval '- 2'; +select interval '1 day -'; +select interval '1 day 1'; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index cf8a72ed01..33d918bbeb 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 100 +-- Number of queries: 107 -- !query @@ -1024,3 +1024,101 @@ SELECT struct,to_json(from_json({"a":"1 days"})):string,to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string,from_json(to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))):struct> -- !query output {"a":1 days} {"a":"1 days"} {"a":"2 years 1 months 100 days 2 hours 10 minutes"} {"a":2 years 1 months 100 days 2 hours 10 minutes} + + +-- !query +select interval '+' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: +(line 1, pos 7) + +== SQL == +select interval '+' +-------^^^ + + +-- !query +select interval '+.' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: +.(line 1, pos 7) + +== SQL == +select interval '+.' +-------^^^ + + +-- !query +select interval '1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1(line 1, pos 7) + +== SQL == +select interval '1' +-------^^^ + + +-- !query +select interval '1.2' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1.2(line 1, pos 7) + +== SQL == +select interval '1.2' +-------^^^ + + +-- !query +select interval '- 2' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: - 2(line 1, pos 7) + +== SQL == +select interval '- 2' +-------^^^ + + +-- !query +select interval '1 day -' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1 day -(line 1, pos 7) + +== SQL == +select interval '1 day -' +-------^^^ + + +-- !query +select interval '1 day 1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1 day 1(line 1, pos 7) + +== SQL == +select interval '1 day 1' +-------^^^ diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out index 99b2a6ff7e..898be09a40 100644 --- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 100 +-- Number of queries: 107 -- !query @@ -1012,3 +1012,101 @@ SELECT struct,to_json(from_json({"a":"1 days"})):string,to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string,from_json(to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))):struct> -- !query output {"a":1 days} {"a":"1 days"} {"a":"2 years 1 months 100 days 2 hours 10 minutes"} {"a":2 years 1 months 100 days 2 hours 10 minutes} + + +-- !query +select interval '+' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: +(line 1, pos 7) + +== SQL == +select interval '+' +-------^^^ + + +-- !query +select interval '+.' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: +.(line 1, pos 7) + +== SQL == +select interval '+.' +-------^^^ + + +-- !query +select interval '1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1(line 1, pos 7) + +== SQL == +select interval '1' +-------^^^ + + +-- !query +select interval '1.2' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1.2(line 1, pos 7) + +== SQL == +select interval '1.2' +-------^^^ + + +-- !query +select interval '- 2' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: - 2(line 1, pos 7) + +== SQL == +select interval '- 2' +-------^^^ + + +-- !query +select interval '1 day -' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1 day -(line 1, pos 7) + +== SQL == +select interval '1 day -' +-------^^^ + + +-- !query +select interval '1 day 1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1 day 1(line 1, pos 7) + +== SQL == +select interval '1 day 1' +-------^^^