From de21f28f8a0a41dd7eb8ed1ff8b35a6d7538958b Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 25 Nov 2019 14:37:04 +0800 Subject: [PATCH] [SPARK-29986][SQL] casting string to date/timestamp/interval should trim all whitespaces ### What changes were proposed in this pull request? A java like string trim method trims all whitespaces that less or equal than 0x20. currently, our UTF8String handle the space =0x20 ONLY. This is not suitable for many cases in Spark, like trim for interval strings, date, timestamps, PostgreSQL like cast string to boolean. ### Why are the changes needed? improve the white spaces handling in UTF8String, also with some bugs fixed ### Does this PR introduce any user-facing change? yes, string with `control character` at either end can be convert to date/timestamp and interval now ### How was this patch tested? add ut Closes #26626 from yaooqinn/SPARK-29986. Authored-by: Kent Yao Signed-off-by: Wenchen Fan --- .../apache/spark/unsafe/types/UTF8String.java | 32 +++++++- .../spark/unsafe/types/UTF8StringSuite.java | 1 + docs/sql-migration-guide.md | 2 + .../postgreSQL/PostgreCastToBoolean.scala | 2 +- .../sql/catalyst/util/DateTimeUtils.scala | 4 +- .../sql/catalyst/util/IntervalUtils.scala | 2 +- .../resources/sql-tests/inputs/datetime.sql | 3 + .../resources/sql-tests/inputs/interval.sql | 5 +- .../sql-tests/results/ansi/interval.sql.out | 80 ++++++++++--------- .../sql-tests/results/datetime.sql.out | 18 ++++- .../sql-tests/results/interval.sql.out | 10 ++- 11 files changed, 114 insertions(+), 45 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index deecd4f015..3754a1a037 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -538,14 +538,42 @@ public final class UTF8String implements Comparable, Externalizable, public UTF8String trim() { int s = 0; // skip all of the space (0x20) in the left side - while (s < this.numBytes && getByte(s) == 0x20) s++; + while (s < this.numBytes && getByte(s) == ' ') s++; if (s == this.numBytes) { // Everything trimmed return EMPTY_UTF8; } // skip all of the space (0x20) in the right side int e = this.numBytes - 1; - while (e > s && getByte(e) == 0x20) e--; + while (e > s && getByte(e) == ' ') e--; + if (s == 0 && e == numBytes - 1) { + // Nothing trimmed + return this; + } + return copyUTF8String(s, e); + } + + /** + * Trims whitespaces (<= ASCII 32) from both ends of this string. + * + * Note that, this method is the same as java's {@link String#trim}, and different from + * {@link UTF8String#trim()} which remove only spaces(= ASCII 32) from both ends. + * + * @return A UTF8String whose value is this UTF8String, with any leading and trailing white + * space removed, or this UTF8String if it has no leading or trailing whitespace. + * + */ + public UTF8String trimAll() { + int s = 0; + // skip all of the whitespaces (<=0x20) in the left side + while (s < this.numBytes && getByte(s) <= ' ') s++; + if (s == this.numBytes) { + // Everything trimmed + return EMPTY_UTF8; + } + // skip all of the whitespaces (<=0x20) in the right side + int e = this.numBytes - 1; + while (e > s && getByte(e) <= ' ') e--; if (s == 0 && e == numBytes - 1) { // Nothing trimmed return this; diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index dbede9bc7f..8f933877f8 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -227,6 +227,7 @@ public class UTF8StringSuite { @Test public void trims() { assertEquals(fromString("1"), fromString("1").trim()); + assertEquals(fromString("1"), fromString("1\t").trimAll()); assertEquals(fromString("hello"), fromString(" hello ").trim()); assertEquals(fromString("hello "), fromString(" hello ").trimLeft()); diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 6fc78893e6..74ba694800 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -224,6 +224,8 @@ license: | - Since Spark 3.0, when casting string value to integral types, including tinyint, smallint, int and bigint type, the leading and trailing white spaces(<= ACSII 32) will be trimmed before convert to integral values, e.g. `cast(' 1 ' as int)` results `1`. In Spark version 2.4 and earlier, the result will be `null`. + - Since Spark 3.0, when casting string value to date, timestamp and interval values, the leading and trailing white spaces(<= ACSII 32) will be trimmed before casing, e.g. `cast('2019-10-10\t as date)` results the date value `2019-10-10`. In Spark version 2.4 and earlier, only the trailing space will be removed, thus, the result is `null`. + ## Upgrading from Spark SQL 2.4 to 2.4.1 - The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/postgreSQL/PostgreCastToBoolean.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/postgreSQL/PostgreCastToBoolean.scala index 20559ba3cd..02bc6f0d0d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/postgreSQL/PostgreCastToBoolean.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/postgreSQL/PostgreCastToBoolean.scala @@ -42,7 +42,7 @@ case class PostgreCastToBoolean(child: Expression, timeZoneId: Option[String]) override def castToBoolean(from: DataType): Any => Any = from match { case StringType => buildCast[UTF8String](_, str => { - val s = str.trim().toLowerCase() + val s = str.trimAll().toLowerCase() if (StringUtils.isTrueString(s)) { true } else if (StringUtils.isFalseString(s)) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index f9f5e83d77..6800abb2ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -205,7 +205,7 @@ object DateTimeUtils { val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0) var i = 0 var currentSegmentValue = 0 - val bytes = s.trim.getBytes + val bytes = s.trimAll().getBytes val specialTimestamp = convertSpecialTimestamp(bytes, timeZoneId) if (specialTimestamp.isDefined) return specialTimestamp var j = 0 @@ -372,7 +372,7 @@ object DateTimeUtils { val segments: Array[Int] = Array[Int](1, 1, 1) var i = 0 var currentSegmentValue = 0 - val bytes = s.trim.getBytes + val bytes = s.trimAll().getBytes val specialDate = convertSpecialDate(bytes, zoneId) if (specialDate.isDefined) return specialDate var j = 0 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala index 9418d8eec3..b8c7e4a9ba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala @@ -465,7 +465,7 @@ object IntervalUtils { throwIAE("interval string cannot be null") } // scalastyle:off caselocale .toLowerCase - val s = input.trim.toLowerCase + val s = input.trimAll().toLowerCase // scalastyle:on val bytes = s.getBytes if (bytes.isEmpty) { diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index 0e22af1fbd..de2040c651 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -36,3 +36,6 @@ select date '2001-10-01' - 7; select date '2001-10-01' - date '2001-09-28'; select date'2020-01-01' - timestamp'2019-10-06 10:11:12.345678'; select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01'; + +select date '2019-01-01\t'; +select timestamp '2019-01-01\t'; diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql index 94d5aae9bc..886e242460 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -256,4 +256,7 @@ from interval_arithmetic; select interval '99 11:22:33.123456789' day to second + interval '10 9:8:7.123456789' day to second, interval '99 11:22:33.123456789' day to second - interval '10 9:8:7.123456789' day to second -from interval_arithmetic; \ No newline at end of file +from interval_arithmetic; + +-- control characters as white spaces +select interval '\t interval 1 day'; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index bceb6bd1d2..56628f615c 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 130 +-- Number of queries: 131 -- !query 0 @@ -1116,34 +1116,42 @@ struct<(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' + INTERVAL '10 -- !query 118 -select 1 year 2 days +select interval '\t interval 1 day' -- !query 118 schema -struct +struct -- !query 118 output -1 years 2 days +1 days -- !query 119 -select '10-9' year to month +select 1 year 2 days -- !query 119 schema -struct +struct -- !query 119 output -10 years 9 months +1 years 2 days -- !query 120 -select '20 15:40:32.99899999' day to second +select '10-9' year to month -- !query 120 schema -struct +struct -- !query 120 output -20 days 15 hours 40 minutes 32.998999 seconds +10 years 9 months -- !query 121 -select 30 day day +select '20 15:40:32.99899999' day to second -- !query 121 schema -struct<> +struct -- !query 121 output +20 days 15 hours 40 minutes 32.998999 seconds + + +-- !query 122 +select 30 day day +-- !query 122 schema +struct<> +-- !query 122 output org.apache.spark.sql.catalyst.parser.ParseException no viable alternative at input 'day'(line 1, pos 14) @@ -1153,27 +1161,27 @@ select 30 day day --------------^^^ --- !query 122 +-- !query 123 select date'2012-01-01' - '2-2' year to month --- !query 122 schema +-- !query 123 schema struct --- !query 122 output +-- !query 123 output 2009-11-01 --- !query 123 +-- !query 124 select 1 month - 1 day --- !query 123 schema +-- !query 124 schema struct --- !query 123 output +-- !query 124 output 1 months -1 days --- !query 124 +-- !query 125 select 1 year to month --- !query 124 schema +-- !query 125 schema struct<> --- !query 124 output +-- !query 125 output org.apache.spark.sql.catalyst.parser.ParseException The value of from-to unit must be a string(line 1, pos 7) @@ -1183,11 +1191,11 @@ select 1 year to month -------^^^ --- !query 125 +-- !query 126 select '1' year to second --- !query 125 schema +-- !query 126 schema struct<> --- !query 125 output +-- !query 126 output org.apache.spark.sql.catalyst.parser.ParseException Intervals FROM year TO second are not supported.(line 1, pos 7) @@ -1197,11 +1205,11 @@ select '1' year to second -------^^^ --- !query 126 +-- !query 127 select 1 year '2-1' year to month --- !query 126 schema +-- !query 127 schema struct<> --- !query 126 output +-- !query 127 output org.apache.spark.sql.catalyst.parser.ParseException Can only have a single from-to unit in the interval literal syntax(line 1, pos 14) @@ -1211,11 +1219,11 @@ select 1 year '2-1' year to month --------------^^^ --- !query 127 +-- !query 128 select (-30) day --- !query 127 schema +-- !query 128 schema struct<> --- !query 127 output +-- !query 128 output org.apache.spark.sql.catalyst.parser.ParseException no viable alternative at input 'day'(line 1, pos 13) @@ -1225,11 +1233,11 @@ select (-30) day -------------^^^ --- !query 128 +-- !query 129 select (a + 1) day --- !query 128 schema +-- !query 129 schema struct<> --- !query 128 output +-- !query 129 output org.apache.spark.sql.catalyst.parser.ParseException no viable alternative at input 'day'(line 1, pos 15) @@ -1239,11 +1247,11 @@ select (a + 1) day ---------------^^^ --- !query 129 +-- !query 130 select 30 day day day --- !query 129 schema +-- !query 130 schema struct<> --- !query 129 output +-- !query 130 output org.apache.spark.sql.catalyst.parser.ParseException no viable alternative at input 'day'(line 1, pos 14) diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index b2c6b878f4..a269420a9a 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 17 +-- Number of queries: 19 -- !query 0 @@ -145,3 +145,19 @@ select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01' struct -- !query 16 output -2078 hours -48 minutes -47.654322 seconds + + +-- !query 17 +select date '2019-01-01\t' +-- !query 17 schema +struct +-- !query 17 output +2019-01-01 + + +-- !query 18 +select timestamp '2019-01-01\t' +-- !query 18 schema +struct +-- !query 18 output +2019-01-01 00:00:00 diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out index bda5fc6d8b..bf7b2184b6 100644 --- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 118 +-- Number of queries: 119 -- !query 0 @@ -1097,3 +1097,11 @@ from interval_arithmetic struct<(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' + INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval,(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' - INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval> -- !query 117 output 109 days 20 hours 30 minutes 40.246912 seconds 89 days 2 hours 14 minutes 26 seconds + + +-- !query 118 +select interval '\t interval 1 day' +-- !query 118 schema +struct +-- !query 118 output +1 days