[SPARK-29986][SQL] casting string to date/timestamp/interval should trim all whitespaces
### What changes were proposed in this pull request? A java like string trim method trims all whitespaces that less or equal than 0x20. currently, our UTF8String handle the space =0x20 ONLY. This is not suitable for many cases in Spark, like trim for interval strings, date, timestamps, PostgreSQL like cast string to boolean. ### Why are the changes needed? improve the white spaces handling in UTF8String, also with some bugs fixed ### Does this PR introduce any user-facing change? yes, string with `control character` at either end can be convert to date/timestamp and interval now ### How was this patch tested? add ut Closes #26626 from yaooqinn/SPARK-29986. Authored-by: Kent Yao <yaooqinn@hotmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
456cfe6e46
commit
de21f28f8a
|
@ -538,14 +538,42 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
|
|||
public UTF8String trim() {
|
||||
int s = 0;
|
||||
// skip all of the space (0x20) in the left side
|
||||
while (s < this.numBytes && getByte(s) == 0x20) s++;
|
||||
while (s < this.numBytes && getByte(s) == ' ') s++;
|
||||
if (s == this.numBytes) {
|
||||
// Everything trimmed
|
||||
return EMPTY_UTF8;
|
||||
}
|
||||
// skip all of the space (0x20) in the right side
|
||||
int e = this.numBytes - 1;
|
||||
while (e > s && getByte(e) == 0x20) e--;
|
||||
while (e > s && getByte(e) == ' ') e--;
|
||||
if (s == 0 && e == numBytes - 1) {
|
||||
// Nothing trimmed
|
||||
return this;
|
||||
}
|
||||
return copyUTF8String(s, e);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims whitespaces (<= ASCII 32) from both ends of this string.
|
||||
*
|
||||
* Note that, this method is the same as java's {@link String#trim}, and different from
|
||||
* {@link UTF8String#trim()} which remove only spaces(= ASCII 32) from both ends.
|
||||
*
|
||||
* @return A UTF8String whose value is this UTF8String, with any leading and trailing white
|
||||
* space removed, or this UTF8String if it has no leading or trailing whitespace.
|
||||
*
|
||||
*/
|
||||
public UTF8String trimAll() {
|
||||
int s = 0;
|
||||
// skip all of the whitespaces (<=0x20) in the left side
|
||||
while (s < this.numBytes && getByte(s) <= ' ') s++;
|
||||
if (s == this.numBytes) {
|
||||
// Everything trimmed
|
||||
return EMPTY_UTF8;
|
||||
}
|
||||
// skip all of the whitespaces (<=0x20) in the right side
|
||||
int e = this.numBytes - 1;
|
||||
while (e > s && getByte(e) <= ' ') e--;
|
||||
if (s == 0 && e == numBytes - 1) {
|
||||
// Nothing trimmed
|
||||
return this;
|
||||
|
|
|
@ -227,6 +227,7 @@ public class UTF8StringSuite {
|
|||
@Test
|
||||
public void trims() {
|
||||
assertEquals(fromString("1"), fromString("1").trim());
|
||||
assertEquals(fromString("1"), fromString("1\t").trimAll());
|
||||
|
||||
assertEquals(fromString("hello"), fromString(" hello ").trim());
|
||||
assertEquals(fromString("hello "), fromString(" hello ").trimLeft());
|
||||
|
|
|
@ -224,6 +224,8 @@ license: |
|
|||
|
||||
- Since Spark 3.0, when casting string value to integral types, including tinyint, smallint, int and bigint type, the leading and trailing white spaces(<= ACSII 32) will be trimmed before convert to integral values, e.g. `cast(' 1 ' as int)` results `1`. In Spark version 2.4 and earlier, the result will be `null`.
|
||||
|
||||
- Since Spark 3.0, when casting string value to date, timestamp and interval values, the leading and trailing white spaces(<= ACSII 32) will be trimmed before casing, e.g. `cast('2019-10-10\t as date)` results the date value `2019-10-10`. In Spark version 2.4 and earlier, only the trailing space will be removed, thus, the result is `null`.
|
||||
|
||||
## Upgrading from Spark SQL 2.4 to 2.4.1
|
||||
|
||||
- The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was
|
||||
|
|
|
@ -42,7 +42,7 @@ case class PostgreCastToBoolean(child: Expression, timeZoneId: Option[String])
|
|||
override def castToBoolean(from: DataType): Any => Any = from match {
|
||||
case StringType =>
|
||||
buildCast[UTF8String](_, str => {
|
||||
val s = str.trim().toLowerCase()
|
||||
val s = str.trimAll().toLowerCase()
|
||||
if (StringUtils.isTrueString(s)) {
|
||||
true
|
||||
} else if (StringUtils.isFalseString(s)) {
|
||||
|
|
|
@ -205,7 +205,7 @@ object DateTimeUtils {
|
|||
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
|
||||
var i = 0
|
||||
var currentSegmentValue = 0
|
||||
val bytes = s.trim.getBytes
|
||||
val bytes = s.trimAll().getBytes
|
||||
val specialTimestamp = convertSpecialTimestamp(bytes, timeZoneId)
|
||||
if (specialTimestamp.isDefined) return specialTimestamp
|
||||
var j = 0
|
||||
|
@ -372,7 +372,7 @@ object DateTimeUtils {
|
|||
val segments: Array[Int] = Array[Int](1, 1, 1)
|
||||
var i = 0
|
||||
var currentSegmentValue = 0
|
||||
val bytes = s.trim.getBytes
|
||||
val bytes = s.trimAll().getBytes
|
||||
val specialDate = convertSpecialDate(bytes, zoneId)
|
||||
if (specialDate.isDefined) return specialDate
|
||||
var j = 0
|
||||
|
|
|
@ -465,7 +465,7 @@ object IntervalUtils {
|
|||
throwIAE("interval string cannot be null")
|
||||
}
|
||||
// scalastyle:off caselocale .toLowerCase
|
||||
val s = input.trim.toLowerCase
|
||||
val s = input.trimAll().toLowerCase
|
||||
// scalastyle:on
|
||||
val bytes = s.getBytes
|
||||
if (bytes.isEmpty) {
|
||||
|
|
|
@ -36,3 +36,6 @@ select date '2001-10-01' - 7;
|
|||
select date '2001-10-01' - date '2001-09-28';
|
||||
select date'2020-01-01' - timestamp'2019-10-06 10:11:12.345678';
|
||||
select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01';
|
||||
|
||||
select date '2019-01-01\t';
|
||||
select timestamp '2019-01-01\t';
|
||||
|
|
|
@ -257,3 +257,6 @@ select
|
|||
interval '99 11:22:33.123456789' day to second + interval '10 9:8:7.123456789' day to second,
|
||||
interval '99 11:22:33.123456789' day to second - interval '10 9:8:7.123456789' day to second
|
||||
from interval_arithmetic;
|
||||
|
||||
-- control characters as white spaces
|
||||
select interval '\t interval 1 day';
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
-- Automatically generated by SQLQueryTestSuite
|
||||
-- Number of queries: 130
|
||||
-- Number of queries: 131
|
||||
|
||||
|
||||
-- !query 0
|
||||
|
@ -1116,34 +1116,42 @@ struct<(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' + INTERVAL '10
|
|||
|
||||
|
||||
-- !query 118
|
||||
select 1 year 2 days
|
||||
select interval '\t interval 1 day'
|
||||
-- !query 118 schema
|
||||
struct<INTERVAL '1 years 2 days':interval>
|
||||
struct<INTERVAL '1 days':interval>
|
||||
-- !query 118 output
|
||||
1 years 2 days
|
||||
1 days
|
||||
|
||||
|
||||
-- !query 119
|
||||
select '10-9' year to month
|
||||
select 1 year 2 days
|
||||
-- !query 119 schema
|
||||
struct<INTERVAL '10 years 9 months':interval>
|
||||
struct<INTERVAL '1 years 2 days':interval>
|
||||
-- !query 119 output
|
||||
10 years 9 months
|
||||
1 years 2 days
|
||||
|
||||
|
||||
-- !query 120
|
||||
select '20 15:40:32.99899999' day to second
|
||||
select '10-9' year to month
|
||||
-- !query 120 schema
|
||||
struct<INTERVAL '20 days 15 hours 40 minutes 32.998999 seconds':interval>
|
||||
struct<INTERVAL '10 years 9 months':interval>
|
||||
-- !query 120 output
|
||||
20 days 15 hours 40 minutes 32.998999 seconds
|
||||
10 years 9 months
|
||||
|
||||
|
||||
-- !query 121
|
||||
select 30 day day
|
||||
select '20 15:40:32.99899999' day to second
|
||||
-- !query 121 schema
|
||||
struct<>
|
||||
struct<INTERVAL '20 days 15 hours 40 minutes 32.998999 seconds':interval>
|
||||
-- !query 121 output
|
||||
20 days 15 hours 40 minutes 32.998999 seconds
|
||||
|
||||
|
||||
-- !query 122
|
||||
select 30 day day
|
||||
-- !query 122 schema
|
||||
struct<>
|
||||
-- !query 122 output
|
||||
org.apache.spark.sql.catalyst.parser.ParseException
|
||||
|
||||
no viable alternative at input 'day'(line 1, pos 14)
|
||||
|
@ -1153,27 +1161,27 @@ select 30 day day
|
|||
--------------^^^
|
||||
|
||||
|
||||
-- !query 122
|
||||
-- !query 123
|
||||
select date'2012-01-01' - '2-2' year to month
|
||||
-- !query 122 schema
|
||||
-- !query 123 schema
|
||||
struct<CAST(CAST(DATE '2012-01-01' AS TIMESTAMP) - INTERVAL '2 years 2 months' AS DATE):date>
|
||||
-- !query 122 output
|
||||
-- !query 123 output
|
||||
2009-11-01
|
||||
|
||||
|
||||
-- !query 123
|
||||
-- !query 124
|
||||
select 1 month - 1 day
|
||||
-- !query 123 schema
|
||||
-- !query 124 schema
|
||||
struct<INTERVAL '1 months -1 days':interval>
|
||||
-- !query 123 output
|
||||
-- !query 124 output
|
||||
1 months -1 days
|
||||
|
||||
|
||||
-- !query 124
|
||||
-- !query 125
|
||||
select 1 year to month
|
||||
-- !query 124 schema
|
||||
-- !query 125 schema
|
||||
struct<>
|
||||
-- !query 124 output
|
||||
-- !query 125 output
|
||||
org.apache.spark.sql.catalyst.parser.ParseException
|
||||
|
||||
The value of from-to unit must be a string(line 1, pos 7)
|
||||
|
@ -1183,11 +1191,11 @@ select 1 year to month
|
|||
-------^^^
|
||||
|
||||
|
||||
-- !query 125
|
||||
-- !query 126
|
||||
select '1' year to second
|
||||
-- !query 125 schema
|
||||
-- !query 126 schema
|
||||
struct<>
|
||||
-- !query 125 output
|
||||
-- !query 126 output
|
||||
org.apache.spark.sql.catalyst.parser.ParseException
|
||||
|
||||
Intervals FROM year TO second are not supported.(line 1, pos 7)
|
||||
|
@ -1197,11 +1205,11 @@ select '1' year to second
|
|||
-------^^^
|
||||
|
||||
|
||||
-- !query 126
|
||||
-- !query 127
|
||||
select 1 year '2-1' year to month
|
||||
-- !query 126 schema
|
||||
-- !query 127 schema
|
||||
struct<>
|
||||
-- !query 126 output
|
||||
-- !query 127 output
|
||||
org.apache.spark.sql.catalyst.parser.ParseException
|
||||
|
||||
Can only have a single from-to unit in the interval literal syntax(line 1, pos 14)
|
||||
|
@ -1211,11 +1219,11 @@ select 1 year '2-1' year to month
|
|||
--------------^^^
|
||||
|
||||
|
||||
-- !query 127
|
||||
-- !query 128
|
||||
select (-30) day
|
||||
-- !query 127 schema
|
||||
-- !query 128 schema
|
||||
struct<>
|
||||
-- !query 127 output
|
||||
-- !query 128 output
|
||||
org.apache.spark.sql.catalyst.parser.ParseException
|
||||
|
||||
no viable alternative at input 'day'(line 1, pos 13)
|
||||
|
@ -1225,11 +1233,11 @@ select (-30) day
|
|||
-------------^^^
|
||||
|
||||
|
||||
-- !query 128
|
||||
-- !query 129
|
||||
select (a + 1) day
|
||||
-- !query 128 schema
|
||||
-- !query 129 schema
|
||||
struct<>
|
||||
-- !query 128 output
|
||||
-- !query 129 output
|
||||
org.apache.spark.sql.catalyst.parser.ParseException
|
||||
|
||||
no viable alternative at input 'day'(line 1, pos 15)
|
||||
|
@ -1239,11 +1247,11 @@ select (a + 1) day
|
|||
---------------^^^
|
||||
|
||||
|
||||
-- !query 129
|
||||
-- !query 130
|
||||
select 30 day day day
|
||||
-- !query 129 schema
|
||||
-- !query 130 schema
|
||||
struct<>
|
||||
-- !query 129 output
|
||||
-- !query 130 output
|
||||
org.apache.spark.sql.catalyst.parser.ParseException
|
||||
|
||||
no viable alternative at input 'day'(line 1, pos 14)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
-- Automatically generated by SQLQueryTestSuite
|
||||
-- Number of queries: 17
|
||||
-- Number of queries: 19
|
||||
|
||||
|
||||
-- !query 0
|
||||
|
@ -145,3 +145,19 @@ select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01'
|
|||
struct<subtracttimestamps(TIMESTAMP '2019-10-06 10:11:12.345678', CAST(DATE '2020-01-01' AS TIMESTAMP)):interval>
|
||||
-- !query 16 output
|
||||
-2078 hours -48 minutes -47.654322 seconds
|
||||
|
||||
|
||||
-- !query 17
|
||||
select date '2019-01-01\t'
|
||||
-- !query 17 schema
|
||||
struct<DATE '2019-01-01':date>
|
||||
-- !query 17 output
|
||||
2019-01-01
|
||||
|
||||
|
||||
-- !query 18
|
||||
select timestamp '2019-01-01\t'
|
||||
-- !query 18 schema
|
||||
struct<TIMESTAMP '2019-01-01 00:00:00':timestamp>
|
||||
-- !query 18 output
|
||||
2019-01-01 00:00:00
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
-- Automatically generated by SQLQueryTestSuite
|
||||
-- Number of queries: 118
|
||||
-- Number of queries: 119
|
||||
|
||||
|
||||
-- !query 0
|
||||
|
@ -1097,3 +1097,11 @@ from interval_arithmetic
|
|||
struct<(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' + INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval,(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' - INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval>
|
||||
-- !query 117 output
|
||||
109 days 20 hours 30 minutes 40.246912 seconds 89 days 2 hours 14 minutes 26 seconds
|
||||
|
||||
|
||||
-- !query 118
|
||||
select interval '\t interval 1 day'
|
||||
-- !query 118 schema
|
||||
struct<INTERVAL '1 days':interval>
|
||||
-- !query 118 output
|
||||
1 days
|
||||
|
|
Loading…
Reference in a new issue