[SPARK-29986][SQL] casting string to date/timestamp/interval should trim all whitespaces

### What changes were proposed in this pull request?

A java like string trim method trims all whitespaces that less or equal than 0x20. currently, our UTF8String handle the space =0x20 ONLY. This is not suitable for many cases in Spark, like trim for interval strings, date, timestamps, PostgreSQL like cast string to boolean.

### Why are the changes needed?

improve the white spaces handling in UTF8String, also with some bugs fixed

### Does this PR introduce any user-facing change?

yes,
string with `control character` at either end can be convert to date/timestamp and interval now

### How was this patch tested?

add ut

Closes #26626 from yaooqinn/SPARK-29986.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Kent Yao 2019-11-25 14:37:04 +08:00 committed by Wenchen Fan
parent 456cfe6e46
commit de21f28f8a
11 changed files with 114 additions and 45 deletions

View file

@ -538,14 +538,42 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
public UTF8String trim() {
int s = 0;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByte(s) == 0x20) s++;
while (s < this.numBytes && getByte(s) == ' ') s++;
if (s == this.numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
// skip all of the space (0x20) in the right side
int e = this.numBytes - 1;
while (e > s && getByte(e) == 0x20) e--;
while (e > s && getByte(e) == ' ') e--;
if (s == 0 && e == numBytes - 1) {
// Nothing trimmed
return this;
}
return copyUTF8String(s, e);
}
/**
* Trims whitespaces (<= ASCII 32) from both ends of this string.
*
* Note that, this method is the same as java's {@link String#trim}, and different from
* {@link UTF8String#trim()} which remove only spaces(= ASCII 32) from both ends.
*
* @return A UTF8String whose value is this UTF8String, with any leading and trailing white
* space removed, or this UTF8String if it has no leading or trailing whitespace.
*
*/
public UTF8String trimAll() {
int s = 0;
// skip all of the whitespaces (<=0x20) in the left side
while (s < this.numBytes && getByte(s) <= ' ') s++;
if (s == this.numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
// skip all of the whitespaces (<=0x20) in the right side
int e = this.numBytes - 1;
while (e > s && getByte(e) <= ' ') e--;
if (s == 0 && e == numBytes - 1) {
// Nothing trimmed
return this;

View file

@ -227,6 +227,7 @@ public class UTF8StringSuite {
@Test
public void trims() {
assertEquals(fromString("1"), fromString("1").trim());
assertEquals(fromString("1"), fromString("1\t").trimAll());
assertEquals(fromString("hello"), fromString(" hello ").trim());
assertEquals(fromString("hello "), fromString(" hello ").trimLeft());

View file

@ -224,6 +224,8 @@ license: |
- Since Spark 3.0, when casting string value to integral types, including tinyint, smallint, int and bigint type, the leading and trailing white spaces(<= ACSII 32) will be trimmed before convert to integral values, e.g. `cast(' 1 ' as int)` results `1`. In Spark version 2.4 and earlier, the result will be `null`.
- Since Spark 3.0, when casting string value to date, timestamp and interval values, the leading and trailing white spaces(<= ACSII 32) will be trimmed before casing, e.g. `cast('2019-10-10\t as date)` results the date value `2019-10-10`. In Spark version 2.4 and earlier, only the trailing space will be removed, thus, the result is `null`.
## Upgrading from Spark SQL 2.4 to 2.4.1
- The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was

View file

@ -42,7 +42,7 @@ case class PostgreCastToBoolean(child: Expression, timeZoneId: Option[String])
override def castToBoolean(from: DataType): Any => Any = from match {
case StringType =>
buildCast[UTF8String](_, str => {
val s = str.trim().toLowerCase()
val s = str.trimAll().toLowerCase()
if (StringUtils.isTrueString(s)) {
true
} else if (StringUtils.isFalseString(s)) {

View file

@ -205,7 +205,7 @@ object DateTimeUtils {
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
var i = 0
var currentSegmentValue = 0
val bytes = s.trim.getBytes
val bytes = s.trimAll().getBytes
val specialTimestamp = convertSpecialTimestamp(bytes, timeZoneId)
if (specialTimestamp.isDefined) return specialTimestamp
var j = 0
@ -372,7 +372,7 @@ object DateTimeUtils {
val segments: Array[Int] = Array[Int](1, 1, 1)
var i = 0
var currentSegmentValue = 0
val bytes = s.trim.getBytes
val bytes = s.trimAll().getBytes
val specialDate = convertSpecialDate(bytes, zoneId)
if (specialDate.isDefined) return specialDate
var j = 0

View file

@ -465,7 +465,7 @@ object IntervalUtils {
throwIAE("interval string cannot be null")
}
// scalastyle:off caselocale .toLowerCase
val s = input.trim.toLowerCase
val s = input.trimAll().toLowerCase
// scalastyle:on
val bytes = s.getBytes
if (bytes.isEmpty) {

View file

@ -36,3 +36,6 @@ select date '2001-10-01' - 7;
select date '2001-10-01' - date '2001-09-28';
select date'2020-01-01' - timestamp'2019-10-06 10:11:12.345678';
select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01';
select date '2019-01-01\t';
select timestamp '2019-01-01\t';

View file

@ -257,3 +257,6 @@ select
interval '99 11:22:33.123456789' day to second + interval '10 9:8:7.123456789' day to second,
interval '99 11:22:33.123456789' day to second - interval '10 9:8:7.123456789' day to second
from interval_arithmetic;
-- control characters as white spaces
select interval '\t interval 1 day';

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 130
-- Number of queries: 131
-- !query 0
@ -1116,34 +1116,42 @@ struct<(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' + INTERVAL '10
-- !query 118
select 1 year 2 days
select interval '\t interval 1 day'
-- !query 118 schema
struct<INTERVAL '1 years 2 days':interval>
struct<INTERVAL '1 days':interval>
-- !query 118 output
1 years 2 days
1 days
-- !query 119
select '10-9' year to month
select 1 year 2 days
-- !query 119 schema
struct<INTERVAL '10 years 9 months':interval>
struct<INTERVAL '1 years 2 days':interval>
-- !query 119 output
10 years 9 months
1 years 2 days
-- !query 120
select '20 15:40:32.99899999' day to second
select '10-9' year to month
-- !query 120 schema
struct<INTERVAL '20 days 15 hours 40 minutes 32.998999 seconds':interval>
struct<INTERVAL '10 years 9 months':interval>
-- !query 120 output
20 days 15 hours 40 minutes 32.998999 seconds
10 years 9 months
-- !query 121
select 30 day day
select '20 15:40:32.99899999' day to second
-- !query 121 schema
struct<>
struct<INTERVAL '20 days 15 hours 40 minutes 32.998999 seconds':interval>
-- !query 121 output
20 days 15 hours 40 minutes 32.998999 seconds
-- !query 122
select 30 day day
-- !query 122 schema
struct<>
-- !query 122 output
org.apache.spark.sql.catalyst.parser.ParseException
no viable alternative at input 'day'(line 1, pos 14)
@ -1153,27 +1161,27 @@ select 30 day day
--------------^^^
-- !query 122
-- !query 123
select date'2012-01-01' - '2-2' year to month
-- !query 122 schema
-- !query 123 schema
struct<CAST(CAST(DATE '2012-01-01' AS TIMESTAMP) - INTERVAL '2 years 2 months' AS DATE):date>
-- !query 122 output
-- !query 123 output
2009-11-01
-- !query 123
-- !query 124
select 1 month - 1 day
-- !query 123 schema
-- !query 124 schema
struct<INTERVAL '1 months -1 days':interval>
-- !query 123 output
-- !query 124 output
1 months -1 days
-- !query 124
-- !query 125
select 1 year to month
-- !query 124 schema
-- !query 125 schema
struct<>
-- !query 124 output
-- !query 125 output
org.apache.spark.sql.catalyst.parser.ParseException
The value of from-to unit must be a string(line 1, pos 7)
@ -1183,11 +1191,11 @@ select 1 year to month
-------^^^
-- !query 125
-- !query 126
select '1' year to second
-- !query 125 schema
-- !query 126 schema
struct<>
-- !query 125 output
-- !query 126 output
org.apache.spark.sql.catalyst.parser.ParseException
Intervals FROM year TO second are not supported.(line 1, pos 7)
@ -1197,11 +1205,11 @@ select '1' year to second
-------^^^
-- !query 126
-- !query 127
select 1 year '2-1' year to month
-- !query 126 schema
-- !query 127 schema
struct<>
-- !query 126 output
-- !query 127 output
org.apache.spark.sql.catalyst.parser.ParseException
Can only have a single from-to unit in the interval literal syntax(line 1, pos 14)
@ -1211,11 +1219,11 @@ select 1 year '2-1' year to month
--------------^^^
-- !query 127
-- !query 128
select (-30) day
-- !query 127 schema
-- !query 128 schema
struct<>
-- !query 127 output
-- !query 128 output
org.apache.spark.sql.catalyst.parser.ParseException
no viable alternative at input 'day'(line 1, pos 13)
@ -1225,11 +1233,11 @@ select (-30) day
-------------^^^
-- !query 128
-- !query 129
select (a + 1) day
-- !query 128 schema
-- !query 129 schema
struct<>
-- !query 128 output
-- !query 129 output
org.apache.spark.sql.catalyst.parser.ParseException
no viable alternative at input 'day'(line 1, pos 15)
@ -1239,11 +1247,11 @@ select (a + 1) day
---------------^^^
-- !query 129
-- !query 130
select 30 day day day
-- !query 129 schema
-- !query 130 schema
struct<>
-- !query 129 output
-- !query 130 output
org.apache.spark.sql.catalyst.parser.ParseException
no viable alternative at input 'day'(line 1, pos 14)

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 17
-- Number of queries: 19
-- !query 0
@ -145,3 +145,19 @@ select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01'
struct<subtracttimestamps(TIMESTAMP '2019-10-06 10:11:12.345678', CAST(DATE '2020-01-01' AS TIMESTAMP)):interval>
-- !query 16 output
-2078 hours -48 minutes -47.654322 seconds
-- !query 17
select date '2019-01-01\t'
-- !query 17 schema
struct<DATE '2019-01-01':date>
-- !query 17 output
2019-01-01
-- !query 18
select timestamp '2019-01-01\t'
-- !query 18 schema
struct<TIMESTAMP '2019-01-01 00:00:00':timestamp>
-- !query 18 output
2019-01-01 00:00:00

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 118
-- Number of queries: 119
-- !query 0
@ -1097,3 +1097,11 @@ from interval_arithmetic
struct<(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' + INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval,(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' - INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval>
-- !query 117 output
109 days 20 hours 30 minutes 40.246912 seconds 89 days 2 hours 14 minutes 26 seconds
-- !query 118
select interval '\t interval 1 day'
-- !query 118 schema
struct<INTERVAL '1 days':interval>
-- !query 118 output
1 days