[SPARK-28123][SQL] String Functions: support btrim

### What changes were proposed in this pull request?
Spark support `trim`/`ltrim`/`rtrim` now. The function `btrim` is an alternate form of `TRIM(BOTH <chars> FROM <expr>)`.
`btrim` removes the longest string consisting only of specified characters from the start and end of a string.

The mainstream database support this feature show below:

**Postgresql**
https://www.postgresql.org/docs/11/functions-binarystring.html

**Vertica**
https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/SQLReferenceManual/Functions/String/BTRIM.htm?tocpath=SQL%20Reference%20Manual%7CSQL%20Functions%7CString%20Functions%7C_____5

**Redshift**
https://docs.aws.amazon.com/redshift/latest/dg/r_BTRIM.html

**Druid**
https://druid.apache.org/docs/latest/querying/sql.html#string-functions

**Greenplum**
http://docs.greenplum.org/6-8/ref_guide/function-summary.html

### Why are the changes needed?
btrim is very useful.

### Does this PR introduce _any_ user-facing change?
Yes. btrim is a new function

### How was this patch tested?
Jenkins test.

Closes #31390 from beliefer/SPARK-28123-support-btrim.

Authored-by: gengjiaan <gengjiaan@360.cn>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
gengjiaan 2021-02-19 13:28:49 +00:00 committed by Wenchen Fan
parent 27abb6ab56
commit 06df1210d4
8 changed files with 151 additions and 8 deletions

View file

@ -375,6 +375,7 @@ object FunctionRegistry {
expression[SubstringIndex]("substring_index"),
expression[StringTranslate]("translate"),
expression[StringTrim]("trim"),
expression[StringTrimBoth]("btrim"),
expression[Upper]("ucase", true),
expression[UnBase64]("unbase64"),
expression[Unhex]("unhex"),

View file

@ -920,6 +920,54 @@ case class StringTrim(srcStr: Expression, trimStr: Option[Expression] = None)
override val trimMethod: String = "trim"
}
/**
* A function that takes a character string, removes the leading and trailing characters matching
* with any character in the trim string, returns the new string.
* trimStr: A character string to be trimmed from the source string, if it has multiple characters,
* the function searches for each character in the source string, removes the characters from the
* source string until it encounters the first non-match character.
*/
@ExpressionDescription(
usage = """
_FUNC_(str) - Removes the leading and trailing space characters from `str`.
_FUNC_(str, trimStr) - Remove the leading and trailing `trimStr` characters from `str`.
""",
arguments = """
Arguments:
* str - a string expression
* trimStr - the trim string characters to trim, the default value is a single space
""",
examples = """
Examples:
> SELECT _FUNC_(' SparkSQL ');
SparkSQL
> SELECT _FUNC_(encode(' SparkSQL ', 'utf-8'));
SparkSQL
> SELECT _FUNC_('SSparkSQLS', 'SL');
parkSQ
> SELECT _FUNC_(encode('SSparkSQLS', 'utf-8'), encode('SL', 'utf-8'));
parkSQ
""",
since = "3.2.0",
group = "string_funcs")
case class StringTrimBoth(srcStr: Expression, trimStr: Option[Expression], child: Expression)
extends RuntimeReplaceable {
def this(srcStr: Expression, trimStr: Expression) = {
this(srcStr, Option(trimStr), StringTrim(srcStr, trimStr))
}
def this(srcStr: Expression) = {
this(srcStr, None, StringTrim(srcStr))
}
override def exprsReplaced: Seq[Expression] = srcStr +: trimStr.toSeq
override def flatArguments: Iterator[Any] = Iterator(srcStr, trimStr)
override def prettyName: String = "btrim"
}
object StringTrimLeft {
def apply(str: Expression, trimStr: Expression): StringTrimLeft =
StringTrimLeft(str, Some(trimStr))

View file

@ -1,6 +1,6 @@
<!-- Automatically generated by ExpressionsSchemaSuite -->
## Summary
- Number of queries: 350
- Number of queries: 351
- Number of expressions that missing example: 13
- Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint,window
## Schema of Built-in Functions
@ -269,6 +269,7 @@
| org.apache.spark.sql.catalyst.expressions.StringToMap | str_to_map | SELECT str_to_map('a:1,b:2,c:3', ',', ':') | struct<str_to_map(a:1,b:2,c:3, ,, :):map<string,string>> |
| org.apache.spark.sql.catalyst.expressions.StringTranslate | translate | SELECT translate('AaBbCc', 'abc', '123') | struct<translate(AaBbCc, abc, 123):string> |
| org.apache.spark.sql.catalyst.expressions.StringTrim | trim | SELECT trim(' SparkSQL ') | struct<trim( SparkSQL ):string> |
| org.apache.spark.sql.catalyst.expressions.StringTrimBoth | btrim | SELECT btrim(' SparkSQL ') | struct<btrim( SparkSQL ):string> |
| org.apache.spark.sql.catalyst.expressions.StringTrimLeft | ltrim | SELECT ltrim(' SparkSQL ') | struct<ltrim( SparkSQL ):string> |
| org.apache.spark.sql.catalyst.expressions.StringTrimRight | rtrim | SELECT rtrim(' SparkSQL ') | struct<rtrim( SparkSQL ):string> |
| org.apache.spark.sql.catalyst.expressions.StructsToCsv | to_csv | SELECT to_csv(named_struct('a', 1, 'b', 2)) | struct<to_csv(named_struct(a, 1, b, 2)):string> |

View file

@ -647,10 +647,9 @@ SELECT repeat('Pg', 4);
SELECT repeat('Pg', -4);
SELECT trim(binary('\\000') from binary('\\000Tom\\000'));
-- [SPARK-28123] Add support btrim
-- SELECT btrim(E'\\000trim\\000'::bytea, E'\\000'::bytea);
-- SELECT btrim(''::bytea, E'\\000'::bytea);
-- SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT btrim(binary('\\000trim\\000'), binary('\\000'));
SELECT btrim(binary(''), binary('\\000'));
SELECT btrim(binary('\\000trim\\000'), binary(''));
-- [SPARK-28121] decode can not accept 'escape' as charset
-- [SPARK-28412][SQL] ANSI SQL: OVERLAY function support byte array
-- SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');

View file

@ -50,6 +50,12 @@ SELECT trim(TRAILING 'xyz' FROM 'testxxzx');
SELECT trim(TRAILING 'xyz' FROM 'xyztestxxzx');
SELECT trim(TRAILING 'xy' FROM 'TURNERyxXxy');
-- btrim
SELECT btrim('xyxtrimyyx', 'xy');
SELECT btrim(encode(" xyz ", 'utf-8'));
SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'));
SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'));
-- Check lpad/rpad with invalid length parameter
SELECT lpad('hi', 'invalid_length');
SELECT rpad('hi', 'invalid_length');

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 44
-- Number of queries: 48
-- !query
@ -278,6 +278,38 @@ struct<TRIM(TRAILING xy FROM TURNERyxXxy):string>
TURNERyxX
-- !query
SELECT btrim('xyxtrimyyx', 'xy')
-- !query schema
struct<btrim(xyxtrimyyx, xy):string>
-- !query output
trim
-- !query
SELECT btrim(encode(" xyz ", 'utf-8'))
-- !query schema
struct<btrim(encode( xyz , utf-8)):string>
-- !query output
xyz
-- !query
SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
-- !query schema
struct<btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)):string>
-- !query output
Tom
-- !query
SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
-- !query schema
struct<btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)):string>
-- !query output
bar
-- !query
SELECT lpad('hi', 'invalid_length')
-- !query schema

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 121
-- Number of queries: 124
-- !query
@ -982,6 +982,30 @@ struct<TRIM(BOTH \000 FROM \000Tom\000):string>
Tom
-- !query
SELECT btrim(binary('\\000trim\\000'), binary('\\000'))
-- !query schema
struct<btrim(\000trim\000, \000):string>
-- !query output
trim
-- !query
SELECT btrim(binary(''), binary('\\000'))
-- !query schema
struct<btrim(, \000):string>
-- !query output
-- !query
SELECT btrim(binary('\\000trim\\000'), binary(''))
-- !query schema
struct<btrim(\000trim\000, ):string>
-- !query output
\000trim\000
-- !query
DROP TABLE toasttest
-- !query schema

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 44
-- Number of queries: 48
-- !query
@ -276,6 +276,38 @@ struct<TRIM(TRAILING xy FROM TURNERyxXxy):string>
TURNERyxX
-- !query
SELECT btrim('xyxtrimyyx', 'xy')
-- !query schema
struct<btrim(xyxtrimyyx, xy):string>
-- !query output
trim
-- !query
SELECT btrim(encode(" xyz ", 'utf-8'))
-- !query schema
struct<btrim(encode( xyz , utf-8)):string>
-- !query output
xyz
-- !query
SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
-- !query schema
struct<btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)):string>
-- !query output
Tom
-- !query
SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
-- !query schema
struct<btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)):string>
-- !query output
bar
-- !query
SELECT lpad('hi', 'invalid_length')
-- !query schema