[SPARK-28123][SQL] String Functions: support btrim
### What changes were proposed in this pull request? Spark support `trim`/`ltrim`/`rtrim` now. The function `btrim` is an alternate form of `TRIM(BOTH <chars> FROM <expr>)`. `btrim` removes the longest string consisting only of specified characters from the start and end of a string. The mainstream database support this feature show below: **Postgresql** https://www.postgresql.org/docs/11/functions-binarystring.html **Vertica** https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/SQLReferenceManual/Functions/String/BTRIM.htm?tocpath=SQL%20Reference%20Manual%7CSQL%20Functions%7CString%20Functions%7C_____5 **Redshift** https://docs.aws.amazon.com/redshift/latest/dg/r_BTRIM.html **Druid** https://druid.apache.org/docs/latest/querying/sql.html#string-functions **Greenplum** http://docs.greenplum.org/6-8/ref_guide/function-summary.html ### Why are the changes needed? btrim is very useful. ### Does this PR introduce _any_ user-facing change? Yes. btrim is a new function ### How was this patch tested? Jenkins test. Closes #31390 from beliefer/SPARK-28123-support-btrim. Authored-by: gengjiaan <gengjiaan@360.cn> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
27abb6ab56
commit
06df1210d4
|
@ -375,6 +375,7 @@ object FunctionRegistry {
|
|||
expression[SubstringIndex]("substring_index"),
|
||||
expression[StringTranslate]("translate"),
|
||||
expression[StringTrim]("trim"),
|
||||
expression[StringTrimBoth]("btrim"),
|
||||
expression[Upper]("ucase", true),
|
||||
expression[UnBase64]("unbase64"),
|
||||
expression[Unhex]("unhex"),
|
||||
|
|
|
@ -920,6 +920,54 @@ case class StringTrim(srcStr: Expression, trimStr: Option[Expression] = None)
|
|||
override val trimMethod: String = "trim"
|
||||
}
|
||||
|
||||
/**
|
||||
* A function that takes a character string, removes the leading and trailing characters matching
|
||||
* with any character in the trim string, returns the new string.
|
||||
* trimStr: A character string to be trimmed from the source string, if it has multiple characters,
|
||||
* the function searches for each character in the source string, removes the characters from the
|
||||
* source string until it encounters the first non-match character.
|
||||
*/
|
||||
@ExpressionDescription(
|
||||
usage = """
|
||||
_FUNC_(str) - Removes the leading and trailing space characters from `str`.
|
||||
|
||||
_FUNC_(str, trimStr) - Remove the leading and trailing `trimStr` characters from `str`.
|
||||
""",
|
||||
arguments = """
|
||||
Arguments:
|
||||
* str - a string expression
|
||||
* trimStr - the trim string characters to trim, the default value is a single space
|
||||
""",
|
||||
examples = """
|
||||
Examples:
|
||||
> SELECT _FUNC_(' SparkSQL ');
|
||||
SparkSQL
|
||||
> SELECT _FUNC_(encode(' SparkSQL ', 'utf-8'));
|
||||
SparkSQL
|
||||
> SELECT _FUNC_('SSparkSQLS', 'SL');
|
||||
parkSQ
|
||||
> SELECT _FUNC_(encode('SSparkSQLS', 'utf-8'), encode('SL', 'utf-8'));
|
||||
parkSQ
|
||||
""",
|
||||
since = "3.2.0",
|
||||
group = "string_funcs")
|
||||
case class StringTrimBoth(srcStr: Expression, trimStr: Option[Expression], child: Expression)
|
||||
extends RuntimeReplaceable {
|
||||
|
||||
def this(srcStr: Expression, trimStr: Expression) = {
|
||||
this(srcStr, Option(trimStr), StringTrim(srcStr, trimStr))
|
||||
}
|
||||
|
||||
def this(srcStr: Expression) = {
|
||||
this(srcStr, None, StringTrim(srcStr))
|
||||
}
|
||||
|
||||
override def exprsReplaced: Seq[Expression] = srcStr +: trimStr.toSeq
|
||||
override def flatArguments: Iterator[Any] = Iterator(srcStr, trimStr)
|
||||
|
||||
override def prettyName: String = "btrim"
|
||||
}
|
||||
|
||||
object StringTrimLeft {
|
||||
def apply(str: Expression, trimStr: Expression): StringTrimLeft =
|
||||
StringTrimLeft(str, Some(trimStr))
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<!-- Automatically generated by ExpressionsSchemaSuite -->
|
||||
## Summary
|
||||
- Number of queries: 350
|
||||
- Number of queries: 351
|
||||
- Number of expressions that missing example: 13
|
||||
- Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint,window
|
||||
## Schema of Built-in Functions
|
||||
|
@ -269,6 +269,7 @@
|
|||
| org.apache.spark.sql.catalyst.expressions.StringToMap | str_to_map | SELECT str_to_map('a:1,b:2,c:3', ',', ':') | struct<str_to_map(a:1,b:2,c:3, ,, :):map<string,string>> |
|
||||
| org.apache.spark.sql.catalyst.expressions.StringTranslate | translate | SELECT translate('AaBbCc', 'abc', '123') | struct<translate(AaBbCc, abc, 123):string> |
|
||||
| org.apache.spark.sql.catalyst.expressions.StringTrim | trim | SELECT trim(' SparkSQL ') | struct<trim( SparkSQL ):string> |
|
||||
| org.apache.spark.sql.catalyst.expressions.StringTrimBoth | btrim | SELECT btrim(' SparkSQL ') | struct<btrim( SparkSQL ):string> |
|
||||
| org.apache.spark.sql.catalyst.expressions.StringTrimLeft | ltrim | SELECT ltrim(' SparkSQL ') | struct<ltrim( SparkSQL ):string> |
|
||||
| org.apache.spark.sql.catalyst.expressions.StringTrimRight | rtrim | SELECT rtrim(' SparkSQL ') | struct<rtrim( SparkSQL ):string> |
|
||||
| org.apache.spark.sql.catalyst.expressions.StructsToCsv | to_csv | SELECT to_csv(named_struct('a', 1, 'b', 2)) | struct<to_csv(named_struct(a, 1, b, 2)):string> |
|
||||
|
|
|
@ -647,10 +647,9 @@ SELECT repeat('Pg', 4);
|
|||
SELECT repeat('Pg', -4);
|
||||
|
||||
SELECT trim(binary('\\000') from binary('\\000Tom\\000'));
|
||||
-- [SPARK-28123] Add support btrim
|
||||
-- SELECT btrim(E'\\000trim\\000'::bytea, E'\\000'::bytea);
|
||||
-- SELECT btrim(''::bytea, E'\\000'::bytea);
|
||||
-- SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
|
||||
SELECT btrim(binary('\\000trim\\000'), binary('\\000'));
|
||||
SELECT btrim(binary(''), binary('\\000'));
|
||||
SELECT btrim(binary('\\000trim\\000'), binary(''));
|
||||
-- [SPARK-28121] decode can not accept 'escape' as charset
|
||||
-- [SPARK-28412][SQL] ANSI SQL: OVERLAY function support byte array
|
||||
-- SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
|
||||
|
|
|
@ -50,6 +50,12 @@ SELECT trim(TRAILING 'xyz' FROM 'testxxzx');
|
|||
SELECT trim(TRAILING 'xyz' FROM 'xyztestxxzx');
|
||||
SELECT trim(TRAILING 'xy' FROM 'TURNERyxXxy');
|
||||
|
||||
-- btrim
|
||||
SELECT btrim('xyxtrimyyx', 'xy');
|
||||
SELECT btrim(encode(" xyz ", 'utf-8'));
|
||||
SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'));
|
||||
SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'));
|
||||
|
||||
-- Check lpad/rpad with invalid length parameter
|
||||
SELECT lpad('hi', 'invalid_length');
|
||||
SELECT rpad('hi', 'invalid_length');
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
-- Automatically generated by SQLQueryTestSuite
|
||||
-- Number of queries: 44
|
||||
-- Number of queries: 48
|
||||
|
||||
|
||||
-- !query
|
||||
|
@ -278,6 +278,38 @@ struct<TRIM(TRAILING xy FROM TURNERyxXxy):string>
|
|||
TURNERyxX
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim('xyxtrimyyx', 'xy')
|
||||
-- !query schema
|
||||
struct<btrim(xyxtrimyyx, xy):string>
|
||||
-- !query output
|
||||
trim
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(encode(" xyz ", 'utf-8'))
|
||||
-- !query schema
|
||||
struct<btrim(encode( xyz , utf-8)):string>
|
||||
-- !query output
|
||||
xyz
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
|
||||
-- !query schema
|
||||
struct<btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)):string>
|
||||
-- !query output
|
||||
Tom
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
|
||||
-- !query schema
|
||||
struct<btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)):string>
|
||||
-- !query output
|
||||
bar
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT lpad('hi', 'invalid_length')
|
||||
-- !query schema
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
-- Automatically generated by SQLQueryTestSuite
|
||||
-- Number of queries: 121
|
||||
-- Number of queries: 124
|
||||
|
||||
|
||||
-- !query
|
||||
|
@ -982,6 +982,30 @@ struct<TRIM(BOTH \000 FROM \000Tom\000):string>
|
|||
Tom
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(binary('\\000trim\\000'), binary('\\000'))
|
||||
-- !query schema
|
||||
struct<btrim(\000trim\000, \000):string>
|
||||
-- !query output
|
||||
trim
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(binary(''), binary('\\000'))
|
||||
-- !query schema
|
||||
struct<btrim(, \000):string>
|
||||
-- !query output
|
||||
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(binary('\\000trim\\000'), binary(''))
|
||||
-- !query schema
|
||||
struct<btrim(\000trim\000, ):string>
|
||||
-- !query output
|
||||
\000trim\000
|
||||
|
||||
|
||||
-- !query
|
||||
DROP TABLE toasttest
|
||||
-- !query schema
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
-- Automatically generated by SQLQueryTestSuite
|
||||
-- Number of queries: 44
|
||||
-- Number of queries: 48
|
||||
|
||||
|
||||
-- !query
|
||||
|
@ -276,6 +276,38 @@ struct<TRIM(TRAILING xy FROM TURNERyxXxy):string>
|
|||
TURNERyxX
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim('xyxtrimyyx', 'xy')
|
||||
-- !query schema
|
||||
struct<btrim(xyxtrimyyx, xy):string>
|
||||
-- !query output
|
||||
trim
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(encode(" xyz ", 'utf-8'))
|
||||
-- !query schema
|
||||
struct<btrim(encode( xyz , utf-8)):string>
|
||||
-- !query output
|
||||
xyz
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
|
||||
-- !query schema
|
||||
struct<btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)):string>
|
||||
-- !query output
|
||||
Tom
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
|
||||
-- !query schema
|
||||
struct<btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)):string>
|
||||
-- !query output
|
||||
bar
|
||||
|
||||
|
||||
-- !query
|
||||
SELECT lpad('hi', 'invalid_length')
|
||||
-- !query schema
|
||||
|
|
Loading…
Reference in a new issue