[SPARK-28123][SQL] String Functions: support btrim

### What changes were proposed in this pull request? Spark support `trim`/`ltrim`/`rtrim` now. The function `btrim` is an alternate form of `TRIM(BOTH <chars> FROM <expr>)`. `btrim` removes the longest string consisting only of specified characters from the start and end of a string. The mainstream database support this feature show below: **Postgresql** https://www.postgresql.org/docs/11/functions-binarystring.html **Vertica** https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/SQLReferenceManual/Functions/String/BTRIM.htm?tocpath=SQL%20Reference%20Manual%7CSQL%20Functions%7CString%20Functions%7C_____5 **Redshift** https://docs.aws.amazon.com/redshift/latest/dg/r_BTRIM.html **Druid** https://druid.apache.org/docs/latest/querying/sql.html#string-functions **Greenplum** http://docs.greenplum.org/6-8/ref_guide/function-summary.html ### Why are the changes needed? btrim is very useful. ### Does this PR introduce _any_ user-facing change? Yes. btrim is a new function ### How was this patch tested? Jenkins test. Closes #31390 from beliefer/SPARK-28123-support-btrim. Authored-by: gengjiaan <gengjiaan@360.cn> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-02-19 13:28:49 +00:00 · 2021-02-19 13:28:49 +00:00 · 06df1210d4
parent 27abb6ab56
commit 06df1210d4
8 changed files with 151 additions and 8 deletions
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@ -375,6 +375,7 @@ object FunctionRegistry {
    expression[SubstringIndex]("substring_index"),
    expression[StringTranslate]("translate"),
    expression[StringTrim]("trim"),
+    expression[StringTrimBoth]("btrim"),
    expression[Upper]("ucase", true),
    expression[UnBase64]("unbase64"),
    expression[Unhex]("unhex"),
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@ -920,6 +920,54 @@ case class StringTrim(srcStr: Expression, trimStr: Option[Expression] = None)
  override val trimMethod: String = "trim"
 }

+/**
+ * A function that takes a character string, removes the leading and trailing characters matching
+ * with any character in the trim string, returns the new string.
+ * trimStr: A character string to be trimmed from the source string, if it has multiple characters,
+ * the function searches for each character in the source string, removes the characters from the
+ * source string until it encounters the first non-match character.
+ */
+@ExpressionDescription(
+  usage = """
+    _FUNC_(str) - Removes the leading and trailing space characters from `str`.
+
+    _FUNC_(str, trimStr) - Remove the leading and trailing `trimStr` characters from `str`.
+  """,
+  arguments = """
+    Arguments:
+      * str - a string expression
+      * trimStr - the trim string characters to trim, the default value is a single space
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('    SparkSQL   ');
+       SparkSQL
+      > SELECT _FUNC_(encode('    SparkSQL   ', 'utf-8'));
+       SparkSQL
+      > SELECT _FUNC_('SSparkSQLS', 'SL');
+       parkSQ
+      > SELECT _FUNC_(encode('SSparkSQLS', 'utf-8'), encode('SL', 'utf-8'));
+       parkSQ
+  """,
+  since = "3.2.0",
+  group = "string_funcs")
+case class StringTrimBoth(srcStr: Expression, trimStr: Option[Expression], child: Expression)
+  extends RuntimeReplaceable {
+
+  def this(srcStr: Expression, trimStr: Expression) = {
+    this(srcStr, Option(trimStr), StringTrim(srcStr, trimStr))
+  }
+
+  def this(srcStr: Expression) = {
+    this(srcStr, None, StringTrim(srcStr))
+  }
+
+  override def exprsReplaced: Seq[Expression] = srcStr +: trimStr.toSeq
+  override def flatArguments: Iterator[Any] = Iterator(srcStr, trimStr)
+
+  override def prettyName: String = "btrim"
+}
+
 object StringTrimLeft {
  def apply(str: Expression, trimStr: Expression): StringTrimLeft =
    StringTrimLeft(str, Some(trimStr))
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@ -1,6 +1,6 @@
 <!-- Automatically generated by ExpressionsSchemaSuite -->
 ## Summary
-  - Number of queries: 350
+  - Number of queries: 351
  - Number of expressions that missing example: 13
  - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint,window
 ## Schema of Built-in Functions
@ -269,6 +269,7 @@
 | org.apache.spark.sql.catalyst.expressions.StringToMap | str_to_map | SELECT str_to_map('a:1,b:2,c:3', ',', ':') | struct<str_to_map(a:1,b:2,c:3, ,, :):map<string,string>> |
 | org.apache.spark.sql.catalyst.expressions.StringTranslate | translate | SELECT translate('AaBbCc', 'abc', '123') | struct<translate(AaBbCc, abc, 123):string> |
 | org.apache.spark.sql.catalyst.expressions.StringTrim | trim | SELECT trim('    SparkSQL   ') | struct<trim(    SparkSQL   ):string> |
+| org.apache.spark.sql.catalyst.expressions.StringTrimBoth | btrim | SELECT btrim('    SparkSQL   ') | struct<btrim(    SparkSQL   ):string> |
 | org.apache.spark.sql.catalyst.expressions.StringTrimLeft | ltrim | SELECT ltrim('    SparkSQL   ') | struct<ltrim(    SparkSQL   ):string> |
 | org.apache.spark.sql.catalyst.expressions.StringTrimRight | rtrim | SELECT rtrim('    SparkSQL   ') | struct<rtrim(    SparkSQL   ):string> |
 | org.apache.spark.sql.catalyst.expressions.StructsToCsv | to_csv | SELECT to_csv(named_struct('a', 1, 'b', 2)) | struct<to_csv(named_struct(a, 1, b, 2)):string> |
--- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql
@ -647,10 +647,9 @@ SELECT repeat('Pg', 4);
 SELECT repeat('Pg', -4);

 SELECT trim(binary('\\000') from binary('\\000Tom\\000'));
-- [SPARK-28123] Add support btrim
-- SELECT btrim(E'\\000trim\\000'::bytea, E'\\000'::bytea);
-- SELECT btrim(''::bytea, E'\\000'::bytea);
-- SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
+SELECT btrim(binary('\\000trim\\000'), binary('\\000'));
+SELECT btrim(binary(''), binary('\\000'));
+SELECT btrim(binary('\\000trim\\000'), binary(''));
 -- [SPARK-28121] decode can not accept 'escape' as charset
 -- [SPARK-28412][SQL] ANSI SQL: OVERLAY function support byte array
 -- SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@ -50,6 +50,12 @@ SELECT trim(TRAILING 'xyz' FROM 'testxxzx');
 SELECT trim(TRAILING 'xyz' FROM 'xyztestxxzx');
 SELECT trim(TRAILING 'xy' FROM 'TURNERyxXxy');

+-- btrim
+SELECT btrim('xyxtrimyyx', 'xy');
+SELECT btrim(encode(" xyz ", 'utf-8'));
+SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'));
+SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'));
+
 -- Check lpad/rpad with invalid length parameter
 SELECT lpad('hi', 'invalid_length');
 SELECT rpad('hi', 'invalid_length');
--- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
-- Number of queries: 44
+-- Number of queries: 48


 -- !query
@ -278,6 +278,38 @@ struct<TRIM(TRAILING xy FROM TURNERyxXxy):string>
 TURNERyxX


+-- !query
+SELECT btrim('xyxtrimyyx', 'xy')
+-- !query schema
+struct<btrim(xyxtrimyyx, xy):string>
+-- !query output
+trim
+
+
+-- !query
+SELECT btrim(encode(" xyz ", 'utf-8'))
+-- !query schema
+struct<btrim(encode( xyz , utf-8)):string>
+-- !query output
+xyz
+
+
+-- !query
+SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
+-- !query schema
+struct<btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)):string>
+-- !query output
+Tom
+
+
+-- !query
+SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
+-- !query schema
+struct<btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)):string>
+-- !query output
+bar
+
+
 -- !query
 SELECT lpad('hi', 'invalid_length')
 -- !query schema
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out
@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
-- Number of queries: 121
+-- Number of queries: 124


 -- !query
@ -982,6 +982,30 @@ struct<TRIM(BOTH \000 FROM \000Tom\000):string>
 Tom


+-- !query
+SELECT btrim(binary('\\000trim\\000'), binary('\\000'))
+-- !query schema
+struct<btrim(\000trim\000, \000):string>
+-- !query output
+trim
+
+
+-- !query
+SELECT btrim(binary(''), binary('\\000'))
+-- !query schema
+struct<btrim(, \000):string>
+-- !query output
+
+
+
+-- !query
+SELECT btrim(binary('\\000trim\\000'), binary(''))
+-- !query schema
+struct<btrim(\000trim\000, ):string>
+-- !query output
+\000trim\000
+
+
 -- !query
 DROP TABLE toasttest
 -- !query schema
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
-- Number of queries: 44
+-- Number of queries: 48


 -- !query
@ -276,6 +276,38 @@ struct<TRIM(TRAILING xy FROM TURNERyxXxy):string>
 TURNERyxX


+-- !query
+SELECT btrim('xyxtrimyyx', 'xy')
+-- !query schema
+struct<btrim(xyxtrimyyx, xy):string>
+-- !query output
+trim
+
+
+-- !query
+SELECT btrim(encode(" xyz ", 'utf-8'))
+-- !query schema
+struct<btrim(encode( xyz , utf-8)):string>
+-- !query output
+xyz
+
+
+-- !query
+SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
+-- !query schema
+struct<btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)):string>
+-- !query output
+Tom
+
+
+-- !query
+SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
+-- !query schema
+struct<btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)):string>
+-- !query output
+bar
+
+
 -- !query
 SELECT lpad('hi', 'invalid_length')
 -- !query schema