[SPARK-20750][SQL] Built-in SQL Function Support - REPLACE

## What changes were proposed in this pull request?

This PR adds built-in SQL function `(REPLACE(<string_expression>, <search_string> [, <replacement_string>])`

`REPLACE()` return that string that is replaced all occurrences with given string.

## How was this patch tested?

added new test suites

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18047 from kiszk/SPARK-20750.
This commit is contained in:
Kazuaki Ishizaki 2017-05-29 11:47:31 -07:00 committed by Xiao Li
parent f9b59abeae
commit ef9fd920c3
6 changed files with 93 additions and 1 deletions

View file

@ -835,6 +835,15 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
return res; return res;
} }
public UTF8String replace(UTF8String search, UTF8String replace) {
if (EMPTY_UTF8.equals(search)) {
return this;
}
String replaced = toString().replace(
search.toString(), replace.toString());
return fromString(replaced);
}
// TODO: Need to use `Code Point` here instead of Char in case the character longer than 2 bytes // TODO: Need to use `Code Point` here instead of Char in case the character longer than 2 bytes
public UTF8String translate(Map<Character, Character> dict) { public UTF8String translate(Map<Character, Character> dict) {
String srcStr = this.toString(); String srcStr = this.toString();

View file

@ -304,6 +304,7 @@ object FunctionRegistry {
expression[RegExpExtract]("regexp_extract"), expression[RegExpExtract]("regexp_extract"),
expression[RegExpReplace]("regexp_replace"), expression[RegExpReplace]("regexp_replace"),
expression[StringRepeat]("repeat"), expression[StringRepeat]("repeat"),
expression[StringReplace]("replace"),
expression[StringReverse]("reverse"), expression[StringReverse]("reverse"),
expression[RLike]("rlike"), expression[RLike]("rlike"),
expression[StringRPad]("rpad"), expression[StringRPad]("rpad"),

View file

@ -340,6 +340,48 @@ case class EndsWith(left: Expression, right: Expression) extends StringPredicate
} }
} }
/**
* Replace all occurrences with string.
*/
// scalastyle:off line.size.limit
@ExpressionDescription(
usage = "_FUNC_(str, search[, replace]) - Replaces all occurrences of `search` with `replace`.",
extended = """
Arguments:
str - a string expression
search - a string expression. If `search` is not found in `str`, `str` is returned unchanged.
replace - a string expression. If `replace` is not specified or is an empty string, nothing replaces
the string that is removed from `str`.
Examples:
> SELECT _FUNC_('ABCabc', 'abc', 'DEF');
ABCDEF
""")
// scalastyle:on line.size.limit
case class StringReplace(srcExpr: Expression, searchExpr: Expression, replaceExpr: Expression)
extends TernaryExpression with ImplicitCastInputTypes {
def this(srcExpr: Expression, searchExpr: Expression) = {
this(srcExpr, searchExpr, Literal(""))
}
override def nullSafeEval(srcEval: Any, searchEval: Any, replaceEval: Any): Any = {
srcEval.asInstanceOf[UTF8String].replace(
searchEval.asInstanceOf[UTF8String], replaceEval.asInstanceOf[UTF8String])
}
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
nullSafeCodeGen(ctx, ev, (src, search, replace) => {
s"""${ev.value} = $src.replace($search, $replace);"""
})
}
override def dataType: DataType = StringType
override def inputTypes: Seq[DataType] = Seq(StringType, StringType, StringType)
override def children: Seq[Expression] = srcExpr :: searchExpr :: replaceExpr :: Nil
override def prettyName: String = "replace"
}
object StringTranslate { object StringTranslate {
def buildDict(matchingString: UTF8String, replaceString: UTF8String) def buildDict(matchingString: UTF8String, replaceString: UTF8String)

View file

@ -372,6 +372,26 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(SoundEx(Literal("!!")), "!!") checkEvaluation(SoundEx(Literal("!!")), "!!")
} }
test("replace") {
checkEvaluation(
StringReplace(Literal("replace"), Literal("pl"), Literal("123")), "re123ace")
checkEvaluation(StringReplace(Literal("replace"), Literal("pl"), Literal("")), "reace")
checkEvaluation(StringReplace(Literal("replace"), Literal(""), Literal("123")), "replace")
checkEvaluation(StringReplace(Literal.create(null, StringType),
Literal("pl"), Literal("123")), null)
checkEvaluation(StringReplace(Literal("replace"),
Literal.create(null, StringType), Literal("123")), null)
checkEvaluation(StringReplace(Literal("replace"),
Literal("pl"), Literal.create(null, StringType)), null)
// test for multiple replace
checkEvaluation(StringReplace(Literal("abcabc"), Literal("b"), Literal("12")), "a12ca12c")
checkEvaluation(StringReplace(Literal("abcdabcd"), Literal("bc"), Literal("")), "adad")
// scalastyle:off
// non ascii characters are not allowed in the source code, so we disable the scalastyle.
checkEvaluation(StringReplace(Literal("花花世界"), Literal("花世"), Literal("ab")), "花ab界")
// scalastyle:on
}
test("translate") { test("translate") {
checkEvaluation( checkEvaluation(
StringTranslate(Literal("translate"), Literal("rnlt"), Literal("123")), "1a2s3ae") StringTranslate(Literal("translate"), Literal("rnlt"), Literal("123")), "1a2s3ae")

View file

@ -8,3 +8,7 @@ select 'a' || 'b' || 'c';
-- Check if catalyst combine nested `Concat`s -- Check if catalyst combine nested `Concat`s
EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col
FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10)) t; FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10)) t;
-- replace function
select replace('abc', 'b', '123');
select replace('abc', 'b');

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite -- Automatically generated by SQLQueryTestSuite
-- Number of queries: 4 -- Number of queries: 6
-- !query 0 -- !query 0
@ -54,3 +54,19 @@ Project [concat(cast(id#xL as string), cast(id#xL as string), cast(id#xL as stri
== Physical Plan == == Physical Plan ==
*Project [concat(cast(id#xL as string), cast(id#xL as string), cast(id#xL as string), cast(id#xL as string)) AS col#x] *Project [concat(cast(id#xL as string), cast(id#xL as string), cast(id#xL as string), cast(id#xL as string)) AS col#x]
+- *Range (0, 10, step=1, splits=2) +- *Range (0, 10, step=1, splits=2)
-- !query 4
select replace('abc', 'b', '123')
-- !query 4 schema
struct<replace(abc, b, 123):string>
-- !query 4 output
a123c
-- !query 5
select replace('abc', 'b')
-- !query 5 schema
struct<replace(abc, b, ):string>
-- !query 5 output
ac