[SPARK-30724][SQL] Support 'LIKE ANY' and 'LIKE ALL' operators

### What changes were proposed in this pull request?

`LIKE ANY/SOME` and `LIKE ALL` operators are mostly used when we are matching a text field with numbers of patterns. For example:

Teradata / Hive 3.0 / Snowflake:
```sql
--like any
select 'foo' LIKE ANY ('%foo%','%bar%');

--like all
select 'foo' LIKE ALL ('%foo%','%bar%');
```
PostgreSQL:
```sql
-- like any
select 'foo' LIKE ANY (array['%foo%','%bar%']);

-- like all
select 'foo' LIKE ALL (array['%foo%','%bar%']);
```

This PR add support these two operators.

More details:
https://docs.teradata.com/reader/756LNiPSFdY~4JcCCcR5Cw/4~AyrPNmDN0Xk4SALLo6aQ
https://issues.apache.org/jira/browse/HIVE-15229
https://docs.snowflake.net/manuals/sql-reference/functions/like_any.html

### Why are the changes needed?

To smoothly migrate SQLs to Spark SQL.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Unit test.

Closes #27477 from wangyum/SPARK-30724.

Authored-by: Yuming Wang <yumwang@ebay.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
This commit is contained in:
Yuming Wang 2020-04-24 22:20:32 +09:00 committed by Takeshi Yamamuro
parent 463c54419b
commit b10263b8e5
7 changed files with 405 additions and 10 deletions

View file

@ -766,6 +766,7 @@ predicate
| NOT? kind=IN '(' expression (',' expression)* ')'
| NOT? kind=IN '(' query ')'
| NOT? kind=RLIKE pattern=valueExpression
| NOT? kind=LIKE quantifier=(ANY | SOME | ALL) ('('')' | '(' expression (',' expression)* ')')
| NOT? kind=LIKE pattern=valueExpression (ESCAPE escapeChar=STRING)?
| IS NOT? kind=NULL
| IS NOT? kind=(TRUE | FALSE | UNKNOWN)

View file

@ -1373,7 +1373,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
* Add a predicate to the given expression. Supported expressions are:
* - (NOT) BETWEEN
* - (NOT) IN
* - (NOT) LIKE
* - (NOT) LIKE (ANY | SOME | ALL)
* - (NOT) RLIKE
* - IS (NOT) NULL.
* - IS (NOT) (TRUE | FALSE | UNKNOWN)
@ -1391,6 +1391,14 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
case other => Seq(other)
}
def getLikeQuantifierExprs(expressions: java.util.List[ExpressionContext]): Seq[Expression] = {
if (expressions.isEmpty) {
throw new ParseException("Expected something between '(' and ')'.", ctx)
} else {
expressions.asScala.map(expression).map(p => invertIfNotDefined(new Like(e, p)))
}
}
// Create the predicate.
ctx.kind.getType match {
case SqlBaseParser.BETWEEN =>
@ -1403,14 +1411,21 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
case SqlBaseParser.IN =>
invertIfNotDefined(In(e, ctx.expression.asScala.map(expression)))
case SqlBaseParser.LIKE =>
val escapeChar = Option(ctx.escapeChar).map(string).map { str =>
if (str.length != 1) {
throw new ParseException("Invalid escape string." +
"Escape string must contains only one character.", ctx)
}
str.charAt(0)
}.getOrElse('\\')
invertIfNotDefined(Like(e, expression(ctx.pattern), escapeChar))
Option(ctx.quantifier).map(_.getType) match {
case Some(SqlBaseParser.ANY) | Some(SqlBaseParser.SOME) =>
getLikeQuantifierExprs(ctx.expression).reduceLeft(Or)
case Some(SqlBaseParser.ALL) =>
getLikeQuantifierExprs(ctx.expression).reduceLeft(And)
case _ =>
val escapeChar = Option(ctx.escapeChar).map(string).map { str =>
if (str.length != 1) {
throw new ParseException("Invalid escape string." +
"Escape string must contain only one character.", ctx)
}
str.charAt(0)
}.getOrElse('\\')
invertIfNotDefined(Like(e, expression(ctx.pattern), escapeChar))
}
case SqlBaseParser.RLIKE =>
invertIfNotDefined(RLike(e, expression(ctx.pattern)))
case SqlBaseParser.NULL if ctx.NOT != null =>

View file

@ -188,7 +188,7 @@ class ExpressionParserSuite extends AnalysisTest {
}
test("like escape expressions") {
val message = "Escape string must contains only one character."
val message = "Escape string must contain only one character."
assertEqual("a like 'pattern%' escape '#'", 'a.like("pattern%", '#'))
assertEqual("a like 'pattern%' escape '\"'", 'a.like("pattern%", '\"'))
intercept("a like 'pattern%' escape '##'", message)
@ -208,6 +208,21 @@ class ExpressionParserSuite extends AnalysisTest {
assertEqual("a rlike 'pattern\\t\\n'", 'a rlike "pattern\\t\\n", parser)
}
test("(NOT) LIKE (ANY | SOME | ALL) expressions") {
Seq("any", "some").foreach { quantifier =>
assertEqual(s"a like $quantifier ('foo%', 'b%')", ('a like "foo%") || ('a like "b%"))
assertEqual(s"a not like $quantifier ('foo%', 'b%')", !('a like "foo%") || !('a like "b%"))
assertEqual(s"not (a like $quantifier ('foo%', 'b%'))", !(('a like "foo%") || ('a like "b%")))
}
assertEqual("a like all ('foo%', 'b%')", ('a like "foo%") && ('a like "b%"))
assertEqual("a not like all ('foo%', 'b%')", !('a like "foo%") && !('a like "b%"))
assertEqual("not (a like all ('foo%', 'b%'))", !(('a like "foo%") && ('a like "b%")))
Seq("any", "some", "all").foreach { quantifier =>
intercept(s"a like $quantifier()", "Expected something between '(' and ')'")
}
}
test("is null expressions") {
assertEqual("a is null", 'a.isNull)
assertEqual("a is not null", 'a.isNotNull)

View file

@ -0,0 +1,39 @@
CREATE OR REPLACE TEMPORARY VIEW like_all_table AS SELECT * FROM (VALUES
('google', '%oo%'),
('facebook', '%oo%'),
('linkedin', '%in'))
as t1(company, pat);
SELECT company FROM like_all_table WHERE company LIKE ALL ('%oo%', '%go%');
SELECT company FROM like_all_table WHERE company LIKE ALL ('microsoft', '%yoo%');
SELECT
company,
CASE
WHEN company LIKE ALL ('%oo%', '%go%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company LIKE ALL ('%oo%', 'go%') OR company LIKE ALL ('%in', 'ms%') THEN 'Y'
ELSE 'N'
END AS mix
FROM like_all_table ;
-- Mix test with constant pattern and column value
SELECT company FROM like_all_table WHERE company LIKE ALL ('%oo%', pat);
-- not like all test
SELECT company FROM like_all_table WHERE company NOT LIKE ALL ('%oo%', '%in', 'fa%');
SELECT company FROM like_all_table WHERE company NOT LIKE ALL ('microsoft', '%yoo%');
SELECT company FROM like_all_table WHERE company NOT LIKE ALL ('%oo%', 'fa%');
SELECT company FROM like_all_table WHERE NOT company LIKE ALL ('%oo%', 'fa%');
-- null test
SELECT company FROM like_all_table WHERE company LIKE ALL ('%oo%', NULL);
SELECT company FROM like_all_table WHERE company NOT LIKE ALL ('%oo%', NULL);
SELECT company FROM like_all_table WHERE company LIKE ALL (NULL, NULL);
SELECT company FROM like_all_table WHERE company NOT LIKE ALL (NULL, NULL);
-- negative case
SELECT company FROM like_any_table WHERE company LIKE ALL ();

View file

@ -0,0 +1,39 @@
CREATE OR REPLACE TEMPORARY VIEW like_any_table AS SELECT * FROM (VALUES
('google', '%oo%'),
('facebook', '%oo%'),
('linkedin', '%in'))
as t1(company, pat);
SELECT company FROM like_any_table WHERE company LIKE ANY ('%oo%', '%in', 'fa%');
SELECT company FROM like_any_table WHERE company LIKE ANY ('microsoft', '%yoo%');
select
company,
CASE
WHEN company LIKE ANY ('%oo%', '%in', 'fa%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company LIKE ANY ('%oo%', 'fa%') OR company LIKE ANY ('%in', 'ms%') THEN 'Y'
ELSE 'N'
END AS mix
FROM like_any_table;
-- Mix test with constant pattern and column value
SELECT company FROM like_any_table WHERE company LIKE ANY ('%zz%', pat);
-- not like any test
SELECT company FROM like_any_table WHERE company NOT LIKE ANY ('%oo%', '%in', 'fa%');
SELECT company FROM like_any_table WHERE company NOT LIKE ANY ('microsoft', '%yoo%');
SELECT company FROM like_any_table WHERE company NOT LIKE ANY ('%oo%', 'fa%');
SELECT company FROM like_any_table WHERE NOT company LIKE ANY ('%oo%', 'fa%');
-- null test
SELECT company FROM like_any_table WHERE company LIKE ANY ('%oo%', NULL);
SELECT company FROM like_any_table WHERE company NOT LIKE ANY ('%oo%', NULL);
SELECT company FROM like_any_table WHERE company LIKE ANY (NULL, NULL);
SELECT company FROM like_any_table WHERE company NOT LIKE ANY (NULL, NULL);
-- negative case
SELECT company FROM like_any_table WHERE company LIKE ANY ();

View file

@ -0,0 +1,140 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 14
-- !query
CREATE OR REPLACE TEMPORARY VIEW like_all_table AS SELECT * FROM (VALUES
('google', '%oo%'),
('facebook', '%oo%'),
('linkedin', '%in'))
as t1(company, pat)
-- !query schema
struct<>
-- !query output
-- !query
SELECT company FROM like_all_table WHERE company LIKE ALL ('%oo%', '%go%')
-- !query schema
struct<company:string>
-- !query output
google
-- !query
SELECT company FROM like_all_table WHERE company LIKE ALL ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT
company,
CASE
WHEN company LIKE ALL ('%oo%', '%go%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company LIKE ALL ('%oo%', 'go%') OR company LIKE ALL ('%in', 'ms%') THEN 'Y'
ELSE 'N'
END AS mix
FROM like_all_table
-- !query schema
struct<company:string,is_available:string,mix:string>
-- !query output
facebook N N
google Y Y
linkedin N N
-- !query
SELECT company FROM like_all_table WHERE company LIKE ALL ('%oo%', pat)
-- !query schema
struct<company:string>
-- !query output
facebook
google
-- !query
SELECT company FROM like_all_table WHERE company NOT LIKE ALL ('%oo%', '%in', 'fa%')
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM like_all_table WHERE company NOT LIKE ALL ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output
facebook
google
linkedin
-- !query
SELECT company FROM like_all_table WHERE company NOT LIKE ALL ('%oo%', 'fa%')
-- !query schema
struct<company:string>
-- !query output
linkedin
-- !query
SELECT company FROM like_all_table WHERE NOT company LIKE ALL ('%oo%', 'fa%')
-- !query schema
struct<company:string>
-- !query output
google
linkedin
-- !query
SELECT company FROM like_all_table WHERE company LIKE ALL ('%oo%', NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM like_all_table WHERE company NOT LIKE ALL ('%oo%', NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM like_all_table WHERE company LIKE ALL (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM like_all_table WHERE company NOT LIKE ALL (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM like_any_table WHERE company LIKE ALL ()
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.catalyst.parser.ParseException
Expected something between '(' and ')'.(line 1, pos 49)
== SQL ==
SELECT company FROM like_any_table WHERE company LIKE ALL ()
-------------------------------------------------^^^

View file

@ -0,0 +1,146 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 14
-- !query
CREATE OR REPLACE TEMPORARY VIEW like_any_table AS SELECT * FROM (VALUES
('google', '%oo%'),
('facebook', '%oo%'),
('linkedin', '%in'))
as t1(company, pat)
-- !query schema
struct<>
-- !query output
-- !query
SELECT company FROM like_any_table WHERE company LIKE ANY ('%oo%', '%in', 'fa%')
-- !query schema
struct<company:string>
-- !query output
facebook
google
linkedin
-- !query
SELECT company FROM like_any_table WHERE company LIKE ANY ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output
-- !query
select
company,
CASE
WHEN company LIKE ANY ('%oo%', '%in', 'fa%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company LIKE ANY ('%oo%', 'fa%') OR company LIKE ANY ('%in', 'ms%') THEN 'Y'
ELSE 'N'
END AS mix
FROM like_any_table
-- !query schema
struct<company:string,is_available:string,mix:string>
-- !query output
facebook Y Y
google Y Y
linkedin Y Y
-- !query
SELECT company FROM like_any_table WHERE company LIKE ANY ('%zz%', pat)
-- !query schema
struct<company:string>
-- !query output
facebook
google
linkedin
-- !query
SELECT company FROM like_any_table WHERE company NOT LIKE ANY ('%oo%', '%in', 'fa%')
-- !query schema
struct<company:string>
-- !query output
facebook
google
linkedin
-- !query
SELECT company FROM like_any_table WHERE company NOT LIKE ANY ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output
facebook
google
linkedin
-- !query
SELECT company FROM like_any_table WHERE company NOT LIKE ANY ('%oo%', 'fa%')
-- !query schema
struct<company:string>
-- !query output
google
linkedin
-- !query
SELECT company FROM like_any_table WHERE NOT company LIKE ANY ('%oo%', 'fa%')
-- !query schema
struct<company:string>
-- !query output
linkedin
-- !query
SELECT company FROM like_any_table WHERE company LIKE ANY ('%oo%', NULL)
-- !query schema
struct<company:string>
-- !query output
facebook
google
-- !query
SELECT company FROM like_any_table WHERE company NOT LIKE ANY ('%oo%', NULL)
-- !query schema
struct<company:string>
-- !query output
linkedin
-- !query
SELECT company FROM like_any_table WHERE company LIKE ANY (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM like_any_table WHERE company NOT LIKE ANY (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM like_any_table WHERE company LIKE ANY ()
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.catalyst.parser.ParseException
Expected something between '(' and ')'.(line 1, pos 49)
== SQL ==
SELECT company FROM like_any_table WHERE company LIKE ANY ()
-------------------------------------------------^^^