[SPARK-36736][SQL] Support ILIKE (ALL | ANY | SOME) - case insensitive LIKE

### What changes were proposed in this pull request?
In the PR, I propose to support a case-insensitive variant of the `LIKE (ALL | ANY | SOME)` expression - `ILIKE`. In this way, Spark's users can match strings to single pattern in the case-insensitive manner. For example:
```sql
spark-sql> create table ilike_example(subject varchar(20));
spark-sql> insert into ilike_example values
         > ('jane doe'),
         > ('Jane Doe'),
         > ('JANE DOE'),
         > ('John Doe'),
         > ('John Smith');
spark-sql> select *
         > from ilike_example
         > where subject ilike any ('jane%', '%SMITH')
         > order by subject;
JANE DOE
Jane Doe
John Smith
jane doe
```

The syntax of `ILIKE` is similar to `LIKE`:
```
str NOT? ILIKE (ANY | SOME | ALL) (pattern+)
```

### Why are the changes needed?
1. To improve user experience with Spark SQL. No need to use `lower(col_name)` in where clauses.
2. To make migration from other popular DMBSs to Spark SQL easier. DBMSs below support `ilike` in SQL:
    - [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/ilike.html#ilike)
    - [PostgreSQL](https://www.postgresql.org/docs/12/functions-matching.html)
    - [CockroachDB](https://www.cockroachlabs.com/docs/stable/functions-and-operators.html)

### Does this PR introduce _any_ user-facing change?
No, it doesn't. The PR **extends** existing APIs.

### How was this patch tested?
1. By running of expression examples via:
```
$ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite"
```
2. Added new test to test parsing of `ILIKE`:
```
$ build/sbt "test:testOnly *.ExpressionParserSuite"
```
3. Via existing test suites:
```
$ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z ilike-any.sql"
$ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z ilike-all.sql"
```

Closes #33966 from MaxGekk/ilike-any.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Max Gekk 2021-09-13 22:51:49 +08:00 committed by Wenchen Fan
parent e858cd568a
commit bd62ad9982
9 changed files with 414 additions and 14 deletions

View file

@ -797,7 +797,7 @@ predicate
| NOT? kind=IN '(' expression (',' expression)* ')' | NOT? kind=IN '(' expression (',' expression)* ')'
| NOT? kind=IN '(' query ')' | NOT? kind=IN '(' query ')'
| NOT? kind=RLIKE pattern=valueExpression | NOT? kind=RLIKE pattern=valueExpression
| NOT? kind=LIKE quantifier=(ANY | SOME | ALL) ('('')' | '(' expression (',' expression)* ')') | NOT? kind=(LIKE | ILIKE) quantifier=(ANY | SOME | ALL) ('('')' | '(' expression (',' expression)* ')')
| NOT? kind=(LIKE | ILIKE) pattern=valueExpression (ESCAPE escapeChar=STRING)? | NOT? kind=(LIKE | ILIKE) pattern=valueExpression (ESCAPE escapeChar=STRING)?
| IS NOT? kind=NULL | IS NOT? kind=NULL
| IS NOT? kind=(TRUE | FALSE | UNKNOWN) | IS NOT? kind=(TRUE | FALSE | UNKNOWN)

View file

@ -1557,7 +1557,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
* Add a predicate to the given expression. Supported expressions are: * Add a predicate to the given expression. Supported expressions are:
* - (NOT) BETWEEN * - (NOT) BETWEEN
* - (NOT) IN * - (NOT) IN
* - (NOT) LIKE (ANY | SOME | ALL) * - (NOT) (LIKE | ILIKE) (ANY | SOME | ALL)
* - (NOT) RLIKE * - (NOT) RLIKE
* - IS (NOT) NULL. * - IS (NOT) NULL.
* - IS (NOT) (TRUE | FALSE | UNKNOWN) * - IS (NOT) (TRUE | FALSE | UNKNOWN)
@ -1575,6 +1575,20 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
case other => Seq(other) case other => Seq(other)
} }
def lowerLikeArgsIfNeeded(
expr: Expression,
patterns: Seq[UTF8String]): (Expression, Seq[UTF8String]) = ctx.kind.getType match {
// scalastyle:off caselocale
case SqlBaseParser.ILIKE => (Lower(expr), patterns.map(_.toLowerCase))
// scalastyle:on caselocale
case _ => (expr, patterns)
}
def getLike(expr: Expression, pattern: Expression): Expression = ctx.kind.getType match {
case SqlBaseParser.ILIKE => new ILike(expr, pattern)
case _ => new Like(expr, pattern)
}
// Create the predicate. // Create the predicate.
ctx.kind.getType match { ctx.kind.getType match {
case SqlBaseParser.BETWEEN => case SqlBaseParser.BETWEEN =>
@ -1595,13 +1609,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
// If there are many pattern expressions, will throw StackOverflowError. // If there are many pattern expressions, will throw StackOverflowError.
// So we use LikeAny or NotLikeAny instead. // So we use LikeAny or NotLikeAny instead.
val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String]) val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String])
val (expr, pat) = lowerLikeArgsIfNeeded(e, patterns)
ctx.NOT match { ctx.NOT match {
case null => LikeAny(e, patterns) case null => LikeAny(expr, pat)
case _ => NotLikeAny(e, patterns) case _ => NotLikeAny(expr, pat)
} }
} else { } else {
ctx.expression.asScala.map(expression) ctx.expression.asScala.map(expression)
.map(p => invertIfNotDefined(new Like(e, p))).toSeq.reduceLeft(Or) .map(p => invertIfNotDefined(getLike(e, p))).toSeq.reduceLeft(Or)
} }
case Some(SqlBaseParser.ALL) => case Some(SqlBaseParser.ALL) =>
validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx) validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx)
@ -1610,13 +1625,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
// If there are many pattern expressions, will throw StackOverflowError. // If there are many pattern expressions, will throw StackOverflowError.
// So we use LikeAll or NotLikeAll instead. // So we use LikeAll or NotLikeAll instead.
val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String]) val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String])
val (expr, pat) = lowerLikeArgsIfNeeded(e, patterns)
ctx.NOT match { ctx.NOT match {
case null => LikeAll(e, patterns) case null => LikeAll(expr, pat)
case _ => NotLikeAll(e, patterns) case _ => NotLikeAll(expr, pat)
} }
} else { } else {
ctx.expression.asScala.map(expression) ctx.expression.asScala.map(expression)
.map(p => invertIfNotDefined(new Like(e, p))).toSeq.reduceLeft(And) .map(p => invertIfNotDefined(getLike(e, p))).toSeq.reduceLeft(And)
} }
case _ => case _ =>
val escapeChar = Option(ctx.escapeChar).map(string).map { str => val escapeChar = Option(ctx.escapeChar).map(string).map { str =>
@ -1625,9 +1641,10 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
} }
str.charAt(0) str.charAt(0)
}.getOrElse('\\') }.getOrElse('\\')
val likeExpr = if (ctx.kind.getType == SqlBaseParser.ILIKE) { val likeExpr = ctx.kind.getType match {
new ILike(e, expression(ctx.pattern), escapeChar) case SqlBaseParser.ILIKE => new ILike(e, expression(ctx.pattern), escapeChar)
} else Like(e, expression(ctx.pattern), escapeChar) case _ => Like(e, expression(ctx.pattern), escapeChar)
}
invertIfNotDefined(likeExpr) invertIfNotDefined(likeExpr)
} }
case SqlBaseParser.RLIKE => case SqlBaseParser.RLIKE =>

View file

@ -938,4 +938,19 @@ class ExpressionParserSuite extends AnalysisTest {
assertEqual("current_timestamp", UnresolvedAttribute.quoted("current_timestamp")) assertEqual("current_timestamp", UnresolvedAttribute.quoted("current_timestamp"))
} }
} }
test("SPARK-36736: (NOT) ILIKE (ANY | SOME | ALL) expressions") {
Seq("any", "some").foreach { quantifier =>
assertEqual(s"a ilike $quantifier ('FOO%', 'b%')", lower($"a") likeAny("foo%", "b%"))
assertEqual(s"a not ilike $quantifier ('foo%', 'B%')", lower($"a") notLikeAny("foo%", "b%"))
assertEqual(s"not (a ilike $quantifier ('FOO%', 'B%'))", !(lower($"a") likeAny("foo%", "b%")))
}
assertEqual("a ilike all ('Foo%', 'b%')", lower($"a") likeAll("foo%", "b%"))
assertEqual("a not ilike all ('foo%', 'B%')", lower($"a") notLikeAll("foo%", "b%"))
assertEqual("not (a ilike all ('foO%', 'b%'))", !(lower($"a") likeAll("foo%", "b%")))
Seq("any", "some", "all").foreach { quantifier =>
intercept(s"a ilike $quantifier()", "Expected something between '(' and ')'")
}
}
} }

View file

@ -0,0 +1,41 @@
-- test cases for ilike all
CREATE OR REPLACE TEMPORARY VIEW ilike_all_table AS SELECT * FROM (VALUES
('gOOgle', '%oo%'),
('facebook', '%OO%'),
('liNkedin', '%In'))
as t1(company, pat);
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%oO%', '%Go%');
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('microsoft', '%yoo%');
SELECT
company,
CASE
WHEN company ILIKE ALL ('%oo%', '%GO%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company ILIKE ALL ('%OO%', 'go%') OR company ILIKE ALL ('%IN', 'ms%') THEN 'Y'
ELSE 'N'
END AS mix
FROM ilike_all_table ;
-- Mix test with constant pattern and column value
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%oo%', pat);
-- not ilike all test
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%oo%', '%In', 'Fa%');
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('microsoft', '%yoo%');
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%oo%', 'fA%');
SELECT company FROM ilike_all_table WHERE NOT company ILIKE ALL ('%oO%', 'fa%');
-- null test
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%OO%', NULL);
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%Oo%', NULL);
SELECT company FROM ilike_all_table WHERE company ILIKE ALL (NULL, NULL);
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL (NULL, NULL);
-- negative case
SELECT company FROM ilike_any_table WHERE company ILIKE ALL ();

View file

@ -0,0 +1,41 @@
-- test cases for ilike any
CREATE OR REPLACE TEMPORARY VIEW ilike_any_table AS SELECT * FROM (VALUES
('Google', '%Oo%'),
('FaceBook', '%oO%'),
('linkedIn', '%IN'))
as t1(company, pat);
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%oo%', '%IN', 'fA%');
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('microsoft', '%yoo%');
select
company,
CASE
WHEN company ILIKE ANY ('%oO%', '%IN', 'Fa%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company ILIKE ANY ('%OO%', 'fa%') OR company ILIKE ANY ('%in', 'MS%') THEN 'Y'
ELSE 'N'
END AS mix
FROM ilike_any_table;
-- Mix test with constant pattern and column value
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%zZ%', pat);
-- not ilike any test
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oO%', '%iN', 'fa%');
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('microsoft', '%yOo%');
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oo%', 'Fa%');
SELECT company FROM ilike_any_table WHERE NOT company ILIKE ANY ('%OO%', 'fa%');
-- null test
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%oO%', NULL);
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oo%', NULL);
SELECT company FROM ilike_any_table WHERE company ILIKE ANY (NULL, NULL);
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY (NULL, NULL);
-- negative case
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ();

View file

@ -38,4 +38,4 @@ SELECT company FROM like_all_table WHERE company LIKE ALL (NULL, NULL);
SELECT company FROM like_all_table WHERE company NOT LIKE ALL (NULL, NULL); SELECT company FROM like_all_table WHERE company NOT LIKE ALL (NULL, NULL);
-- negative case -- negative case
SELECT company FROM like_any_table WHERE company LIKE ALL (); SELECT company FROM like_all_table WHERE company LIKE ALL ();

View file

@ -0,0 +1,140 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 14
-- !query
CREATE OR REPLACE TEMPORARY VIEW ilike_all_table AS SELECT * FROM (VALUES
('gOOgle', '%oo%'),
('facebook', '%OO%'),
('liNkedin', '%In'))
as t1(company, pat)
-- !query schema
struct<>
-- !query output
-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%oO%', '%Go%')
-- !query schema
struct<company:string>
-- !query output
gOOgle
-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT
company,
CASE
WHEN company ILIKE ALL ('%oo%', '%GO%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company ILIKE ALL ('%OO%', 'go%') OR company ILIKE ALL ('%IN', 'ms%') THEN 'Y'
ELSE 'N'
END AS mix
FROM ilike_all_table
-- !query schema
struct<company:string,is_available:string,mix:string>
-- !query output
facebook N N
gOOgle Y Y
liNkedin N N
-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%oo%', pat)
-- !query schema
struct<company:string>
-- !query output
facebook
gOOgle
-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%oo%', '%In', 'Fa%')
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output
facebook
gOOgle
liNkedin
-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%oo%', 'fA%')
-- !query schema
struct<company:string>
-- !query output
liNkedin
-- !query
SELECT company FROM ilike_all_table WHERE NOT company ILIKE ALL ('%oO%', 'fa%')
-- !query schema
struct<company:string>
-- !query output
gOOgle
liNkedin
-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL ('%OO%', NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL ('%Oo%', NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM ilike_all_table WHERE company ILIKE ALL (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM ilike_all_table WHERE company NOT ILIKE ALL (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM ilike_any_table WHERE company ILIKE ALL ()
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.catalyst.parser.ParseException
Expected something between '(' and ')'.(line 1, pos 50)
== SQL ==
SELECT company FROM ilike_any_table WHERE company ILIKE ALL ()
--------------------------------------------------^^^

View file

@ -0,0 +1,146 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 14
-- !query
CREATE OR REPLACE TEMPORARY VIEW ilike_any_table AS SELECT * FROM (VALUES
('Google', '%Oo%'),
('FaceBook', '%oO%'),
('linkedIn', '%IN'))
as t1(company, pat)
-- !query schema
struct<>
-- !query output
-- !query
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%oo%', '%IN', 'fA%')
-- !query schema
struct<company:string>
-- !query output
FaceBook
Google
linkedIn
-- !query
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('microsoft', '%yoo%')
-- !query schema
struct<company:string>
-- !query output
-- !query
select
company,
CASE
WHEN company ILIKE ANY ('%oO%', '%IN', 'Fa%') THEN 'Y'
ELSE 'N'
END AS is_available,
CASE
WHEN company ILIKE ANY ('%OO%', 'fa%') OR company ILIKE ANY ('%in', 'MS%') THEN 'Y'
ELSE 'N'
END AS mix
FROM ilike_any_table
-- !query schema
struct<company:string,is_available:string,mix:string>
-- !query output
FaceBook Y Y
Google Y Y
linkedIn Y Y
-- !query
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%zZ%', pat)
-- !query schema
struct<company:string>
-- !query output
FaceBook
Google
linkedIn
-- !query
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oO%', '%iN', 'fa%')
-- !query schema
struct<company:string>
-- !query output
FaceBook
Google
linkedIn
-- !query
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('microsoft', '%yOo%')
-- !query schema
struct<company:string>
-- !query output
FaceBook
Google
linkedIn
-- !query
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oo%', 'Fa%')
-- !query schema
struct<company:string>
-- !query output
Google
linkedIn
-- !query
SELECT company FROM ilike_any_table WHERE NOT company ILIKE ANY ('%OO%', 'fa%')
-- !query schema
struct<company:string>
-- !query output
linkedIn
-- !query
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ('%oO%', NULL)
-- !query schema
struct<company:string>
-- !query output
FaceBook
Google
-- !query
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY ('%oo%', NULL)
-- !query schema
struct<company:string>
-- !query output
linkedIn
-- !query
SELECT company FROM ilike_any_table WHERE company ILIKE ANY (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM ilike_any_table WHERE company NOT ILIKE ANY (NULL, NULL)
-- !query schema
struct<company:string>
-- !query output
-- !query
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ()
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.catalyst.parser.ParseException
Expected something between '(' and ')'.(line 1, pos 50)
== SQL ==
SELECT company FROM ilike_any_table WHERE company ILIKE ANY ()
--------------------------------------------------^^^

View file

@ -127,7 +127,7 @@ struct<company:string>
-- !query -- !query
SELECT company FROM like_any_table WHERE company LIKE ALL () SELECT company FROM like_all_table WHERE company LIKE ALL ()
-- !query schema -- !query schema
struct<> struct<>
-- !query output -- !query output
@ -136,5 +136,5 @@ org.apache.spark.sql.catalyst.parser.ParseException
Expected something between '(' and ')'.(line 1, pos 49) Expected something between '(' and ')'.(line 1, pos 49)
== SQL == == SQL ==
SELECT company FROM like_any_table WHERE company LIKE ALL () SELECT company FROM like_all_table WHERE company LIKE ALL ()
-------------------------------------------------^^^ -------------------------------------------------^^^