[SPARK-19017][SQL] NOT IN subquery with more than one column may return incorrect results
## What changes were proposed in this pull request? This PR fixes the code in Optimizer phase where the NULL-aware expression of a NOT IN query is expanded in Rule `RewritePredicateSubquery`. Example: The query select a1,b1 from t1 where (a1,b1) not in (select a2,b2 from t2); has the (a1, b1) = (a2, b2) rewritten from (before this fix): Join LeftAnti, ((isnull((_1#2 = a2#16)) || isnull((_2#3 = b2#17))) || ((_1#2 = a2#16) && (_2#3 = b2#17))) to (after this fix): Join LeftAnti, (((_1#2 = a2#16) || isnull((_1#2 = a2#16))) && ((_2#3 = b2#17) || isnull((_2#3 = b2#17)))) ## How was this patch tested? sql/test, catalyst/test and new test cases in SQLQueryTestSuite. Author: Nattavut Sutyanyong <nsy.can@gmail.com> Closes #16467 from nsyca/19017.
This commit is contained in:
parent
8f3f73abc1
commit
cdb691eb4d
|
@ -68,8 +68,14 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
|
|||
// Note that will almost certainly be planned as a Broadcast Nested Loop join.
|
||||
// Use EXISTS if performance matters to you.
|
||||
val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p)
|
||||
val anyNull = splitConjunctivePredicates(joinCond.get).map(IsNull).reduceLeft(Or)
|
||||
Join(outerPlan, sub, LeftAnti, Option(Or(anyNull, joinCond.get)))
|
||||
// Expand the NOT IN expression with the NULL-aware semantic
|
||||
// to its full form. That is from:
|
||||
// (a1,b1,...) = (a2,b2,...)
|
||||
// to
|
||||
// (a1=a2 OR isnull(a1=a2)) AND (b1=b2 OR isnull(b1=b2)) AND ...
|
||||
val joinConds = splitConjunctivePredicates(joinCond.get)
|
||||
val pairs = joinConds.map(c => Or(c, IsNull(c))).reduceLeft(And)
|
||||
Join(outerPlan, sub, LeftAnti, Option(pairs))
|
||||
case (p, predicate) =>
|
||||
val (newCond, inputPlan) = rewriteExistentialExpr(Seq(predicate), p)
|
||||
Project(p.output, Filter(newCond.get, inputPlan))
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
-- This file contains test cases for NOT IN subquery with multiple columns.
|
||||
|
||||
-- The data sets are populated as follows:
|
||||
-- 1) When T1.A1 = T2.A2
|
||||
-- 1.1) T1.B1 = T2.B2
|
||||
-- 1.2) T1.B1 = T2.B2 returns false
|
||||
-- 1.3) T1.B1 is null
|
||||
-- 1.4) T2.B2 is null
|
||||
-- 2) When T1.A1 = T2.A2 returns false
|
||||
-- 3) When T1.A1 is null
|
||||
-- 4) When T1.A2 is null
|
||||
|
||||
-- T1.A1 T1.B1 T2.A2 T2.B2
|
||||
-- ----- ----- ----- -----
|
||||
-- 1 1 1 1 (1.1)
|
||||
-- 1 3 (1.2)
|
||||
-- 1 null 1 null (1.3 & 1.4)
|
||||
--
|
||||
-- 2 1 1 1 (2)
|
||||
-- null 1 (3)
|
||||
-- null 3 (4)
|
||||
|
||||
create temporary view t1 as select * from values
|
||||
(1, 1), (2, 1), (null, 1),
|
||||
(1, 3), (null, 3),
|
||||
(1, null), (null, 2)
|
||||
as t1(a1, b1);
|
||||
|
||||
create temporary view t2 as select * from values
|
||||
(1, 1),
|
||||
(null, 3),
|
||||
(1, null)
|
||||
as t2(a2, b2);
|
||||
|
||||
-- multiple columns in NOT IN
|
||||
-- TC 01.01
|
||||
select a1,b1
|
||||
from t1
|
||||
where (a1,b1) not in (select a2,b2
|
||||
from t2);
|
||||
|
||||
-- multiple columns with expressions in NOT IN
|
||||
-- TC 01.02
|
||||
select a1,b1
|
||||
from t1
|
||||
where (a1-1,b1) not in (select a2,b2
|
||||
from t2);
|
||||
|
||||
-- multiple columns with expressions in NOT IN
|
||||
-- TC 01.02
|
||||
select a1,b1
|
||||
from t1
|
||||
where (a1,b1) not in (select a2+1,b2
|
||||
from t2);
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
-- Automatically generated by SQLQueryTestSuite
|
||||
-- Number of queries: 5
|
||||
|
||||
|
||||
-- !query 0
|
||||
create temporary view t1 as select * from values
|
||||
(1, 1), (2, 1), (null, 1),
|
||||
(1, 3), (null, 3),
|
||||
(1, null), (null, 2)
|
||||
as t1(a1, b1)
|
||||
-- !query 0 schema
|
||||
struct<>
|
||||
-- !query 0 output
|
||||
|
||||
|
||||
|
||||
-- !query 1
|
||||
create temporary view t2 as select * from values
|
||||
(1, 1),
|
||||
(null, 3),
|
||||
(1, null)
|
||||
as t2(a2, b2)
|
||||
-- !query 1 schema
|
||||
struct<>
|
||||
-- !query 1 output
|
||||
|
||||
|
||||
|
||||
-- !query 2
|
||||
select a1,b1
|
||||
from t1
|
||||
where (a1,b1) not in (select a2,b2
|
||||
from t2)
|
||||
-- !query 2 schema
|
||||
struct<a1:int,b1:int>
|
||||
-- !query 2 output
|
||||
2 1
|
||||
|
||||
|
||||
-- !query 3
|
||||
select a1,b1
|
||||
from t1
|
||||
where (a1-1,b1) not in (select a2,b2
|
||||
from t2)
|
||||
-- !query 3 schema
|
||||
struct<a1:int,b1:int>
|
||||
-- !query 3 output
|
||||
1 1
|
||||
|
||||
|
||||
-- !query 4
|
||||
select a1,b1
|
||||
from t1
|
||||
where (a1,b1) not in (select a2+1,b2
|
||||
from t2)
|
||||
-- !query 4 schema
|
||||
struct<a1:int,b1:int>
|
||||
-- !query 4 output
|
||||
1 1
|
|
@ -163,7 +163,12 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
|
|||
s"-- Number of queries: ${outputs.size}\n\n\n" +
|
||||
outputs.zipWithIndex.map{case (qr, i) => qr.toString(i)}.mkString("\n\n\n") + "\n"
|
||||
}
|
||||
stringToFile(new File(testCase.resultFile), goldenOutput)
|
||||
val resultFile = new File(testCase.resultFile);
|
||||
val parent = resultFile.getParentFile();
|
||||
if (!parent.exists()) {
|
||||
assert(parent.mkdirs(), "Could not create directory: " + parent)
|
||||
}
|
||||
stringToFile(resultFile, goldenOutput)
|
||||
}
|
||||
|
||||
// Read back the golden file.
|
||||
|
|
|
@ -263,12 +263,12 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
|
|||
Row(1, 2.0) :: Row(1, 2.0) :: Nil)
|
||||
|
||||
checkAnswer(
|
||||
sql("select * from l where a not in (select c from t where b < d)"),
|
||||
Row(1, 2.0) :: Row(1, 2.0) :: Row(3, 3.0) :: Nil)
|
||||
sql("select * from l where (a, b) not in (select c, d from t) and a < 4"),
|
||||
Row(1, 2.0) :: Row(1, 2.0) :: Row(2, 1.0) :: Row(2, 1.0) :: Row(3, 3.0) :: Nil)
|
||||
|
||||
// Empty sub-query
|
||||
checkAnswer(
|
||||
sql("select * from l where a not in (select c from r where c > 10 and b < d)"),
|
||||
sql("select * from l where (a, b) not in (select c, d from r where c > 10)"),
|
||||
Row(1, 2.0) :: Row(1, 2.0) :: Row(2, 1.0) :: Row(2, 1.0) ::
|
||||
Row(3, 3.0) :: Row(null, null) :: Row(null, 5.0) :: Row(6, null) :: Nil)
|
||||
|
||||
|
|
Loading…
Reference in a new issue