[SPARK-19017][SQL] NOT IN subquery with more than one column may return incorrect results

## What changes were proposed in this pull request?

This PR fixes the code in Optimizer phase where the NULL-aware expression of a NOT IN query is expanded in Rule `RewritePredicateSubquery`.

Example:
The query

 select a1,b1
 from   t1
 where  (a1,b1) not in (select a2,b2
                        from   t2);

has the (a1, b1) = (a2, b2) rewritten from (before this fix):

Join LeftAnti, ((isnull((_1#2 = a2#16)) || isnull((_2#3 = b2#17))) || ((_1#2 = a2#16) && (_2#3 = b2#17)))

to (after this fix):

Join LeftAnti, (((_1#2 = a2#16) || isnull((_1#2 = a2#16))) && ((_2#3 = b2#17) || isnull((_2#3 = b2#17))))

## How was this patch tested?

sql/test, catalyst/test and new test cases in SQLQueryTestSuite.

Author: Nattavut Sutyanyong <nsy.can@gmail.com>

Closes #16467 from nsyca/19017.
This commit is contained in:
Nattavut Sutyanyong 2017-01-24 23:31:06 +01:00 committed by Herman van Hovell
parent 8f3f73abc1
commit cdb691eb4d
5 changed files with 131 additions and 6 deletions

View file

@ -68,8 +68,14 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
// Note that will almost certainly be planned as a Broadcast Nested Loop join.
// Use EXISTS if performance matters to you.
val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p)
val anyNull = splitConjunctivePredicates(joinCond.get).map(IsNull).reduceLeft(Or)
Join(outerPlan, sub, LeftAnti, Option(Or(anyNull, joinCond.get)))
// Expand the NOT IN expression with the NULL-aware semantic
// to its full form. That is from:
// (a1,b1,...) = (a2,b2,...)
// to
// (a1=a2 OR isnull(a1=a2)) AND (b1=b2 OR isnull(b1=b2)) AND ...
val joinConds = splitConjunctivePredicates(joinCond.get)
val pairs = joinConds.map(c => Or(c, IsNull(c))).reduceLeft(And)
Join(outerPlan, sub, LeftAnti, Option(pairs))
case (p, predicate) =>
val (newCond, inputPlan) = rewriteExistentialExpr(Seq(predicate), p)
Project(p.output, Filter(newCond.get, inputPlan))

View file

@ -0,0 +1,55 @@
-- This file contains test cases for NOT IN subquery with multiple columns.
-- The data sets are populated as follows:
-- 1) When T1.A1 = T2.A2
-- 1.1) T1.B1 = T2.B2
-- 1.2) T1.B1 = T2.B2 returns false
-- 1.3) T1.B1 is null
-- 1.4) T2.B2 is null
-- 2) When T1.A1 = T2.A2 returns false
-- 3) When T1.A1 is null
-- 4) When T1.A2 is null
-- T1.A1 T1.B1 T2.A2 T2.B2
-- ----- ----- ----- -----
-- 1 1 1 1 (1.1)
-- 1 3 (1.2)
-- 1 null 1 null (1.3 & 1.4)
--
-- 2 1 1 1 (2)
-- null 1 (3)
-- null 3 (4)
create temporary view t1 as select * from values
(1, 1), (2, 1), (null, 1),
(1, 3), (null, 3),
(1, null), (null, 2)
as t1(a1, b1);
create temporary view t2 as select * from values
(1, 1),
(null, 3),
(1, null)
as t2(a2, b2);
-- multiple columns in NOT IN
-- TC 01.01
select a1,b1
from t1
where (a1,b1) not in (select a2,b2
from t2);
-- multiple columns with expressions in NOT IN
-- TC 01.02
select a1,b1
from t1
where (a1-1,b1) not in (select a2,b2
from t2);
-- multiple columns with expressions in NOT IN
-- TC 01.02
select a1,b1
from t1
where (a1,b1) not in (select a2+1,b2
from t2);

View file

@ -0,0 +1,59 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 5
-- !query 0
create temporary view t1 as select * from values
(1, 1), (2, 1), (null, 1),
(1, 3), (null, 3),
(1, null), (null, 2)
as t1(a1, b1)
-- !query 0 schema
struct<>
-- !query 0 output
-- !query 1
create temporary view t2 as select * from values
(1, 1),
(null, 3),
(1, null)
as t2(a2, b2)
-- !query 1 schema
struct<>
-- !query 1 output
-- !query 2
select a1,b1
from t1
where (a1,b1) not in (select a2,b2
from t2)
-- !query 2 schema
struct<a1:int,b1:int>
-- !query 2 output
2 1
-- !query 3
select a1,b1
from t1
where (a1-1,b1) not in (select a2,b2
from t2)
-- !query 3 schema
struct<a1:int,b1:int>
-- !query 3 output
1 1
-- !query 4
select a1,b1
from t1
where (a1,b1) not in (select a2+1,b2
from t2)
-- !query 4 schema
struct<a1:int,b1:int>
-- !query 4 output
1 1

View file

@ -163,7 +163,12 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
s"-- Number of queries: ${outputs.size}\n\n\n" +
outputs.zipWithIndex.map{case (qr, i) => qr.toString(i)}.mkString("\n\n\n") + "\n"
}
stringToFile(new File(testCase.resultFile), goldenOutput)
val resultFile = new File(testCase.resultFile);
val parent = resultFile.getParentFile();
if (!parent.exists()) {
assert(parent.mkdirs(), "Could not create directory: " + parent)
}
stringToFile(resultFile, goldenOutput)
}
// Read back the golden file.

View file

@ -263,12 +263,12 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
Row(1, 2.0) :: Row(1, 2.0) :: Nil)
checkAnswer(
sql("select * from l where a not in (select c from t where b < d)"),
Row(1, 2.0) :: Row(1, 2.0) :: Row(3, 3.0) :: Nil)
sql("select * from l where (a, b) not in (select c, d from t) and a < 4"),
Row(1, 2.0) :: Row(1, 2.0) :: Row(2, 1.0) :: Row(2, 1.0) :: Row(3, 3.0) :: Nil)
// Empty sub-query
checkAnswer(
sql("select * from l where a not in (select c from r where c > 10 and b < d)"),
sql("select * from l where (a, b) not in (select c, d from r where c > 10)"),
Row(1, 2.0) :: Row(1, 2.0) :: Row(2, 1.0) :: Row(2, 1.0) ::
Row(3, 3.0) :: Row(null, null) :: Row(null, 5.0) :: Row(6, null) :: Nil)