[SPARK-24385][SQL] Resolve self-join condition ambiguity for EqualNullSafe

## What changes were proposed in this pull request?

In Dataset.join we have a small hack for resolving ambiguity in the column name for self-joins. The current code supports only `EqualTo`.

The PR extends the fix to `EqualNullSafe`.

Credit for this PR should be given to daniel-shields.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #21605 from mgaido91/SPARK-24385_2.
This commit is contained in:
Marco Gaido 2018-07-03 12:20:03 +08:00 committed by Wenchen Fan
parent 85fe1297e3
commit a7c8f0c8cb
2 changed files with 13 additions and 0 deletions

View file

@ -1016,6 +1016,11 @@ class Dataset[T] private[sql](
catalyst.expressions.EqualTo(
withPlan(plan.left).resolve(a.name),
withPlan(plan.right).resolve(b.name))
case catalyst.expressions.EqualNullSafe(a: AttributeReference, b: AttributeReference)
if a.sameRef(b) =>
catalyst.expressions.EqualNullSafe(
withPlan(plan.left).resolve(a.name),
withPlan(plan.right).resolve(b.name))
}}
withPlan {

View file

@ -287,4 +287,12 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
dfOne.join(dfTwo, $"a" === $"b", "left").queryExecution.optimizedPlan
}
}
test("SPARK-24385: Resolve ambiguity in self-joins with EqualNullSafe") {
withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") {
val df = spark.range(2)
// this throws an exception before the fix
df.join(df, df("id") <=> df("id")).queryExecution.optimizedPlan
}
}
}