From a744fea3be12f1a53ab553040b95da730210bc88 Mon Sep 17 00:00:00 2001 From: "Jungtaek Lim (HeartSaVioR)" Date: Wed, 28 Oct 2020 10:00:29 -0700 Subject: [PATCH] [SPARK-33267][SQL] Fix NPE issue on 'In' filter when one of values contains null ### What changes were proposed in this pull request? This PR proposes to fix the NPE issue on `In` filter when one of values contain null. In real case, you can trigger this issue when you try to push down the filter with `in (..., null)` against V2 source table. `DataSourceStrategy` caches the mapping (filter instance -> expression) in HashMap, which leverages hash code on the key, hence it could trigger the NPE issue. ### Why are the changes needed? This is an obvious bug as `In` filter doesn't care about null value when calculating hash code. ### Does this PR introduce _any_ user-facing change? Yes, previously the query with having `null` in "in" condition against data source V2 source table supporting push down filter failed with NPE, whereas after the PR the query will not fail. ### How was this patch tested? UT added. The new UT fails without the PR and passes with the PR. Closes #30170 from HeartSaVioR/SPARK-33267. Authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Dongjoon Hyun --- .../scala/org/apache/spark/sql/sources/filters.scala | 2 +- .../apache/spark/sql/connector/DataSourceV2Suite.scala | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala index 7533793253..2b44a3a861 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/sources/filters.scala @@ -164,7 +164,7 @@ case class In(attribute: String, values: Array[Any]) extends Filter { var h = attribute.hashCode values.foreach { v => h *= 41 - h += v.hashCode() + h += (if (v != null) v.hashCode() else 0) } h } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala index ec1ac00d08..ce28e61570 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala @@ -413,6 +413,16 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession with AdaptiveS } } } + + test("SPARK-33267: push down with condition 'in (..., null)' should not throw NPE") { + Seq(classOf[AdvancedDataSourceV2], classOf[JavaAdvancedDataSourceV2]).foreach { cls => + withClue(cls.getName) { + val df = spark.read.format(cls.getName).load() + // before SPARK-33267 below query just threw NPE + df.select('i).where("i in (1, null)").collect() + } + } + } }