From a5e13acd19871831a93a5bdcbc99a9eb9f1aba07 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 30 Nov 2020 11:24:15 +0900 Subject: [PATCH] [SPARK-33582][SQL] Hive Metastore support filter by not-equals ### What changes were proposed in this pull request? This pr make partition predicate pushdown into Hive metastore support not-equals operator. Hive related changes: https://github.com/apache/hive/blob/b8bd4594bef718b1eeac9fceb437d7df7b480ed1/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/TestHiveMetaStore.java#L2194-L2207 https://issues.apache.org/jira/browse/HIVE-2702 ### Why are the changes needed? Improve query performance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #30534 from wangyum/SPARK-33582. Authored-by: Yuming Wang Signed-off-by: HyukjinKwon --- .../spark/sql/hive/client/HiveShim.scala | 8 ++++++++ .../spark/sql/hive/client/FiltersSuite.scala | 8 ++++++++ .../client/HivePartitionFilteringSuite.scala | 20 +++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 17a64a67df..ed088648bc 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -812,6 +812,14 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { right <- convert(expr2) } yield s"($left or $right)" + case Not(EqualTo( + ExtractAttribute(SupportedAttribute(name)), ExtractableLiteral(value))) if useAdvanced => + Some(s"$name != $value") + + case Not(EqualTo( + ExtractableLiteral(value), ExtractAttribute(SupportedAttribute(name)))) if useAdvanced => + Some(s"$value != $name") + case _ => None } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala index 6c0531182e..12ed0e5305 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala @@ -100,6 +100,14 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { (a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil, "(intcol = 1)") + filterTest("NOT: int and string filters", + (a("intcol", IntegerType) =!= Literal(1)) :: (Literal("a") =!= a("strcol", IntegerType)) :: Nil, + """intcol != 1 and "a" != strcol""") + + filterTest("NOT: date filter", + (a("datecol", DateType) =!= Literal(Date.valueOf("2019-01-01"))) :: Nil, + "datecol != 2019-01-01") + // Applying the predicate `x IN (NULL)` should return an empty set, but since this optimization // will be applied by Catalyst, this filter converter does not need to account for this. filterTest("SPARK-24879 IN predicates with only NULLs will not cause a NPE", diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala index e07fbc29ee..dc56e6bc4d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala @@ -352,6 +352,26 @@ class HivePartitionFilteringSuite(version: String) dateStrValue) } + test("getPartitionsByFilter: ds<>20170101") { + testMetastorePartitionFiltering( + attr("ds") =!= 20170101, + 20170102 to 20170103, + hValue, + chunkValue, + dateValue, + dateStrValue) + } + + test("getPartitionsByFilter: h<>0 and chunk<>ab and d<>2019-01-01") { + testMetastorePartitionFiltering( + attr("h") =!= 0 && attr("chunk") =!= "ab" && attr("d") =!= Date.valueOf("2019-01-01"), + dsValue, + 1 to 4, + Seq("aa", "ba", "bb"), + Seq("2019-01-02", "2019-01-03"), + dateStrValue) + } + test("getPartitionsByFilter: d=2019-01-01") { testMetastorePartitionFiltering( attr("d") === Date.valueOf("2019-01-01"),