From 8b73b92aadd685b29ef3d9b33366f5e1fd3dae99 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 15 Feb 2020 19:49:58 +0800 Subject: [PATCH] [SPARK-30826][SQL] Respect reference case in `StringStartsWith` pushed down to parquet ### What changes were proposed in this pull request? In the PR, I propose to convert the attribute name of `StringStartsWith` pushed down to the Parquet datasource to column reference via the `nameToParquetField` map. Similar conversions are performed for other source filters pushed down to parquet. ### Why are the changes needed? This fixes the bug described in [SPARK-30826](https://issues.apache.org/jira/browse/SPARK-30826). The query from an external table: ```sql CREATE TABLE t1 (col STRING) USING parquet OPTIONS (path '$path') ``` created on top of written parquet files by `Seq("42").toDF("COL").write.parquet(path)` returns wrong empty result: ```scala spark.sql("SELECT * FROM t1 WHERE col LIKE '4%'").show +---+ |col| +---+ +---+ ``` ### Does this PR introduce any user-facing change? Yes. After the changes the result is correct for the example above: ```scala spark.sql("SELECT * FROM t1 WHERE col LIKE '4%'").show +---+ |col| +---+ | 42| +---+ ``` ### How was this patch tested? Added a test to `ParquetFilterSuite` Closes #27574 from MaxGekk/parquet-StringStartsWith-case-sens. Authored-by: Maxim Gekk Signed-off-by: Wenchen Fan --- .../datasources/parquet/ParquetFilters.scala | 2 +- .../parquet/ParquetFilterSuite.scala | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index b9b86adb43..948a120e0d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -591,7 +591,7 @@ class ParquetFilters( case sources.StringStartsWith(name, prefix) if pushDownStartWith && canMakeFilterOn(name, prefix) => Option(prefix).map { v => - FilterApi.userDefined(binaryColumn(name), + FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldName), new UserDefinedPredicate[Binary] with Serializable { private val strToBinary = Binary.fromReusedByteArray(v.getBytes) private val size = strToBinary.length diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 286bb1e920..4e0c1c2dbe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -1390,6 +1390,27 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared } } } + + test("SPARK-30826: case insensitivity of StringStartsWith attribute") { + import testImplicits._ + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + withTable("t1") { + withTempPath { dir => + val path = dir.toURI.toString + Seq("42").toDF("COL").write.parquet(path) + spark.sql( + s""" + |CREATE TABLE t1 (col STRING) + |USING parquet + |OPTIONS (path '$path') + """.stripMargin) + checkAnswer( + spark.sql("SELECT * FROM t1 WHERE col LIKE '4%'"), + Row("42")) + } + } + } + } } class ParquetV1FilterSuite extends ParquetFilterSuite {