3ecb379430
This PR adds a hacky workaround for PARQUET-201, and should be removed once we upgrade to parquet-mr 1.8.1 or higher versions. In Parquet, not all types of columns can be used for filter push-down optimization. The set of valid column types is controlled by `ValidTypeMap`. Unfortunately, in parquet-mr 1.7.0 and prior versions, this limitation is too strict, and doesn't allow `BINARY (ENUM)` columns to be pushed down. On the other hand, `BINARY (ENUM)` is commonly seen in Parquet files written by libraries like `parquet-avro`. This restriction is problematic for Spark SQL, because Spark SQL doesn't have a type that maps to Parquet `BINARY (ENUM)` directly, and always converts `BINARY (ENUM)` to Catalyst `StringType`. Thus, a predicate involving a `BINARY (ENUM)` is recognized as one involving a string field instead and can be pushed down by the query optimizer. Such predicates are actually perfectly legal except that it fails the `ValidTypeMap` check. The workaround added here is relaxing `ValidTypeMap` to include `BINARY (ENUM)`. I also took the chance to simplify `ParquetCompatibilityTest` a little bit when adding regression test. Author: Cheng Lian <lian@databricks.com> Closes #8107 from liancheng/spark-9407/parquet-enum-filter-push-down.
61 lines
2 KiB
Thrift
61 lines
2 KiB
Thrift
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
* this work for additional information regarding copyright ownership.
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
namespace java org.apache.spark.sql.execution.datasources.parquet.test.thrift
|
|
|
|
enum Suit {
|
|
SPADES,
|
|
HEARTS,
|
|
DIAMONDS,
|
|
CLUBS
|
|
}
|
|
|
|
struct Nested {
|
|
1: required list<i32> nestedIntsColumn;
|
|
2: required string nestedStringColumn;
|
|
}
|
|
|
|
/**
|
|
* This is a test struct for testing parquet-thrift compatibility.
|
|
*/
|
|
struct ParquetThriftCompat {
|
|
1: required bool boolColumn;
|
|
2: required byte byteColumn;
|
|
3: required i16 shortColumn;
|
|
4: required i32 intColumn;
|
|
5: required i64 longColumn;
|
|
6: required double doubleColumn;
|
|
7: required binary binaryColumn;
|
|
8: required string stringColumn;
|
|
9: required Suit enumColumn
|
|
|
|
10: optional bool maybeBoolColumn;
|
|
11: optional byte maybeByteColumn;
|
|
12: optional i16 maybeShortColumn;
|
|
13: optional i32 maybeIntColumn;
|
|
14: optional i64 maybeLongColumn;
|
|
15: optional double maybeDoubleColumn;
|
|
16: optional binary maybeBinaryColumn;
|
|
17: optional string maybeStringColumn;
|
|
18: optional Suit maybeEnumColumn;
|
|
|
|
19: required list<string> stringsColumn;
|
|
20: required set<i32> intSetColumn;
|
|
21: required map<i32, string> intToStringColumn;
|
|
22: required map<i32, list<Nested>> complexColumn;
|
|
}
|