[SPARK-15454][SQL] Filter out files starting with _
## What changes were proposed in this pull request? Many other systems (e.g. Impala) uses _xxx as staging, and Spark should not be reading those files. ## How was this patch tested? Added a unit test case. Author: Reynold Xin <rxin@databricks.com> Closes #13227 from rxin/SPARK-15454.
This commit is contained in:
parent
0e70fd61b4
commit
dcac8e6f49
|
@ -341,11 +341,11 @@ private[sql] object HadoopFsRelation extends Logging {
|
|||
|
||||
/** Checks if we should filter out this path name. */
|
||||
def shouldFilterOut(pathName: String): Boolean = {
|
||||
// TODO: We should try to filter out all files/dirs starting with "." or "_".
|
||||
// The only reason that we are not doing it now is that Parquet needs to find those
|
||||
// metadata files from leaf files returned by this methods. We should refactor
|
||||
// this logic to not mix metadata files with data files.
|
||||
pathName == "_SUCCESS" || pathName == "_temporary" || pathName.startsWith(".")
|
||||
// We filter everything that starts with _ and ., except _common_metadata and _metadata
|
||||
// because Parquet needs to find those metadata files from leaf files returned by this method.
|
||||
// We should refactor this logic to not mix metadata files with data files.
|
||||
(pathName.startsWith("_") || pathName.startsWith(".")) &&
|
||||
!pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -39,4 +39,15 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
|
|||
assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize))
|
||||
}
|
||||
}
|
||||
|
||||
test("file filtering") {
|
||||
assert(!HadoopFsRelation.shouldFilterOut("abcd"))
|
||||
assert(HadoopFsRelation.shouldFilterOut(".ab"))
|
||||
assert(HadoopFsRelation.shouldFilterOut("_cd"))
|
||||
|
||||
assert(!HadoopFsRelation.shouldFilterOut("_metadata"))
|
||||
assert(!HadoopFsRelation.shouldFilterOut("_common_metadata"))
|
||||
assert(HadoopFsRelation.shouldFilterOut("_ab_metadata"))
|
||||
assert(HadoopFsRelation.shouldFilterOut("_cd_common_metadata"))
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue