[SPARK-15454][SQL] Filter out files starting with _

## What changes were proposed in this pull request?
Many other systems (e.g. Impala) uses _xxx as staging, and Spark should not be reading those files.

## How was this patch tested?
Added a unit test case.

Author: Reynold Xin <rxin@databricks.com>

Closes #13227 from rxin/SPARK-15454.
This commit is contained in:
Reynold Xin 2016-05-20 14:49:54 -07:00
parent 0e70fd61b4
commit dcac8e6f49
2 changed files with 16 additions and 5 deletions

View file

@ -341,11 +341,11 @@ private[sql] object HadoopFsRelation extends Logging {
/** Checks if we should filter out this path name. */
def shouldFilterOut(pathName: String): Boolean = {
// TODO: We should try to filter out all files/dirs starting with "." or "_".
// The only reason that we are not doing it now is that Parquet needs to find those
// metadata files from leaf files returned by this methods. We should refactor
// this logic to not mix metadata files with data files.
pathName == "_SUCCESS" || pathName == "_temporary" || pathName.startsWith(".")
// We filter everything that starts with _ and ., except _common_metadata and _metadata
// because Parquet needs to find those metadata files from leaf files returned by this method.
// We should refactor this logic to not mix metadata files with data files.
(pathName.startsWith("_") || pathName.startsWith(".")) &&
!pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
}
/**

View file

@ -39,4 +39,15 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize))
}
}
test("file filtering") {
assert(!HadoopFsRelation.shouldFilterOut("abcd"))
assert(HadoopFsRelation.shouldFilterOut(".ab"))
assert(HadoopFsRelation.shouldFilterOut("_cd"))
assert(!HadoopFsRelation.shouldFilterOut("_metadata"))
assert(!HadoopFsRelation.shouldFilterOut("_common_metadata"))
assert(HadoopFsRelation.shouldFilterOut("_ab_metadata"))
assert(HadoopFsRelation.shouldFilterOut("_cd_common_metadata"))
}
}