From 9d8e9b394bbc065a72076585a21393f42ce86cd1 Mon Sep 17 00:00:00 2001 From: Hirobe Keiichi Date: Sun, 6 Jan 2019 08:52:09 -0600 Subject: [PATCH] [SPARK-26339][SQL] Throws better exception when reading files that start with underscore ## What changes were proposed in this pull request? My pull request #23288 was resolved and merged to master, but it turned out later that my change breaks another regression test. Because we cannot reopen pull request, I create a new pull request here. Commit 92934b4 is only change after pull request #23288. `CheckFileExist` was avoided at 239cfa4 after discussing #23288 (comment). But, that change turned out to be wrong because we should not check if argument checkFileExist is false. Test https://github.com/apache/spark/blob/27e42c1de502da80fa3e22bb69de47fb00158174/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala#L2555 failed when we avoided checkFileExist, but now successed after commit 92934b4 . ## How was this patch tested? Both of below tests were passed. ``` testOnly org.apache.spark.sql.execution.datasources.csv.CSVSuite testOnly org.apache.spark.sql.SQLQuerySuite ``` Closes #23446 from KeiichiHirobe/SPARK-26339. Authored-by: Hirobe Keiichi Signed-off-by: Sean Owen --- .../execution/datasources/DataSource.scala | 19 +++++++++++++++++- .../src/test/resources/test-data/_cars.csv | 7 +++++++ .../execution/datasources/csv/CSVSuite.scala | 20 +++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 sql/core/src/test/resources/test-data/_cars.csv diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index fefff68c4b..2a438a5cbf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -543,7 +543,7 @@ case class DataSource( checkFilesExist: Boolean): Seq[Path] = { val allPaths = caseInsensitiveOptions.get("path") ++ paths val hadoopConf = sparkSession.sessionState.newHadoopConf() - allPaths.flatMap { path => + val allGlobPath = allPaths.flatMap { path => val hdfsPath = new Path(path) val fs = hdfsPath.getFileSystem(hadoopConf) val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory) @@ -560,6 +560,23 @@ case class DataSource( } globPath }.toSeq + + if (checkFilesExist) { + val (filteredOut, filteredIn) = allGlobPath.partition { path => + InMemoryFileIndex.shouldFilterOut(path.getName) + } + if (filteredOut.nonEmpty) { + if (filteredIn.isEmpty) { + throw new AnalysisException( + s"All paths were ignored:\n${filteredOut.mkString("\n ")}") + } else { + logDebug( + s"Some paths were ignored:\n${filteredOut.mkString("\n ")}") + } + } + } + + allGlobPath } } diff --git a/sql/core/src/test/resources/test-data/_cars.csv b/sql/core/src/test/resources/test-data/_cars.csv new file mode 100644 index 0000000000..40ded573ad --- /dev/null +++ b/sql/core/src/test/resources/test-data/_cars.csv @@ -0,0 +1,7 @@ + +year,make,model,comment,blank +"2012","Tesla","S","No comment", + +1997,Ford,E350,"Go get one now they are going fast", +2015,Chevy,Volt + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index d9e5d7af19..fb1bedfaa3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -53,6 +53,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te private val carsEmptyValueFile = "test-data/cars-empty-value.csv" private val carsBlankColName = "test-data/cars-blank-column-name.csv" private val carsCrlf = "test-data/cars-crlf.csv" + private val carsFilteredOutFile = "test-data/_cars.csv" private val emptyFile = "test-data/empty.csv" private val commentsFile = "test-data/comments.csv" private val disableCommentsFile = "test-data/disable_comments.csv" @@ -346,6 +347,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te assert(result.schema.fieldNames.size === 1) } + test("SPARK-26339 Not throw an exception if some of specified paths are filtered in") { + val cars = spark + .read + .option("header", "false") + .csv(testFile(carsFile), testFile(carsFilteredOutFile)) + + verifyCars(cars, withHeader = false, checkTypes = false) + } + + test("SPARK-26339 Throw an exception only if all of the specified paths are filtered out") { + val e = intercept[AnalysisException] { + val cars = spark + .read + .option("header", "false") + .csv(testFile(carsFilteredOutFile)) + }.getMessage + assert(e.contains("All paths were ignored:")) + } + test("DDL test with empty file") { withView("carsTable") { spark.sql(