[SPARK-26339][SQL] Throws better exception when reading files that start with underscore

## What changes were proposed in this pull request? My pull request #23288 was resolved and merged to master, but it turned out later that my change breaks another regression test. Because we cannot reopen pull request, I create a new pull request here. Commit 92934b4 is only change after pull request #23288. `CheckFileExist` was avoided at 239cfa4 after discussing #23288 (comment). But, that change turned out to be wrong because we should not check if argument checkFileExist is false. Test 27e42c1de5/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala (L2555) failed when we avoided checkFileExist, but now successed after commit 92934b4 . ## How was this patch tested? Both of below tests were passed. ``` testOnly org.apache.spark.sql.execution.datasources.csv.CSVSuite testOnly org.apache.spark.sql.SQLQuerySuite ``` Closes #23446 from KeiichiHirobe/SPARK-26339. Authored-by: Hirobe Keiichi <keiichi_hirobe@forcia.com> Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-01-06 08:52:09 -06:00 · 2019-01-06 08:52:09 -06:00 · 9d8e9b394b
parent 737f08949a
commit 9d8e9b394b
3 changed files with 45 additions and 1 deletions
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@ -543,7 +543,7 @@ case class DataSource(
      checkFilesExist: Boolean): Seq[Path] = {
    val allPaths = caseInsensitiveOptions.get("path") ++ paths
    val hadoopConf = sparkSession.sessionState.newHadoopConf()
-    allPaths.flatMap { path =>
+    val allGlobPath = allPaths.flatMap { path =>
      val hdfsPath = new Path(path)
      val fs = hdfsPath.getFileSystem(hadoopConf)
      val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
@ -560,6 +560,23 @@ case class DataSource(
      }
      globPath
    }.toSeq
+
+    if (checkFilesExist) {
+      val (filteredOut, filteredIn) = allGlobPath.partition { path =>
+        InMemoryFileIndex.shouldFilterOut(path.getName)
+      }
+      if (filteredOut.nonEmpty) {
+        if (filteredIn.isEmpty) {
+          throw new AnalysisException(
+            s"All paths were ignored:\n${filteredOut.mkString("\n  ")}")
+        } else {
+          logDebug(
+            s"Some paths were ignored:\n${filteredOut.mkString("\n  ")}")
+        }
+      }
+    }
+
+    allGlobPath
  }
 }

--- a/sql/core/src/test/resources/test-data/_cars.csv
+++ b/sql/core/src/test/resources/test-data/_cars.csv
@ -0,0 +1,7 @@
+
+year,make,model,comment,blank
+"2012","Tesla","S","No comment",
+
+1997,Ford,E350,"Go get one now they are going fast",
+2015,Chevy,Volt
+
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@ -53,6 +53,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
  private val carsEmptyValueFile = "test-data/cars-empty-value.csv"
  private val carsBlankColName = "test-data/cars-blank-column-name.csv"
  private val carsCrlf = "test-data/cars-crlf.csv"
+  private val carsFilteredOutFile = "test-data/_cars.csv"
  private val emptyFile = "test-data/empty.csv"
  private val commentsFile = "test-data/comments.csv"
  private val disableCommentsFile = "test-data/disable_comments.csv"
@ -346,6 +347,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
    assert(result.schema.fieldNames.size === 1)
  }

+  test("SPARK-26339 Not throw an exception if some of specified paths are filtered in") {
+    val cars = spark
+      .read
+      .option("header", "false")
+      .csv(testFile(carsFile), testFile(carsFilteredOutFile))
+
+    verifyCars(cars, withHeader = false, checkTypes = false)
+  }
+
+  test("SPARK-26339 Throw an exception only if all of the specified paths are filtered out") {
+    val e = intercept[AnalysisException] {
+      val cars = spark
+        .read
+        .option("header", "false")
+        .csv(testFile(carsFilteredOutFile))
+    }.getMessage
+    assert(e.contains("All paths were ignored:"))
+  }
+
  test("DDL test with empty file") {
    withView("carsTable") {
      spark.sql(