[SPARK-26339][SQL] Throws better exception when reading files that start with underscore

## What changes were proposed in this pull request?
My pull request #23288 was resolved and merged to master, but it turned out  later that my change breaks another regression test. Because we cannot reopen pull request, I create a new pull request here.
Commit 92934b4 is only change after pull request #23288.
`CheckFileExist` was avoided at 239cfa4 after discussing #23288 (comment).
But, that change turned out to be wrong because we should not check if argument checkFileExist is false.

Test 27e42c1de5/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala (L2555)
failed when we avoided checkFileExist, but now successed after commit 92934b4 .

## How was this patch tested?
Both of below tests were passed.
```
testOnly org.apache.spark.sql.execution.datasources.csv.CSVSuite
testOnly org.apache.spark.sql.SQLQuerySuite
```

Closes #23446 from KeiichiHirobe/SPARK-26339.

Authored-by: Hirobe Keiichi <keiichi_hirobe@forcia.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
This commit is contained in:
Hirobe Keiichi 2019-01-06 08:52:09 -06:00 committed by Sean Owen
parent 737f08949a
commit 9d8e9b394b
3 changed files with 45 additions and 1 deletions

View file

@ -543,7 +543,7 @@ case class DataSource(
checkFilesExist: Boolean): Seq[Path] = {
val allPaths = caseInsensitiveOptions.get("path") ++ paths
val hadoopConf = sparkSession.sessionState.newHadoopConf()
allPaths.flatMap { path =>
val allGlobPath = allPaths.flatMap { path =>
val hdfsPath = new Path(path)
val fs = hdfsPath.getFileSystem(hadoopConf)
val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
@ -560,6 +560,23 @@ case class DataSource(
}
globPath
}.toSeq
if (checkFilesExist) {
val (filteredOut, filteredIn) = allGlobPath.partition { path =>
InMemoryFileIndex.shouldFilterOut(path.getName)
}
if (filteredOut.nonEmpty) {
if (filteredIn.isEmpty) {
throw new AnalysisException(
s"All paths were ignored:\n${filteredOut.mkString("\n ")}")
} else {
logDebug(
s"Some paths were ignored:\n${filteredOut.mkString("\n ")}")
}
}
}
allGlobPath
}
}

View file

@ -0,0 +1,7 @@
year,make,model,comment,blank
"2012","Tesla","S","No comment",
1997,Ford,E350,"Go get one now they are going fast",
2015,Chevy,Volt
Can't render this file because it has a wrong number of fields in line 6.

View file

@ -53,6 +53,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
private val carsEmptyValueFile = "test-data/cars-empty-value.csv"
private val carsBlankColName = "test-data/cars-blank-column-name.csv"
private val carsCrlf = "test-data/cars-crlf.csv"
private val carsFilteredOutFile = "test-data/_cars.csv"
private val emptyFile = "test-data/empty.csv"
private val commentsFile = "test-data/comments.csv"
private val disableCommentsFile = "test-data/disable_comments.csv"
@ -346,6 +347,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
assert(result.schema.fieldNames.size === 1)
}
test("SPARK-26339 Not throw an exception if some of specified paths are filtered in") {
val cars = spark
.read
.option("header", "false")
.csv(testFile(carsFile), testFile(carsFilteredOutFile))
verifyCars(cars, withHeader = false, checkTypes = false)
}
test("SPARK-26339 Throw an exception only if all of the specified paths are filtered out") {
val e = intercept[AnalysisException] {
val cars = spark
.read
.option("header", "false")
.csv(testFile(carsFilteredOutFile))
}.getMessage
assert(e.contains("All paths were ignored:"))
}
test("DDL test with empty file") {
withView("carsTable") {
spark.sql(