[SPARK-34339][CORE][SQL] Expose the number of total paths in Utils.buildLocationMetadata()

### What changes were proposed in this pull request?

This PR proposes to expose the number of total paths in Utils.buildLocationMetadata(), with relaxing space usage a bit (around 10+ chars).

Suppose the first 2 of 5 paths are only fit to the threshold, the outputs between the twos are below:

* before the change: `[path1, path2]`
* after the change: `(5 paths)[path1, path2, ...]`

### Why are the changes needed?

SPARK-31793 silently truncates the paths hence end users can't indicate how many paths are truncated, and even more, whether paths are truncated or not.

### Does this PR introduce _any_ user-facing change?

Yes, the location metadata will also show how many paths are truncated (not shown), instead of silently truncated.

### How was this patch tested?

Modified UTs

Closes #31464 from HeartSaVioR/SPARK-34339.

Authored-by: Jungtaek Lim (HeartSaVioR) <kabhwan.opensource@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
Jungtaek Lim (HeartSaVioR) 2021-02-05 09:37:38 +09:00 committed by HyukjinKwon
parent 7675582dab
commit fbe726f5b1
5 changed files with 20 additions and 16 deletions

View file

@ -2980,7 +2980,7 @@ private[spark] object Utils extends Logging {
* exceeds `stopAppendingThreshold`, stop appending paths for saving memory.
*/
def buildLocationMetadata(paths: Seq[Path], stopAppendingThreshold: Int): String = {
val metadata = new StringBuilder("[")
val metadata = new StringBuilder(s"(${paths.length} paths)[")
var index: Int = 0
while (index < paths.length && metadata.length < stopAppendingThreshold) {
if (index > 0) {
@ -2989,6 +2989,12 @@ private[spark] object Utils extends Logging {
metadata.append(paths(index).toString)
index += 1
}
if (paths.length > index) {
if (index > 0) {
metadata.append(", ")
}
metadata.append("...")
}
metadata.append("]")
metadata.toString
}

View file

@ -1304,16 +1304,11 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
test("pathsToMetadata") {
val paths = (0 to 4).map(i => new Path(s"path$i"))
assert(Utils.buildLocationMetadata(paths, 5) == "[path0]")
assert(Utils.buildLocationMetadata(paths, 10) == "[path0, path1]")
assert(Utils.buildLocationMetadata(paths, 15) == "[path0, path1, path2]")
assert(Utils.buildLocationMetadata(paths, 25) == "[path0, path1, path2, path3]")
// edge-case: we should consider the fact non-path chars including '[' and ", " are accounted
// 1. second path is not added due to the addition of '['
assert(Utils.buildLocationMetadata(paths, 6) == "[path0]")
// 2. third path is not added due to the addition of ", "
assert(Utils.buildLocationMetadata(paths, 13) == "[path0, path1]")
assert(Utils.buildLocationMetadata(paths, 10) == "(5 paths)[...]")
// 11 is the minimum threshold to print at least one path
assert(Utils.buildLocationMetadata(paths, 11) == "(5 paths)[path0, ...]")
// 11 + 5 + 2 = 18 is the minimum threshold to print two paths
assert(Utils.buildLocationMetadata(paths, 18) == "(5 paths)[path0, path1, ...]")
}
test("checkHost supports both IPV4 and IPV6") {

View file

@ -2132,7 +2132,7 @@ class AvroV2Suite extends AvroSuite with ExplainSuiteHelper {
|Output \\[2\\]: \\[value#xL, id#x\\]
|DataFilters: \\[isnotnull\\(value#xL\\), \\(value#xL > 2\\)\\]
|Format: avro
|Location: InMemoryFileIndex\\[.*\\]
|Location: InMemoryFileIndex\\([0-9]+ paths\\)\\[.*\\]
|PartitionFilters: \\[isnotnull\\(id#x\\), \\(id#x > 1\\)\\]
|PushedFilers: \\[IsNotNull\\(value\\), GreaterThan\\(value,2\\)\\]
|ReadSchema: struct\\<value:bigint\\>

View file

@ -411,7 +411,7 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
|Output \\[2\\]: \\[value#x, id#x\\]
|DataFilters: \\[isnotnull\\(value#x\\), \\(value#x > 2\\)\\]
|Format: $fmt
|Location: InMemoryFileIndex\\[.*\\]
|Location: InMemoryFileIndex\\([0-9]+ paths\\)\\[.*\\]
|PartitionFilters: \\[isnotnull\\(id#x\\), \\(id#x > 1\\)\\]
${pushFilterMaps.get(fmt).get}
|ReadSchema: struct\\<value:int\\>

View file

@ -122,8 +122,6 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
test("SPARK-31793: FileSourceScanExec metadata should contain limited file paths") {
withTempPath { path =>
val dir = path.getCanonicalPath
// create a sub-directory with long name so that each root path will always exceed the limit
// this is to ensure we always test the case for the path truncation
val dataDirName = Random.alphanumeric.take(100).toList.mkString
@ -146,6 +144,9 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
// The location metadata should at least contain one path
assert(location.get.contains(paths.head))
// The location metadata should have the number of root paths
assert(location.get.contains("(10 paths)"))
// The location metadata should have bracket wrapping paths
assert(location.get.indexOf('[') > -1)
assert(location.get.indexOf(']') > -1)
@ -155,7 +156,9 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
location.get.indexOf('[') + 1, location.get.indexOf(']')).split(", ").toSeq
// the only one path should be available
assert(pathsInLocation.size == 1)
assert(pathsInLocation.size == 2)
// indicator ("...") should be available
assert(pathsInLocation.exists(_.contains("...")))
}
}
}