[SPARK-34339][CORE][SQL] Expose the number of total paths in Utils.buildLocationMetadata()
### What changes were proposed in this pull request? This PR proposes to expose the number of total paths in Utils.buildLocationMetadata(), with relaxing space usage a bit (around 10+ chars). Suppose the first 2 of 5 paths are only fit to the threshold, the outputs between the twos are below: * before the change: `[path1, path2]` * after the change: `(5 paths)[path1, path2, ...]` ### Why are the changes needed? SPARK-31793 silently truncates the paths hence end users can't indicate how many paths are truncated, and even more, whether paths are truncated or not. ### Does this PR introduce _any_ user-facing change? Yes, the location metadata will also show how many paths are truncated (not shown), instead of silently truncated. ### How was this patch tested? Modified UTs Closes #31464 from HeartSaVioR/SPARK-34339. Authored-by: Jungtaek Lim (HeartSaVioR) <kabhwan.opensource@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
parent
7675582dab
commit
fbe726f5b1
|
@ -2980,7 +2980,7 @@ private[spark] object Utils extends Logging {
|
|||
* exceeds `stopAppendingThreshold`, stop appending paths for saving memory.
|
||||
*/
|
||||
def buildLocationMetadata(paths: Seq[Path], stopAppendingThreshold: Int): String = {
|
||||
val metadata = new StringBuilder("[")
|
||||
val metadata = new StringBuilder(s"(${paths.length} paths)[")
|
||||
var index: Int = 0
|
||||
while (index < paths.length && metadata.length < stopAppendingThreshold) {
|
||||
if (index > 0) {
|
||||
|
@ -2989,6 +2989,12 @@ private[spark] object Utils extends Logging {
|
|||
metadata.append(paths(index).toString)
|
||||
index += 1
|
||||
}
|
||||
if (paths.length > index) {
|
||||
if (index > 0) {
|
||||
metadata.append(", ")
|
||||
}
|
||||
metadata.append("...")
|
||||
}
|
||||
metadata.append("]")
|
||||
metadata.toString
|
||||
}
|
||||
|
|
|
@ -1304,16 +1304,11 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
|
|||
|
||||
test("pathsToMetadata") {
|
||||
val paths = (0 to 4).map(i => new Path(s"path$i"))
|
||||
assert(Utils.buildLocationMetadata(paths, 5) == "[path0]")
|
||||
assert(Utils.buildLocationMetadata(paths, 10) == "[path0, path1]")
|
||||
assert(Utils.buildLocationMetadata(paths, 15) == "[path0, path1, path2]")
|
||||
assert(Utils.buildLocationMetadata(paths, 25) == "[path0, path1, path2, path3]")
|
||||
|
||||
// edge-case: we should consider the fact non-path chars including '[' and ", " are accounted
|
||||
// 1. second path is not added due to the addition of '['
|
||||
assert(Utils.buildLocationMetadata(paths, 6) == "[path0]")
|
||||
// 2. third path is not added due to the addition of ", "
|
||||
assert(Utils.buildLocationMetadata(paths, 13) == "[path0, path1]")
|
||||
assert(Utils.buildLocationMetadata(paths, 10) == "(5 paths)[...]")
|
||||
// 11 is the minimum threshold to print at least one path
|
||||
assert(Utils.buildLocationMetadata(paths, 11) == "(5 paths)[path0, ...]")
|
||||
// 11 + 5 + 2 = 18 is the minimum threshold to print two paths
|
||||
assert(Utils.buildLocationMetadata(paths, 18) == "(5 paths)[path0, path1, ...]")
|
||||
}
|
||||
|
||||
test("checkHost supports both IPV4 and IPV6") {
|
||||
|
|
|
@ -2132,7 +2132,7 @@ class AvroV2Suite extends AvroSuite with ExplainSuiteHelper {
|
|||
|Output \\[2\\]: \\[value#xL, id#x\\]
|
||||
|DataFilters: \\[isnotnull\\(value#xL\\), \\(value#xL > 2\\)\\]
|
||||
|Format: avro
|
||||
|Location: InMemoryFileIndex\\[.*\\]
|
||||
|Location: InMemoryFileIndex\\([0-9]+ paths\\)\\[.*\\]
|
||||
|PartitionFilters: \\[isnotnull\\(id#x\\), \\(id#x > 1\\)\\]
|
||||
|PushedFilers: \\[IsNotNull\\(value\\), GreaterThan\\(value,2\\)\\]
|
||||
|ReadSchema: struct\\<value:bigint\\>
|
||||
|
|
|
@ -411,7 +411,7 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
|
|||
|Output \\[2\\]: \\[value#x, id#x\\]
|
||||
|DataFilters: \\[isnotnull\\(value#x\\), \\(value#x > 2\\)\\]
|
||||
|Format: $fmt
|
||||
|Location: InMemoryFileIndex\\[.*\\]
|
||||
|Location: InMemoryFileIndex\\([0-9]+ paths\\)\\[.*\\]
|
||||
|PartitionFilters: \\[isnotnull\\(id#x\\), \\(id#x > 1\\)\\]
|
||||
${pushFilterMaps.get(fmt).get}
|
||||
|ReadSchema: struct\\<value:int\\>
|
||||
|
|
|
@ -122,8 +122,6 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
|
|||
|
||||
test("SPARK-31793: FileSourceScanExec metadata should contain limited file paths") {
|
||||
withTempPath { path =>
|
||||
val dir = path.getCanonicalPath
|
||||
|
||||
// create a sub-directory with long name so that each root path will always exceed the limit
|
||||
// this is to ensure we always test the case for the path truncation
|
||||
val dataDirName = Random.alphanumeric.take(100).toList.mkString
|
||||
|
@ -146,6 +144,9 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
|
|||
// The location metadata should at least contain one path
|
||||
assert(location.get.contains(paths.head))
|
||||
|
||||
// The location metadata should have the number of root paths
|
||||
assert(location.get.contains("(10 paths)"))
|
||||
|
||||
// The location metadata should have bracket wrapping paths
|
||||
assert(location.get.indexOf('[') > -1)
|
||||
assert(location.get.indexOf(']') > -1)
|
||||
|
@ -155,7 +156,9 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
|
|||
location.get.indexOf('[') + 1, location.get.indexOf(']')).split(", ").toSeq
|
||||
|
||||
// the only one path should be available
|
||||
assert(pathsInLocation.size == 1)
|
||||
assert(pathsInLocation.size == 2)
|
||||
// indicator ("...") should be available
|
||||
assert(pathsInLocation.exists(_.contains("...")))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue