diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java index 6549cac011..e5e61aae92 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java @@ -18,25 +18,11 @@ package org.apache.spark.network.shuffle; import java.io.File; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import com.google.common.annotations.VisibleForTesting; - -import org.apache.commons.lang3.SystemUtils; import org.apache.spark.network.util.JavaUtils; public class ExecutorDiskUtils { - private static final Pattern MULTIPLE_SEPARATORS; - static { - if (SystemUtils.IS_OS_WINDOWS) { - MULTIPLE_SEPARATORS = Pattern.compile("[/\\\\]+"); - } else { - MULTIPLE_SEPARATORS = Pattern.compile("/{2,}"); - } - } - /** * Hashes a filename into the corresponding local directory, in a manner consistent with * Spark's DiskBlockManager.getFile(). @@ -45,34 +31,16 @@ public class ExecutorDiskUtils { int hash = JavaUtils.nonNegativeHash(filename); String localDir = localDirs[hash % localDirs.length]; int subDirId = (hash / localDirs.length) % subDirsPerLocalDir; - return new File(createNormalizedInternedPathname( - localDir, String.format("%02x", subDirId), filename)); - } - - /** - * This method is needed to avoid the situation when multiple File instances for the - * same pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String. - * According to measurements, in some scenarios such duplicate strings may waste a lot - * of memory (~ 10% of the heap). To avoid that, we intern the pathname, and before that - * we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise, - * the internal code in java.io.File would normalize it later, creating a new "foo/bar" - * String copy. Unfortunately, we cannot just reuse the normalization code that java.io.File - * uses, since it is in the package-private class java.io.FileSystem. - * - * On Windows, separator "\" is used instead of "/". - * - * "\\" is a legal character in path name on Unix-like OS, but illegal on Windows. - */ - @VisibleForTesting - static String createNormalizedInternedPathname(String dir1, String dir2, String fname) { - String pathname = dir1 + File.separator + dir2 + File.separator + fname; - Matcher m = MULTIPLE_SEPARATORS.matcher(pathname); - pathname = m.replaceAll(Matcher.quoteReplacement(File.separator)); - // A single trailing slash needs to be taken care of separately - if (pathname.length() > 1 && pathname.charAt(pathname.length() - 1) == File.separatorChar) { - pathname = pathname.substring(0, pathname.length() - 1); - } - return pathname.intern(); + final String notNormalizedPath = + localDir + File.separator + String.format("%02x", subDirId) + File.separator + filename; + // Interning the normalized path as according to measurements, in some scenarios such + // duplicate strings may waste a lot of memory (~ 10% of the heap). + // Unfortunately, we cannot just call the normalization code that java.io.File + // uses, since it is in the package-private class java.io.FileSystem. + // So we are creating a File just to get the normalized path back to intern it. + // Finally a new File is built and returned with this interned normalized path. + final String normalizedInternedPath = new File(notNormalizedPath).getPath().intern(); + return new File(normalizedInternedPath); } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java index ba1a17bf7e..a6bcbb8850 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java @@ -24,7 +24,6 @@ import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executor; import java.util.concurrent.Executors; -import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.lang3.builder.ToStringBuilder; @@ -71,8 +70,6 @@ public class ExternalShuffleBlockResolver { private static final String APP_KEY_PREFIX = "AppExecShuffleInfo"; private static final StoreVersion CURRENT_VERSION = new StoreVersion(1, 0); - private static final Pattern MULTIPLE_SEPARATORS = Pattern.compile(File.separator + "{2,}"); - // Map containing all registered executors' metadata. @VisibleForTesting final ConcurrentMap executors; diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java index 6515b6ca03..88bcf43c23 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java @@ -17,7 +17,6 @@ package org.apache.spark.network.shuffle; -import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -25,7 +24,6 @@ import java.nio.charset.StandardCharsets; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.io.CharStreams; -import org.apache.commons.lang3.SystemUtils; import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo; import org.apache.spark.network.util.MapConfigProvider; import org.apache.spark.network.util.TransportConf; @@ -145,29 +143,4 @@ public class ExternalShuffleBlockResolverSuite { assertEquals(shuffleInfo, mapper.readValue(legacyShuffleJson, ExecutorShuffleInfo.class)); } - @Test - public void testNormalizeAndInternPathname() { - String sep = File.separator; - String expectedPathname = sep + "foo" + sep + "bar" + sep + "baz"; - assertPathsMatch("/foo", "bar", "baz", expectedPathname); - assertPathsMatch("//foo/", "bar/", "//baz", expectedPathname); - assertPathsMatch("/foo/", "/bar//", "/baz", expectedPathname); - assertPathsMatch("foo", "bar", "baz///", "foo" + sep + "bar" + sep + "baz"); - assertPathsMatch("/", "", "", sep); - assertPathsMatch("/", "/", "/", sep); - if (SystemUtils.IS_OS_WINDOWS) { - assertPathsMatch("/foo\\/", "bar", "baz", expectedPathname); - } else { - assertPathsMatch("/foo\\/", "bar", "baz", sep + "foo\\" + sep + "bar" + sep + "baz"); - } - } - - private void assertPathsMatch(String p1, String p2, String p3, String expectedPathname) { - String normPathname = - ExecutorDiskUtils.createNormalizedInternedPathname(p1, p2, p3); - assertEquals(expectedPathname, normPathname); - File file = new File(normPathname); - String returnedPath = file.getPath(); - assertEquals(normPathname, returnedPath); - } }