[SPARK-25704][CORE] Allocate a bit less than Int.MaxValue

JVMs don't you allocate arrays of length exactly Int.MaxValue, so leave a little extra room. This is necessary when reading blocks >2GB off the network (for remote reads or for cache replication). Unit tests via jenkins, ran a test with blocks over 2gb on a cluster Closes #22705 from squito/SPARK-25704. Authored-by: Imran Rashid <irashid@cloudera.com> Signed-off-by: Imran Rashid <irashid@cloudera.com>
2018-10-19 12:52:41 -05:00 · 2018-10-19 12:52:41 -05:00 · 43717dee57
parent 130121711c
commit 43717dee57
2 changed files with 11 additions and 11 deletions
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@ -133,8 +133,6 @@ private[spark] class BlockManager(

  private[spark] val externalShuffleServiceEnabled =
    conf.get(config.SHUFFLE_SERVICE_ENABLED)
-  private val chunkSize =
-    conf.getSizeAsBytes("spark.storage.memoryMapLimitForTests", Int.MaxValue.toString).toInt
  private val remoteReadNioBufferConversion =
    conf.getBoolean("spark.network.remoteReadNioBufferConversion", false)

@ -451,7 +449,7 @@ private[spark] class BlockManager(
            new EncryptedBlockData(tmpFile, blockSize, conf, key).toChunkedByteBuffer(allocator)

          case None =>
-            ChunkedByteBuffer.fromFile(tmpFile, conf.get(config.MEMORY_MAP_LIMIT_FOR_TESTS).toInt)
+            ChunkedByteBuffer.fromFile(tmpFile)
        }
        putBytes(blockId, buffer, level)(classTag)
        tmpFile.delete()
@ -797,7 +795,7 @@ private[spark] class BlockManager(
        if (remoteReadNioBufferConversion) {
          return Some(new ChunkedByteBuffer(data.nioByteBuffer()))
        } else {
-          return Some(ChunkedByteBuffer.fromManagedBuffer(data, chunkSize))
+          return Some(ChunkedByteBuffer.fromManagedBuffer(data))
        }
      }
      logDebug(s"The value of block $blockId is null")
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
@ -30,6 +30,7 @@ import org.apache.spark.internal.config
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
 import org.apache.spark.network.util.{ByteArrayWritableChannel, LimitedInputStream}
 import org.apache.spark.storage.StorageUtils
+import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.util.Utils

 /**
@ -169,24 +170,25 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {

 }

-object ChunkedByteBuffer {
+private[spark] object ChunkedByteBuffer {
+
+
  // TODO eliminate this method if we switch BlockManager to getting InputStreams
-  def fromManagedBuffer(data: ManagedBuffer, maxChunkSize: Int): ChunkedByteBuffer = {
+  def fromManagedBuffer(data: ManagedBuffer): ChunkedByteBuffer = {
    data match {
      case f: FileSegmentManagedBuffer =>
-        fromFile(f.getFile, maxChunkSize, f.getOffset, f.getLength)
+        fromFile(f.getFile, f.getOffset, f.getLength)
      case other =>
        new ChunkedByteBuffer(other.nioByteBuffer())
    }
  }

-  def fromFile(file: File, maxChunkSize: Int): ChunkedByteBuffer = {
-    fromFile(file, maxChunkSize, 0, file.length())
+  def fromFile(file: File): ChunkedByteBuffer = {
+    fromFile(file, 0, file.length())
  }

  private def fromFile(
      file: File,
-      maxChunkSize: Int,
      offset: Long,
      length: Long): ChunkedByteBuffer = {
    // We do *not* memory map the file, because we may end up putting this into the memory store,
@ -195,7 +197,7 @@ object ChunkedByteBuffer {
    val is = new FileInputStream(file)
    ByteStreams.skipFully(is, offset)
    val in = new LimitedInputStream(is, length)
-    val chunkSize = math.min(maxChunkSize, length).toInt
+    val chunkSize = math.min(ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH, length).toInt
    val out = new ChunkedByteBufferOutputStream(chunkSize, ByteBuffer.allocate _)
    Utils.tryWithSafeFinally {
      IOUtils.copy(in, out)