Revert "[SPARK-35011][CORE] Avoid Block Manager registrations when StopExecutor msg is in-flight"
This reverts commitb9e53f8937
. ### What changes were proposed in this pull request? Revert https://github.com/apache/spark/pull/32114 ### Why are the changes needed? It breaks the expected `BlockManager` re-registration (e.g., heartbeat loss of an active executor) due to deferred removal of `BlockManager`, see the check:9cefde8db3/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala (L551)
### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass existing tests. Closes #33959 from Ngone51/revert-35011-3.2. Authored-by: yi.wu <yi.wu@databricks.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
This commit is contained in:
parent
b52fbeee2d
commit
dcb57ab42b
|
@ -80,7 +80,9 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
|
|||
// executor ID -> timestamp of when the last heartbeat from this executor was received
|
||||
private val executorLastSeen = new HashMap[String, Long]
|
||||
|
||||
private val executorTimeoutMs = Utils.executorTimeoutMs(sc.conf)
|
||||
private val executorTimeoutMs = sc.conf.get(
|
||||
config.STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT
|
||||
).getOrElse(Utils.timeStringAsMs(s"${sc.conf.get(Network.NETWORK_TIMEOUT)}s"))
|
||||
|
||||
private val checkTimeoutIntervalMs = sc.conf.get(Network.NETWORK_TIMEOUT_INTERVAL)
|
||||
|
||||
|
|
|
@ -96,15 +96,6 @@ class BlockManagerMasterEndpoint(
|
|||
mapper
|
||||
}
|
||||
|
||||
private val executorTimeoutMs = Utils.executorTimeoutMs(conf)
|
||||
private val blockManagerInfoCleaner = {
|
||||
val cleaningDelay = Math.floorDiv(executorTimeoutMs, 2L)
|
||||
val executor = ThreadUtils.newDaemonSingleThreadScheduledExecutor("blockManagerInfo-cleaner")
|
||||
executor.scheduleWithFixedDelay(() => cleanBlockManagerInfo(), cleaningDelay, cleaningDelay,
|
||||
TimeUnit.MILLISECONDS)
|
||||
executor
|
||||
}
|
||||
|
||||
val proactivelyReplicate = conf.get(config.STORAGE_REPLICATION_PROACTIVE)
|
||||
|
||||
val defaultRpcTimeout = RpcUtils.askRpcTimeout(conf)
|
||||
|
@ -282,12 +273,12 @@ class BlockManagerMasterEndpoint(
|
|||
}
|
||||
}
|
||||
bmIdsExecutor.foreach { bmId =>
|
||||
aliveBlockManagerInfo(bmId).foreach { bmInfo =>
|
||||
blockManagerInfo.get(bmId).foreach { bmInfo =>
|
||||
bmInfo.removeBlock(blockId)
|
||||
}
|
||||
}
|
||||
}
|
||||
val removeRddFromExecutorsFutures = allAliveBlockManagerInfos.map { bmInfo =>
|
||||
val removeRddFromExecutorsFutures = blockManagerInfo.values.map { bmInfo =>
|
||||
bmInfo.storageEndpoint.ask[Int](removeMsg).recover {
|
||||
// use 0 as default value means no blocks were removed
|
||||
handleBlockRemovalFailure("RDD", rddId.toString, bmInfo.blockManagerId, 0)
|
||||
|
@ -314,7 +305,7 @@ class BlockManagerMasterEndpoint(
|
|||
// Nothing to do in the BlockManagerMasterEndpoint data structures
|
||||
val removeMsg = RemoveShuffle(shuffleId)
|
||||
Future.sequence(
|
||||
allAliveBlockManagerInfos.map { bm =>
|
||||
blockManagerInfo.values.map { bm =>
|
||||
bm.storageEndpoint.ask[Boolean](removeMsg).recover {
|
||||
// use false as default value means no shuffle data were removed
|
||||
handleBlockRemovalFailure("shuffle", shuffleId.toString, bm.blockManagerId, false)
|
||||
|
@ -330,7 +321,7 @@ class BlockManagerMasterEndpoint(
|
|||
*/
|
||||
private def removeBroadcast(broadcastId: Long, removeFromDriver: Boolean): Future[Seq[Int]] = {
|
||||
val removeMsg = RemoveBroadcast(broadcastId, removeFromDriver)
|
||||
val requiredBlockManagers = allAliveBlockManagerInfos.filter { info =>
|
||||
val requiredBlockManagers = blockManagerInfo.values.filter { info =>
|
||||
removeFromDriver || !info.blockManagerId.isDriver
|
||||
}
|
||||
val futures = requiredBlockManagers.map { bm =>
|
||||
|
@ -346,24 +337,13 @@ class BlockManagerMasterEndpoint(
|
|||
private def removeBlockManager(blockManagerId: BlockManagerId): Unit = {
|
||||
val info = blockManagerInfo(blockManagerId)
|
||||
|
||||
// SPARK-35011: Not removing info from the blockManagerInfo map, but only setting the removal
|
||||
// timestamp of the executor in BlockManagerInfo. This info will be removed from
|
||||
// blockManagerInfo map by the blockManagerInfoCleaner once
|
||||
// now() - info.executorRemovalTs > executorTimeoutMs.
|
||||
//
|
||||
// We are delaying the removal of BlockManagerInfo to avoid a BlockManager reregistration
|
||||
// while a executor is shutting. This unwanted reregistration causes inconsistent bookkeeping
|
||||
// of executors in Spark.
|
||||
// Delaying this removal until blockManagerInfoCleaner decides to remove it ensures
|
||||
// BlockManagerMasterHeartbeatEndpoint does not ask the BlockManager on a recently removed
|
||||
// executor to reregister on BlockManagerHeartbeat message.
|
||||
info.setExecutorRemovalTs()
|
||||
|
||||
// Remove the block manager from blockManagerIdByExecutor.
|
||||
blockManagerIdByExecutor -= blockManagerId.executorId
|
||||
decommissioningBlockManagerSet.remove(blockManagerId)
|
||||
|
||||
// remove all the blocks.
|
||||
// Remove it from blockManagerInfo and remove all the blocks.
|
||||
blockManagerInfo.remove(blockManagerId)
|
||||
|
||||
val iterator = info.blocks.keySet.iterator
|
||||
while (iterator.hasNext) {
|
||||
val blockId = iterator.next
|
||||
|
@ -384,7 +364,7 @@ class BlockManagerMasterEndpoint(
|
|||
val i = (new Random(blockId.hashCode)).nextInt(locations.size)
|
||||
val blockLocations = locations.toSeq
|
||||
val candidateBMId = blockLocations(i)
|
||||
aliveBlockManagerInfo(candidateBMId).foreach { bm =>
|
||||
blockManagerInfo.get(candidateBMId).foreach { bm =>
|
||||
val remainingLocations = locations.toSeq.filter(bm => bm != candidateBMId)
|
||||
val replicateMsg = ReplicateBlock(blockId, remainingLocations, maxReplicas)
|
||||
bm.storageEndpoint.ask[Boolean](replicateMsg)
|
||||
|
@ -420,16 +400,16 @@ class BlockManagerMasterEndpoint(
|
|||
*/
|
||||
private def getReplicateInfoForRDDBlocks(blockManagerId: BlockManagerId): Seq[ReplicateBlock] = {
|
||||
try {
|
||||
aliveBlockManagerInfo(blockManagerId).map { info =>
|
||||
val rddBlocks = info.blocks.keySet().asScala.filter(_.isRDD)
|
||||
rddBlocks.map { blockId =>
|
||||
val currentBlockLocations = blockLocations.get(blockId)
|
||||
val maxReplicas = currentBlockLocations.size + 1
|
||||
val remainingLocations = currentBlockLocations.toSeq.filter(bm => bm != blockManagerId)
|
||||
val replicateMsg = ReplicateBlock(blockId, remainingLocations, maxReplicas)
|
||||
replicateMsg
|
||||
}.toSeq
|
||||
}.getOrElse(Seq.empty[ReplicateBlock])
|
||||
val info = blockManagerInfo(blockManagerId)
|
||||
|
||||
val rddBlocks = info.blocks.keySet().asScala.filter(_.isRDD)
|
||||
rddBlocks.map { blockId =>
|
||||
val currentBlockLocations = blockLocations.get(blockId)
|
||||
val maxReplicas = currentBlockLocations.size + 1
|
||||
val remainingLocations = currentBlockLocations.toSeq.filter(bm => bm != blockManagerId)
|
||||
val replicateMsg = ReplicateBlock(blockId, remainingLocations, maxReplicas)
|
||||
replicateMsg
|
||||
}.toSeq
|
||||
} catch {
|
||||
// If the block manager has already exited, nothing to replicate.
|
||||
case e: java.util.NoSuchElementException =>
|
||||
|
@ -443,7 +423,8 @@ class BlockManagerMasterEndpoint(
|
|||
val locations = blockLocations.get(blockId)
|
||||
if (locations != null) {
|
||||
locations.foreach { blockManagerId: BlockManagerId =>
|
||||
aliveBlockManagerInfo(blockManagerId).foreach { bm =>
|
||||
val blockManager = blockManagerInfo.get(blockManagerId)
|
||||
blockManager.foreach { bm =>
|
||||
// Remove the block from the BlockManager.
|
||||
// Doesn't actually wait for a confirmation and the message might get lost.
|
||||
// If message loss becomes frequent, we should add retry logic here.
|
||||
|
@ -458,14 +439,14 @@ class BlockManagerMasterEndpoint(
|
|||
|
||||
// Return a map from the block manager id to max memory and remaining memory.
|
||||
private def memoryStatus: Map[BlockManagerId, (Long, Long)] = {
|
||||
allAliveBlockManagerInfos.map { info =>
|
||||
(info.blockManagerId, (info.maxMem, info.remainingMem))
|
||||
blockManagerInfo.map { case(blockManagerId, info) =>
|
||||
(blockManagerId, (info.maxMem, info.remainingMem))
|
||||
}.toMap
|
||||
}
|
||||
|
||||
private def storageStatus: Array[StorageStatus] = {
|
||||
allAliveBlockManagerInfos.map { info =>
|
||||
new StorageStatus(info.blockManagerId, info.maxMem, Some(info.maxOnHeapMem),
|
||||
blockManagerInfo.map { case (blockManagerId, info) =>
|
||||
new StorageStatus(blockManagerId, info.maxMem, Some(info.maxOnHeapMem),
|
||||
Some(info.maxOffHeapMem), info.blocks.asScala)
|
||||
}.toArray
|
||||
}
|
||||
|
@ -487,7 +468,7 @@ class BlockManagerMasterEndpoint(
|
|||
* Futures to avoid potential deadlocks. This can arise if there exists a block manager
|
||||
* that is also waiting for this master endpoint's response to a previous message.
|
||||
*/
|
||||
allAliveBlockManagerInfos.map { info =>
|
||||
blockManagerInfo.values.map { info =>
|
||||
val blockStatusFuture =
|
||||
if (askStorageEndpoints) {
|
||||
info.storageEndpoint.ask[Option[BlockStatus]](getBlockStatus)
|
||||
|
@ -511,7 +492,7 @@ class BlockManagerMasterEndpoint(
|
|||
askStorageEndpoints: Boolean): Future[Seq[BlockId]] = {
|
||||
val getMatchingBlockIds = GetMatchingBlockIds(filter)
|
||||
Future.sequence(
|
||||
allAliveBlockManagerInfos.map { info =>
|
||||
blockManagerInfo.values.map { info =>
|
||||
val future =
|
||||
if (askStorageEndpoints) {
|
||||
info.storageEndpoint.ask[Seq[BlockId]](getMatchingBlockIds)
|
||||
|
@ -581,10 +562,9 @@ class BlockManagerMasterEndpoint(
|
|||
if (pushBasedShuffleEnabled) {
|
||||
addMergerLocation(id)
|
||||
}
|
||||
|
||||
listenerBus.post(SparkListenerBlockManagerAdded(time, id,
|
||||
maxOnHeapMemSize + maxOffHeapMemSize, Some(maxOnHeapMemSize), Some(maxOffHeapMemSize)))
|
||||
}
|
||||
listenerBus.post(SparkListenerBlockManagerAdded(time, id, maxOnHeapMemSize + maxOffHeapMemSize,
|
||||
Some(maxOnHeapMemSize), Some(maxOffHeapMemSize)))
|
||||
id
|
||||
}
|
||||
|
||||
|
@ -672,7 +652,7 @@ class BlockManagerMasterEndpoint(
|
|||
if (externalShuffleServiceRddFetchEnabled && bmId.port == externalShuffleServicePort) {
|
||||
blockStatusByShuffleService.get(bmId).flatMap(m => m.get(blockId))
|
||||
} else {
|
||||
aliveBlockManagerInfo(bmId).flatMap(_.getStatus(blockId))
|
||||
blockManagerInfo.get(bmId).flatMap(_.getStatus(blockId))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -683,7 +663,8 @@ class BlockManagerMasterEndpoint(
|
|||
// can be used to access this block even when the original executor is already stopped.
|
||||
loc.host == requesterHost &&
|
||||
(loc.port == externalShuffleServicePort ||
|
||||
aliveBlockManagerInfo(loc)
|
||||
blockManagerInfo
|
||||
.get(loc)
|
||||
.flatMap(_.getStatus(blockId).map(_.storageLevel.useDisk))
|
||||
.getOrElse(false))
|
||||
}.flatMap { bmId => Option(executorIdToLocalDirs.getIfPresent(bmId.executorId)) }
|
||||
|
@ -700,7 +681,7 @@ class BlockManagerMasterEndpoint(
|
|||
|
||||
/** Get the list of the peers of the given block manager */
|
||||
private def getPeers(blockManagerId: BlockManagerId): Seq[BlockManagerId] = {
|
||||
val blockManagerIds = allAliveBlockManagerInfos.map(_.blockManagerId).toSet
|
||||
val blockManagerIds = blockManagerInfo.keySet
|
||||
if (blockManagerIds.contains(blockManagerId)) {
|
||||
blockManagerIds
|
||||
.filterNot { _.isDriver }
|
||||
|
@ -753,7 +734,7 @@ class BlockManagerMasterEndpoint(
|
|||
private def getExecutorEndpointRef(executorId: String): Option[RpcEndpointRef] = {
|
||||
for (
|
||||
blockManagerId <- blockManagerIdByExecutor.get(executorId);
|
||||
info <- aliveBlockManagerInfo(blockManagerId)
|
||||
info <- blockManagerInfo.get(blockManagerId)
|
||||
) yield {
|
||||
info.storageEndpoint
|
||||
}
|
||||
|
@ -761,27 +742,7 @@ class BlockManagerMasterEndpoint(
|
|||
|
||||
override def onStop(): Unit = {
|
||||
askThreadPool.shutdownNow()
|
||||
blockManagerInfoCleaner.shutdownNow()
|
||||
}
|
||||
|
||||
private def cleanBlockManagerInfo(): Unit = {
|
||||
logDebug("Cleaning blockManagerInfo")
|
||||
val now = System.currentTimeMillis()
|
||||
val expiredBmIds = blockManagerInfo.filter { case (_, bmInfo) =>
|
||||
// bmInfo.executorRemovalTs.get cannot be None when BM is not alive
|
||||
!bmInfo.isAlive && (now - bmInfo.executorRemovalTs.get) > executorTimeoutMs
|
||||
}.keys
|
||||
expiredBmIds.foreach { bmId =>
|
||||
logInfo(s"Cleaning expired $bmId from blockManagerInfo")
|
||||
blockManagerInfo.remove(bmId)
|
||||
}
|
||||
}
|
||||
|
||||
@inline private def aliveBlockManagerInfo(bmId: BlockManagerId): Option[BlockManagerInfo] =
|
||||
blockManagerInfo.get(bmId).filter(_.isAlive)
|
||||
|
||||
@inline private def allAliveBlockManagerInfos: Iterable[BlockManagerInfo] =
|
||||
blockManagerInfo.values.filter(_.isAlive)
|
||||
}
|
||||
|
||||
@DeveloperApi
|
||||
|
@ -834,7 +795,6 @@ private[spark] class BlockManagerInfo(
|
|||
|
||||
private var _lastSeenMs: Long = timeMs
|
||||
private var _remainingMem: Long = maxMem
|
||||
private var _executorRemovalTs: Option[Long] = None
|
||||
|
||||
// Mapping from block id to its status.
|
||||
private val _blocks = new JHashMap[BlockId, BlockStatus]
|
||||
|
@ -949,16 +909,4 @@ private[spark] class BlockManagerInfo(
|
|||
def clear(): Unit = {
|
||||
_blocks.clear()
|
||||
}
|
||||
|
||||
def executorRemovalTs: Option[Long] = _executorRemovalTs
|
||||
|
||||
def isAlive: Boolean = _executorRemovalTs.isEmpty
|
||||
|
||||
def setExecutorRemovalTs(): Unit = {
|
||||
if (!isAlive) {
|
||||
logWarning(s"executorRemovalTs is already set to ${_executorRemovalTs.get}")
|
||||
} else {
|
||||
_executorRemovalTs = Some(System.currentTimeMillis())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3090,13 +3090,6 @@ private[spark] object Utils extends Logging {
|
|||
}
|
||||
}
|
||||
|
||||
def executorTimeoutMs(conf: SparkConf): Long = {
|
||||
// "spark.network.timeout" uses "seconds", while `spark.storage.blockManagerSlaveTimeoutMs` uses
|
||||
// "milliseconds"
|
||||
conf.get(config.STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT)
|
||||
.getOrElse(Utils.timeStringAsMs(s"${conf.get(Network.NETWORK_TIMEOUT)}s"))
|
||||
}
|
||||
|
||||
/** Returns a string message about delegation token generation failure */
|
||||
def createFailedToGetTokenMessage(serviceName: String, e: scala.Throwable): String = {
|
||||
val message = "Failed to get token from service %s due to %s. " +
|
||||
|
|
|
@ -26,7 +26,7 @@ import scala.collection.mutable
|
|||
import scala.collection.mutable.ArrayBuffer
|
||||
import scala.concurrent.{Future, TimeoutException}
|
||||
import scala.concurrent.duration._
|
||||
import scala.language.{implicitConversions, postfixOps}
|
||||
import scala.language.implicitConversions
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.commons.lang3.RandomUtils
|
||||
|
@ -101,7 +101,6 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
|
|||
.set(STORAGE_UNROLL_MEMORY_THRESHOLD, 512L)
|
||||
.set(Network.RPC_ASK_TIMEOUT, "5s")
|
||||
.set(PUSH_BASED_SHUFFLE_ENABLED, true)
|
||||
.set(STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT.key, "5s")
|
||||
}
|
||||
|
||||
private def makeSortShuffleManager(conf: Option[SparkConf] = None): SortShuffleManager = {
|
||||
|
@ -611,7 +610,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
|
|||
mc.eq(StorageLevel.NONE), mc.anyInt(), mc.anyInt())
|
||||
}
|
||||
|
||||
test("no reregistration on heart beat until executor timeout") {
|
||||
test("reregistration on heart beat") {
|
||||
val store = makeBlockManager(2000)
|
||||
val a1 = new Array[Byte](400)
|
||||
|
||||
|
@ -622,15 +621,10 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
|
|||
|
||||
master.removeExecutor(store.blockManagerId.executorId)
|
||||
assert(master.getLocations("a1").size == 0, "a1 was not removed from master")
|
||||
|
||||
val reregister = !master.driverHeartbeatEndPoint.askSync[Boolean](
|
||||
BlockManagerHeartbeat(store.blockManagerId))
|
||||
assert(reregister == false, "master told to re-register")
|
||||
|
||||
eventually(timeout(10 seconds), interval(1 seconds)) {
|
||||
val reregister = !master.driverHeartbeatEndPoint.askSync[Boolean](
|
||||
BlockManagerHeartbeat(store.blockManagerId))
|
||||
assert(reregister, "master did not tell to re-register")
|
||||
}
|
||||
assert(reregister)
|
||||
}
|
||||
|
||||
test("reregistration on block update") {
|
||||
|
@ -644,12 +638,6 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
|
|||
master.removeExecutor(store.blockManagerId.executorId)
|
||||
assert(master.getLocations("a1").size == 0, "a1 was not removed from master")
|
||||
|
||||
eventually(timeout(10 seconds), interval(1 seconds)) {
|
||||
val reregister = !master.driverHeartbeatEndPoint.askSync[Boolean](
|
||||
BlockManagerHeartbeat(store.blockManagerId))
|
||||
assert(reregister, "master did not tell to re-register")
|
||||
}
|
||||
|
||||
store.putSingle("a2", a2, StorageLevel.MEMORY_ONLY)
|
||||
store.waitForAsyncReregister()
|
||||
|
||||
|
|
Loading…
Reference in a new issue