[SPARK-26152] Synchronize Worker Cleanup with Worker Shutdown
## What changes were proposed in this pull request? The race between org.apache.spark.deploy.DeployMessages.WorkDirCleanup event and org.apache.spark.deploy.worker.Worker#onStop. Here its possible that while the WorkDirCleanup event is being processed, org.apache.spark.deploy.worker.Worker#cleanupThreadExecutor was shutdown. hence any submission after ThreadPoolExecutor will result in java.util.concurrent.RejectedExecutionException ## How was this patch tested? Manually Closes #24056 from ajithme/workercleanup. Authored-by: Ajith <ajith2489@gmail.com> Signed-off-by: Sean Owen <sean.owen@databricks.com>
This commit is contained in:
parent
bacffb8810
commit
2a04de52dd
|
@ -450,7 +450,8 @@ private[deploy] class Worker(
|
||||||
// rpcEndpoint.
|
// rpcEndpoint.
|
||||||
// Copy ids so that it can be used in the cleanup thread.
|
// Copy ids so that it can be used in the cleanup thread.
|
||||||
val appIds = (executors.values.map(_.appId) ++ drivers.values.map(_.driverId)).toSet
|
val appIds = (executors.values.map(_.appId) ++ drivers.values.map(_.driverId)).toSet
|
||||||
val cleanupFuture = concurrent.Future {
|
try {
|
||||||
|
val cleanupFuture: concurrent.Future[Unit] = concurrent.Future {
|
||||||
val appDirs = workDir.listFiles()
|
val appDirs = workDir.listFiles()
|
||||||
if (appDirs == null) {
|
if (appDirs == null) {
|
||||||
throw new IOException("ERROR: Failed to list files in " + appDirs)
|
throw new IOException("ERROR: Failed to list files in " + appDirs)
|
||||||
|
@ -471,6 +472,10 @@ private[deploy] class Worker(
|
||||||
cleanupFuture.failed.foreach(e =>
|
cleanupFuture.failed.foreach(e =>
|
||||||
logError("App dir cleanup failed: " + e.getMessage, e)
|
logError("App dir cleanup failed: " + e.getMessage, e)
|
||||||
)(cleanupThreadExecutor)
|
)(cleanupThreadExecutor)
|
||||||
|
} catch {
|
||||||
|
case _: RejectedExecutionException if cleanupThreadExecutor.isShutdown =>
|
||||||
|
logWarning("Failed to cleanup work dir as executor pool was shutdown")
|
||||||
|
}
|
||||||
|
|
||||||
case MasterChanged(masterRef, masterWebUiUrl) =>
|
case MasterChanged(masterRef, masterWebUiUrl) =>
|
||||||
logInfo("Master has changed, new master is at " + masterRef.address.toSparkURL)
|
logInfo("Master has changed, new master is at " + masterRef.address.toSparkURL)
|
||||||
|
@ -634,6 +639,7 @@ private[deploy] class Worker(
|
||||||
val shouldCleanup = finishedApps.contains(id) && !executors.values.exists(_.appId == id)
|
val shouldCleanup = finishedApps.contains(id) && !executors.values.exists(_.appId == id)
|
||||||
if (shouldCleanup) {
|
if (shouldCleanup) {
|
||||||
finishedApps -= id
|
finishedApps -= id
|
||||||
|
try {
|
||||||
appDirectories.remove(id).foreach { dirList =>
|
appDirectories.remove(id).foreach { dirList =>
|
||||||
concurrent.Future {
|
concurrent.Future {
|
||||||
logInfo(s"Cleaning up local directories for application $id")
|
logInfo(s"Cleaning up local directories for application $id")
|
||||||
|
@ -644,6 +650,10 @@ private[deploy] class Worker(
|
||||||
logError(s"Clean up app dir $dirList failed: ${e.getMessage}", e)
|
logError(s"Clean up app dir $dirList failed: ${e.getMessage}", e)
|
||||||
)(cleanupThreadExecutor)
|
)(cleanupThreadExecutor)
|
||||||
}
|
}
|
||||||
|
} catch {
|
||||||
|
case _: RejectedExecutionException if cleanupThreadExecutor.isShutdown =>
|
||||||
|
logWarning("Failed to cleanup application as executor pool was shutdown")
|
||||||
|
}
|
||||||
shuffleService.applicationRemoved(id)
|
shuffleService.applicationRemoved(id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue