[SPARK-28839][CORE] Avoids NPE in context cleaner when dynamic allocation and shuffle service are on
### What changes were proposed in this pull request? This PR proposes to avoid to thrown NPE at context cleaner when shuffle service is on - it is kind of a small followup of https://github.com/apache/spark/pull/24817 Seems like it sets `null` for `shuffleIds` to track when the service is on. Later, `removeShuffle` tries to remove an element at `shuffleIds` which leads to NPE. It fixes it by explicitly not sending the event (`ShuffleCleanedEvent`) in this case. See the code path below:cbad616d4c/core/src/main/scala/org/apache/spark/SparkContext.scala (L584)
cbad616d4c/core/src/main/scala/org/apache/spark/ContextCleaner.scala (L125)
cbad616d4c/core/src/main/scala/org/apache/spark/ContextCleaner.scala (L190)
cbad616d4c/core/src/main/scala/org/apache/spark/ContextCleaner.scala (L220-L230)
cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L353-L357)
cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L347)
cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L400-L406)
cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L475)
cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L427)
### Why are the changes needed? This is a bug fix. ### Does this PR introduce any user-facing change? It prevents the exception: ``` 19/08/21 06:44:01 ERROR AsyncEventQueue: Listener ExecutorMonitor threw an exception java.lang.NullPointerException at org.apache.spark.scheduler.dynalloc.ExecutorMonitor$Tracker.removeShuffle(ExecutorMonitor.scala:479) at org.apache.spark.scheduler.dynalloc.ExecutorMonitor.$anonfun$cleanupShuffle$2(ExecutorMonitor.scala:408) at org.apache.spark.scheduler.dynalloc.ExecutorMonitor.$anonfun$cleanupShuffle$2$adapted(ExecutorMonitor.scala:407) at scala.collection.Iterator.foreach(Iterator.scala:941) at scala.collection.Iterator.foreach$(Iterator.scala:941) at scala.collection.AbstractIterator.foreach(Iterator.scala:1429) at scala.collection.IterableLike.foreach(IterableLike.scala:74) at scala.collection.IterableLike.foreach$(IterableLike.scala:73) at scala.collection.AbstractIterable.foreach(Iterable.scala:56) at org.apache.spark.scheduler.dynalloc.ExecutorMonitor.cleanupShuffle(ExecutorMonitor.scala:407) at org.apache.spark.scheduler.dynalloc.ExecutorMonitor.onOtherEvent(ExecutorMonitor.sc ``` ### How was this patch test? Unittest was added. Closes #25551 from HyukjinKwon/SPARK-28839. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
This commit is contained in:
parent
07c4b9bd1f
commit
d25cbd44ee
|
@ -355,9 +355,12 @@ private[spark] class ExecutorMonitor(
|
|||
override def rddCleaned(rddId: Int): Unit = { }
|
||||
|
||||
override def shuffleCleaned(shuffleId: Int): Unit = {
|
||||
// Because this is called in a completely separate thread, we post a custom event to the
|
||||
// listener bus so that the internal state is safely updated.
|
||||
listenerBus.post(ShuffleCleanedEvent(shuffleId))
|
||||
// Only post the event if tracking is enabled
|
||||
if (shuffleTrackingEnabled) {
|
||||
// Because this is called in a completely separate thread, we post a custom event to the
|
||||
// listener bus so that the internal state is safely updated.
|
||||
listenerBus.post(ShuffleCleanedEvent(shuffleId))
|
||||
}
|
||||
}
|
||||
|
||||
override def broadcastCleaned(broadcastId: Long): Unit = { }
|
||||
|
|
|
@ -333,6 +333,19 @@ class ExecutorMonitorSuite extends SparkFunSuite {
|
|||
assert(monitor.timedOutExecutors(idleDeadline) === Seq("1"))
|
||||
}
|
||||
|
||||
|
||||
test("SPARK-28839: Avoids NPE in context cleaner when shuffle service is on") {
|
||||
val bus = mockListenerBus()
|
||||
conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING, true).set(SHUFFLE_SERVICE_ENABLED, true)
|
||||
monitor = new ExecutorMonitor(conf, client, bus, clock) {
|
||||
override def onOtherEvent(event: SparkListenerEvent): Unit = {
|
||||
throw new IllegalStateException("No event should be sent.")
|
||||
}
|
||||
}
|
||||
monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null))
|
||||
monitor.shuffleCleaned(0)
|
||||
}
|
||||
|
||||
test("shuffle tracking with multiple executors and concurrent jobs") {
|
||||
val bus = mockListenerBus()
|
||||
conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING, true).set(SHUFFLE_SERVICE_ENABLED, false)
|
||||
|
|
Loading…
Reference in a new issue