[SPARK-28839][CORE] Avoids NPE in context cleaner when dynamic allocation and shuffle service are on

### What changes were proposed in this pull request?

This PR proposes to avoid to thrown NPE at context cleaner when shuffle service is on - it is kind of a small followup of https://github.com/apache/spark/pull/24817

Seems like it sets `null` for `shuffleIds` to track when the service is on. Later, `removeShuffle` tries to remove an element at `shuffleIds` which leads to NPE. It fixes it by explicitly not sending the event (`ShuffleCleanedEvent`) in this case.

See the code path below:

cbad616d4c/core/src/main/scala/org/apache/spark/SparkContext.scala (L584)

cbad616d4c/core/src/main/scala/org/apache/spark/ContextCleaner.scala (L125)

cbad616d4c/core/src/main/scala/org/apache/spark/ContextCleaner.scala (L190)

cbad616d4c/core/src/main/scala/org/apache/spark/ContextCleaner.scala (L220-L230)

cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L353-L357)

cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L347)

cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L400-L406)

cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L475)

cbad616d4c/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (L427)

### Why are the changes needed?

This is a bug fix.

### Does this PR introduce any user-facing change?

It prevents the exception:

```
19/08/21 06:44:01 ERROR AsyncEventQueue: Listener ExecutorMonitor threw an exception
java.lang.NullPointerException
	at org.apache.spark.scheduler.dynalloc.ExecutorMonitor$Tracker.removeShuffle(ExecutorMonitor.scala:479)
	at org.apache.spark.scheduler.dynalloc.ExecutorMonitor.$anonfun$cleanupShuffle$2(ExecutorMonitor.scala:408)
	at org.apache.spark.scheduler.dynalloc.ExecutorMonitor.$anonfun$cleanupShuffle$2$adapted(ExecutorMonitor.scala:407)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at org.apache.spark.scheduler.dynalloc.ExecutorMonitor.cleanupShuffle(ExecutorMonitor.scala:407)
	at org.apache.spark.scheduler.dynalloc.ExecutorMonitor.onOtherEvent(ExecutorMonitor.sc
```

### How was this patch test?

Unittest was added.

Closes #25551 from HyukjinKwon/SPARK-28839.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
This commit is contained in:
HyukjinKwon 2019-08-23 12:44:56 -07:00 committed by Marcelo Vanzin
parent 07c4b9bd1f
commit d25cbd44ee
2 changed files with 19 additions and 3 deletions

View file

@ -355,9 +355,12 @@ private[spark] class ExecutorMonitor(
override def rddCleaned(rddId: Int): Unit = { }
override def shuffleCleaned(shuffleId: Int): Unit = {
// Because this is called in a completely separate thread, we post a custom event to the
// listener bus so that the internal state is safely updated.
listenerBus.post(ShuffleCleanedEvent(shuffleId))
// Only post the event if tracking is enabled
if (shuffleTrackingEnabled) {
// Because this is called in a completely separate thread, we post a custom event to the
// listener bus so that the internal state is safely updated.
listenerBus.post(ShuffleCleanedEvent(shuffleId))
}
}
override def broadcastCleaned(broadcastId: Long): Unit = { }

View file

@ -333,6 +333,19 @@ class ExecutorMonitorSuite extends SparkFunSuite {
assert(monitor.timedOutExecutors(idleDeadline) === Seq("1"))
}
test("SPARK-28839: Avoids NPE in context cleaner when shuffle service is on") {
val bus = mockListenerBus()
conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING, true).set(SHUFFLE_SERVICE_ENABLED, true)
monitor = new ExecutorMonitor(conf, client, bus, clock) {
override def onOtherEvent(event: SparkListenerEvent): Unit = {
throw new IllegalStateException("No event should be sent.")
}
}
monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null))
monitor.shuffleCleaned(0)
}
test("shuffle tracking with multiple executors and concurrent jobs") {
val bus = mockListenerBus()
conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING, true).set(SHUFFLE_SERVICE_ENABLED, false)