Report errors in tasks to the driver via a Mesos status update

When a task throws an exception, the Spark executor previously just
logged it to a local file on the slave and exited. This commit causes
Spark to also report the exception back to the driver using a Mesos
status update, so the user doesn't have to look through a log file on
the slave.

Here's what the reporting currently looks like:

    # ./run spark.examples.ExceptionHandlingTest master@203.0.113.1:5050
    [...]
    11/10/26 21:04:13 INFO spark.SimpleJob: Lost TID 1 (task 0:1)
    11/10/26 21:04:13 INFO spark.SimpleJob: Loss was due to java.lang.Exception: Testing exception handling
    [...]
    11/10/26 21:04:16 INFO spark.SparkContext: Job finished in 5.988547328 s
This commit is contained in:
Ankur Dave 2011-10-26 21:07:17 +00:00
parent 07532021fe
commit 35b6358a7c
3 changed files with 27 additions and 0 deletions

View file

@ -87,6 +87,13 @@ class Executor extends org.apache.mesos.Executor with Logging {
.build())
}
case t: Throwable => {
val reason = OtherFailure(t.toString())
d.sendStatusUpdate(TaskStatus.newBuilder()
.setTaskId(desc.getTaskId)
.setState(TaskState.TASK_FAILED)
.setData(ByteString.copyFrom(Utils.serialize(reason)))
.build())
// TODO: Handle errors in tasks less dramatically
logError("Exception in task ID " + tid, t)
System.exit(1)

View file

@ -229,6 +229,8 @@ extends Job(jobId) with Logging
if (tasksFinished == numTasks)
sched.jobFinished(this)
return
case otherFailure: OtherFailure =>
logInfo("Loss was due to %s".format(otherFailure.message))
case _ => {}
}
}

View file

@ -0,0 +1,18 @@
package spark.examples
import spark.SparkContext
object ExceptionHandlingTest {
def main(args: Array[String]) {
if (args.length == 0) {
System.err.println("Usage: ExceptionHandlingTest <host>")
System.exit(1)
}
val sc = new SparkContext(args(0), "ExceptionHandlingTest")
sc.parallelize(0 until sc.defaultParallelism).foreach { i =>
if (Math.random > 0.75)
throw new Exception("Testing exception handling")
}
}
}