Merge pull request #108 from alig/master

Changes to enable executing by using HDFS as a synchronization point between driver and executors, as well as ensuring executors exit properly.
2013-10-25 18:28:43 -07:00 · 2013-10-25 18:28:43 -07:00 · bab496c120
parent d307db6e55 eef261c892
commit bab496c120
7 changed files with 123 additions and 8 deletions
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@ -56,16 +56,15 @@ import org.apache.spark.partial.{ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd._
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend,
-  SparkDeploySchedulerBackend, ClusterScheduler}
+  SparkDeploySchedulerBackend, ClusterScheduler, SimrSchedulerBackend}
 import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend}
 import org.apache.spark.scheduler.local.LocalScheduler
 import org.apache.spark.scheduler.StageInfo
 import org.apache.spark.storage.{BlockManagerSource, RDDInfo, StorageStatus, StorageUtils}
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{ClosureCleaner, MetadataCleaner, MetadataCleanerType,
  TimeStampedHashMap, Utils}
 /**
 * Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
 * cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
@ -158,6 +157,8 @@ class SparkContext(
    val SPARK_REGEX = """spark://(.*)""".r
    // Regular expression for connection to Mesos cluster
    val MESOS_REGEX = """mesos://(.*)""".r
    // Regular expression for connection to Simr cluster
    val SIMR_REGEX = """simr://(.*)""".r
    master match {
      case "local" =>
@ -176,6 +177,12 @@ class SparkContext(
        scheduler.initialize(backend)
        scheduler
      case SIMR_REGEX(simrUrl) =>
        val scheduler = new ClusterScheduler(this)
        val backend = new SimrSchedulerBackend(scheduler, this, simrUrl)
        scheduler.initialize(backend)
        scheduler
      case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
        // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
        val memoryPerSlaveInt = memoryPerSlave.toInt
@ -680,8 +687,7 @@ class SparkContext(
   */
  def addJar(path: String) {
    if (path == null) {
-      logWarning("null specified as parameter to addJar",
+      logWarning("null specified as parameter to addJar")
        new SparkException("null specified as parameter to addJar"))
    } else {
      var key = ""
      if (path.contains("\\")) {
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@ -80,6 +80,11 @@ private[spark] class CoarseGrainedExecutorBackend(
    case Terminated(_) | RemoteClientDisconnected(_, _) | RemoteClientShutdown(_, _) =>
      logError("Driver terminated or disconnected! Shutting down.")
      System.exit(1)
    case StopExecutor =>
      logInfo("Driver commanded a shutdown")
      context.stop(self)
      context.system.shutdown()
  }
  override def statusUpdate(taskId: Long, state: TaskState, data: ByteBuffer) {
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@ -60,6 +60,10 @@ private[spark] object CoarseGrainedClusterMessages {
  case object StopDriver extends CoarseGrainedClusterMessage
  case object StopExecutor extends CoarseGrainedClusterMessage
  case object StopExecutors extends CoarseGrainedClusterMessage
  case class RemoveExecutor(executorId: String, reason: String) extends CoarseGrainedClusterMessage
 }
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@ -101,6 +101,13 @@ class CoarseGrainedSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Ac
        sender ! true
        context.stop(self)
      case StopExecutors =>
        logInfo("Asking each executor to shut down")
        for (executor <- executorActor.values) {
          executor ! StopExecutor
        }
        sender ! true
      case RemoveExecutor(executorId, reason) =>
        removeExecutor(executorId, reason)
        sender ! true
@ -170,11 +177,24 @@ class CoarseGrainedSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Ac
  private val timeout = Duration.create(System.getProperty("spark.akka.askTimeout", "10").toLong, "seconds")
  def stopExecutors() {
    try {
      if (driverActor != null) {
        logInfo("Shutting down all executors")
        val future = driverActor.ask(StopExecutors)(timeout)
        Await.ready(future, timeout)
      }
    } catch {
      case e: Exception =>
        throw new SparkException("Error asking standalone scheduler to shut down executors", e)
    }
  }
  override def stop() {
    try {
      if (driverActor != null) {
        val future = driverActor.ask(StopDriver)(timeout)
-        Await.result(future, timeout)
+        Await.ready(future, timeout)
      }
    } catch {
      case e: Exception =>
@ -197,7 +217,7 @@ class CoarseGrainedSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Ac
  def removeExecutor(executorId: String, reason: String) {
    try {
      val future = driverActor.ask(RemoveExecutor(executorId, reason))(timeout)
-      Await.result(future, timeout)
+      Await.ready(future, timeout)
    } catch {
      case e: Exception =>
        throw new SparkException("Error notifying standalone scheduler's driver actor", e)
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala
@ -0,0 +1,66 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.scheduler.cluster
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{Path, FileSystem}
 import org.apache.spark.{Logging, SparkContext}
 private[spark] class SimrSchedulerBackend(
    scheduler: ClusterScheduler,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem)
  with Logging {
  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)
  val maxCores = System.getProperty("spark.simr.executor.cores", "1").toInt
  override def start() {
    super.start()
    val driverUrl = "akka://spark@%s:%s/user/%s".format(
      System.getProperty("spark.driver.host"), System.getProperty("spark.driver.port"),
      CoarseGrainedSchedulerBackend.ACTOR_NAME)
    val conf = new Configuration()
    val fs = FileSystem.get(conf)
    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.close()
    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }
  override def stop() {
    val conf = new Configuration()
    val fs = FileSystem.get(conf)
    fs.delete(new Path(driverFilePath), false)
    super.stopExecutors()
    super.stop()
  }
 }
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
@ -38,6 +38,6 @@ object SparkPi {
      if (x*x + y*y < 1) 1 else 0
    }.reduce(_ + _)
    println("Pi is roughly " + 4.0 * count / n)
-    System.exit(0)
+    spark.stop()
  }
 }
--- a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@ -633,6 +633,20 @@ class SparkILoop(in0: Option[BufferedReader], val out: PrintWriter, val master:
    Result(true, shouldReplay)
  }
  def addAllClasspath(args: Seq[String]): Unit = {
    var added = false
    var totalClasspath = ""
    for (arg <- args) {
      val f = File(arg).normalize
      if (f.exists) {
        added = true
        addedClasspath = ClassPath.join(addedClasspath, f.path)
        totalClasspath = ClassPath.join(settings.classpath.value, addedClasspath)
      }
    }
    if (added) replay()
  }
  def addClasspath(arg: String): Unit = {
    val f = File(arg).normalize
    if (f.exists) {