Merge branch 'master' of https://github.com/mesos/spark into bootstrap-update

2013-08-01 11:28:28 -07:00 · 2013-08-01 11:28:28 -07:00 · 5dae283996
parent 7c9c5ef6c6 0a96493ac6
commit 5dae283996
9 changed files with 113 additions and 28 deletions
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@ -62,43 +62,31 @@
      <groupId>org.spark-project</groupId>
      <artifactId>spark-core</artifactId>
      <classifier>${classifier.name}</classifier>
-      <version>0.8.0-SNAPSHOT</version>
+      <version>${project.version}</version>
    </dependency>
    <dependency>
      <groupId>org.spark-project</groupId>
      <artifactId>spark-bagel</artifactId>
      <classifier>${classifier.name}</classifier>
-      <version>0.8.0-SNAPSHOT</version>
+      <version>${project.version}</version>
    </dependency>
    <dependency>
      <groupId>org.spark-project</groupId>
-      <artifactId>spark-examples</artifactId>
+      <artifactId>spark-mllib</artifactId>
      <classifier>${classifier.name}</classifier>
-      <version>0.8.0-SNAPSHOT</version>
-    </dependency>
-    <dependency>
-      <groupId>org.spark-project</groupId>
-      <artifactId>spark-examples</artifactId>
-      <classifier>javadoc</classifier>
-      <version>0.8.0-SNAPSHOT</version>
-    </dependency>
-    <dependency>
-      <groupId>org.spark-project</groupId>
-      <artifactId>spark-examples</artifactId>
-      <classifier>sources</classifier>
-      <version>0.8.0-SNAPSHOT</version>
+      <version>${project.version}</version>
    </dependency>
    <dependency>
      <groupId>org.spark-project</groupId>
      <artifactId>spark-repl</artifactId>
      <classifier>${classifier.name}</classifier>
-      <version>0.8.0-SNAPSHOT</version>
+      <version>${project.version}</version>
    </dependency>
    <dependency>
      <groupId>org.spark-project</groupId>
      <artifactId>spark-streaming</artifactId>
      <classifier>${classifier.name}</classifier>
-      <version>0.8.0-SNAPSHOT</version>
+      <version>${project.version}</version>
    </dependency>
  </dependencies>
 </project>
--- a/assembly/src/main/assembly/assembly.xml
+++ b/assembly/src/main/assembly/assembly.xml
@ -49,7 +49,7 @@
        <include>org.spark-project:*:jar</include>
      </includes>
      <excludes>
-        <exclude>org.spark-project:spark-dist:jar</exclude>
+        <exclude>org.spark-project:spark-assembly:jar</exclude>
      </excludes>
    </dependencySet>
    <dependencySet>
--- a/core/src/main/scala/spark/deploy/master/ui/IndexPage.scala
+++ b/core/src/main/scala/spark/deploy/master/ui/IndexPage.scala
@ -25,17 +25,25 @@ import akka.dispatch.Await
 import akka.pattern.ask
 import akka.util.duration._

+import net.liftweb.json.JsonAST.JValue
+
 import spark.Utils
 import spark.deploy.DeployWebUI
 import spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
+import spark.deploy.JsonProtocol
 import spark.deploy.master.{ApplicationInfo, WorkerInfo}
 import spark.ui.UIUtils

-
 private[spark] class IndexPage(parent: MasterWebUI) {
  val master = parent.master
  implicit val timeout = parent.timeout

+  def renderJson(request: HttpServletRequest): JValue = {
+    val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse]
+    val state = Await.result(stateFuture, 30 seconds)
+    JsonProtocol.writeMasterState(state)
+  }
+
  /** Index view listing applications and executors */
  def render(request: HttpServletRequest): Seq[Node] = {
    val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse]
--- a/core/src/main/scala/spark/deploy/master/ui/MasterWebUI.scala
+++ b/core/src/main/scala/spark/deploy/master/ui/MasterWebUI.scala
@ -61,6 +61,7 @@ class MasterWebUI(val master: ActorRef, requestedPort: Int) extends Logging {
    ("/static", createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR)),
    ("/app/json", (request: HttpServletRequest) => applicationPage.renderJson(request)),
    ("/app", (request: HttpServletRequest) => applicationPage.render(request)),
+    ("/json", (request: HttpServletRequest) => indexPage.renderJson(request)),
    ("*", (request: HttpServletRequest) => indexPage.render(request))
  )

--- a/core/src/main/scala/spark/scheduler/cluster/ClusterTaskSetManager.scala
+++ b/core/src/main/scala/spark/scheduler/cluster/ClusterTaskSetManager.scala
@ -85,7 +85,7 @@ private[spark] class ClusterTaskSetManager(sched: ClusterScheduler, val taskSet:
  val CPUS_PER_TASK = System.getProperty("spark.task.cpus", "1").toDouble

  // Maximum times a task is allowed to fail before failing the job
-  val MAX_TASK_FAILURES = 4
+  val MAX_TASK_FAILURES = System.getProperty("spark.task.maxFailures", "4").toInt

  // Quantile of tasks at which to start speculation
  val SPECULATION_QUANTILE = System.getProperty("spark.speculation.quantile", "0.75").toDouble
--- a/core/src/main/scala/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/spark/ui/UIUtils.scala
@ -74,7 +74,6 @@ private[spark] object UIUtils {
                    </ul>
                    <ul id="infolist">
                      <li>Application: <strong>{sc.appName}</strong></li>
-                      <li>Master: <strong>{sc.master}</strong></li>
                      <li>Executors: <strong>{sc.getExecutorStorageStatus.size}</strong></li>
                    </ul>
                  </div>
@ -117,9 +116,9 @@ private[spark] object UIUtils {
              <img src="/static/spark_logo.png" />
            </div>
            <div class="span10">
-              <h1 style="vertical-align: bottom; margin-top: 40px; display: inline-block;">
+              <h3 style="vertical-align: bottom; margin-top: 40px; display: inline-block;">
                {title}
-              </h1>
+              </h3>
            </div>
          </div>
          {content}
--- a/docs/configuration.md
+++ b/docs/configuration.md
@ -310,6 +310,14 @@ Apart from these, the following properties are also available, and may be useful
    Duration (milliseconds) of how long to batch new objects coming from network receivers.
  </td>
 </tr>
+<tr>
+  <td>spark.task.maxFailures</td>
+  <td>4</td>
+  <td>
+    Number of individual task failures before giving up on the job.
+    Should be greater than or equal to 1. Number of allowed retries = this value - 1.
+  </td>
+</tr>

 </table>

--- a/mllib/src/main/scala/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/spark/mllib/clustering/KMeans.scala
@ -315,14 +315,15 @@ object KMeans {
  }

  def main(args: Array[String]) {
-    if (args.length != 4) {
-      println("Usage: KMeans <master> <input_file> <k> <max_iterations>")
+    if (args.length < 4) {
+      println("Usage: KMeans <master> <input_file> <k> <max_iterations> [<runs>]")
      System.exit(1)
    }
    val (master, inputFile, k, iters) = (args(0), args(1), args(2).toInt, args(3).toInt)
+    val runs = if (args.length >= 5) args(4).toInt else 1
    val sc = new SparkContext(master, "KMeans")
-    val data = sc.textFile(inputFile).map(line => line.split(' ').map(_.toDouble))
-    val model = KMeans.train(data, k, iters)
+    val data = sc.textFile(inputFile).map(line => line.split(' ').map(_.toDouble)).cache()
+    val model = KMeans.train(data, k, iters, runs)
    val cost = model.computeCost(data)
    println("Cluster centers:")
    for (c <- model.clusterCenters) {
--- a/mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala
@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.util
+
+import scala.util.Random
+
+import spark.{RDD, SparkContext}
+
+object KMeansDataGenerator {
+
+  /**
+   * Generate an RDD containing test data for KMeans. This function chooses k cluster centers
+   * from a d-dimensional Gaussian distribution scaled by factor r, then creates a Gaussian
+   * cluster with scale 1 around each center.
+   *
+   * @param sc SparkContext to use for creating the RDD
+   * @param numPoints Number of points that will be contained in the RDD
+   * @param k Number of clusters
+   * @param d Number of dimensions
+   * @parak r Scaling factor for the distribution of the initial centers
+   * @param numPartitions Number of partitions of the generated RDD; default 2
+   */
+  def generateKMeansRDD(
+      sc: SparkContext,
+      numPoints: Int,
+      k: Int,
+      d: Int,
+      r: Double,
+      numPartitions: Int = 2)
+    : RDD[Array[Double]] =
+  {
+    // First, generate some centers
+    val rand = new Random(42)
+    val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r))
+    // Then generate points around each center
+    sc.parallelize(0 until numPoints, numPartitions).map { idx =>
+      val center = centers(idx % k)
+      val rand2 = new Random(42 + idx)
+      Array.tabulate(d)(i => center(i) + rand2.nextGaussian())
+    }
+  }
+
+  def main(args: Array[String]) {
+    if (args.length < 6) {
+      println("Usage: KMeansGenerator " +
+        "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]")
+      System.exit(1)
+    }
+
+    val sparkMaster = args(0)
+    val outputPath = args(1)
+    val numPoints = args(2).toInt
+    val k = args(3).toInt
+    val d = args(4).toInt
+    val r = args(5).toDouble
+    val parts = if (args.length >= 7) args(6).toInt else 2
+
+    val sc = new SparkContext(sparkMaster, "KMeansDataGenerator")
+    val data = generateKMeansRDD(sc, numPoints, k, d, r, parts)
+    data.map(_.mkString(" ")).saveAsTextFile(outputPath)
+
+    System.exit(0)
+  }
+}
+