Merge branch 'master' into stageInfo
Conflicts: core/src/main/scala/spark/scheduler/DAGScheduler.scala core/src/main/scala/spark/scheduler/local/LocalScheduler.scala
This commit is contained in:
commit
b430d2359d
|
@ -45,11 +45,6 @@
|
|||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop1</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>!hadoopVersion</name>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.spark-project</groupId>
|
||||
|
@ -77,12 +72,6 @@
|
|||
</profile>
|
||||
<profile>
|
||||
<id>hadoop2</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>hadoopVersion</name>
|
||||
<value>2</value>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.spark-project</groupId>
|
||||
|
|
|
@ -23,7 +23,7 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter {
|
|||
sc = null
|
||||
}
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
}
|
||||
|
||||
test("halting by voting") {
|
||||
|
|
16
core/pom.xml
16
core/pom.xml
|
@ -98,6 +98,11 @@
|
|||
<artifactId>scalacheck_${scala.version}</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.easymock</groupId>
|
||||
<artifactId>easymock</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.novocode</groupId>
|
||||
<artifactId>junit-interface</artifactId>
|
||||
|
@ -163,11 +168,6 @@
|
|||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop1</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>!hadoopVersion</name>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
|
@ -220,12 +220,6 @@
|
|||
</profile>
|
||||
<profile>
|
||||
<id>hadoop2</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>hadoopVersion</name>
|
||||
<value>2</value>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
|
|
|
@ -10,9 +10,9 @@ import spark.storage.{BlockManager, StorageLevel}
|
|||
private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
|
||||
private val loading = new HashSet[String]
|
||||
|
||||
/** Gets or computes an RDD split. Used by RDD.iterator() when a RDD is cached. */
|
||||
/** Gets or computes an RDD split. Used by RDD.iterator() when an RDD is cached. */
|
||||
def getOrCompute[T](rdd: RDD[T], split: Split, context: TaskContext, storageLevel: StorageLevel)
|
||||
: Iterator[T] = {
|
||||
: Iterator[T] = {
|
||||
val key = "rdd_%d_%d".format(rdd.id, split.index)
|
||||
logInfo("Cache key is " + key)
|
||||
blockManager.get(key) match {
|
||||
|
@ -50,7 +50,7 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
|
|||
// If we got here, we have to load the split
|
||||
val elements = new ArrayBuffer[Any]
|
||||
logInfo("Computing partition " + split)
|
||||
elements ++= rdd.compute(split, context)
|
||||
elements ++= rdd.computeOrReadCheckpoint(split, context)
|
||||
// Try to put this block in the blockManager
|
||||
blockManager.put(key, elements, storageLevel, true)
|
||||
return elements.iterator.asInstanceOf[Iterator[T]]
|
||||
|
|
|
@ -61,17 +61,3 @@ class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int)
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Represents a dependency between the PartitionPruningRDD and its parent. In this
|
||||
* case, the child RDD contains a subset of partitions of the parents'.
|
||||
*/
|
||||
class PruneDependency[T](rdd: RDD[T], @transient partitionFilterFunc: Int => Boolean)
|
||||
extends NarrowDependency[T](rdd) {
|
||||
|
||||
@transient
|
||||
val partitions: Array[Split] = rdd.splits.filter(s => partitionFilterFunc(s.index))
|
||||
|
||||
override def getParents(partitionId: Int) = List(partitions(partitionId).index)
|
||||
}
|
||||
|
|
|
@ -38,10 +38,7 @@ private[spark] class MapOutputTrackerActor(tracker: MapOutputTracker) extends Ac
|
|||
}
|
||||
}
|
||||
|
||||
private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolean) extends Logging {
|
||||
val ip: String = System.getProperty("spark.master.host", "localhost")
|
||||
val port: Int = System.getProperty("spark.master.port", "7077").toInt
|
||||
val actorName: String = "MapOutputTracker"
|
||||
private[spark] class MapOutputTracker(actorSystem: ActorSystem, isDriver: Boolean) extends Logging {
|
||||
|
||||
val timeout = 10.seconds
|
||||
|
||||
|
@ -56,11 +53,14 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
|
|||
var cacheGeneration = generation
|
||||
val cachedSerializedStatuses = new TimeStampedHashMap[Int, Array[Byte]]
|
||||
|
||||
var trackerActor: ActorRef = if (isMaster) {
|
||||
val actorName: String = "MapOutputTracker"
|
||||
var trackerActor: ActorRef = if (isDriver) {
|
||||
val actor = actorSystem.actorOf(Props(new MapOutputTrackerActor(this)), name = actorName)
|
||||
logInfo("Registered MapOutputTrackerActor actor")
|
||||
actor
|
||||
} else {
|
||||
val ip = System.getProperty("spark.driver.host", "localhost")
|
||||
val port = System.getProperty("spark.driver.port", "7077").toInt
|
||||
val url = "akka://spark@%s:%s/user/%s".format(ip, port, actorName)
|
||||
actorSystem.actorFor(url)
|
||||
}
|
||||
|
@ -170,7 +170,7 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
|
|||
}
|
||||
}
|
||||
|
||||
def cleanup(cleanupTime: Long) {
|
||||
private def cleanup(cleanupTime: Long) {
|
||||
mapStatuses.clearOldValues(cleanupTime)
|
||||
cachedSerializedStatuses.clearOldValues(cleanupTime)
|
||||
}
|
||||
|
|
|
@ -465,7 +465,7 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
|
|||
val res = self.context.runJob(self, process _, Array(index), false)
|
||||
res(0)
|
||||
case None =>
|
||||
self.filter(_._1 == key).map(_._2).collect
|
||||
self.filter(_._1 == key).map(_._2).collect()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -590,7 +590,7 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
|
|||
|
||||
var count = 0
|
||||
while(iter.hasNext) {
|
||||
val record = iter.next
|
||||
val record = iter.next()
|
||||
count += 1
|
||||
writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
|
||||
}
|
||||
|
@ -649,9 +649,7 @@ class OrderedRDDFunctions[K <% Ordered[K]: ClassManifest, V: ClassManifest](
|
|||
}
|
||||
|
||||
private[spark]
|
||||
class MappedValuesRDD[K, V, U](prev: RDD[(K, V)], f: V => U)
|
||||
extends RDD[(K, U)](prev) {
|
||||
|
||||
class MappedValuesRDD[K, V, U](prev: RDD[(K, V)], f: V => U) extends RDD[(K, U)](prev) {
|
||||
override def getSplits = firstParent[(K, V)].splits
|
||||
override val partitioner = firstParent[(K, V)].partitioner
|
||||
override def compute(split: Split, context: TaskContext) =
|
||||
|
|
|
@ -1,27 +1,17 @@
|
|||
package spark
|
||||
|
||||
import java.io.{ObjectOutputStream, IOException, EOFException, ObjectInputStream}
|
||||
import java.net.URL
|
||||
import java.util.{Date, Random}
|
||||
import java.util.{HashMap => JHashMap}
|
||||
import java.util.concurrent.atomic.AtomicLong
|
||||
|
||||
import scala.collection.Map
|
||||
import scala.collection.JavaConversions.mapAsScalaMap
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
import scala.collection.mutable.HashMap
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hadoop.io.BytesWritable
|
||||
import org.apache.hadoop.io.NullWritable
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.Writable
|
||||
import org.apache.hadoop.mapred.FileOutputCommitter
|
||||
import org.apache.hadoop.mapred.HadoopWriter
|
||||
import org.apache.hadoop.mapred.JobConf
|
||||
import org.apache.hadoop.mapred.OutputCommitter
|
||||
import org.apache.hadoop.mapred.OutputFormat
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.hadoop.mapred.TextOutputFormat
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.{Object2LongOpenHashMap => OLMap}
|
||||
|
@ -30,7 +20,6 @@ import spark.partial.BoundedDouble
|
|||
import spark.partial.CountEvaluator
|
||||
import spark.partial.GroupedCountEvaluator
|
||||
import spark.partial.PartialResult
|
||||
import spark.rdd.BlockRDD
|
||||
import spark.rdd.CartesianRDD
|
||||
import spark.rdd.FilteredRDD
|
||||
import spark.rdd.FlatMappedRDD
|
||||
|
@ -73,11 +62,11 @@ import SparkContext._
|
|||
* on RDD internals.
|
||||
*/
|
||||
abstract class RDD[T: ClassManifest](
|
||||
@transient var sc: SparkContext,
|
||||
var dependencies_ : List[Dependency[_]]
|
||||
@transient private var sc: SparkContext,
|
||||
@transient private var deps: Seq[Dependency[_]]
|
||||
) extends Serializable with Logging {
|
||||
|
||||
|
||||
/** Construct an RDD with just a one-to-one dependency on one parent */
|
||||
def this(@transient oneParent: RDD[_]) =
|
||||
this(oneParent.context , List(new OneToOneDependency(oneParent)))
|
||||
|
||||
|
@ -85,25 +74,27 @@ abstract class RDD[T: ClassManifest](
|
|||
// Methods that should be implemented by subclasses of RDD
|
||||
// =======================================================================
|
||||
|
||||
/** Function for computing a given partition. */
|
||||
/** Implemented by subclasses to compute a given partition. */
|
||||
def compute(split: Split, context: TaskContext): Iterator[T]
|
||||
|
||||
/** Set of partitions in this RDD. */
|
||||
protected def getSplits(): Array[Split]
|
||||
/**
|
||||
* Implemented by subclasses to return the set of partitions in this RDD. This method will only
|
||||
* be called once, so it is safe to implement a time-consuming computation in it.
|
||||
*/
|
||||
protected def getSplits: Array[Split]
|
||||
|
||||
/** How this RDD depends on any parent RDDs. */
|
||||
protected def getDependencies(): List[Dependency[_]] = dependencies_
|
||||
/**
|
||||
* Implemented by subclasses to return how this RDD depends on parent RDDs. This method will only
|
||||
* be called once, so it is safe to implement a time-consuming computation in it.
|
||||
*/
|
||||
protected def getDependencies: Seq[Dependency[_]] = deps
|
||||
|
||||
/** A friendly name for this RDD */
|
||||
var name: String = null
|
||||
|
||||
/** Optionally overridden by subclasses to specify placement preferences. */
|
||||
protected def getPreferredLocations(split: Split): Seq[String] = Nil
|
||||
|
||||
/** Optionally overridden by subclasses to specify how they are partitioned. */
|
||||
val partitioner: Option[Partitioner] = None
|
||||
|
||||
|
||||
// =======================================================================
|
||||
// Methods and fields available on all RDDs
|
||||
// =======================================================================
|
||||
|
@ -111,13 +102,16 @@ abstract class RDD[T: ClassManifest](
|
|||
/** A unique ID for this RDD (within its SparkContext). */
|
||||
val id = sc.newRddId()
|
||||
|
||||
/** A friendly name for this RDD */
|
||||
var name: String = null
|
||||
|
||||
/** Assign a name to this RDD */
|
||||
def setName(_name: String) = {
|
||||
name = _name
|
||||
this
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Set this RDD's storage level to persist its values across operations after the first time
|
||||
* it is computed. Can only be called once on each RDD.
|
||||
*/
|
||||
|
@ -142,15 +136,24 @@ abstract class RDD[T: ClassManifest](
|
|||
/** Get the RDD's current storage level, or StorageLevel.NONE if none is set. */
|
||||
def getStorageLevel = storageLevel
|
||||
|
||||
// Our dependencies and splits will be gotten by calling subclass's methods below, and will
|
||||
// be overwritten when we're checkpointed
|
||||
private var dependencies_ : Seq[Dependency[_]] = null
|
||||
@transient private var splits_ : Array[Split] = null
|
||||
|
||||
/** An Option holding our checkpoint RDD, if we are checkpointed */
|
||||
private def checkpointRDD: Option[RDD[T]] = checkpointData.flatMap(_.checkpointRDD)
|
||||
|
||||
/**
|
||||
* Get the preferred location of a split, taking into account whether the
|
||||
* Get the list of dependencies of this RDD, taking into account whether the
|
||||
* RDD is checkpointed or not.
|
||||
*/
|
||||
final def preferredLocations(split: Split): Seq[String] = {
|
||||
if (isCheckpointed) {
|
||||
checkpointData.get.getPreferredLocations(split)
|
||||
} else {
|
||||
getPreferredLocations(split)
|
||||
final def dependencies: Seq[Dependency[_]] = {
|
||||
checkpointRDD.map(r => List(new OneToOneDependency(r))).getOrElse {
|
||||
if (dependencies_ == null) {
|
||||
dependencies_ = getDependencies
|
||||
}
|
||||
dependencies_
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -159,22 +162,21 @@ abstract class RDD[T: ClassManifest](
|
|||
* RDD is checkpointed or not.
|
||||
*/
|
||||
final def splits: Array[Split] = {
|
||||
if (isCheckpointed) {
|
||||
checkpointData.get.getSplits
|
||||
} else {
|
||||
getSplits
|
||||
checkpointRDD.map(_.splits).getOrElse {
|
||||
if (splits_ == null) {
|
||||
splits_ = getSplits
|
||||
}
|
||||
splits_
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of dependencies of this RDD, taking into account whether the
|
||||
* Get the preferred location of a split, taking into account whether the
|
||||
* RDD is checkpointed or not.
|
||||
*/
|
||||
final def dependencies: List[Dependency[_]] = {
|
||||
if (isCheckpointed) {
|
||||
dependencies_
|
||||
} else {
|
||||
getDependencies
|
||||
final def preferredLocations(split: Split): Seq[String] = {
|
||||
checkpointRDD.map(_.getPreferredLocations(split)).getOrElse {
|
||||
getPreferredLocations(split)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -184,10 +186,19 @@ abstract class RDD[T: ClassManifest](
|
|||
* subclasses of RDD.
|
||||
*/
|
||||
final def iterator(split: Split, context: TaskContext): Iterator[T] = {
|
||||
if (isCheckpointed) {
|
||||
checkpointData.get.iterator(split, context)
|
||||
} else if (storageLevel != StorageLevel.NONE) {
|
||||
if (storageLevel != StorageLevel.NONE) {
|
||||
SparkEnv.get.cacheManager.getOrCompute(this, split, context, storageLevel)
|
||||
} else {
|
||||
computeOrReadCheckpoint(split, context)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute an RDD partition or read it from a checkpoint if the RDD is checkpointing.
|
||||
*/
|
||||
private[spark] def computeOrReadCheckpoint(split: Split, context: TaskContext): Iterator[T] = {
|
||||
if (isCheckpointed) {
|
||||
firstParent[T].iterator(split, context)
|
||||
} else {
|
||||
compute(split, context)
|
||||
}
|
||||
|
@ -374,20 +385,22 @@ abstract class RDD[T: ClassManifest](
|
|||
val reducePartition: Iterator[T] => Option[T] = iter => {
|
||||
if (iter.hasNext) {
|
||||
Some(iter.reduceLeft(cleanF))
|
||||
}else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
val options = sc.runJob(this, reducePartition)
|
||||
val results = new ArrayBuffer[T]
|
||||
for (opt <- options; elem <- opt) {
|
||||
results += elem
|
||||
}
|
||||
if (results.size == 0) {
|
||||
throw new UnsupportedOperationException("empty collection")
|
||||
} else {
|
||||
return results.reduceLeft(cleanF)
|
||||
var jobResult: Option[T] = None
|
||||
val mergeResult = (index: Int, taskResult: Option[T]) => {
|
||||
if (taskResult != None) {
|
||||
jobResult = jobResult match {
|
||||
case Some(value) => Some(f(value, taskResult.get))
|
||||
case None => taskResult
|
||||
}
|
||||
}
|
||||
}
|
||||
sc.runJob(this, reducePartition, mergeResult)
|
||||
// Get the final result out of our Option, or throw an exception if the RDD was empty
|
||||
jobResult.getOrElse(throw new UnsupportedOperationException("empty collection"))
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -397,9 +410,13 @@ abstract class RDD[T: ClassManifest](
|
|||
* modify t2.
|
||||
*/
|
||||
def fold(zeroValue: T)(op: (T, T) => T): T = {
|
||||
// Clone the zero value since we will also be serializing it as part of tasks
|
||||
var jobResult = Utils.clone(zeroValue, sc.env.closureSerializer.newInstance())
|
||||
val cleanOp = sc.clean(op)
|
||||
val results = sc.runJob(this, (iter: Iterator[T]) => iter.fold(zeroValue)(cleanOp))
|
||||
return results.fold(zeroValue)(cleanOp)
|
||||
val foldPartition = (iter: Iterator[T]) => iter.fold(zeroValue)(cleanOp)
|
||||
val mergeResult = (index: Int, taskResult: T) => jobResult = op(jobResult, taskResult)
|
||||
sc.runJob(this, foldPartition, mergeResult)
|
||||
jobResult
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -411,11 +428,14 @@ abstract class RDD[T: ClassManifest](
|
|||
* allocation.
|
||||
*/
|
||||
def aggregate[U: ClassManifest](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U = {
|
||||
// Clone the zero value since we will also be serializing it as part of tasks
|
||||
var jobResult = Utils.clone(zeroValue, sc.env.closureSerializer.newInstance())
|
||||
val cleanSeqOp = sc.clean(seqOp)
|
||||
val cleanCombOp = sc.clean(combOp)
|
||||
val results = sc.runJob(this,
|
||||
(iter: Iterator[T]) => iter.aggregate(zeroValue)(cleanSeqOp, cleanCombOp))
|
||||
return results.fold(zeroValue)(cleanCombOp)
|
||||
val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp)
|
||||
val mergeResult = (index: Int, taskResult: U) => jobResult = combOp(jobResult, taskResult)
|
||||
sc.runJob(this, aggregatePartition, mergeResult)
|
||||
jobResult
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -426,7 +446,7 @@ abstract class RDD[T: ClassManifest](
|
|||
var result = 0L
|
||||
while (iter.hasNext) {
|
||||
result += 1L
|
||||
iter.next
|
||||
iter.next()
|
||||
}
|
||||
result
|
||||
}).sum
|
||||
|
@ -441,7 +461,7 @@ abstract class RDD[T: ClassManifest](
|
|||
var result = 0L
|
||||
while (iter.hasNext) {
|
||||
result += 1L
|
||||
iter.next
|
||||
iter.next()
|
||||
}
|
||||
result
|
||||
}
|
||||
|
@ -578,15 +598,15 @@ abstract class RDD[T: ClassManifest](
|
|||
/**
|
||||
* Return whether this RDD has been checkpointed or not
|
||||
*/
|
||||
def isCheckpointed(): Boolean = {
|
||||
if (checkpointData.isDefined) checkpointData.get.isCheckpointed() else false
|
||||
def isCheckpointed: Boolean = {
|
||||
checkpointData.map(_.isCheckpointed).getOrElse(false)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the name of the file to which this RDD was checkpointed
|
||||
*/
|
||||
def getCheckpointFile(): Option[String] = {
|
||||
if (checkpointData.isDefined) checkpointData.get.getCheckpointFile() else None
|
||||
def getCheckpointFile: Option[String] = {
|
||||
checkpointData.flatMap(_.getCheckpointFile)
|
||||
}
|
||||
|
||||
// =======================================================================
|
||||
|
@ -611,31 +631,52 @@ abstract class RDD[T: ClassManifest](
|
|||
def context = sc
|
||||
|
||||
/**
|
||||
* Performs the checkpointing of this RDD by saving this . It is called by the DAGScheduler
|
||||
* Performs the checkpointing of this RDD by saving this. It is called by the DAGScheduler
|
||||
* after a job using this RDD has completed (therefore the RDD has been materialized and
|
||||
* potentially stored in memory). doCheckpoint() is called recursively on the parent RDDs.
|
||||
*/
|
||||
protected[spark] def doCheckpoint() {
|
||||
if (checkpointData.isDefined) checkpointData.get.doCheckpoint()
|
||||
dependencies.foreach(_.rdd.doCheckpoint())
|
||||
private[spark] def doCheckpoint() {
|
||||
if (checkpointData.isDefined) {
|
||||
checkpointData.get.doCheckpoint()
|
||||
} else {
|
||||
dependencies.foreach(_.rdd.doCheckpoint())
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the dependencies of this RDD from its original parents to the new RDD
|
||||
* (`newRDD`) created from the checkpoint file.
|
||||
* Changes the dependencies of this RDD from its original parents to a new RDD (`newRDD`)
|
||||
* created from the checkpoint file, and forget its old dependencies and splits.
|
||||
*/
|
||||
protected[spark] def changeDependencies(newRDD: RDD[_]) {
|
||||
private[spark] def markCheckpointed(checkpointRDD: RDD[_]) {
|
||||
clearDependencies()
|
||||
dependencies_ = List(new OneToOneDependency(newRDD))
|
||||
dependencies_ = null
|
||||
splits_ = null
|
||||
deps = null // Forget the constructor argument for dependencies too
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the dependencies of this RDD. This method must ensure that all references
|
||||
* to the original parent RDDs is removed to enable the parent RDDs to be garbage
|
||||
* collected. Subclasses of RDD may override this method for implementing their own cleaning
|
||||
* logic. See [[spark.rdd.UnionRDD]] and [[spark.rdd.ShuffledRDD]] to get a better idea.
|
||||
* logic. See [[spark.rdd.UnionRDD]] for an example.
|
||||
*/
|
||||
protected[spark] def clearDependencies() {
|
||||
protected def clearDependencies() {
|
||||
dependencies_ = null
|
||||
}
|
||||
|
||||
/** A description of this RDD and its recursive dependencies for debugging. */
|
||||
def toDebugString(): String = {
|
||||
def debugString(rdd: RDD[_], prefix: String = ""): Seq[String] = {
|
||||
Seq(prefix + rdd + " (" + rdd.splits.size + " splits)") ++
|
||||
rdd.dependencies.flatMap(d => debugString(d.rdd, prefix + " "))
|
||||
}
|
||||
debugString(this).mkString("\n")
|
||||
}
|
||||
|
||||
override def toString(): String = "%s%s[%d] at %s".format(
|
||||
Option(name).map(_ + " ").getOrElse(""),
|
||||
getClass.getSimpleName,
|
||||
id,
|
||||
origin)
|
||||
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@ private[spark] object CheckpointState extends Enumeration {
|
|||
* of the checkpointed RDD.
|
||||
*/
|
||||
private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T])
|
||||
extends Logging with Serializable {
|
||||
extends Logging with Serializable {
|
||||
|
||||
import CheckpointState._
|
||||
|
||||
|
@ -31,7 +31,7 @@ extends Logging with Serializable {
|
|||
@transient var cpFile: Option[String] = None
|
||||
|
||||
// The CheckpointRDD created from the checkpoint file, that is, the new parent the associated RDD.
|
||||
@transient var cpRDD: Option[RDD[T]] = None
|
||||
var cpRDD: Option[RDD[T]] = None
|
||||
|
||||
// Mark the RDD for checkpointing
|
||||
def markForCheckpoint() {
|
||||
|
@ -41,12 +41,12 @@ extends Logging with Serializable {
|
|||
}
|
||||
|
||||
// Is the RDD already checkpointed
|
||||
def isCheckpointed(): Boolean = {
|
||||
def isCheckpointed: Boolean = {
|
||||
RDDCheckpointData.synchronized { cpState == Checkpointed }
|
||||
}
|
||||
|
||||
// Get the file to which this RDD was checkpointed to as an Option
|
||||
def getCheckpointFile(): Option[String] = {
|
||||
def getCheckpointFile: Option[String] = {
|
||||
RDDCheckpointData.synchronized { cpFile }
|
||||
}
|
||||
|
||||
|
@ -71,7 +71,7 @@ extends Logging with Serializable {
|
|||
RDDCheckpointData.synchronized {
|
||||
cpFile = Some(path)
|
||||
cpRDD = Some(newRDD)
|
||||
rdd.changeDependencies(newRDD)
|
||||
rdd.markCheckpointed(newRDD) // Update the RDD's dependencies and splits
|
||||
cpState = Checkpointed
|
||||
RDDCheckpointData.clearTaskCaches()
|
||||
logInfo("Done checkpointing RDD " + rdd.id + ", new parent is RDD " + newRDD.id)
|
||||
|
@ -79,7 +79,7 @@ extends Logging with Serializable {
|
|||
}
|
||||
|
||||
// Get preferred location of a split after checkpointing
|
||||
def getPreferredLocations(split: Split) = {
|
||||
def getPreferredLocations(split: Split): Seq[String] = {
|
||||
RDDCheckpointData.synchronized {
|
||||
cpRDD.get.preferredLocations(split)
|
||||
}
|
||||
|
@ -91,9 +91,10 @@ extends Logging with Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
// Get iterator. This is called at the worker nodes.
|
||||
def iterator(split: Split, context: TaskContext): Iterator[T] = {
|
||||
rdd.firstParent[T].iterator(split, context)
|
||||
def checkpointRDD: Option[RDD[T]] = {
|
||||
RDDCheckpointData.synchronized {
|
||||
cpRDD
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -73,12 +73,12 @@ class SparkContext(
|
|||
// Ensure logging is initialized before we spawn any threads
|
||||
initLogging()
|
||||
|
||||
// Set Spark master host and port system properties
|
||||
if (System.getProperty("spark.master.host") == null) {
|
||||
System.setProperty("spark.master.host", Utils.localIpAddress)
|
||||
// Set Spark driver host and port system properties
|
||||
if (System.getProperty("spark.driver.host") == null) {
|
||||
System.setProperty("spark.driver.host", Utils.localIpAddress)
|
||||
}
|
||||
if (System.getProperty("spark.master.port") == null) {
|
||||
System.setProperty("spark.master.port", "0")
|
||||
if (System.getProperty("spark.driver.port") == null) {
|
||||
System.setProperty("spark.driver.port", "0")
|
||||
}
|
||||
|
||||
private val isLocal = (master == "local" || master.startsWith("local["))
|
||||
|
@ -86,15 +86,15 @@ class SparkContext(
|
|||
// Create the Spark execution environment (cache, map output tracker, etc)
|
||||
private[spark] val env = SparkEnv.createFromSystemProperties(
|
||||
"<driver>",
|
||||
System.getProperty("spark.master.host"),
|
||||
System.getProperty("spark.master.port").toInt,
|
||||
System.getProperty("spark.driver.host"),
|
||||
System.getProperty("spark.driver.port").toInt,
|
||||
true,
|
||||
isLocal)
|
||||
SparkEnv.set(env)
|
||||
|
||||
// Start the BlockManager UI
|
||||
private[spark] val ui = new BlockManagerUI(
|
||||
env.actorSystem, env.blockManager.master.masterActor, this)
|
||||
env.actorSystem, env.blockManager.master.driverActor, this)
|
||||
ui.start()
|
||||
|
||||
// Used to store a URL for each static file/jar together with the file's local timestamp
|
||||
|
@ -111,8 +111,9 @@ class SparkContext(
|
|||
|
||||
// Environment variables to pass to our executors
|
||||
private[spark] val executorEnvs = HashMap[String, String]()
|
||||
// Note: SPARK_MEM is included for Mesos, but overwritten for standalone mode in ExecutorRunner
|
||||
for (key <- Seq("SPARK_MEM", "SPARK_CLASSPATH", "SPARK_LIBRARY_PATH", "SPARK_JAVA_OPTS",
|
||||
"SPARK_TESTING")) {
|
||||
"SPARK_TESTING")) {
|
||||
val value = System.getenv(key)
|
||||
if (value != null) {
|
||||
executorEnvs(key) = value
|
||||
|
@ -191,6 +192,7 @@ class SparkContext(
|
|||
taskScheduler.start()
|
||||
|
||||
private var dagScheduler = new DAGScheduler(taskScheduler)
|
||||
dagScheduler.start()
|
||||
|
||||
/** A default Hadoop Configuration for the Hadoop code (e.g. file systems) that we reuse. */
|
||||
val hadoopConfiguration = {
|
||||
|
@ -414,14 +416,14 @@ class SparkContext(
|
|||
|
||||
/**
|
||||
* Create an [[spark.Accumulator]] variable of a given type, which tasks can "add" values
|
||||
* to using the `+=` method. Only the master can access the accumulator's `value`.
|
||||
* to using the `+=` method. Only the driver can access the accumulator's `value`.
|
||||
*/
|
||||
def accumulator[T](initialValue: T)(implicit param: AccumulatorParam[T]) =
|
||||
new Accumulator(initialValue, param)
|
||||
|
||||
/**
|
||||
* Create an [[spark.Accumulable]] shared variable, to which tasks can add values with `+=`.
|
||||
* Only the master can access the accumuable's `value`.
|
||||
* Only the driver can access the accumuable's `value`.
|
||||
* @tparam T accumulator type
|
||||
* @tparam R type that can be added to the accumulator
|
||||
*/
|
||||
|
@ -471,7 +473,7 @@ class SparkContext(
|
|||
* Return a map from the slave to the max memory available for caching and the remaining
|
||||
* memory available for caching.
|
||||
*/
|
||||
def getSlavesMemoryStatus: Map[String, (Long, Long)] = {
|
||||
def getExecutorMemoryStatus: Map[String, (Long, Long)] = {
|
||||
env.blockManager.master.getMemoryStatus.map { case(blockManagerId, mem) =>
|
||||
(blockManagerId.ip + ":" + blockManagerId.port, mem)
|
||||
}
|
||||
|
@ -482,7 +484,7 @@ class SparkContext(
|
|||
* they take, etc.
|
||||
*/
|
||||
def getRDDStorageInfo : Array[RDDInfo] = {
|
||||
StorageUtils.rddInfoFromStorageStatus(getSlavesStorageStatus, this)
|
||||
StorageUtils.rddInfoFromStorageStatus(getExecutorStorageStatus, this)
|
||||
}
|
||||
|
||||
def getStageInfo: Map[Stage,StageInfo] = {
|
||||
|
@ -492,7 +494,7 @@ class SparkContext(
|
|||
/**
|
||||
* Return information about blocks stored in all of the slaves
|
||||
*/
|
||||
def getSlavesStorageStatus : Array[StorageStatus] = {
|
||||
def getExecutorStorageStatus : Array[StorageStatus] = {
|
||||
env.blockManager.master.getStorageStatus
|
||||
}
|
||||
|
||||
|
@ -566,10 +568,30 @@ class SparkContext(
|
|||
}
|
||||
|
||||
/**
|
||||
* Run a function on a given set of partitions in an RDD and return the results. This is the main
|
||||
* entry point to the scheduler, by which all actions get launched. The allowLocal flag specifies
|
||||
* whether the scheduler can run the computation on the master rather than shipping it out to the
|
||||
* cluster, for short actions like first().
|
||||
* Run a function on a given set of partitions in an RDD and pass the results to the given
|
||||
* handler function. This is the main entry point for all actions in Spark. The allowLocal
|
||||
* flag specifies whether the scheduler can run the computation on the driver rather than
|
||||
* shipping it out to the cluster, for short actions like first().
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
rdd: RDD[T],
|
||||
func: (TaskContext, Iterator[T]) => U,
|
||||
partitions: Seq[Int],
|
||||
allowLocal: Boolean,
|
||||
resultHandler: (Int, U) => Unit) {
|
||||
val callSite = Utils.getSparkCallSite
|
||||
logInfo("Starting job: " + callSite)
|
||||
val start = System.nanoTime
|
||||
val result = dagScheduler.runJob(rdd, func, partitions, callSite, allowLocal, resultHandler)
|
||||
logInfo("Job finished: " + callSite + ", took " + (System.nanoTime - start) / 1e9 + " s")
|
||||
rdd.doCheckpoint()
|
||||
result
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a function on a given set of partitions in an RDD and return the results as an array. The
|
||||
* allowLocal flag specifies whether the scheduler can run the computation on the driver rather
|
||||
* than shipping it out to the cluster, for short actions like first().
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
rdd: RDD[T],
|
||||
|
@ -577,13 +599,9 @@ class SparkContext(
|
|||
partitions: Seq[Int],
|
||||
allowLocal: Boolean
|
||||
): Array[U] = {
|
||||
val callSite = Utils.getSparkCallSite
|
||||
logInfo("Starting job: " + callSite)
|
||||
val start = System.nanoTime
|
||||
val result = dagScheduler.runJob(rdd, func, partitions, callSite, allowLocal)
|
||||
logInfo("Job finished: " + callSite + ", took " + (System.nanoTime - start) / 1e9 + " s")
|
||||
rdd.doCheckpoint()
|
||||
result
|
||||
val results = new Array[U](partitions.size)
|
||||
runJob[T, U](rdd, func, partitions, allowLocal, (index, res) => results(index) = res)
|
||||
results
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -613,6 +631,29 @@ class SparkContext(
|
|||
runJob(rdd, func, 0 until rdd.splits.size, false)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a job on all partitions in an RDD and pass the results to a handler function.
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
rdd: RDD[T],
|
||||
processPartition: (TaskContext, Iterator[T]) => U,
|
||||
resultHandler: (Int, U) => Unit)
|
||||
{
|
||||
runJob[T, U](rdd, processPartition, 0 until rdd.splits.size, false, resultHandler)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a job on all partitions in an RDD and pass the results to a handler function.
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
rdd: RDD[T],
|
||||
processPartition: Iterator[T] => U,
|
||||
resultHandler: (Int, U) => Unit)
|
||||
{
|
||||
val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter)
|
||||
runJob[T, U](rdd, processFunc, 0 until rdd.splits.size, false, resultHandler)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a job that can return approximate results.
|
||||
*/
|
||||
|
@ -696,6 +737,16 @@ object SparkContext {
|
|||
def zero(initialValue: Int) = 0
|
||||
}
|
||||
|
||||
implicit object LongAccumulatorParam extends AccumulatorParam[Long] {
|
||||
def addInPlace(t1: Long, t2: Long) = t1 + t2
|
||||
def zero(initialValue: Long) = 0l
|
||||
}
|
||||
|
||||
implicit object FloatAccumulatorParam extends AccumulatorParam[Float] {
|
||||
def addInPlace(t1: Float, t2: Float) = t1 + t2
|
||||
def zero(initialValue: Float) = 0f
|
||||
}
|
||||
|
||||
// TODO: Add AccumulatorParams for other types, e.g. lists and strings
|
||||
|
||||
implicit def rddToPairRDDFunctions[K: ClassManifest, V: ClassManifest](rdd: RDD[(K, V)]) =
|
||||
|
|
|
@ -62,15 +62,15 @@ object SparkEnv extends Logging {
|
|||
executorId: String,
|
||||
hostname: String,
|
||||
port: Int,
|
||||
isMaster: Boolean,
|
||||
isDriver: Boolean,
|
||||
isLocal: Boolean): SparkEnv = {
|
||||
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("spark", hostname, port)
|
||||
|
||||
// Bit of a hack: If this is the master and our port was 0 (meaning bind to any free port),
|
||||
// figure out which port number Akka actually bound to and set spark.master.port to it.
|
||||
if (isMaster && port == 0) {
|
||||
System.setProperty("spark.master.port", boundPort.toString)
|
||||
// Bit of a hack: If this is the driver and our port was 0 (meaning bind to any free port),
|
||||
// figure out which port number Akka actually bound to and set spark.driver.port to it.
|
||||
if (isDriver && port == 0) {
|
||||
System.setProperty("spark.driver.port", boundPort.toString)
|
||||
}
|
||||
|
||||
val classLoader = Thread.currentThread.getContextClassLoader
|
||||
|
@ -84,22 +84,22 @@ object SparkEnv extends Logging {
|
|||
|
||||
val serializer = instantiateClass[Serializer]("spark.serializer", "spark.JavaSerializer")
|
||||
|
||||
val masterIp: String = System.getProperty("spark.master.host", "localhost")
|
||||
val masterPort: Int = System.getProperty("spark.master.port", "7077").toInt
|
||||
val driverIp: String = System.getProperty("spark.driver.host", "localhost")
|
||||
val driverPort: Int = System.getProperty("spark.driver.port", "7077").toInt
|
||||
val blockManagerMaster = new BlockManagerMaster(
|
||||
actorSystem, isMaster, isLocal, masterIp, masterPort)
|
||||
actorSystem, isDriver, isLocal, driverIp, driverPort)
|
||||
val blockManager = new BlockManager(executorId, actorSystem, blockManagerMaster, serializer)
|
||||
|
||||
val connectionManager = blockManager.connectionManager
|
||||
|
||||
val broadcastManager = new BroadcastManager(isMaster)
|
||||
val broadcastManager = new BroadcastManager(isDriver)
|
||||
|
||||
val closureSerializer = instantiateClass[Serializer](
|
||||
"spark.closure.serializer", "spark.JavaSerializer")
|
||||
|
||||
val cacheManager = new CacheManager(blockManager)
|
||||
|
||||
val mapOutputTracker = new MapOutputTracker(actorSystem, isMaster)
|
||||
val mapOutputTracker = new MapOutputTracker(actorSystem, isDriver)
|
||||
|
||||
val shuffleFetcher = instantiateClass[ShuffleFetcher](
|
||||
"spark.shuffle.fetcher", "spark.BlockStoreShuffleFetcher")
|
||||
|
@ -111,7 +111,7 @@ object SparkEnv extends Logging {
|
|||
// Set the sparkFiles directory, used when downloading dependencies. In local mode,
|
||||
// this is a temporary directory; in distributed mode, this is the executor's current working
|
||||
// directory.
|
||||
val sparkFilesDir: String = if (isMaster) {
|
||||
val sparkFilesDir: String = if (isDriver) {
|
||||
Utils.createTempDir().getAbsolutePath
|
||||
} else {
|
||||
"."
|
||||
|
|
|
@ -12,6 +12,7 @@ import scala.io.Source
|
|||
import com.google.common.io.Files
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder
|
||||
import scala.Some
|
||||
import spark.serializer.SerializerInstance
|
||||
|
||||
/**
|
||||
* Various utility methods used by Spark.
|
||||
|
@ -446,4 +447,11 @@ private object Utils extends Logging {
|
|||
socket.close()
|
||||
portBound
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone an object using a Spark serializer.
|
||||
*/
|
||||
def clone[T](value: T, serializer: SerializerInstance): T = {
|
||||
serializer.deserialize[T](serializer.serialize(value))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -319,7 +319,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround
|
|||
/**
|
||||
* Return whether this RDD has been checkpointed or not
|
||||
*/
|
||||
def isCheckpointed(): Boolean = rdd.isCheckpointed()
|
||||
def isCheckpointed: Boolean = rdd.isCheckpointed
|
||||
|
||||
/**
|
||||
* Gets the name of the file to which this RDD was checkpointed
|
||||
|
@ -330,4 +330,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround
|
|||
case _ => Optional.absent()
|
||||
}
|
||||
}
|
||||
|
||||
/** A description of this RDD and its recursive dependencies for debugging. */
|
||||
def toDebugString(): String = {
|
||||
rdd.toDebugString()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -103,21 +103,27 @@ private[spark] class PythonRDD[T: ClassManifest](
|
|||
|
||||
private def read(): Array[Byte] = {
|
||||
try {
|
||||
val length = stream.readInt()
|
||||
if (length != -1) {
|
||||
val obj = new Array[Byte](length)
|
||||
stream.readFully(obj)
|
||||
obj
|
||||
} else {
|
||||
// We've finished the data section of the output, but we can still read some
|
||||
// accumulator updates; let's do that, breaking when we get EOFException
|
||||
while (true) {
|
||||
val len2 = stream.readInt()
|
||||
val update = new Array[Byte](len2)
|
||||
stream.readFully(update)
|
||||
accumulator += Collections.singletonList(update)
|
||||
}
|
||||
new Array[Byte](0)
|
||||
stream.readInt() match {
|
||||
case length if length > 0 =>
|
||||
val obj = new Array[Byte](length)
|
||||
stream.readFully(obj)
|
||||
obj
|
||||
case -2 =>
|
||||
// Signals that an exception has been thrown in python
|
||||
val exLength = stream.readInt()
|
||||
val obj = new Array[Byte](exLength)
|
||||
stream.readFully(obj)
|
||||
throw new PythonException(new String(obj))
|
||||
case -1 =>
|
||||
// We've finished the data section of the output, but we can still read some
|
||||
// accumulator updates; let's do that, breaking when we get EOFException
|
||||
while (true) {
|
||||
val len2 = stream.readInt()
|
||||
val update = new Array[Byte](len2)
|
||||
stream.readFully(update)
|
||||
accumulator += Collections.singletonList(update)
|
||||
}
|
||||
new Array[Byte](0)
|
||||
}
|
||||
} catch {
|
||||
case eof: EOFException => {
|
||||
|
@ -140,6 +146,9 @@ private[spark] class PythonRDD[T: ClassManifest](
|
|||
val asJavaRDD : JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
|
||||
}
|
||||
|
||||
/** Thrown for exceptions in user Python code. */
|
||||
private class PythonException(msg: String) extends Exception(msg)
|
||||
|
||||
/**
|
||||
* Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Python.
|
||||
* This is used by PySpark's shuffle operations.
|
||||
|
@ -229,6 +238,11 @@ private[spark] object PythonRDD {
|
|||
}
|
||||
|
||||
def writeIteratorToPickleFile[T](items: java.util.Iterator[T], filename: String) {
|
||||
import scala.collection.JavaConverters._
|
||||
writeIteratorToPickleFile(items.asScala, filename)
|
||||
}
|
||||
|
||||
def writeIteratorToPickleFile[T](items: Iterator[T], filename: String) {
|
||||
val file = new DataOutputStream(new FileOutputStream(filename))
|
||||
for (item <- items) {
|
||||
writeAsPickle(item, file)
|
||||
|
@ -236,8 +250,10 @@ private[spark] object PythonRDD {
|
|||
file.close()
|
||||
}
|
||||
|
||||
def takePartition[T](rdd: RDD[T], partition: Int): java.util.Iterator[T] =
|
||||
rdd.context.runJob(rdd, ((x: Iterator[T]) => x), Seq(partition), true).head
|
||||
def takePartition[T](rdd: RDD[T], partition: Int): Iterator[T] = {
|
||||
implicit val cm : ClassManifest[T] = rdd.elementClassManifest
|
||||
rdd.context.runJob(rdd, ((x: Iterator[T]) => x.toArray), Seq(partition), true).head.iterator
|
||||
}
|
||||
}
|
||||
|
||||
private object Pickle {
|
||||
|
|
|
@ -31,7 +31,7 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
@transient var totalBlocks = -1
|
||||
@transient var hasBlocks = new AtomicInteger(0)
|
||||
|
||||
// Used ONLY by Master to track how many unique blocks have been sent out
|
||||
// Used ONLY by driver to track how many unique blocks have been sent out
|
||||
@transient var sentBlocks = new AtomicInteger(0)
|
||||
|
||||
@transient var listenPortLock = new Object
|
||||
|
@ -42,7 +42,7 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
|
||||
@transient var serveMR: ServeMultipleRequests = null
|
||||
|
||||
// Used only in Master
|
||||
// Used only in driver
|
||||
@transient var guideMR: GuideMultipleRequests = null
|
||||
|
||||
// Used only in Workers
|
||||
|
@ -99,14 +99,14 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
}
|
||||
|
||||
// Must always come AFTER listenPort is created
|
||||
val masterSource =
|
||||
val driverSource =
|
||||
SourceInfo(hostAddress, listenPort, totalBlocks, totalBytes)
|
||||
hasBlocksBitVector.synchronized {
|
||||
masterSource.hasBlocksBitVector = hasBlocksBitVector
|
||||
driverSource.hasBlocksBitVector = hasBlocksBitVector
|
||||
}
|
||||
|
||||
// In the beginning, this is the only known source to Guide
|
||||
listOfSources += masterSource
|
||||
listOfSources += driverSource
|
||||
|
||||
// Register with the Tracker
|
||||
MultiTracker.registerBroadcast(id,
|
||||
|
@ -122,7 +122,7 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
|
||||
case None =>
|
||||
logInfo("Started reading broadcast variable " + id)
|
||||
// Initializing everything because Master will only send null/0 values
|
||||
// Initializing everything because driver will only send null/0 values
|
||||
// Only the 1st worker in a node can be here. Others will get from cache
|
||||
initializeWorkerVariables()
|
||||
|
||||
|
@ -151,7 +151,7 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
}
|
||||
}
|
||||
|
||||
// Initialize variables in the worker node. Master sends everything as 0/null
|
||||
// Initialize variables in the worker node. Driver sends everything as 0/null
|
||||
private def initializeWorkerVariables() {
|
||||
arrayOfBlocks = null
|
||||
hasBlocksBitVector = null
|
||||
|
@ -248,7 +248,7 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
// Receive source information from Guide
|
||||
var suitableSources =
|
||||
oisGuide.readObject.asInstanceOf[ListBuffer[SourceInfo]]
|
||||
logDebug("Received suitableSources from Master " + suitableSources)
|
||||
logDebug("Received suitableSources from Driver " + suitableSources)
|
||||
|
||||
addToListOfSources(suitableSources)
|
||||
|
||||
|
@ -532,7 +532,7 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
oosSource.writeObject(blockToAskFor)
|
||||
oosSource.flush()
|
||||
|
||||
// CHANGED: Master might send some other block than the one
|
||||
// CHANGED: Driver might send some other block than the one
|
||||
// requested to ensure fast spreading of all blocks.
|
||||
val recvStartTime = System.currentTimeMillis
|
||||
val bcBlock = oisSource.readObject.asInstanceOf[BroadcastBlock]
|
||||
|
@ -982,9 +982,9 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
// Receive which block to send
|
||||
var blockToSend = ois.readObject.asInstanceOf[Int]
|
||||
|
||||
// If it is master AND at least one copy of each block has not been
|
||||
// If it is driver AND at least one copy of each block has not been
|
||||
// sent out already, MODIFY blockToSend
|
||||
if (MultiTracker.isMaster && sentBlocks.get < totalBlocks) {
|
||||
if (MultiTracker.isDriver && sentBlocks.get < totalBlocks) {
|
||||
blockToSend = sentBlocks.getAndIncrement
|
||||
}
|
||||
|
||||
|
@ -1031,7 +1031,7 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
|
|||
|
||||
private[spark] class BitTorrentBroadcastFactory
|
||||
extends BroadcastFactory {
|
||||
def initialize(isMaster: Boolean) { MultiTracker.initialize(isMaster) }
|
||||
def initialize(isDriver: Boolean) { MultiTracker.initialize(isDriver) }
|
||||
|
||||
def newBroadcast[T](value_ : T, isLocal: Boolean, id: Long) =
|
||||
new BitTorrentBroadcast[T](value_, isLocal, id)
|
||||
|
|
|
@ -15,7 +15,7 @@ abstract class Broadcast[T](private[spark] val id: Long) extends Serializable {
|
|||
}
|
||||
|
||||
private[spark]
|
||||
class BroadcastManager(val isMaster_ : Boolean) extends Logging with Serializable {
|
||||
class BroadcastManager(val _isDriver: Boolean) extends Logging with Serializable {
|
||||
|
||||
private var initialized = false
|
||||
private var broadcastFactory: BroadcastFactory = null
|
||||
|
@ -33,7 +33,7 @@ class BroadcastManager(val isMaster_ : Boolean) extends Logging with Serializabl
|
|||
Class.forName(broadcastFactoryClass).newInstance.asInstanceOf[BroadcastFactory]
|
||||
|
||||
// Initialize appropriate BroadcastFactory and BroadcastObject
|
||||
broadcastFactory.initialize(isMaster)
|
||||
broadcastFactory.initialize(isDriver)
|
||||
|
||||
initialized = true
|
||||
}
|
||||
|
@ -49,5 +49,5 @@ class BroadcastManager(val isMaster_ : Boolean) extends Logging with Serializabl
|
|||
def newBroadcast[T](value_ : T, isLocal: Boolean) =
|
||||
broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement())
|
||||
|
||||
def isMaster = isMaster_
|
||||
def isDriver = _isDriver
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@ package spark.broadcast
|
|||
* entire Spark job.
|
||||
*/
|
||||
private[spark] trait BroadcastFactory {
|
||||
def initialize(isMaster: Boolean): Unit
|
||||
def newBroadcast[T](value_ : T, isLocal: Boolean, id: Long): Broadcast[T]
|
||||
def initialize(isDriver: Boolean): Unit
|
||||
def newBroadcast[T](value: T, isLocal: Boolean, id: Long): Broadcast[T]
|
||||
def stop(): Unit
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ extends Broadcast[T](id) with Logging with Serializable {
|
|||
}
|
||||
|
||||
private[spark] class HttpBroadcastFactory extends BroadcastFactory {
|
||||
def initialize(isMaster: Boolean) { HttpBroadcast.initialize(isMaster) }
|
||||
def initialize(isDriver: Boolean) { HttpBroadcast.initialize(isDriver) }
|
||||
|
||||
def newBroadcast[T](value_ : T, isLocal: Boolean, id: Long) =
|
||||
new HttpBroadcast[T](value_, isLocal, id)
|
||||
|
@ -69,12 +69,12 @@ private object HttpBroadcast extends Logging {
|
|||
private val cleaner = new MetadataCleaner("HttpBroadcast", cleanup)
|
||||
|
||||
|
||||
def initialize(isMaster: Boolean) {
|
||||
def initialize(isDriver: Boolean) {
|
||||
synchronized {
|
||||
if (!initialized) {
|
||||
bufferSize = System.getProperty("spark.buffer.size", "65536").toInt
|
||||
compress = System.getProperty("spark.broadcast.compress", "true").toBoolean
|
||||
if (isMaster) {
|
||||
if (isDriver) {
|
||||
createServer()
|
||||
}
|
||||
serverUri = System.getProperty("spark.httpBroadcast.uri")
|
||||
|
|
|
@ -23,25 +23,24 @@ extends Logging {
|
|||
var ranGen = new Random
|
||||
|
||||
private var initialized = false
|
||||
private var isMaster_ = false
|
||||
private var _isDriver = false
|
||||
|
||||
private var stopBroadcast = false
|
||||
|
||||
private var trackMV: TrackMultipleValues = null
|
||||
|
||||
def initialize(isMaster__ : Boolean) {
|
||||
def initialize(__isDriver: Boolean) {
|
||||
synchronized {
|
||||
if (!initialized) {
|
||||
_isDriver = __isDriver
|
||||
|
||||
isMaster_ = isMaster__
|
||||
|
||||
if (isMaster) {
|
||||
if (isDriver) {
|
||||
trackMV = new TrackMultipleValues
|
||||
trackMV.setDaemon(true)
|
||||
trackMV.start()
|
||||
|
||||
// Set masterHostAddress to the master's IP address for the slaves to read
|
||||
System.setProperty("spark.MultiTracker.MasterHostAddress", Utils.localIpAddress)
|
||||
// Set DriverHostAddress to the driver's IP address for the slaves to read
|
||||
System.setProperty("spark.MultiTracker.DriverHostAddress", Utils.localIpAddress)
|
||||
}
|
||||
|
||||
initialized = true
|
||||
|
@ -54,10 +53,10 @@ extends Logging {
|
|||
}
|
||||
|
||||
// Load common parameters
|
||||
private var MasterHostAddress_ = System.getProperty(
|
||||
"spark.MultiTracker.MasterHostAddress", "")
|
||||
private var MasterTrackerPort_ = System.getProperty(
|
||||
"spark.broadcast.masterTrackerPort", "11111").toInt
|
||||
private var DriverHostAddress_ = System.getProperty(
|
||||
"spark.MultiTracker.DriverHostAddress", "")
|
||||
private var DriverTrackerPort_ = System.getProperty(
|
||||
"spark.broadcast.driverTrackerPort", "11111").toInt
|
||||
private var BlockSize_ = System.getProperty(
|
||||
"spark.broadcast.blockSize", "4096").toInt * 1024
|
||||
private var MaxRetryCount_ = System.getProperty(
|
||||
|
@ -91,11 +90,11 @@ extends Logging {
|
|||
private var EndGameFraction_ = System.getProperty(
|
||||
"spark.broadcast.endGameFraction", "0.95").toDouble
|
||||
|
||||
def isMaster = isMaster_
|
||||
def isDriver = _isDriver
|
||||
|
||||
// Common config params
|
||||
def MasterHostAddress = MasterHostAddress_
|
||||
def MasterTrackerPort = MasterTrackerPort_
|
||||
def DriverHostAddress = DriverHostAddress_
|
||||
def DriverTrackerPort = DriverTrackerPort_
|
||||
def BlockSize = BlockSize_
|
||||
def MaxRetryCount = MaxRetryCount_
|
||||
|
||||
|
@ -123,7 +122,7 @@ extends Logging {
|
|||
var threadPool = Utils.newDaemonCachedThreadPool()
|
||||
var serverSocket: ServerSocket = null
|
||||
|
||||
serverSocket = new ServerSocket(MasterTrackerPort)
|
||||
serverSocket = new ServerSocket(DriverTrackerPort)
|
||||
logInfo("TrackMultipleValues started at " + serverSocket)
|
||||
|
||||
try {
|
||||
|
@ -235,7 +234,7 @@ extends Logging {
|
|||
try {
|
||||
// Connect to the tracker to find out GuideInfo
|
||||
clientSocketToTracker =
|
||||
new Socket(MultiTracker.MasterHostAddress, MultiTracker.MasterTrackerPort)
|
||||
new Socket(MultiTracker.DriverHostAddress, MultiTracker.DriverTrackerPort)
|
||||
oosTracker =
|
||||
new ObjectOutputStream(clientSocketToTracker.getOutputStream)
|
||||
oosTracker.flush()
|
||||
|
@ -276,7 +275,7 @@ extends Logging {
|
|||
}
|
||||
|
||||
def registerBroadcast(id: Long, gInfo: SourceInfo) {
|
||||
val socket = new Socket(MultiTracker.MasterHostAddress, MasterTrackerPort)
|
||||
val socket = new Socket(MultiTracker.DriverHostAddress, DriverTrackerPort)
|
||||
val oosST = new ObjectOutputStream(socket.getOutputStream)
|
||||
oosST.flush()
|
||||
val oisST = new ObjectInputStream(socket.getInputStream)
|
||||
|
@ -303,7 +302,7 @@ extends Logging {
|
|||
}
|
||||
|
||||
def unregisterBroadcast(id: Long) {
|
||||
val socket = new Socket(MultiTracker.MasterHostAddress, MasterTrackerPort)
|
||||
val socket = new Socket(MultiTracker.DriverHostAddress, DriverTrackerPort)
|
||||
val oosST = new ObjectOutputStream(socket.getOutputStream)
|
||||
oosST.flush()
|
||||
val oisST = new ObjectInputStream(socket.getInputStream)
|
||||
|
|
|
@ -98,7 +98,7 @@ extends Broadcast[T](id) with Logging with Serializable {
|
|||
|
||||
case None =>
|
||||
logInfo("Started reading broadcast variable " + id)
|
||||
// Initializing everything because Master will only send null/0 values
|
||||
// Initializing everything because Driver will only send null/0 values
|
||||
// Only the 1st worker in a node can be here. Others will get from cache
|
||||
initializeWorkerVariables()
|
||||
|
||||
|
@ -157,55 +157,55 @@ extends Broadcast[T](id) with Logging with Serializable {
|
|||
listenPortLock.synchronized { listenPortLock.wait() }
|
||||
}
|
||||
|
||||
var clientSocketToMaster: Socket = null
|
||||
var oosMaster: ObjectOutputStream = null
|
||||
var oisMaster: ObjectInputStream = null
|
||||
var clientSocketToDriver: Socket = null
|
||||
var oosDriver: ObjectOutputStream = null
|
||||
var oisDriver: ObjectInputStream = null
|
||||
|
||||
// Connect and receive broadcast from the specified source, retrying the
|
||||
// specified number of times in case of failures
|
||||
var retriesLeft = MultiTracker.MaxRetryCount
|
||||
do {
|
||||
// Connect to Master and send this worker's Information
|
||||
clientSocketToMaster = new Socket(MultiTracker.MasterHostAddress, gInfo.listenPort)
|
||||
oosMaster = new ObjectOutputStream(clientSocketToMaster.getOutputStream)
|
||||
oosMaster.flush()
|
||||
oisMaster = new ObjectInputStream(clientSocketToMaster.getInputStream)
|
||||
// Connect to Driver and send this worker's Information
|
||||
clientSocketToDriver = new Socket(MultiTracker.DriverHostAddress, gInfo.listenPort)
|
||||
oosDriver = new ObjectOutputStream(clientSocketToDriver.getOutputStream)
|
||||
oosDriver.flush()
|
||||
oisDriver = new ObjectInputStream(clientSocketToDriver.getInputStream)
|
||||
|
||||
logDebug("Connected to Master's guiding object")
|
||||
logDebug("Connected to Driver's guiding object")
|
||||
|
||||
// Send local source information
|
||||
oosMaster.writeObject(SourceInfo(hostAddress, listenPort))
|
||||
oosMaster.flush()
|
||||
oosDriver.writeObject(SourceInfo(hostAddress, listenPort))
|
||||
oosDriver.flush()
|
||||
|
||||
// Receive source information from Master
|
||||
var sourceInfo = oisMaster.readObject.asInstanceOf[SourceInfo]
|
||||
// Receive source information from Driver
|
||||
var sourceInfo = oisDriver.readObject.asInstanceOf[SourceInfo]
|
||||
totalBlocks = sourceInfo.totalBlocks
|
||||
arrayOfBlocks = new Array[BroadcastBlock](totalBlocks)
|
||||
totalBlocksLock.synchronized { totalBlocksLock.notifyAll() }
|
||||
totalBytes = sourceInfo.totalBytes
|
||||
|
||||
logDebug("Received SourceInfo from Master:" + sourceInfo + " My Port: " + listenPort)
|
||||
logDebug("Received SourceInfo from Driver:" + sourceInfo + " My Port: " + listenPort)
|
||||
|
||||
val start = System.nanoTime
|
||||
val receptionSucceeded = receiveSingleTransmission(sourceInfo)
|
||||
val time = (System.nanoTime - start) / 1e9
|
||||
|
||||
// Updating some statistics in sourceInfo. Master will be using them later
|
||||
// Updating some statistics in sourceInfo. Driver will be using them later
|
||||
if (!receptionSucceeded) {
|
||||
sourceInfo.receptionFailed = true
|
||||
}
|
||||
|
||||
// Send back statistics to the Master
|
||||
oosMaster.writeObject(sourceInfo)
|
||||
// Send back statistics to the Driver
|
||||
oosDriver.writeObject(sourceInfo)
|
||||
|
||||
if (oisMaster != null) {
|
||||
oisMaster.close()
|
||||
if (oisDriver != null) {
|
||||
oisDriver.close()
|
||||
}
|
||||
if (oosMaster != null) {
|
||||
oosMaster.close()
|
||||
if (oosDriver != null) {
|
||||
oosDriver.close()
|
||||
}
|
||||
if (clientSocketToMaster != null) {
|
||||
clientSocketToMaster.close()
|
||||
if (clientSocketToDriver != null) {
|
||||
clientSocketToDriver.close()
|
||||
}
|
||||
|
||||
retriesLeft -= 1
|
||||
|
@ -552,7 +552,7 @@ extends Broadcast[T](id) with Logging with Serializable {
|
|||
}
|
||||
|
||||
private def sendObject() {
|
||||
// Wait till receiving the SourceInfo from Master
|
||||
// Wait till receiving the SourceInfo from Driver
|
||||
while (totalBlocks == -1) {
|
||||
totalBlocksLock.synchronized { totalBlocksLock.wait() }
|
||||
}
|
||||
|
@ -576,7 +576,7 @@ extends Broadcast[T](id) with Logging with Serializable {
|
|||
|
||||
private[spark] class TreeBroadcastFactory
|
||||
extends BroadcastFactory {
|
||||
def initialize(isMaster: Boolean) { MultiTracker.initialize(isMaster) }
|
||||
def initialize(isDriver: Boolean) { MultiTracker.initialize(isDriver) }
|
||||
|
||||
def newBroadcast[T](value_ : T, isLocal: Boolean, id: Long) =
|
||||
new TreeBroadcast[T](value_, isLocal, id)
|
||||
|
|
|
@ -16,38 +16,25 @@ import scala.collection.mutable.ArrayBuffer
|
|||
* fault recovery without spinning up a lot of processes.
|
||||
*/
|
||||
private[spark]
|
||||
class LocalSparkCluster(numSlaves: Int, coresPerSlave: Int, memoryPerSlave: Int) extends Logging {
|
||||
class LocalSparkCluster(numWorkers: Int, coresPerWorker: Int, memoryPerWorker: Int) extends Logging {
|
||||
|
||||
val localIpAddress = Utils.localIpAddress
|
||||
private val localIpAddress = Utils.localIpAddress
|
||||
private val masterActorSystems = ArrayBuffer[ActorSystem]()
|
||||
private val workerActorSystems = ArrayBuffer[ActorSystem]()
|
||||
|
||||
var masterActor : ActorRef = _
|
||||
var masterActorSystem : ActorSystem = _
|
||||
var masterPort : Int = _
|
||||
var masterUrl : String = _
|
||||
|
||||
val slaveActorSystems = ArrayBuffer[ActorSystem]()
|
||||
val slaveActors = ArrayBuffer[ActorRef]()
|
||||
|
||||
def start() : String = {
|
||||
logInfo("Starting a local Spark cluster with " + numSlaves + " slaves.")
|
||||
def start(): String = {
|
||||
logInfo("Starting a local Spark cluster with " + numWorkers + " workers.")
|
||||
|
||||
/* Start the Master */
|
||||
val (actorSystem, masterPort) = AkkaUtils.createActorSystem("sparkMaster", localIpAddress, 0)
|
||||
masterActorSystem = actorSystem
|
||||
masterUrl = "spark://" + localIpAddress + ":" + masterPort
|
||||
val actor = masterActorSystem.actorOf(
|
||||
Props(new Master(localIpAddress, masterPort, 0)), name = "Master")
|
||||
masterActor = actor
|
||||
val (masterSystem, masterPort) = Master.startSystemAndActor(localIpAddress, 0, 0)
|
||||
masterActorSystems += masterSystem
|
||||
val masterUrl = "spark://" + localIpAddress + ":" + masterPort
|
||||
|
||||
/* Start the Slaves */
|
||||
for (slaveNum <- 1 to numSlaves) {
|
||||
val (actorSystem, boundPort) =
|
||||
AkkaUtils.createActorSystem("sparkWorker" + slaveNum, localIpAddress, 0)
|
||||
slaveActorSystems += actorSystem
|
||||
val actor = actorSystem.actorOf(
|
||||
Props(new Worker(localIpAddress, boundPort, 0, coresPerSlave, memoryPerSlave, masterUrl)),
|
||||
name = "Worker")
|
||||
slaveActors += actor
|
||||
/* Start the Workers */
|
||||
for (workerNum <- 1 to numWorkers) {
|
||||
val (workerSystem, _) = Worker.startSystemAndActor(localIpAddress, 0, 0, coresPerWorker,
|
||||
memoryPerWorker, masterUrl, null, Some(workerNum))
|
||||
workerActorSystems += workerSystem
|
||||
}
|
||||
|
||||
return masterUrl
|
||||
|
@ -55,10 +42,10 @@ class LocalSparkCluster(numSlaves: Int, coresPerSlave: Int, memoryPerSlave: Int)
|
|||
|
||||
def stop() {
|
||||
logInfo("Shutting down local Spark cluster.")
|
||||
// Stop the slaves before the master so they don't get upset that it disconnected
|
||||
slaveActorSystems.foreach(_.shutdown())
|
||||
slaveActorSystems.foreach(_.awaitTermination())
|
||||
masterActorSystem.shutdown()
|
||||
masterActorSystem.awaitTermination()
|
||||
// Stop the workers before the master so they don't get upset that it disconnected
|
||||
workerActorSystems.foreach(_.shutdown())
|
||||
workerActorSystems.foreach(_.awaitTermination())
|
||||
masterActorSystems.foreach(_.shutdown())
|
||||
masterActorSystems.foreach(_.awaitTermination())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ import spark.{SparkException, Logging}
|
|||
import akka.remote.RemoteClientLifeCycleEvent
|
||||
import akka.remote.RemoteClientShutdown
|
||||
import spark.deploy.RegisterJob
|
||||
import spark.deploy.master.Master
|
||||
import akka.remote.RemoteClientDisconnected
|
||||
import akka.actor.Terminated
|
||||
import akka.dispatch.Await
|
||||
|
@ -24,26 +25,18 @@ private[spark] class Client(
|
|||
listener: ClientListener)
|
||||
extends Logging {
|
||||
|
||||
val MASTER_REGEX = "spark://([^:]+):([0-9]+)".r
|
||||
|
||||
var actor: ActorRef = null
|
||||
var jobId: String = null
|
||||
|
||||
if (MASTER_REGEX.unapplySeq(masterUrl) == None) {
|
||||
throw new SparkException("Invalid master URL: " + masterUrl)
|
||||
}
|
||||
|
||||
class ClientActor extends Actor with Logging {
|
||||
var master: ActorRef = null
|
||||
var masterAddress: Address = null
|
||||
var alreadyDisconnected = false // To avoid calling listener.disconnected() multiple times
|
||||
|
||||
override def preStart() {
|
||||
val Seq(masterHost, masterPort) = MASTER_REGEX.unapplySeq(masterUrl).get
|
||||
logInfo("Connecting to master spark://" + masterHost + ":" + masterPort)
|
||||
val akkaUrl = "akka://spark@%s:%s/user/Master".format(masterHost, masterPort)
|
||||
logInfo("Connecting to master " + masterUrl)
|
||||
try {
|
||||
master = context.actorFor(akkaUrl)
|
||||
master = context.actorFor(Master.toAkkaUrl(masterUrl))
|
||||
masterAddress = master.path.address
|
||||
master ! RegisterJob(jobDescription)
|
||||
context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent])
|
||||
|
|
|
@ -12,7 +12,7 @@ private[spark] trait ClientListener {
|
|||
|
||||
def disconnected(): Unit
|
||||
|
||||
def executorAdded(id: String, workerId: String, host: String, cores: Int, memory: Int): Unit
|
||||
def executorAdded(fullId: String, workerId: String, host: String, cores: Int, memory: Int): Unit
|
||||
|
||||
def executorRemoved(id: String, message: String, exitStatus: Option[Int]): Unit
|
||||
def executorRemoved(fullId: String, message: String, exitStatus: Option[Int]): Unit
|
||||
}
|
||||
|
|
|
@ -10,7 +10,7 @@ private[spark] class JobInfo(
|
|||
val id: String,
|
||||
val desc: JobDescription,
|
||||
val submitDate: Date,
|
||||
val actor: ActorRef)
|
||||
val driver: ActorRef)
|
||||
{
|
||||
var state = JobState.WAITING
|
||||
var executors = new mutable.HashMap[Int, ExecutorInfo]
|
||||
|
|
|
@ -88,7 +88,7 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor
|
|||
execOption match {
|
||||
case Some(exec) => {
|
||||
exec.state = state
|
||||
exec.job.actor ! ExecutorUpdated(execId, state, message, exitStatus)
|
||||
exec.job.driver ! ExecutorUpdated(execId, state, message, exitStatus)
|
||||
if (ExecutorState.isFinished(state)) {
|
||||
val jobInfo = idToJob(jobId)
|
||||
// Remove this executor from the worker and job
|
||||
|
@ -100,11 +100,9 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor
|
|||
if (jobInfo.incrementRetryCount < JobState.MAX_NUM_RETRY) {
|
||||
schedule()
|
||||
} else {
|
||||
val e = new SparkException("Job %s with ID %s failed %d times.".format(
|
||||
logError("Job %s with ID %s failed %d times, removing it".format(
|
||||
jobInfo.desc.name, jobInfo.id, jobInfo.retryCount))
|
||||
logError(e.getMessage, e)
|
||||
throw e
|
||||
//System.exit(1)
|
||||
removeJob(jobInfo)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -199,7 +197,7 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor
|
|||
logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
|
||||
worker.addExecutor(exec)
|
||||
worker.actor ! LaunchExecutor(exec.job.id, exec.id, exec.job.desc, exec.cores, exec.memory, sparkHome)
|
||||
exec.job.actor ! ExecutorAdded(exec.id, worker.id, worker.host, exec.cores, exec.memory)
|
||||
exec.job.driver ! ExecutorAdded(exec.id, worker.id, worker.host, exec.cores, exec.memory)
|
||||
}
|
||||
|
||||
def addWorker(id: String, host: String, port: Int, cores: Int, memory: Int, webUiPort: Int,
|
||||
|
@ -221,19 +219,19 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor
|
|||
actorToWorker -= worker.actor
|
||||
addressToWorker -= worker.actor.path.address
|
||||
for (exec <- worker.executors.values) {
|
||||
exec.job.actor ! ExecutorStateChanged(exec.job.id, exec.id, ExecutorState.LOST, None, None)
|
||||
exec.job.driver ! ExecutorStateChanged(exec.job.id, exec.id, ExecutorState.LOST, None, None)
|
||||
exec.job.executors -= exec.id
|
||||
}
|
||||
}
|
||||
|
||||
def addJob(desc: JobDescription, actor: ActorRef): JobInfo = {
|
||||
def addJob(desc: JobDescription, driver: ActorRef): JobInfo = {
|
||||
val now = System.currentTimeMillis()
|
||||
val date = new Date(now)
|
||||
val job = new JobInfo(now, newJobId(date), desc, date, actor)
|
||||
val job = new JobInfo(now, newJobId(date), desc, date, driver)
|
||||
jobs += job
|
||||
idToJob(job.id) = job
|
||||
actorToJob(sender) = job
|
||||
addressToJob(sender.path.address) = job
|
||||
actorToJob(driver) = job
|
||||
addressToJob(driver.path.address) = job
|
||||
return job
|
||||
}
|
||||
|
||||
|
@ -242,8 +240,8 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor
|
|||
logInfo("Removing job " + job.id)
|
||||
jobs -= job
|
||||
idToJob -= job.id
|
||||
actorToJob -= job.actor
|
||||
addressToWorker -= job.actor.path.address
|
||||
actorToJob -= job.driver
|
||||
addressToWorker -= job.driver.path.address
|
||||
completedJobs += job // Remember it in our history
|
||||
waitingJobs -= job
|
||||
for (exec <- job.executors.values) {
|
||||
|
@ -264,11 +262,29 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor
|
|||
}
|
||||
|
||||
private[spark] object Master {
|
||||
private val systemName = "sparkMaster"
|
||||
private val actorName = "Master"
|
||||
private val sparkUrlRegex = "spark://([^:]+):([0-9]+)".r
|
||||
|
||||
def main(argStrings: Array[String]) {
|
||||
val args = new MasterArguments(argStrings)
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("spark", args.ip, args.port)
|
||||
val actor = actorSystem.actorOf(
|
||||
Props(new Master(args.ip, boundPort, args.webUiPort)), name = "Master")
|
||||
val (actorSystem, _) = startSystemAndActor(args.ip, args.port, args.webUiPort)
|
||||
actorSystem.awaitTermination()
|
||||
}
|
||||
|
||||
/** Returns an `akka://...` URL for the Master actor given a sparkUrl `spark://host:ip`. */
|
||||
def toAkkaUrl(sparkUrl: String): String = {
|
||||
sparkUrl match {
|
||||
case sparkUrlRegex(host, port) =>
|
||||
"akka://%s@%s:%s/user/%s".format(systemName, host, port, actorName)
|
||||
case _ =>
|
||||
throw new SparkException("Invalid master URL: " + sparkUrl)
|
||||
}
|
||||
}
|
||||
|
||||
def startSystemAndActor(host: String, port: Int, webUiPort: Int): (ActorSystem, Int) = {
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem(systemName, host, port)
|
||||
val actor = actorSystem.actorOf(Props(new Master(host, boundPort, webUiPort)), name = actorName)
|
||||
(actorSystem, boundPort)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,13 +45,9 @@ class MasterWebUI(val actorSystem: ActorSystem, master: ActorRef) extends Direct
|
|||
case (jobId, Some(js)) if (js.equalsIgnoreCase("json")) =>
|
||||
val future = master ? RequestMasterState
|
||||
val jobInfo = for (masterState <- future.mapTo[MasterState]) yield {
|
||||
masterState.activeJobs.find(_.id == jobId) match {
|
||||
case Some(job) => job
|
||||
case _ => masterState.completedJobs.find(_.id == jobId) match {
|
||||
case Some(job) => job
|
||||
case _ => null
|
||||
}
|
||||
}
|
||||
masterState.activeJobs.find(_.id == jobId).getOrElse({
|
||||
masterState.completedJobs.find(_.id == jobId).getOrElse(null)
|
||||
})
|
||||
}
|
||||
respondWithMediaType(MediaTypes.`application/json`) { ctx =>
|
||||
ctx.complete(jobInfo.mapTo[JobInfo])
|
||||
|
@ -61,14 +57,10 @@ class MasterWebUI(val actorSystem: ActorSystem, master: ActorRef) extends Direct
|
|||
val future = master ? RequestMasterState
|
||||
future.map { state =>
|
||||
val masterState = state.asInstanceOf[MasterState]
|
||||
|
||||
masterState.activeJobs.find(_.id == jobId) match {
|
||||
case Some(job) => spark.deploy.master.html.job_details.render(job)
|
||||
case _ => masterState.completedJobs.find(_.id == jobId) match {
|
||||
case Some(job) => spark.deploy.master.html.job_details.render(job)
|
||||
case _ => null
|
||||
}
|
||||
}
|
||||
val job = masterState.activeJobs.find(_.id == jobId).getOrElse({
|
||||
masterState.completedJobs.find(_.id == jobId).getOrElse(null)
|
||||
})
|
||||
spark.deploy.master.html.job_details.render(job)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -113,8 +113,7 @@ private[spark] class ExecutorRunner(
|
|||
for ((key, value) <- jobDesc.command.environment) {
|
||||
env.put(key, value)
|
||||
}
|
||||
env.put("SPARK_CORES", cores.toString)
|
||||
env.put("SPARK_MEMORY", memory.toString)
|
||||
env.put("SPARK_MEM", memory.toString + "m")
|
||||
// In case we are running this from within the Spark Shell, avoid creating a "scala"
|
||||
// parent process for the executor command
|
||||
env.put("SPARK_LAUNCH_WITH_SCALA", "0")
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package spark.deploy.worker
|
||||
|
||||
import scala.collection.mutable.{ArrayBuffer, HashMap}
|
||||
import akka.actor.{ActorRef, Props, Actor}
|
||||
import akka.actor.{ActorRef, Props, Actor, ActorSystem}
|
||||
import spark.{Logging, Utils}
|
||||
import spark.util.AkkaUtils
|
||||
import spark.deploy._
|
||||
|
@ -13,6 +13,7 @@ import akka.remote.RemoteClientDisconnected
|
|||
import spark.deploy.RegisterWorker
|
||||
import spark.deploy.LaunchExecutor
|
||||
import spark.deploy.RegisterWorkerFailed
|
||||
import spark.deploy.master.Master
|
||||
import akka.actor.Terminated
|
||||
import java.io.File
|
||||
|
||||
|
@ -27,7 +28,6 @@ private[spark] class Worker(
|
|||
extends Actor with Logging {
|
||||
|
||||
val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For worker and executor IDs
|
||||
val MASTER_REGEX = "spark://([^:]+):([0-9]+)".r
|
||||
|
||||
var master: ActorRef = null
|
||||
var masterWebUiUrl : String = ""
|
||||
|
@ -48,11 +48,7 @@ private[spark] class Worker(
|
|||
def memoryFree: Int = memory - memoryUsed
|
||||
|
||||
def createWorkDir() {
|
||||
workDir = if (workDirPath != null) {
|
||||
new File(workDirPath)
|
||||
} else {
|
||||
new File(sparkHome, "work")
|
||||
}
|
||||
workDir = Option(workDirPath).map(new File(_)).getOrElse(new File(sparkHome, "work"))
|
||||
try {
|
||||
if (!workDir.exists() && !workDir.mkdirs()) {
|
||||
logError("Failed to create work directory " + workDir)
|
||||
|
@ -68,8 +64,7 @@ private[spark] class Worker(
|
|||
override def preStart() {
|
||||
logInfo("Starting Spark worker %s:%d with %d cores, %s RAM".format(
|
||||
ip, port, cores, Utils.memoryMegabytesToString(memory)))
|
||||
val envVar = System.getenv("SPARK_HOME")
|
||||
sparkHome = new File(if (envVar == null) "." else envVar)
|
||||
sparkHome = new File(Option(System.getenv("SPARK_HOME")).getOrElse("."))
|
||||
logInfo("Spark home: " + sparkHome)
|
||||
createWorkDir()
|
||||
connectToMaster()
|
||||
|
@ -77,24 +72,15 @@ private[spark] class Worker(
|
|||
}
|
||||
|
||||
def connectToMaster() {
|
||||
masterUrl match {
|
||||
case MASTER_REGEX(masterHost, masterPort) => {
|
||||
logInfo("Connecting to master spark://" + masterHost + ":" + masterPort)
|
||||
val akkaUrl = "akka://spark@%s:%s/user/Master".format(masterHost, masterPort)
|
||||
try {
|
||||
master = context.actorFor(akkaUrl)
|
||||
master ! RegisterWorker(workerId, ip, port, cores, memory, webUiPort, publicAddress)
|
||||
context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent])
|
||||
context.watch(master) // Doesn't work with remote actors, but useful for testing
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
logError("Failed to connect to master", e)
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
case _ =>
|
||||
logError("Invalid master URL: " + masterUrl)
|
||||
logInfo("Connecting to master " + masterUrl)
|
||||
try {
|
||||
master = context.actorFor(Master.toAkkaUrl(masterUrl))
|
||||
master ! RegisterWorker(workerId, ip, port, cores, memory, webUiPort, publicAddress)
|
||||
context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent])
|
||||
context.watch(master) // Doesn't work with remote actors, but useful for testing
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
logError("Failed to connect to master", e)
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
|
@ -134,7 +120,9 @@ private[spark] class Worker(
|
|||
val fullId = jobId + "/" + execId
|
||||
if (ExecutorState.isFinished(state)) {
|
||||
val executor = executors(fullId)
|
||||
logInfo("Executor " + fullId + " finished with state " + state)
|
||||
logInfo("Executor " + fullId + " finished with state " + state +
|
||||
message.map(" message " + _).getOrElse("") +
|
||||
exitStatus.map(" exitStatus " + _).getOrElse(""))
|
||||
finishedExecutors(fullId) = executor
|
||||
executors -= fullId
|
||||
coresUsed -= executor.cores
|
||||
|
@ -143,9 +131,13 @@ private[spark] class Worker(
|
|||
|
||||
case KillExecutor(jobId, execId) =>
|
||||
val fullId = jobId + "/" + execId
|
||||
val executor = executors(fullId)
|
||||
logInfo("Asked to kill executor " + fullId)
|
||||
executor.kill()
|
||||
executors.get(fullId) match {
|
||||
case Some(executor) =>
|
||||
logInfo("Asked to kill executor " + fullId)
|
||||
executor.kill()
|
||||
case None =>
|
||||
logInfo("Asked to kill unknown executor " + fullId)
|
||||
}
|
||||
|
||||
case Terminated(_) | RemoteClientDisconnected(_, _) | RemoteClientShutdown(_, _) =>
|
||||
masterDisconnected()
|
||||
|
@ -177,11 +169,19 @@ private[spark] class Worker(
|
|||
private[spark] object Worker {
|
||||
def main(argStrings: Array[String]) {
|
||||
val args = new WorkerArguments(argStrings)
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("spark", args.ip, args.port)
|
||||
val actor = actorSystem.actorOf(
|
||||
Props(new Worker(args.ip, boundPort, args.webUiPort, args.cores, args.memory,
|
||||
args.master, args.workDir)),
|
||||
name = "Worker")
|
||||
val (actorSystem, _) = startSystemAndActor(args.ip, args.port, args.webUiPort, args.cores,
|
||||
args.memory, args.master, args.workDir)
|
||||
actorSystem.awaitTermination()
|
||||
}
|
||||
|
||||
def startSystemAndActor(host: String, port: Int, webUiPort: Int, cores: Int, memory: Int,
|
||||
masterUrl: String, workDir: String, workerNumber: Option[Int] = None): (ActorSystem, Int) = {
|
||||
// The LocalSparkCluster runs multiple local sparkWorkerX actor systems
|
||||
val systemName = "sparkWorker" + workerNumber.map(_.toString).getOrElse("")
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem(systemName, host, port)
|
||||
val actor = actorSystem.actorOf(Props(new Worker(host, boundPort, webUiPort, cores, memory,
|
||||
masterUrl, workDir)), name = "Worker")
|
||||
(actorSystem, boundPort)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -32,7 +32,11 @@ private[spark] class MesosExecutorBackend(executor: Executor)
|
|||
logInfo("Registered with Mesos as executor ID " + executorInfo.getExecutorId.getValue)
|
||||
this.driver = driver
|
||||
val properties = Utils.deserialize[Array[(String, String)]](executorInfo.getData.toByteArray)
|
||||
executor.initialize(executorInfo.getExecutorId.getValue, slaveInfo.getHostname, properties)
|
||||
executor.initialize(
|
||||
executorInfo.getExecutorId.getValue,
|
||||
slaveInfo.getHostname,
|
||||
properties
|
||||
)
|
||||
}
|
||||
|
||||
override def launchTask(d: ExecutorDriver, taskInfo: TaskInfo) {
|
||||
|
|
|
@ -16,7 +16,7 @@ import spark.scheduler.cluster.RegisterExecutor
|
|||
|
||||
private[spark] class StandaloneExecutorBackend(
|
||||
executor: Executor,
|
||||
masterUrl: String,
|
||||
driverUrl: String,
|
||||
executorId: String,
|
||||
hostname: String,
|
||||
cores: Int)
|
||||
|
@ -24,25 +24,25 @@ private[spark] class StandaloneExecutorBackend(
|
|||
with ExecutorBackend
|
||||
with Logging {
|
||||
|
||||
var master: ActorRef = null
|
||||
var driver: ActorRef = null
|
||||
|
||||
override def preStart() {
|
||||
try {
|
||||
logInfo("Connecting to master: " + masterUrl)
|
||||
master = context.actorFor(masterUrl)
|
||||
master ! RegisterExecutor(executorId, hostname, cores)
|
||||
logInfo("Connecting to driver: " + driverUrl)
|
||||
driver = context.actorFor(driverUrl)
|
||||
driver ! RegisterExecutor(executorId, hostname, cores)
|
||||
context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent])
|
||||
context.watch(master) // Doesn't work with remote actors, but useful for testing
|
||||
context.watch(driver) // Doesn't work with remote actors, but useful for testing
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
logError("Failed to connect to master", e)
|
||||
logError("Failed to connect to driver", e)
|
||||
System.exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
override def receive = {
|
||||
case RegisteredExecutor(sparkProperties) =>
|
||||
logInfo("Successfully registered with master")
|
||||
logInfo("Successfully registered with driver")
|
||||
executor.initialize(executorId, hostname, sparkProperties)
|
||||
|
||||
case RegisterExecutorFailed(message) =>
|
||||
|
@ -55,24 +55,24 @@ private[spark] class StandaloneExecutorBackend(
|
|||
}
|
||||
|
||||
override def statusUpdate(taskId: Long, state: TaskState, data: ByteBuffer) {
|
||||
master ! StatusUpdate(executorId, taskId, state, data)
|
||||
driver ! StatusUpdate(executorId, taskId, state, data)
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] object StandaloneExecutorBackend {
|
||||
def run(masterUrl: String, executorId: String, hostname: String, cores: Int) {
|
||||
def run(driverUrl: String, executorId: String, hostname: String, cores: Int) {
|
||||
// Create a new ActorSystem to run the backend, because we can't create a SparkEnv / Executor
|
||||
// before getting started with all our system properties, etc
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("sparkExecutor", hostname, 0)
|
||||
val actor = actorSystem.actorOf(
|
||||
Props(new StandaloneExecutorBackend(new Executor, masterUrl, executorId, hostname, cores)),
|
||||
Props(new StandaloneExecutorBackend(new Executor, driverUrl, executorId, hostname, cores)),
|
||||
name = "Executor")
|
||||
actorSystem.awaitTermination()
|
||||
}
|
||||
|
||||
def main(args: Array[String]) {
|
||||
if (args.length != 4) {
|
||||
System.err.println("Usage: StandaloneExecutorBackend <master> <executorId> <hostname> <cores>")
|
||||
System.err.println("Usage: StandaloneExecutorBackend <driverUrl> <executorId> <hostname> <cores>")
|
||||
System.exit(1)
|
||||
}
|
||||
run(args(0), args(1), args(2), args(3).toInt)
|
||||
|
|
|
@ -12,7 +12,14 @@ import java.net._
|
|||
|
||||
|
||||
private[spark]
|
||||
abstract class Connection(val channel: SocketChannel, val selector: Selector) extends Logging {
|
||||
abstract class Connection(val channel: SocketChannel, val selector: Selector,
|
||||
val remoteConnectionManagerId: ConnectionManagerId) extends Logging {
|
||||
def this(channel_ : SocketChannel, selector_ : Selector) = {
|
||||
this(channel_, selector_,
|
||||
ConnectionManagerId.fromSocketAddress(
|
||||
channel_.socket.getRemoteSocketAddress().asInstanceOf[InetSocketAddress]
|
||||
))
|
||||
}
|
||||
|
||||
channel.configureBlocking(false)
|
||||
channel.socket.setTcpNoDelay(true)
|
||||
|
@ -25,7 +32,6 @@ abstract class Connection(val channel: SocketChannel, val selector: Selector) ex
|
|||
var onKeyInterestChangeCallback: (Connection, Int) => Unit = null
|
||||
|
||||
val remoteAddress = getRemoteAddress()
|
||||
val remoteConnectionManagerId = ConnectionManagerId.fromSocketAddress(remoteAddress)
|
||||
|
||||
def key() = channel.keyFor(selector)
|
||||
|
||||
|
@ -103,8 +109,9 @@ abstract class Connection(val channel: SocketChannel, val selector: Selector) ex
|
|||
}
|
||||
|
||||
|
||||
private[spark] class SendingConnection(val address: InetSocketAddress, selector_ : Selector)
|
||||
extends Connection(SocketChannel.open, selector_) {
|
||||
private[spark] class SendingConnection(val address: InetSocketAddress, selector_ : Selector,
|
||||
remoteId_ : ConnectionManagerId)
|
||||
extends Connection(SocketChannel.open, selector_, remoteId_) {
|
||||
|
||||
class Outbox(fair: Int = 0) {
|
||||
val messages = new Queue[Message]()
|
||||
|
|
|
@ -299,7 +299,8 @@ private[spark] class ConnectionManager(port: Int) extends Logging {
|
|||
private def sendMessage(connectionManagerId: ConnectionManagerId, message: Message) {
|
||||
def startNewConnection(): SendingConnection = {
|
||||
val inetSocketAddress = new InetSocketAddress(connectionManagerId.host, connectionManagerId.port)
|
||||
val newConnection = connectionRequests.getOrElseUpdate(connectionManagerId, new SendingConnection(inetSocketAddress, selector))
|
||||
val newConnection = connectionRequests.getOrElseUpdate(connectionManagerId,
|
||||
new SendingConnection(inetSocketAddress, selector, connectionManagerId))
|
||||
newConnection
|
||||
}
|
||||
val lookupKey = ConnectionManagerId.fromSocketAddress(connectionManagerId.toSocketAddress)
|
||||
|
|
|
@ -32,7 +32,7 @@ private[spark] class ApproximateActionListener[T, U, R](
|
|||
if (finishedTasks == totalTasks) {
|
||||
// If we had already returned a PartialResult, set its final value
|
||||
resultObject.foreach(r => r.setFinalValue(evaluator.currentResult()))
|
||||
// Notify any waiting thread that may have called getResult
|
||||
// Notify any waiting thread that may have called awaitResult
|
||||
this.notifyAll()
|
||||
}
|
||||
}
|
||||
|
@ -49,7 +49,7 @@ private[spark] class ApproximateActionListener[T, U, R](
|
|||
* Waits for up to timeout milliseconds since the listener was created and then returns a
|
||||
* PartialResult with the result so far. This may be complete if the whole job is done.
|
||||
*/
|
||||
def getResult(): PartialResult[R] = synchronized {
|
||||
def awaitResult(): PartialResult[R] = synchronized {
|
||||
val finishTime = startTime + timeout
|
||||
while (true) {
|
||||
val time = System.currentTimeMillis()
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package spark.rdd
|
||||
|
||||
import java.io.{ObjectOutputStream, IOException}
|
||||
import spark.{OneToOneDependency, NarrowDependency, RDD, SparkContext, Split, TaskContext}
|
||||
import spark._
|
||||
|
||||
|
||||
private[spark]
|
||||
|
@ -35,7 +35,7 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest](
|
|||
|
||||
val numSplitsInRdd2 = rdd2.splits.size
|
||||
|
||||
@transient var splits_ = {
|
||||
override def getSplits: Array[Split] = {
|
||||
// create the cross product split
|
||||
val array = new Array[Split](rdd1.splits.size * rdd2.splits.size)
|
||||
for (s1 <- rdd1.splits; s2 <- rdd2.splits) {
|
||||
|
@ -45,8 +45,6 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest](
|
|||
array
|
||||
}
|
||||
|
||||
override def getSplits = splits_
|
||||
|
||||
override def getPreferredLocations(split: Split) = {
|
||||
val currSplit = split.asInstanceOf[CartesianSplit]
|
||||
rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)
|
||||
|
@ -58,7 +56,7 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest](
|
|||
y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
|
||||
}
|
||||
|
||||
var deps_ = List(
|
||||
override def getDependencies: Seq[Dependency[_]] = List(
|
||||
new NarrowDependency(rdd1) {
|
||||
def getParents(id: Int): Seq[Int] = List(id / numSplitsInRdd2)
|
||||
},
|
||||
|
@ -67,11 +65,7 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest](
|
|||
}
|
||||
)
|
||||
|
||||
override def getDependencies = deps_
|
||||
|
||||
override def clearDependencies() {
|
||||
deps_ = Nil
|
||||
splits_ = null
|
||||
rdd1 = null
|
||||
rdd2 = null
|
||||
}
|
||||
|
|
|
@ -9,23 +9,26 @@ import org.apache.hadoop.fs.Path
|
|||
import java.io.{File, IOException, EOFException}
|
||||
import java.text.NumberFormat
|
||||
|
||||
private[spark] class CheckpointRDDSplit(idx: Int, val splitFile: String) extends Split {
|
||||
override val index: Int = idx
|
||||
}
|
||||
private[spark] class CheckpointRDDSplit(val index: Int) extends Split {}
|
||||
|
||||
/**
|
||||
* This RDD represents a RDD checkpoint file (similar to HadoopRDD).
|
||||
*/
|
||||
private[spark]
|
||||
class CheckpointRDD[T: ClassManifest](sc: SparkContext, checkpointPath: String)
|
||||
class CheckpointRDD[T: ClassManifest](sc: SparkContext, val checkpointPath: String)
|
||||
extends RDD[T](sc, Nil) {
|
||||
|
||||
@transient val path = new Path(checkpointPath)
|
||||
@transient val fs = path.getFileSystem(new Configuration())
|
||||
@transient val fs = new Path(checkpointPath).getFileSystem(sc.hadoopConfiguration)
|
||||
|
||||
@transient val splits_ : Array[Split] = {
|
||||
val splitFiles = fs.listStatus(path).map(_.getPath.toString).filter(_.contains("part-")).sorted
|
||||
splitFiles.zipWithIndex.map(x => new CheckpointRDDSplit(x._2, x._1)).toArray
|
||||
val dirContents = fs.listStatus(new Path(checkpointPath))
|
||||
val splitFiles = dirContents.map(_.getPath.toString).filter(_.contains("part-")).sorted
|
||||
val numSplits = splitFiles.size
|
||||
if (!splitFiles(0).endsWith(CheckpointRDD.splitIdToFile(0)) ||
|
||||
!splitFiles(numSplits-1).endsWith(CheckpointRDD.splitIdToFile(numSplits-1))) {
|
||||
throw new SparkException("Invalid checkpoint directory: " + checkpointPath)
|
||||
}
|
||||
Array.tabulate(numSplits)(i => new CheckpointRDDSplit(i))
|
||||
}
|
||||
|
||||
checkpointData = Some(new RDDCheckpointData[T](this))
|
||||
|
@ -34,36 +37,34 @@ class CheckpointRDD[T: ClassManifest](sc: SparkContext, checkpointPath: String)
|
|||
override def getSplits = splits_
|
||||
|
||||
override def getPreferredLocations(split: Split): Seq[String] = {
|
||||
val status = fs.getFileStatus(path)
|
||||
val status = fs.getFileStatus(new Path(checkpointPath))
|
||||
val locations = fs.getFileBlockLocations(status, 0, status.getLen)
|
||||
locations.firstOption.toList.flatMap(_.getHosts).filter(_ != "localhost")
|
||||
locations.headOption.toList.flatMap(_.getHosts).filter(_ != "localhost")
|
||||
}
|
||||
|
||||
override def compute(split: Split, context: TaskContext): Iterator[T] = {
|
||||
CheckpointRDD.readFromFile(split.asInstanceOf[CheckpointRDDSplit].splitFile, context)
|
||||
val file = new Path(checkpointPath, CheckpointRDD.splitIdToFile(split.index))
|
||||
CheckpointRDD.readFromFile(file, context)
|
||||
}
|
||||
|
||||
override def checkpoint() {
|
||||
// Do nothing. Hadoop RDD should not be checkpointed.
|
||||
// Do nothing. CheckpointRDD should not be checkpointed.
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] object CheckpointRDD extends Logging {
|
||||
|
||||
def splitIdToFileName(splitId: Int): String = {
|
||||
val numfmt = NumberFormat.getInstance()
|
||||
numfmt.setMinimumIntegerDigits(5)
|
||||
numfmt.setGroupingUsed(false)
|
||||
"part-" + numfmt.format(splitId)
|
||||
def splitIdToFile(splitId: Int): String = {
|
||||
"part-%05d".format(splitId)
|
||||
}
|
||||
|
||||
def writeToFile[T](path: String, blockSize: Int = -1)(context: TaskContext, iterator: Iterator[T]) {
|
||||
def writeToFile[T](path: String, blockSize: Int = -1)(ctx: TaskContext, iterator: Iterator[T]) {
|
||||
val outputDir = new Path(path)
|
||||
val fs = outputDir.getFileSystem(new Configuration())
|
||||
|
||||
val finalOutputName = splitIdToFileName(context.splitId)
|
||||
val finalOutputName = splitIdToFile(ctx.splitId)
|
||||
val finalOutputPath = new Path(outputDir, finalOutputName)
|
||||
val tempOutputPath = new Path(outputDir, "." + finalOutputName + "-attempt-" + context.attemptId)
|
||||
val tempOutputPath = new Path(outputDir, "." + finalOutputName + "-attempt-" + ctx.attemptId)
|
||||
|
||||
if (fs.exists(tempOutputPath)) {
|
||||
throw new IOException("Checkpoint failed: temporary path " +
|
||||
|
@ -83,22 +84,22 @@ private[spark] object CheckpointRDD extends Logging {
|
|||
serializeStream.close()
|
||||
|
||||
if (!fs.rename(tempOutputPath, finalOutputPath)) {
|
||||
if (!fs.delete(finalOutputPath, true)) {
|
||||
throw new IOException("Checkpoint failed: failed to delete earlier output of task "
|
||||
+ context.attemptId)
|
||||
}
|
||||
if (!fs.rename(tempOutputPath, finalOutputPath)) {
|
||||
if (!fs.exists(finalOutputPath)) {
|
||||
fs.delete(tempOutputPath, false)
|
||||
throw new IOException("Checkpoint failed: failed to save output of task: "
|
||||
+ context.attemptId)
|
||||
+ ctx.attemptId + " and final output path does not exist")
|
||||
} else {
|
||||
// Some other copy of this task must've finished before us and renamed it
|
||||
logInfo("Final output path " + finalOutputPath + " already exists; not overwriting it")
|
||||
fs.delete(tempOutputPath, false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def readFromFile[T](path: String, context: TaskContext): Iterator[T] = {
|
||||
val inputPath = new Path(path)
|
||||
val fs = inputPath.getFileSystem(new Configuration())
|
||||
def readFromFile[T](path: Path, context: TaskContext): Iterator[T] = {
|
||||
val fs = path.getFileSystem(new Configuration())
|
||||
val bufferSize = System.getProperty("spark.buffer.size", "65536").toInt
|
||||
val fileInputStream = fs.open(inputPath, bufferSize)
|
||||
val fileInputStream = fs.open(path, bufferSize)
|
||||
val serializer = SparkEnv.get.serializer.newInstance()
|
||||
val deserializeStream = serializer.deserializeStream(fileInputStream)
|
||||
|
||||
|
|
|
@ -27,11 +27,11 @@ private[spark] case class CoalescedRDDSplit(
|
|||
* or to avoid having a large number of small tasks when processing a directory with many files.
|
||||
*/
|
||||
class CoalescedRDD[T: ClassManifest](
|
||||
var prev: RDD[T],
|
||||
@transient var prev: RDD[T],
|
||||
maxPartitions: Int)
|
||||
extends RDD[T](prev.context, Nil) { // Nil, so the dependencies_ var does not refer to parent RDDs
|
||||
extends RDD[T](prev.context, Nil) { // Nil since we implement getDependencies
|
||||
|
||||
@transient var splits_ : Array[Split] = {
|
||||
override def getSplits: Array[Split] = {
|
||||
val prevSplits = prev.splits
|
||||
if (prevSplits.length < maxPartitions) {
|
||||
prevSplits.map(_.index).map{idx => new CoalescedRDDSplit(idx, prev, Array(idx)) }
|
||||
|
@ -44,26 +44,20 @@ class CoalescedRDD[T: ClassManifest](
|
|||
}
|
||||
}
|
||||
|
||||
override def getSplits = splits_
|
||||
|
||||
override def compute(split: Split, context: TaskContext): Iterator[T] = {
|
||||
split.asInstanceOf[CoalescedRDDSplit].parents.iterator.flatMap { parentSplit =>
|
||||
firstParent[T].iterator(parentSplit, context)
|
||||
}
|
||||
}
|
||||
|
||||
var deps_ : List[Dependency[_]] = List(
|
||||
override def getDependencies: Seq[Dependency[_]] = List(
|
||||
new NarrowDependency(prev) {
|
||||
def getParents(id: Int): Seq[Int] =
|
||||
splits(id).asInstanceOf[CoalescedRDDSplit].parentsIndices
|
||||
}
|
||||
)
|
||||
|
||||
override def getDependencies() = deps_
|
||||
|
||||
override def clearDependencies() {
|
||||
deps_ = Nil
|
||||
splits_ = null
|
||||
prev = null
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,13 +3,11 @@ package spark.rdd
|
|||
import spark.{RDD, Split, TaskContext}
|
||||
|
||||
private[spark]
|
||||
class MappedRDD[U: ClassManifest, T: ClassManifest](
|
||||
prev: RDD[T],
|
||||
f: T => U)
|
||||
class MappedRDD[U: ClassManifest, T: ClassManifest](prev: RDD[T], f: T => U)
|
||||
extends RDD[U](prev) {
|
||||
|
||||
override def getSplits = firstParent[T].splits
|
||||
|
||||
override def compute(split: Split, context: TaskContext) =
|
||||
firstParent[T].iterator(split, context).map(f)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,27 @@
|
|||
package spark.rdd
|
||||
|
||||
import spark.{PruneDependency, RDD, SparkEnv, Split, TaskContext}
|
||||
import spark.{NarrowDependency, RDD, SparkEnv, Split, TaskContext}
|
||||
|
||||
|
||||
class PartitionPruningRDDSplit(idx: Int, val parentSplit: Split) extends Split {
|
||||
override val index = idx
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Represents a dependency between the PartitionPruningRDD and its parent. In this
|
||||
* case, the child RDD contains a subset of partitions of the parents'.
|
||||
*/
|
||||
class PruneDependency[T](rdd: RDD[T], @transient partitionFilterFunc: Int => Boolean)
|
||||
extends NarrowDependency[T](rdd) {
|
||||
|
||||
@transient
|
||||
val partitions: Array[Split] = rdd.splits.filter(s => partitionFilterFunc(s.index))
|
||||
.zipWithIndex.map { case(split, idx) => new PartitionPruningRDDSplit(idx, split) : Split }
|
||||
|
||||
override def getParents(partitionId: Int) = List(partitions(partitionId).index)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A RDD used to prune RDD partitions/splits so we can avoid launching tasks on
|
||||
|
@ -13,17 +34,9 @@ class PartitionPruningRDD[T: ClassManifest](
|
|||
@transient partitionFilterFunc: Int => Boolean)
|
||||
extends RDD[T](prev.context, List(new PruneDependency(prev, partitionFilterFunc))) {
|
||||
|
||||
@transient
|
||||
var partitions_ : Array[Split] = dependencies_.head.asInstanceOf[PruneDependency[T]].partitions
|
||||
override def compute(split: Split, context: TaskContext) = firstParent[T].iterator(
|
||||
split.asInstanceOf[PartitionPruningRDDSplit].parentSplit, context)
|
||||
|
||||
override def compute(split: Split, context: TaskContext) = firstParent[T].iterator(split, context)
|
||||
|
||||
override protected def getSplits = partitions_
|
||||
|
||||
override val partitioner = firstParent[T].partitioner
|
||||
|
||||
override def clearDependencies() {
|
||||
super.clearDependencies()
|
||||
partitions_ = null
|
||||
}
|
||||
override protected def getSplits =
|
||||
getDependencies.head.asInstanceOf[PruneDependency[T]].partitions
|
||||
}
|
||||
|
|
|
@ -22,16 +22,10 @@ class ShuffledRDD[K, V](
|
|||
|
||||
override val partitioner = Some(part)
|
||||
|
||||
@transient var splits_ = Array.tabulate[Split](part.numPartitions)(i => new ShuffledRDDSplit(i))
|
||||
|
||||
override def getSplits = splits_
|
||||
override def getSplits = Array.tabulate[Split](part.numPartitions)(i => new ShuffledRDDSplit(i))
|
||||
|
||||
override def compute(split: Split, context: TaskContext): Iterator[(K, V)] = {
|
||||
val shuffledId = dependencies.head.asInstanceOf[ShuffleDependency[K, V]].shuffleId
|
||||
SparkEnv.get.shuffleFetcher.fetch[K, V](shuffledId, split.index)
|
||||
}
|
||||
|
||||
override def clearDependencies() {
|
||||
splits_ = null
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,9 +26,9 @@ private[spark] class UnionSplit[T: ClassManifest](idx: Int, rdd: RDD[T], splitIn
|
|||
class UnionRDD[T: ClassManifest](
|
||||
sc: SparkContext,
|
||||
@transient var rdds: Seq[RDD[T]])
|
||||
extends RDD[T](sc, Nil) { // Nil, so the dependencies_ var does not refer to parent RDDs
|
||||
extends RDD[T](sc, Nil) { // Nil since we implement getDependencies
|
||||
|
||||
@transient var splits_ : Array[Split] = {
|
||||
override def getSplits: Array[Split] = {
|
||||
val array = new Array[Split](rdds.map(_.splits.size).sum)
|
||||
var pos = 0
|
||||
for (rdd <- rdds; split <- rdd.splits) {
|
||||
|
@ -38,20 +38,16 @@ class UnionRDD[T: ClassManifest](
|
|||
array
|
||||
}
|
||||
|
||||
override def getSplits = splits_
|
||||
|
||||
@transient var deps_ = {
|
||||
override def getDependencies: Seq[Dependency[_]] = {
|
||||
val deps = new ArrayBuffer[Dependency[_]]
|
||||
var pos = 0
|
||||
for (rdd <- rdds) {
|
||||
deps += new RangeDependency(rdd, 0, pos, rdd.splits.size)
|
||||
pos += rdd.splits.size
|
||||
}
|
||||
deps.toList
|
||||
deps
|
||||
}
|
||||
|
||||
override def getDependencies = deps_
|
||||
|
||||
override def compute(s: Split, context: TaskContext): Iterator[T] =
|
||||
s.asInstanceOf[UnionSplit[T]].iterator(context)
|
||||
|
||||
|
@ -59,8 +55,6 @@ class UnionRDD[T: ClassManifest](
|
|||
s.asInstanceOf[UnionSplit[T]].preferredLocations()
|
||||
|
||||
override def clearDependencies() {
|
||||
deps_ = null
|
||||
splits_ = null
|
||||
rdds = null
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,9 +32,7 @@ class ZippedRDD[T: ClassManifest, U: ClassManifest](
|
|||
extends RDD[(T, U)](sc, List(new OneToOneDependency(rdd1), new OneToOneDependency(rdd2)))
|
||||
with Serializable {
|
||||
|
||||
// TODO: FIX THIS.
|
||||
|
||||
@transient var splits_ : Array[Split] = {
|
||||
override def getSplits: Array[Split] = {
|
||||
if (rdd1.splits.size != rdd2.splits.size) {
|
||||
throw new IllegalArgumentException("Can't zip RDDs with unequal numbers of partitions")
|
||||
}
|
||||
|
@ -45,8 +43,6 @@ class ZippedRDD[T: ClassManifest, U: ClassManifest](
|
|||
array
|
||||
}
|
||||
|
||||
override def getSplits = splits_
|
||||
|
||||
override def compute(s: Split, context: TaskContext): Iterator[(T, U)] = {
|
||||
val (split1, split2) = s.asInstanceOf[ZippedSplit[T, U]].splits
|
||||
rdd1.iterator(split1, context).zip(rdd2.iterator(split2, context))
|
||||
|
@ -58,7 +54,6 @@ class ZippedRDD[T: ClassManifest, U: ClassManifest](
|
|||
}
|
||||
|
||||
override def clearDependencies() {
|
||||
splits_ = null
|
||||
rdd1 = null
|
||||
rdd2 = null
|
||||
}
|
||||
|
|
|
@ -24,7 +24,16 @@ import util.{MetadataCleaner, TimeStampedHashMap}
|
|||
* and to report fetch failures (the submitTasks method, and code to add CompletionEvents).
|
||||
*/
|
||||
private[spark]
|
||||
class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with Logging {
|
||||
class DAGScheduler(
|
||||
taskSched: TaskScheduler,
|
||||
mapOutputTracker: MapOutputTracker,
|
||||
blockManagerMaster: BlockManagerMaster,
|
||||
env: SparkEnv)
|
||||
extends TaskSchedulerListener with Logging {
|
||||
|
||||
def this(taskSched: TaskScheduler) {
|
||||
this(taskSched, SparkEnv.get.mapOutputTracker, SparkEnv.get.blockManager.master, SparkEnv.get)
|
||||
}
|
||||
taskSched.setListener(this)
|
||||
|
||||
// Called by TaskScheduler to report task completions or failures.
|
||||
|
@ -72,10 +81,6 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
|
||||
var cacheLocs = new HashMap[Int, Array[List[String]]]
|
||||
|
||||
val env = SparkEnv.get
|
||||
val mapOutputTracker = env.mapOutputTracker
|
||||
val blockManagerMaster = env.blockManager.master
|
||||
|
||||
// For tracking failed nodes, we use the MapOutputTracker's generation number, which is
|
||||
// sent with every task. When we detect a node failing, we note the current generation number
|
||||
// and failed executor, increment it for new tasks, and use this to ignore stray ShuffleMapTask
|
||||
|
@ -96,14 +101,16 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
val metadataCleaner = new MetadataCleaner("DAGScheduler", this.cleanup)
|
||||
|
||||
// Start a thread to run the DAGScheduler event loop
|
||||
new Thread("DAGScheduler") {
|
||||
setDaemon(true)
|
||||
override def run() {
|
||||
DAGScheduler.this.run()
|
||||
}
|
||||
}.start()
|
||||
def start() {
|
||||
new Thread("DAGScheduler") {
|
||||
setDaemon(true)
|
||||
override def run() {
|
||||
DAGScheduler.this.run()
|
||||
}
|
||||
}.start()
|
||||
}
|
||||
|
||||
def getCacheLocs(rdd: RDD[_]): Array[List[String]] = {
|
||||
private def getCacheLocs(rdd: RDD[_]): Array[List[String]] = {
|
||||
if (!cacheLocs.contains(rdd.id)) {
|
||||
val blockIds = rdd.splits.indices.map(index=> "rdd_%d_%d".format(rdd.id, index)).toArray
|
||||
cacheLocs(rdd.id) = blockManagerMaster.getLocations(blockIds).map {
|
||||
|
@ -113,7 +120,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
cacheLocs(rdd.id)
|
||||
}
|
||||
|
||||
def clearCacheLocs() {
|
||||
private def clearCacheLocs() {
|
||||
cacheLocs.clear()
|
||||
}
|
||||
|
||||
|
@ -122,7 +129,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
* The priority value passed in will be used if the stage doesn't already exist with
|
||||
* a lower priority (we assume that priorities always increase across jobs for now).
|
||||
*/
|
||||
def getShuffleMapStage(shuffleDep: ShuffleDependency[_,_], priority: Int): Stage = {
|
||||
private def getShuffleMapStage(shuffleDep: ShuffleDependency[_,_], priority: Int): Stage = {
|
||||
shuffleToMapStage.get(shuffleDep.shuffleId) match {
|
||||
case Some(stage) => stage
|
||||
case None =>
|
||||
|
@ -137,11 +144,11 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
* as a result stage for the final RDD used directly in an action. The stage will also be given
|
||||
* the provided priority.
|
||||
*/
|
||||
def newStage(rdd: RDD[_], shuffleDep: Option[ShuffleDependency[_,_]], priority: Int): Stage = {
|
||||
// Kind of ugly: need to register RDDs with the cache and map output tracker here
|
||||
// since we can't do it in the RDD constructor because # of splits is unknown
|
||||
logInfo("Registering RDD " + rdd.id + " (" + rdd.origin + ")")
|
||||
private def newStage(rdd: RDD[_], shuffleDep: Option[ShuffleDependency[_,_]], priority: Int): Stage = {
|
||||
if (shuffleDep != None) {
|
||||
// Kind of ugly: need to register RDDs with the cache and map output tracker here
|
||||
// since we can't do it in the RDD constructor because # of splits is unknown
|
||||
logInfo("Registering RDD " + rdd.id + " (" + rdd.origin + ")")
|
||||
mapOutputTracker.registerShuffle(shuffleDep.get.shuffleId, rdd.splits.size)
|
||||
}
|
||||
val id = nextStageId.getAndIncrement()
|
||||
|
@ -155,7 +162,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
* Get or create the list of parent stages for a given RDD. The stages will be assigned the
|
||||
* provided priority if they haven't already been created with a lower priority.
|
||||
*/
|
||||
def getParentStages(rdd: RDD[_], priority: Int): List[Stage] = {
|
||||
private def getParentStages(rdd: RDD[_], priority: Int): List[Stage] = {
|
||||
val parents = new HashSet[Stage]
|
||||
val visited = new HashSet[RDD[_]]
|
||||
def visit(r: RDD[_]) {
|
||||
|
@ -177,25 +184,22 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
parents.toList
|
||||
}
|
||||
|
||||
def getMissingParentStages(stage: Stage): List[Stage] = {
|
||||
private def getMissingParentStages(stage: Stage): List[Stage] = {
|
||||
val missing = new HashSet[Stage]
|
||||
val visited = new HashSet[RDD[_]]
|
||||
def visit(rdd: RDD[_]) {
|
||||
if (!visited(rdd)) {
|
||||
visited += rdd
|
||||
val locs = getCacheLocs(rdd)
|
||||
for (p <- 0 until rdd.splits.size) {
|
||||
if (locs(p) == Nil) {
|
||||
for (dep <- rdd.dependencies) {
|
||||
dep match {
|
||||
case shufDep: ShuffleDependency[_,_] =>
|
||||
val mapStage = getShuffleMapStage(shufDep, stage.priority)
|
||||
if (!mapStage.isAvailable) {
|
||||
missing += mapStage
|
||||
}
|
||||
case narrowDep: NarrowDependency[_] =>
|
||||
visit(narrowDep.rdd)
|
||||
}
|
||||
if (getCacheLocs(rdd).contains(Nil)) {
|
||||
for (dep <- rdd.dependencies) {
|
||||
dep match {
|
||||
case shufDep: ShuffleDependency[_,_] =>
|
||||
val mapStage = getShuffleMapStage(shufDep, stage.priority)
|
||||
if (!mapStage.isAvailable) {
|
||||
missing += mapStage
|
||||
}
|
||||
case narrowDep: NarrowDependency[_] =>
|
||||
visit(narrowDep.rdd)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -205,23 +209,45 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
missing.toList
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns (and does not submit) a JobSubmitted event suitable to run a given job, and a
|
||||
* JobWaiter whose getResult() method will return the result of the job when it is complete.
|
||||
*
|
||||
* The job is assumed to have at least one partition; zero partition jobs should be handled
|
||||
* without a JobSubmitted event.
|
||||
*/
|
||||
private[scheduler] def prepareJob[T, U: ClassManifest](
|
||||
finalRdd: RDD[T],
|
||||
func: (TaskContext, Iterator[T]) => U,
|
||||
partitions: Seq[Int],
|
||||
callSite: String,
|
||||
allowLocal: Boolean,
|
||||
resultHandler: (Int, U) => Unit)
|
||||
: (JobSubmitted, JobWaiter[U]) =
|
||||
{
|
||||
assert(partitions.size > 0)
|
||||
val waiter = new JobWaiter(partitions.size, resultHandler)
|
||||
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
|
||||
val toSubmit = JobSubmitted(finalRdd, func2, partitions.toArray, allowLocal, callSite, waiter)
|
||||
return (toSubmit, waiter)
|
||||
}
|
||||
|
||||
def runJob[T, U: ClassManifest](
|
||||
finalRdd: RDD[T],
|
||||
func: (TaskContext, Iterator[T]) => U,
|
||||
partitions: Seq[Int],
|
||||
callSite: String,
|
||||
allowLocal: Boolean)
|
||||
: Array[U] =
|
||||
allowLocal: Boolean,
|
||||
resultHandler: (Int, U) => Unit)
|
||||
{
|
||||
if (partitions.size == 0) {
|
||||
return new Array[U](0)
|
||||
return
|
||||
}
|
||||
val waiter = new JobWaiter(partitions.size)
|
||||
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
|
||||
eventQueue.put(JobSubmitted(finalRdd, func2, partitions.toArray, allowLocal, callSite, waiter))
|
||||
waiter.getResult() match {
|
||||
case JobSucceeded(results: Seq[_]) =>
|
||||
return results.asInstanceOf[Seq[U]].toArray
|
||||
val (toSubmit, waiter) = prepareJob(
|
||||
finalRdd, func, partitions, callSite, allowLocal, resultHandler)
|
||||
eventQueue.put(toSubmit)
|
||||
waiter.awaitResult() match {
|
||||
case JobSucceeded => {}
|
||||
case JobFailed(exception: Exception) =>
|
||||
logInfo("Failed to run " + callSite)
|
||||
throw exception
|
||||
|
@ -240,90 +266,117 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
|
||||
val partitions = (0 until rdd.splits.size).toArray
|
||||
eventQueue.put(JobSubmitted(rdd, func2, partitions, false, callSite, listener))
|
||||
return listener.getResult() // Will throw an exception if the job fails
|
||||
return listener.awaitResult() // Will throw an exception if the job fails
|
||||
}
|
||||
|
||||
/**
|
||||
* Process one event retrieved from the event queue.
|
||||
* Returns true if we should stop the event loop.
|
||||
*/
|
||||
private[scheduler] def processEvent(event: DAGSchedulerEvent): Boolean = {
|
||||
event match {
|
||||
case JobSubmitted(finalRDD, func, partitions, allowLocal, callSite, listener) =>
|
||||
val runId = nextRunId.getAndIncrement()
|
||||
val finalStage = newStage(finalRDD, None, runId)
|
||||
val job = new ActiveJob(runId, finalStage, func, partitions, callSite, listener)
|
||||
clearCacheLocs()
|
||||
logInfo("Got job " + job.runId + " (" + callSite + ") with " + partitions.length +
|
||||
" output partitions (allowLocal=" + allowLocal + ")")
|
||||
logInfo("Final stage: " + finalStage + " (" + finalStage.origin + ")")
|
||||
logInfo("Parents of final stage: " + finalStage.parents)
|
||||
logInfo("Missing parents: " + getMissingParentStages(finalStage))
|
||||
if (allowLocal && finalStage.parents.size == 0 && partitions.length == 1) {
|
||||
// Compute very short actions like first() or take() with no parent stages locally.
|
||||
runLocally(job)
|
||||
} else {
|
||||
activeJobs += job
|
||||
resultStageToJob(finalStage) = job
|
||||
submitStage(finalStage)
|
||||
}
|
||||
|
||||
case ExecutorLost(execId) =>
|
||||
handleExecutorLost(execId)
|
||||
|
||||
case completion: CompletionEvent =>
|
||||
handleTaskCompletion(completion)
|
||||
|
||||
case TaskSetFailed(taskSet, reason) =>
|
||||
abortStage(idToStage(taskSet.stageId), reason)
|
||||
|
||||
case StopDAGScheduler =>
|
||||
// Cancel any active jobs
|
||||
for (job <- activeJobs) {
|
||||
val error = new SparkException("Job cancelled because SparkContext was shut down")
|
||||
job.listener.jobFailed(error)
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* Resubmit any failed stages. Ordinarily called after a small amount of time has passed since
|
||||
* the last fetch failure.
|
||||
*/
|
||||
private[scheduler] def resubmitFailedStages() {
|
||||
logInfo("Resubmitting failed stages")
|
||||
clearCacheLocs()
|
||||
val failed2 = failed.toArray
|
||||
failed.clear()
|
||||
for (stage <- failed2.sortBy(_.priority)) {
|
||||
submitStage(stage)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for waiting or failed stages which are now eligible for resubmission.
|
||||
* Ordinarily run on every iteration of the event loop.
|
||||
*/
|
||||
private[scheduler] def submitWaitingStages() {
|
||||
// TODO: We might want to run this less often, when we are sure that something has become
|
||||
// runnable that wasn't before.
|
||||
logTrace("Checking for newly runnable parent stages")
|
||||
logTrace("running: " + running)
|
||||
logTrace("waiting: " + waiting)
|
||||
logTrace("failed: " + failed)
|
||||
val waiting2 = waiting.toArray
|
||||
waiting.clear()
|
||||
for (stage <- waiting2.sortBy(_.priority)) {
|
||||
submitStage(stage)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The main event loop of the DAG scheduler, which waits for new-job / task-finished / failure
|
||||
* events and responds by launching tasks. This runs in a dedicated thread and receives events
|
||||
* via the eventQueue.
|
||||
*/
|
||||
def run() {
|
||||
private def run() {
|
||||
SparkEnv.set(env)
|
||||
|
||||
while (true) {
|
||||
val event = eventQueue.poll(POLL_TIMEOUT, TimeUnit.MILLISECONDS)
|
||||
val time = System.currentTimeMillis() // TODO: use a pluggable clock for testability
|
||||
if (event != null) {
|
||||
logDebug("Got event of type " + event.getClass.getName)
|
||||
}
|
||||
|
||||
event match {
|
||||
case JobSubmitted(finalRDD, func, partitions, allowLocal, callSite, listener) =>
|
||||
val runId = nextRunId.getAndIncrement()
|
||||
val finalStage = newStage(finalRDD, None, runId)
|
||||
val job = new ActiveJob(runId, finalStage, func, partitions, callSite, listener)
|
||||
clearCacheLocs()
|
||||
logInfo("Got job " + job.runId + " (" + callSite + ") with " + partitions.length +
|
||||
" output partitions")
|
||||
logInfo("Final stage: " + finalStage + " (" + finalStage.origin + ")")
|
||||
logInfo("Parents of final stage: " + finalStage.parents)
|
||||
logInfo("Missing parents: " + getMissingParentStages(finalStage))
|
||||
if (allowLocal && finalStage.parents.size == 0 && partitions.length == 1) {
|
||||
// Compute very short actions like first() or take() with no parent stages locally.
|
||||
runLocally(job)
|
||||
} else {
|
||||
activeJobs += job
|
||||
resultStageToJob(finalStage) = job
|
||||
submitStage(finalStage)
|
||||
}
|
||||
|
||||
case ExecutorLost(execId) =>
|
||||
handleExecutorLost(execId)
|
||||
|
||||
case completion: CompletionEvent =>
|
||||
handleTaskCompletion(completion)
|
||||
|
||||
case TaskSetFailed(taskSet, reason) =>
|
||||
abortStage(idToStage(taskSet.stageId), reason)
|
||||
|
||||
case StopDAGScheduler =>
|
||||
// Cancel any active jobs
|
||||
for (job <- activeJobs) {
|
||||
val error = new SparkException("Job cancelled because SparkContext was shut down")
|
||||
job.listener.jobFailed(error)
|
||||
}
|
||||
if (event != null) {
|
||||
if (processEvent(event)) {
|
||||
return
|
||||
|
||||
case null =>
|
||||
// queue.poll() timed out, ignore it
|
||||
}
|
||||
}
|
||||
|
||||
val time = System.currentTimeMillis() // TODO: use a pluggable clock for testability
|
||||
// Periodically resubmit failed stages if some map output fetches have failed and we have
|
||||
// waited at least RESUBMIT_TIMEOUT. We wait for this short time because when a node fails,
|
||||
// tasks on many other nodes are bound to get a fetch failure, and they won't all get it at
|
||||
// the same time, so we want to make sure we've identified all the reduce tasks that depend
|
||||
// on the failed node.
|
||||
if (failed.size > 0 && time > lastFetchFailureTime + RESUBMIT_TIMEOUT) {
|
||||
logInfo("Resubmitting failed stages")
|
||||
clearCacheLocs()
|
||||
val failed2 = failed.toArray
|
||||
failed.clear()
|
||||
for (stage <- failed2.sortBy(_.priority)) {
|
||||
submitStage(stage)
|
||||
}
|
||||
resubmitFailedStages()
|
||||
} else {
|
||||
// TODO: We might want to run this less often, when we are sure that something has become
|
||||
// runnable that wasn't before.
|
||||
logDebug("Checking for newly runnable parent stages")
|
||||
logDebug("running: " + running)
|
||||
logDebug("waiting: " + waiting)
|
||||
logDebug("failed: " + failed)
|
||||
val waiting2 = waiting.toArray
|
||||
waiting.clear()
|
||||
for (stage <- waiting2.sortBy(_.priority)) {
|
||||
submitStage(stage)
|
||||
}
|
||||
submitWaitingStages()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -333,7 +386,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
* We run the operation in a separate thread just in case it takes a bunch of time, so that we
|
||||
* don't block the DAGScheduler event loop or other concurrent jobs.
|
||||
*/
|
||||
def runLocally(job: ActiveJob) {
|
||||
private def runLocally(job: ActiveJob) {
|
||||
logInfo("Computing the requested partition locally")
|
||||
new Thread("Local computation of job " + job.runId) {
|
||||
override def run() {
|
||||
|
@ -356,13 +409,14 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
}.start()
|
||||
}
|
||||
|
||||
def submitStage(stage: Stage) {
|
||||
/** Submits stage, but first recursively submits any missing parents. */
|
||||
private def submitStage(stage: Stage) {
|
||||
logDebug("submitStage(" + stage + ")")
|
||||
if (!waiting(stage) && !running(stage) && !failed(stage)) {
|
||||
val missing = getMissingParentStages(stage).sortBy(_.id)
|
||||
logDebug("missing: " + missing)
|
||||
if (missing == Nil) {
|
||||
logInfo("Submitting " + stage + " (" + stage.origin + "), which has no missing parents")
|
||||
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
|
||||
submitMissingTasks(stage)
|
||||
running += stage
|
||||
} else {
|
||||
|
@ -374,7 +428,8 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
}
|
||||
}
|
||||
|
||||
def submitMissingTasks(stage: Stage) {
|
||||
/** Called when stage's parents are available and we can now do its task. */
|
||||
private def submitMissingTasks(stage: Stage) {
|
||||
logDebug("submitMissingTasks(" + stage + ")")
|
||||
// Get our pending tasks and remember them in our pendingTasks entry
|
||||
val myPending = pendingTasks.getOrElseUpdate(stage, new HashSet)
|
||||
|
@ -395,11 +450,14 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
}
|
||||
}
|
||||
if (tasks.size > 0) {
|
||||
logInfo("Submitting " + tasks.size + " missing tasks from " + stage)
|
||||
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
|
||||
myPending ++= tasks
|
||||
logDebug("New pending tasks: " + myPending)
|
||||
taskSched.submitTasks(
|
||||
new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.priority))
|
||||
if (!stage.submissionTime.isDefined) {
|
||||
stage.submissionTime = Some(System.currentTimeMillis())
|
||||
}
|
||||
} else {
|
||||
logDebug("Stage " + stage + " is actually done; %b %d %d".format(
|
||||
stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
|
||||
|
@ -411,9 +469,20 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
* Responds to a task finishing. This is called inside the event loop so it assumes that it can
|
||||
* modify the scheduler's internal state. Use taskEnded() to post a task end event from outside.
|
||||
*/
|
||||
def handleTaskCompletion(event: CompletionEvent) {
|
||||
private def handleTaskCompletion(event: CompletionEvent) {
|
||||
val task = event.task
|
||||
val stage = idToStage(task.stageId)
|
||||
|
||||
def markStageAsFinished(stage: Stage) = {
|
||||
val serviceTime = stage.submissionTime match {
|
||||
case Some(t) => "%.03f".format((System.currentTimeMillis() - t) / 1000.0)
|
||||
case _ => "Unkown"
|
||||
}
|
||||
logInfo("%s (%s) finished in %s s".format(stage, stage.origin, serviceTime))
|
||||
val stageComp = StageCompleted(stageToInfos(stage))
|
||||
sparkListeners.foreach{_.onStageCompleted(stageComp)}
|
||||
running -= stage
|
||||
}
|
||||
event.reason match {
|
||||
case Success =>
|
||||
logInfo("Completed " + task)
|
||||
|
@ -429,15 +498,13 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
if (!job.finished(rt.outputId)) {
|
||||
job.finished(rt.outputId) = true
|
||||
job.numFinished += 1
|
||||
job.listener.taskSucceeded(rt.outputId, event.result)
|
||||
// If the whole job has finished, remove it
|
||||
if (job.numFinished == job.numPartitions) {
|
||||
activeJobs -= job
|
||||
resultStageToJob -= stage
|
||||
running -= stage
|
||||
val stageComp = StageCompleted(stageToInfos(stage))
|
||||
sparkListeners.foreach{_.onStageCompleted(stageComp)}
|
||||
markStageAsFinished(stage)
|
||||
}
|
||||
job.listener.taskSucceeded(rt.outputId, event.result)
|
||||
}
|
||||
case None =>
|
||||
logInfo("Ignoring result from " + rt + " because its job has finished")
|
||||
|
@ -454,10 +521,8 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
stage.addOutputLoc(smt.partition, status)
|
||||
}
|
||||
if (running.contains(stage) && pendingTasks(stage).isEmpty) {
|
||||
logInfo(stage + " (" + stage.origin + ") finished; looking for newly runnable stages")
|
||||
running -= stage
|
||||
val stageComp = StageCompleted(stageToInfos(stage))
|
||||
sparkListeners.foreach{_.onStageCompleted(stageComp)}
|
||||
markStageAsFinished(stage)
|
||||
logInfo("looking for newly runnable stages")
|
||||
logInfo("running: " + running)
|
||||
logInfo("waiting: " + waiting)
|
||||
logInfo("failed: " + failed)
|
||||
|
@ -492,7 +557,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
waiting --= newlyRunnable
|
||||
running ++= newlyRunnable
|
||||
for (stage <- newlyRunnable.sortBy(_.id)) {
|
||||
logInfo("Submitting " + stage + " (" + stage.origin + "), which is now runnable")
|
||||
logInfo("Submitting " + stage + " (" + stage.rdd + "), which is now runnable")
|
||||
submitMissingTasks(stage)
|
||||
}
|
||||
}
|
||||
|
@ -541,12 +606,12 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
* Optionally the generation during which the failure was caught can be passed to avoid allowing
|
||||
* stray fetch failures from possibly retriggering the detection of a node as lost.
|
||||
*/
|
||||
def handleExecutorLost(execId: String, maybeGeneration: Option[Long] = None) {
|
||||
private def handleExecutorLost(execId: String, maybeGeneration: Option[Long] = None) {
|
||||
val currentGeneration = maybeGeneration.getOrElse(mapOutputTracker.getGeneration)
|
||||
if (!failedGeneration.contains(execId) || failedGeneration(execId) < currentGeneration) {
|
||||
failedGeneration(execId) = currentGeneration
|
||||
logInfo("Executor lost: %s (generation %d)".format(execId, currentGeneration))
|
||||
env.blockManager.master.removeExecutor(execId)
|
||||
blockManagerMaster.removeExecutor(execId)
|
||||
// TODO: This will be really slow if we keep accumulating shuffle map stages
|
||||
for ((shuffleId, stage) <- shuffleToMapStage) {
|
||||
stage.removeOutputsOnExecutor(execId)
|
||||
|
@ -567,7 +632,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
* Aborts all jobs depending on a particular Stage. This is called in response to a task set
|
||||
* being cancelled by the TaskScheduler. Use taskSetFailed() to inject this event from outside.
|
||||
*/
|
||||
def abortStage(failedStage: Stage, reason: String) {
|
||||
private def abortStage(failedStage: Stage, reason: String) {
|
||||
val dependentStages = resultStageToJob.keys.filter(x => stageDependsOn(x, failedStage)).toSeq
|
||||
for (resultStage <- dependentStages) {
|
||||
val job = resultStageToJob(resultStage)
|
||||
|
@ -583,7 +648,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
/**
|
||||
* Return true if one of stage's ancestors is target.
|
||||
*/
|
||||
def stageDependsOn(stage: Stage, target: Stage): Boolean = {
|
||||
private def stageDependsOn(stage: Stage, target: Stage): Boolean = {
|
||||
if (stage == target) {
|
||||
return true
|
||||
}
|
||||
|
@ -610,7 +675,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
visitedRdds.contains(target.rdd)
|
||||
}
|
||||
|
||||
def getPreferredLocs(rdd: RDD[_], partition: Int): List[String] = {
|
||||
private def getPreferredLocs(rdd: RDD[_], partition: Int): List[String] = {
|
||||
// If the partition is cached, return the cache locations
|
||||
val cached = getCacheLocs(rdd)(partition)
|
||||
if (cached != Nil) {
|
||||
|
@ -636,7 +701,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
|
|||
return Nil
|
||||
}
|
||||
|
||||
def cleanup(cleanupTime: Long) {
|
||||
private def cleanup(cleanupTime: Long) {
|
||||
var sizeBefore = idToStage.size
|
||||
idToStage.clearOldValues(cleanupTime)
|
||||
logInfo("idToStage " + sizeBefore + " --> " + idToStage.size)
|
||||
|
|
|
@ -5,5 +5,5 @@ package spark.scheduler
|
|||
*/
|
||||
private[spark] sealed trait JobResult
|
||||
|
||||
private[spark] case class JobSucceeded(results: Seq[_]) extends JobResult
|
||||
private[spark] case object JobSucceeded extends JobResult
|
||||
private[spark] case class JobFailed(exception: Exception) extends JobResult
|
||||
|
|
|
@ -3,10 +3,12 @@ package spark.scheduler
|
|||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
/**
|
||||
* An object that waits for a DAGScheduler job to complete.
|
||||
* An object that waits for a DAGScheduler job to complete. As tasks finish, it passes their
|
||||
* results to the given handler function.
|
||||
*/
|
||||
private[spark] class JobWaiter(totalTasks: Int) extends JobListener {
|
||||
private val taskResults = ArrayBuffer.fill[Any](totalTasks)(null)
|
||||
private[spark] class JobWaiter[T](totalTasks: Int, resultHandler: (Int, T) => Unit)
|
||||
extends JobListener {
|
||||
|
||||
private var finishedTasks = 0
|
||||
|
||||
private var jobFinished = false // Is the job as a whole finished (succeeded or failed)?
|
||||
|
@ -17,11 +19,11 @@ private[spark] class JobWaiter(totalTasks: Int) extends JobListener {
|
|||
if (jobFinished) {
|
||||
throw new UnsupportedOperationException("taskSucceeded() called on a finished JobWaiter")
|
||||
}
|
||||
taskResults(index) = result
|
||||
resultHandler(index, result.asInstanceOf[T])
|
||||
finishedTasks += 1
|
||||
if (finishedTasks == totalTasks) {
|
||||
jobFinished = true
|
||||
jobResult = JobSucceeded(taskResults)
|
||||
jobResult = JobSucceeded
|
||||
this.notifyAll()
|
||||
}
|
||||
}
|
||||
|
@ -38,7 +40,7 @@ private[spark] class JobWaiter(totalTasks: Int) extends JobListener {
|
|||
}
|
||||
}
|
||||
|
||||
def getResult(): JobResult = synchronized {
|
||||
def awaitResult(): JobResult = synchronized {
|
||||
while (!jobFinished) {
|
||||
this.wait()
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ private[spark] object ShuffleMapTask {
|
|||
return old
|
||||
} else {
|
||||
val out = new ByteArrayOutputStream
|
||||
val ser = SparkEnv.get.closureSerializer.newInstance
|
||||
val ser = SparkEnv.get.closureSerializer.newInstance()
|
||||
val objOut = ser.serializeStream(new GZIPOutputStream(out))
|
||||
objOut.writeObject(rdd)
|
||||
objOut.writeObject(dep)
|
||||
|
@ -48,7 +48,7 @@ private[spark] object ShuffleMapTask {
|
|||
synchronized {
|
||||
val loader = Thread.currentThread.getContextClassLoader
|
||||
val in = new GZIPInputStream(new ByteArrayInputStream(bytes))
|
||||
val ser = SparkEnv.get.closureSerializer.newInstance
|
||||
val ser = SparkEnv.get.closureSerializer.newInstance()
|
||||
val objIn = ser.deserializeStream(in)
|
||||
val rdd = objIn.readObject().asInstanceOf[RDD[_]]
|
||||
val dep = objIn.readObject().asInstanceOf[ShuffleDependency[_,_]]
|
||||
|
@ -127,7 +127,6 @@ private[spark] class ShuffleMapTask(
|
|||
val bucketId = dep.partitioner.getPartition(pair._1)
|
||||
buckets(bucketId) += pair
|
||||
}
|
||||
val bucketIterators = buckets.map(_.iterator)
|
||||
|
||||
val compressedSizes = new Array[Byte](numOutputSplits)
|
||||
|
||||
|
@ -135,7 +134,7 @@ private[spark] class ShuffleMapTask(
|
|||
for (i <- 0 until numOutputSplits) {
|
||||
val blockId = "shuffle_" + dep.shuffleId + "_" + partition + "_" + i
|
||||
// Get a Scala iterator from Java map
|
||||
val iter: Iterator[(Any, Any)] = bucketIterators(i)
|
||||
val iter: Iterator[(Any, Any)] = buckets(i).iterator
|
||||
val size = blockManager.put(blockId, iter, StorageLevel.DISK_ONLY, false)
|
||||
compressedSizes(i) = MapOutputTracker.compressSize(size)
|
||||
}
|
||||
|
|
|
@ -32,6 +32,9 @@ private[spark] class Stage(
|
|||
val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil)
|
||||
var numAvailableOutputs = 0
|
||||
|
||||
/** When first task was submitted to scheduler. */
|
||||
var submissionTime: Option[Long] = None
|
||||
|
||||
private var nextAttemptId = 0
|
||||
|
||||
def isAvailable: Boolean = {
|
||||
|
|
|
@ -86,7 +86,7 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
|
|||
}
|
||||
}
|
||||
|
||||
def submitTasks(taskSet: TaskSet) {
|
||||
override def submitTasks(taskSet: TaskSet) {
|
||||
val tasks = taskSet.tasks
|
||||
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
|
||||
this.synchronized {
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package spark.scheduler.cluster
|
||||
|
||||
import spark.Utils
|
||||
|
||||
/**
|
||||
* A backend interface for cluster scheduling systems that allows plugging in different ones under
|
||||
* ClusterScheduler. We assume a Mesos-like model where the application gets resource offers as
|
||||
|
@ -11,5 +13,15 @@ private[spark] trait SchedulerBackend {
|
|||
def reviveOffers(): Unit
|
||||
def defaultParallelism(): Int
|
||||
|
||||
// Memory used by each executor (in megabytes)
|
||||
protected val executorMemory = {
|
||||
// TODO: Might need to add some extra memory for the non-heap parts of the JVM
|
||||
Option(System.getProperty("spark.executor.memory"))
|
||||
.orElse(Option(System.getenv("SPARK_MEM")))
|
||||
.map(Utils.memoryStringToMb)
|
||||
.getOrElse(512)
|
||||
}
|
||||
|
||||
|
||||
// TODO: Probably want to add a killTask too
|
||||
}
|
||||
|
|
|
@ -20,23 +20,14 @@ private[spark] class SparkDeploySchedulerBackend(
|
|||
|
||||
val maxCores = System.getProperty("spark.cores.max", Int.MaxValue.toString).toInt
|
||||
|
||||
// Memory used by each executor (in megabytes)
|
||||
val executorMemory = {
|
||||
if (System.getenv("SPARK_MEM") != null) {
|
||||
Utils.memoryStringToMb(System.getenv("SPARK_MEM"))
|
||||
// TODO: Might need to add some extra memory for the non-heap parts of the JVM
|
||||
} else {
|
||||
512
|
||||
}
|
||||
}
|
||||
|
||||
override def start() {
|
||||
super.start()
|
||||
|
||||
val masterUrl = "akka://spark@%s:%s/user/%s".format(
|
||||
System.getProperty("spark.master.host"), System.getProperty("spark.master.port"),
|
||||
// The endpoint for executors to talk to us
|
||||
val driverUrl = "akka://spark@%s:%s/user/%s".format(
|
||||
System.getProperty("spark.driver.host"), System.getProperty("spark.driver.port"),
|
||||
StandaloneSchedulerBackend.ACTOR_NAME)
|
||||
val args = Seq(masterUrl, "{{EXECUTOR_ID}}", "{{HOSTNAME}}", "{{CORES}}")
|
||||
val args = Seq(driverUrl, "{{EXECUTOR_ID}}", "{{HOSTNAME}}", "{{CORES}}")
|
||||
val command = Command("spark.executor.StandaloneExecutorBackend", args, sc.executorEnvs)
|
||||
val sparkHome = sc.getSparkHome().getOrElse(throw new IllegalArgumentException("must supply spark home for spark standalone"))
|
||||
val jobDesc = new JobDescription(jobName, maxCores, executorMemory, command, sparkHome)
|
||||
|
@ -54,23 +45,23 @@ private[spark] class SparkDeploySchedulerBackend(
|
|||
}
|
||||
}
|
||||
|
||||
def connected(jobId: String) {
|
||||
override def connected(jobId: String) {
|
||||
logInfo("Connected to Spark cluster with job ID " + jobId)
|
||||
}
|
||||
|
||||
def disconnected() {
|
||||
override def disconnected() {
|
||||
if (!stopping) {
|
||||
logError("Disconnected from Spark cluster!")
|
||||
scheduler.error("Disconnected from Spark cluster")
|
||||
}
|
||||
}
|
||||
|
||||
def executorAdded(id: String, workerId: String, host: String, cores: Int, memory: Int) {
|
||||
override def executorAdded(executorId: String, workerId: String, host: String, cores: Int, memory: Int) {
|
||||
logInfo("Granted executor ID %s on host %s with %d cores, %s RAM".format(
|
||||
id, host, cores, Utils.memoryMegabytesToString(memory)))
|
||||
executorId, host, cores, Utils.memoryMegabytesToString(memory)))
|
||||
}
|
||||
|
||||
def executorRemoved(executorId: String, message: String, exitStatus: Option[Int]) {
|
||||
override def executorRemoved(executorId: String, message: String, exitStatus: Option[Int]) {
|
||||
val reason: ExecutorLossReason = exitStatus match {
|
||||
case Some(code) => ExecutorExited(code)
|
||||
case None => SlaveLost(message)
|
||||
|
|
|
@ -6,7 +6,7 @@ import spark.util.SerializableBuffer
|
|||
|
||||
private[spark] sealed trait StandaloneClusterMessage extends Serializable
|
||||
|
||||
// Master to slaves
|
||||
// Driver to executors
|
||||
private[spark]
|
||||
case class LaunchTask(task: TaskDescription) extends StandaloneClusterMessage
|
||||
|
||||
|
@ -17,7 +17,7 @@ case class RegisteredExecutor(sparkProperties: Seq[(String, String)])
|
|||
private[spark]
|
||||
case class RegisterExecutorFailed(message: String) extends StandaloneClusterMessage
|
||||
|
||||
// Executors to master
|
||||
// Executors to driver
|
||||
private[spark]
|
||||
case class RegisterExecutor(executorId: String, host: String, cores: Int)
|
||||
extends StandaloneClusterMessage
|
||||
|
@ -34,6 +34,6 @@ object StatusUpdate {
|
|||
}
|
||||
}
|
||||
|
||||
// Internal messages in master
|
||||
// Internal messages in driver
|
||||
private[spark] case object ReviveOffers extends StandaloneClusterMessage
|
||||
private[spark] case object StopMaster extends StandaloneClusterMessage
|
||||
private[spark] case object StopDriver extends StandaloneClusterMessage
|
||||
|
|
|
@ -23,7 +23,7 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor
|
|||
// Use an atomic variable to track total number of cores in the cluster for simplicity and speed
|
||||
var totalCoreCount = new AtomicInteger(0)
|
||||
|
||||
class MasterActor(sparkProperties: Seq[(String, String)]) extends Actor {
|
||||
class DriverActor(sparkProperties: Seq[(String, String)]) extends Actor {
|
||||
val executorActor = new HashMap[String, ActorRef]
|
||||
val executorAddress = new HashMap[String, Address]
|
||||
val executorHost = new HashMap[String, String]
|
||||
|
@ -64,7 +64,7 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor
|
|||
case ReviveOffers =>
|
||||
makeOffers()
|
||||
|
||||
case StopMaster =>
|
||||
case StopDriver =>
|
||||
sender ! true
|
||||
context.stop(self)
|
||||
|
||||
|
@ -113,10 +113,10 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor
|
|||
}
|
||||
}
|
||||
|
||||
var masterActor: ActorRef = null
|
||||
var driverActor: ActorRef = null
|
||||
val taskIdsOnSlave = new HashMap[String, HashSet[String]]
|
||||
|
||||
def start() {
|
||||
override def start() {
|
||||
val properties = new ArrayBuffer[(String, String)]
|
||||
val iterator = System.getProperties.entrySet.iterator
|
||||
while (iterator.hasNext) {
|
||||
|
@ -126,15 +126,15 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor
|
|||
properties += ((key, value))
|
||||
}
|
||||
}
|
||||
masterActor = actorSystem.actorOf(
|
||||
Props(new MasterActor(properties)), name = StandaloneSchedulerBackend.ACTOR_NAME)
|
||||
driverActor = actorSystem.actorOf(
|
||||
Props(new DriverActor(properties)), name = StandaloneSchedulerBackend.ACTOR_NAME)
|
||||
}
|
||||
|
||||
def stop() {
|
||||
override def stop() {
|
||||
try {
|
||||
if (masterActor != null) {
|
||||
if (driverActor != null) {
|
||||
val timeout = 5.seconds
|
||||
val future = masterActor.ask(StopMaster)(timeout)
|
||||
val future = driverActor.ask(StopDriver)(timeout)
|
||||
Await.result(future, timeout)
|
||||
}
|
||||
} catch {
|
||||
|
@ -143,11 +143,11 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor
|
|||
}
|
||||
}
|
||||
|
||||
def reviveOffers() {
|
||||
masterActor ! ReviveOffers
|
||||
override def reviveOffers() {
|
||||
driverActor ! ReviveOffers
|
||||
}
|
||||
|
||||
def defaultParallelism(): Int = math.max(totalCoreCount.get(), 2)
|
||||
override def defaultParallelism(): Int = math.max(totalCoreCount.get(), 2)
|
||||
}
|
||||
|
||||
private[spark] object StandaloneSchedulerBackend {
|
||||
|
|
|
@ -17,10 +17,7 @@ import java.nio.ByteBuffer
|
|||
/**
|
||||
* Schedules the tasks within a single TaskSet in the ClusterScheduler.
|
||||
*/
|
||||
private[spark] class TaskSetManager(
|
||||
sched: ClusterScheduler,
|
||||
val taskSet: TaskSet)
|
||||
extends Logging {
|
||||
private[spark] class TaskSetManager(sched: ClusterScheduler, val taskSet: TaskSet) extends Logging {
|
||||
|
||||
// Maximum time to wait to run a task in a preferred location (in ms)
|
||||
val LOCALITY_WAIT = System.getProperty("spark.locality.wait", "3000").toLong
|
||||
|
@ -100,7 +97,7 @@ private[spark] class TaskSetManager(
|
|||
}
|
||||
|
||||
// Add a task to all the pending-task lists that it should be on.
|
||||
def addPendingTask(index: Int) {
|
||||
private def addPendingTask(index: Int) {
|
||||
val locations = tasks(index).preferredLocations.toSet & sched.hostsAlive
|
||||
if (locations.size == 0) {
|
||||
pendingTasksWithNoPrefs += index
|
||||
|
@ -115,7 +112,7 @@ private[spark] class TaskSetManager(
|
|||
|
||||
// Return the pending tasks list for a given host, or an empty list if
|
||||
// there is no map entry for that host
|
||||
def getPendingTasksForHost(host: String): ArrayBuffer[Int] = {
|
||||
private def getPendingTasksForHost(host: String): ArrayBuffer[Int] = {
|
||||
pendingTasksForHost.getOrElse(host, ArrayBuffer())
|
||||
}
|
||||
|
||||
|
@ -123,7 +120,7 @@ private[spark] class TaskSetManager(
|
|||
// Return None if the list is empty.
|
||||
// This method also cleans up any tasks in the list that have already
|
||||
// been launched, since we want that to happen lazily.
|
||||
def findTaskFromList(list: ArrayBuffer[Int]): Option[Int] = {
|
||||
private def findTaskFromList(list: ArrayBuffer[Int]): Option[Int] = {
|
||||
while (!list.isEmpty) {
|
||||
val index = list.last
|
||||
list.trimEnd(1)
|
||||
|
@ -137,7 +134,7 @@ private[spark] class TaskSetManager(
|
|||
// Return a speculative task for a given host if any are available. The task should not have an
|
||||
// attempt running on this host, in case the host is slow. In addition, if localOnly is set, the
|
||||
// task must have a preference for this host (or no preferred locations at all).
|
||||
def findSpeculativeTask(host: String, localOnly: Boolean): Option[Int] = {
|
||||
private def findSpeculativeTask(host: String, localOnly: Boolean): Option[Int] = {
|
||||
val hostsAlive = sched.hostsAlive
|
||||
speculatableTasks.retain(index => !finished(index)) // Remove finished tasks from set
|
||||
val localTask = speculatableTasks.find {
|
||||
|
@ -162,7 +159,7 @@ private[spark] class TaskSetManager(
|
|||
|
||||
// Dequeue a pending task for a given node and return its index.
|
||||
// If localOnly is set to false, allow non-local tasks as well.
|
||||
def findTask(host: String, localOnly: Boolean): Option[Int] = {
|
||||
private def findTask(host: String, localOnly: Boolean): Option[Int] = {
|
||||
val localTask = findTaskFromList(getPendingTasksForHost(host))
|
||||
if (localTask != None) {
|
||||
return localTask
|
||||
|
@ -184,7 +181,7 @@ private[spark] class TaskSetManager(
|
|||
// Does a host count as a preferred location for a task? This is true if
|
||||
// either the task has preferred locations and this host is one, or it has
|
||||
// no preferred locations (in which we still count the launch as preferred).
|
||||
def isPreferredLocation(task: Task[_], host: String): Boolean = {
|
||||
private def isPreferredLocation(task: Task[_], host: String): Boolean = {
|
||||
val locs = task.preferredLocations
|
||||
return (locs.contains(host) || locs.isEmpty)
|
||||
}
|
||||
|
@ -335,7 +332,7 @@ private[spark] class TaskSetManager(
|
|||
if (numFailures(index) > MAX_TASK_FAILURES) {
|
||||
logError("Task %s:%d failed more than %d times; aborting job".format(
|
||||
taskSet.id, index, MAX_TASK_FAILURES))
|
||||
abort("Task %d failed more than %d times".format(index, MAX_TASK_FAILURES))
|
||||
abort("Task %s:%d failed more than %d times".format(taskSet.id, index, MAX_TASK_FAILURES))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -54,7 +54,7 @@ private[spark] class LocalScheduler(threads: Int, maxFailures: Int, sc: SparkCon
|
|||
}
|
||||
|
||||
def runTask(task: Task[_], idInJob: Int, attemptId: Int) {
|
||||
logInfo("Running task " + idInJob)
|
||||
logInfo("Running " + task)
|
||||
val info = new TaskInfo(attemptId, idInJob, System.currentTimeMillis(), "local", "local")
|
||||
// Set the Spark execution environment for the worker thread
|
||||
SparkEnv.set(env)
|
||||
|
@ -82,7 +82,7 @@ private[spark] class LocalScheduler(threads: Int, maxFailures: Int, sc: SparkCon
|
|||
val resultToReturn = ser.deserialize[Any](ser.serialize(result))
|
||||
val accumUpdates = ser.deserialize[collection.mutable.Map[Long, Any]](
|
||||
ser.serialize(Accumulators.values))
|
||||
logInfo("Finished task " + idInJob)
|
||||
logInfo("Finished " + task)
|
||||
info.markSuccessful()
|
||||
|
||||
// If the threadpool has not already been shutdown, notify DAGScheduler
|
||||
|
|
|
@ -35,16 +35,6 @@ private[spark] class CoarseMesosSchedulerBackend(
|
|||
|
||||
val MAX_SLAVE_FAILURES = 2 // Blacklist a slave after this many failures
|
||||
|
||||
// Memory used by each executor (in megabytes)
|
||||
val executorMemory = {
|
||||
if (System.getenv("SPARK_MEM") != null) {
|
||||
Utils.memoryStringToMb(System.getenv("SPARK_MEM"))
|
||||
// TODO: Might need to add some extra memory for the non-heap parts of the JVM
|
||||
} else {
|
||||
512
|
||||
}
|
||||
}
|
||||
|
||||
// Lock used to wait for scheduler to be registered
|
||||
var isRegistered = false
|
||||
val registeredLock = new Object()
|
||||
|
@ -104,11 +94,11 @@ private[spark] class CoarseMesosSchedulerBackend(
|
|||
|
||||
def createCommand(offer: Offer, numCores: Int): CommandInfo = {
|
||||
val runScript = new File(sparkHome, "run").getCanonicalPath
|
||||
val masterUrl = "akka://spark@%s:%s/user/%s".format(
|
||||
System.getProperty("spark.master.host"), System.getProperty("spark.master.port"),
|
||||
val driverUrl = "akka://spark@%s:%s/user/%s".format(
|
||||
System.getProperty("spark.driver.host"), System.getProperty("spark.driver.port"),
|
||||
StandaloneSchedulerBackend.ACTOR_NAME)
|
||||
val command = "\"%s\" spark.executor.StandaloneExecutorBackend %s %s %s %d".format(
|
||||
runScript, masterUrl, offer.getSlaveId.getValue, offer.getHostname, numCores)
|
||||
runScript, driverUrl, offer.getSlaveId.getValue, offer.getHostname, numCores)
|
||||
val environment = Environment.newBuilder()
|
||||
sc.executorEnvs.foreach { case (key, value) =>
|
||||
environment.addVariables(Environment.Variable.newBuilder()
|
||||
|
|
|
@ -29,16 +29,6 @@ private[spark] class MesosSchedulerBackend(
|
|||
with MScheduler
|
||||
with Logging {
|
||||
|
||||
// Memory used by each executor (in megabytes)
|
||||
val EXECUTOR_MEMORY = {
|
||||
if (System.getenv("SPARK_MEM") != null) {
|
||||
Utils.memoryStringToMb(System.getenv("SPARK_MEM"))
|
||||
// TODO: Might need to add some extra memory for the non-heap parts of the JVM
|
||||
} else {
|
||||
512
|
||||
}
|
||||
}
|
||||
|
||||
// Lock used to wait for scheduler to be registered
|
||||
var isRegistered = false
|
||||
val registeredLock = new Object()
|
||||
|
@ -51,7 +41,7 @@ private[spark] class MesosSchedulerBackend(
|
|||
val taskIdToSlaveId = new HashMap[Long, String]
|
||||
|
||||
// An ExecutorInfo for our tasks
|
||||
var executorInfo: ExecutorInfo = null
|
||||
var execArgs: Array[Byte] = null
|
||||
|
||||
override def start() {
|
||||
synchronized {
|
||||
|
@ -70,12 +60,11 @@ private[spark] class MesosSchedulerBackend(
|
|||
}
|
||||
}.start()
|
||||
|
||||
executorInfo = createExecutorInfo()
|
||||
waitForRegister()
|
||||
}
|
||||
}
|
||||
|
||||
def createExecutorInfo(): ExecutorInfo = {
|
||||
def createExecutorInfo(execId: String): ExecutorInfo = {
|
||||
val sparkHome = sc.getSparkHome().getOrElse(throw new SparkException(
|
||||
"Spark home is not set; set it through the spark.home system " +
|
||||
"property, the SPARK_HOME environment variable or the SparkContext constructor"))
|
||||
|
@ -90,14 +79,14 @@ private[spark] class MesosSchedulerBackend(
|
|||
val memory = Resource.newBuilder()
|
||||
.setName("mem")
|
||||
.setType(Value.Type.SCALAR)
|
||||
.setScalar(Value.Scalar.newBuilder().setValue(EXECUTOR_MEMORY).build())
|
||||
.setScalar(Value.Scalar.newBuilder().setValue(executorMemory).build())
|
||||
.build()
|
||||
val command = CommandInfo.newBuilder()
|
||||
.setValue(execScript)
|
||||
.setEnvironment(environment)
|
||||
.build()
|
||||
ExecutorInfo.newBuilder()
|
||||
.setExecutorId(ExecutorID.newBuilder().setValue("default").build())
|
||||
.setExecutorId(ExecutorID.newBuilder().setValue(execId).build())
|
||||
.setCommand(command)
|
||||
.setData(ByteString.copyFrom(createExecArg()))
|
||||
.addResources(memory)
|
||||
|
@ -109,17 +98,20 @@ private[spark] class MesosSchedulerBackend(
|
|||
* containing all the spark.* system properties in the form of (String, String) pairs.
|
||||
*/
|
||||
private def createExecArg(): Array[Byte] = {
|
||||
val props = new HashMap[String, String]
|
||||
val iterator = System.getProperties.entrySet.iterator
|
||||
while (iterator.hasNext) {
|
||||
val entry = iterator.next
|
||||
val (key, value) = (entry.getKey.toString, entry.getValue.toString)
|
||||
if (key.startsWith("spark.")) {
|
||||
props(key) = value
|
||||
if (execArgs == null) {
|
||||
val props = new HashMap[String, String]
|
||||
val iterator = System.getProperties.entrySet.iterator
|
||||
while (iterator.hasNext) {
|
||||
val entry = iterator.next
|
||||
val (key, value) = (entry.getKey.toString, entry.getValue.toString)
|
||||
if (key.startsWith("spark.")) {
|
||||
props(key) = value
|
||||
}
|
||||
}
|
||||
// Serialize the map as an array of (String, String) pairs
|
||||
execArgs = Utils.serialize(props.toArray)
|
||||
}
|
||||
// Serialize the map as an array of (String, String) pairs
|
||||
return Utils.serialize(props.toArray)
|
||||
return execArgs
|
||||
}
|
||||
|
||||
override def offerRescinded(d: SchedulerDriver, o: OfferID) {}
|
||||
|
@ -159,7 +151,7 @@ private[spark] class MesosSchedulerBackend(
|
|||
def enoughMemory(o: Offer) = {
|
||||
val mem = getResource(o.getResourcesList, "mem")
|
||||
val slaveId = o.getSlaveId.getValue
|
||||
mem >= EXECUTOR_MEMORY || slaveIdsWithExecutors.contains(slaveId)
|
||||
mem >= executorMemory || slaveIdsWithExecutors.contains(slaveId)
|
||||
}
|
||||
|
||||
for ((offer, index) <- offers.zipWithIndex if enoughMemory(offer)) {
|
||||
|
@ -216,7 +208,7 @@ private[spark] class MesosSchedulerBackend(
|
|||
return MesosTaskInfo.newBuilder()
|
||||
.setTaskId(taskId)
|
||||
.setSlaveId(SlaveID.newBuilder().setValue(slaveId).build())
|
||||
.setExecutor(executorInfo)
|
||||
.setExecutor(createExecutorInfo(slaveId))
|
||||
.setName(task.name)
|
||||
.addResources(cpuResource)
|
||||
.setData(ByteString.copyFrom(task.serializedTask))
|
||||
|
|
|
@ -243,7 +243,7 @@ class BlockManager(
|
|||
val startTimeMs = System.currentTimeMillis
|
||||
var managers = master.getLocations(blockId)
|
||||
val locations = managers.map(_.ip)
|
||||
logDebug("Get block locations in " + Utils.getUsedTimeMs(startTimeMs))
|
||||
logDebug("Got block locations in " + Utils.getUsedTimeMs(startTimeMs))
|
||||
return locations
|
||||
}
|
||||
|
||||
|
@ -253,7 +253,7 @@ class BlockManager(
|
|||
def getLocations(blockIds: Array[String]): Array[Seq[String]] = {
|
||||
val startTimeMs = System.currentTimeMillis
|
||||
val locations = master.getLocations(blockIds).map(_.map(_.ip).toSeq).toArray
|
||||
logDebug("Get multiple block location in " + Utils.getUsedTimeMs(startTimeMs))
|
||||
logDebug("Got multiple block location in " + Utils.getUsedTimeMs(startTimeMs))
|
||||
return locations
|
||||
}
|
||||
|
||||
|
@ -645,7 +645,7 @@ class BlockManager(
|
|||
var size = 0L
|
||||
|
||||
myInfo.synchronized {
|
||||
logDebug("Put for block " + blockId + " took " + Utils.getUsedTimeMs(startTimeMs)
|
||||
logTrace("Put for block " + blockId + " took " + Utils.getUsedTimeMs(startTimeMs)
|
||||
+ " to get into synchronized block")
|
||||
|
||||
if (level.useMemory) {
|
||||
|
@ -677,8 +677,10 @@ class BlockManager(
|
|||
}
|
||||
logDebug("Put block " + blockId + " locally took " + Utils.getUsedTimeMs(startTimeMs))
|
||||
|
||||
|
||||
// Replicate block if required
|
||||
if (level.replication > 1) {
|
||||
val remoteStartTime = System.currentTimeMillis
|
||||
// Serialize the block if not already done
|
||||
if (bytesAfterPut == null) {
|
||||
if (valuesAfterPut == null) {
|
||||
|
@ -688,12 +690,10 @@ class BlockManager(
|
|||
bytesAfterPut = dataSerialize(blockId, valuesAfterPut)
|
||||
}
|
||||
replicate(blockId, bytesAfterPut, level)
|
||||
logDebug("Put block " + blockId + " remotely took " + Utils.getUsedTimeMs(remoteStartTime))
|
||||
}
|
||||
|
||||
BlockManager.dispose(bytesAfterPut)
|
||||
|
||||
logDebug("Put block " + blockId + " took " + Utils.getUsedTimeMs(startTimeMs))
|
||||
|
||||
return size
|
||||
}
|
||||
|
||||
|
@ -950,6 +950,7 @@ class BlockManager(
|
|||
blockInfo.clear()
|
||||
memoryStore.clear()
|
||||
diskStore.clear()
|
||||
metadataCleaner.cancel()
|
||||
logInfo("BlockManager stopped")
|
||||
}
|
||||
}
|
||||
|
@ -978,7 +979,7 @@ object BlockManager extends Logging {
|
|||
*/
|
||||
def dispose(buffer: ByteBuffer) {
|
||||
if (buffer != null && buffer.isInstanceOf[MappedByteBuffer]) {
|
||||
logDebug("Unmapping " + buffer)
|
||||
logTrace("Unmapping " + buffer)
|
||||
if (buffer.asInstanceOf[DirectBuffer].cleaner() != null) {
|
||||
buffer.asInstanceOf[DirectBuffer].cleaner().clean()
|
||||
}
|
||||
|
|
|
@ -15,52 +15,49 @@ import akka.util.duration._
|
|||
|
||||
import spark.{Logging, SparkException, Utils}
|
||||
|
||||
|
||||
private[spark] class BlockManagerMaster(
|
||||
val actorSystem: ActorSystem,
|
||||
isMaster: Boolean,
|
||||
isDriver: Boolean,
|
||||
isLocal: Boolean,
|
||||
masterIp: String,
|
||||
masterPort: Int)
|
||||
driverIp: String,
|
||||
driverPort: Int)
|
||||
extends Logging {
|
||||
|
||||
val AKKA_RETRY_ATTEMPTS: Int = System.getProperty("spark.akka.num.retries", "3").toInt
|
||||
val AKKA_RETRY_INTERVAL_MS: Int = System.getProperty("spark.akka.retry.wait", "3000").toInt
|
||||
|
||||
val MASTER_AKKA_ACTOR_NAME = "BlockMasterManager"
|
||||
val SLAVE_AKKA_ACTOR_NAME = "BlockSlaveManager"
|
||||
val DEFAULT_MANAGER_IP: String = Utils.localHostName()
|
||||
val DRIVER_AKKA_ACTOR_NAME = "BlockMasterManager"
|
||||
|
||||
val timeout = 10.seconds
|
||||
var masterActor: ActorRef = {
|
||||
if (isMaster) {
|
||||
val masterActor = actorSystem.actorOf(Props(new BlockManagerMasterActor(isLocal)),
|
||||
name = MASTER_AKKA_ACTOR_NAME)
|
||||
var driverActor: ActorRef = {
|
||||
if (isDriver) {
|
||||
val driverActor = actorSystem.actorOf(Props(new BlockManagerMasterActor(isLocal)),
|
||||
name = DRIVER_AKKA_ACTOR_NAME)
|
||||
logInfo("Registered BlockManagerMaster Actor")
|
||||
masterActor
|
||||
driverActor
|
||||
} else {
|
||||
val url = "akka://spark@%s:%s/user/%s".format(masterIp, masterPort, MASTER_AKKA_ACTOR_NAME)
|
||||
val url = "akka://spark@%s:%s/user/%s".format(driverIp, driverPort, DRIVER_AKKA_ACTOR_NAME)
|
||||
logInfo("Connecting to BlockManagerMaster: " + url)
|
||||
actorSystem.actorFor(url)
|
||||
}
|
||||
}
|
||||
|
||||
/** Remove a dead executor from the master actor. This is only called on the master side. */
|
||||
/** Remove a dead executor from the driver actor. This is only called on the driver side. */
|
||||
def removeExecutor(execId: String) {
|
||||
tell(RemoveExecutor(execId))
|
||||
logInfo("Removed " + execId + " successfully in removeExecutor")
|
||||
}
|
||||
|
||||
/**
|
||||
* Send the master actor a heart beat from the slave. Returns true if everything works out,
|
||||
* false if the master does not know about the given block manager, which means the block
|
||||
* Send the driver actor a heart beat from the slave. Returns true if everything works out,
|
||||
* false if the driver does not know about the given block manager, which means the block
|
||||
* manager should re-register.
|
||||
*/
|
||||
def sendHeartBeat(blockManagerId: BlockManagerId): Boolean = {
|
||||
askMasterWithRetry[Boolean](HeartBeat(blockManagerId))
|
||||
askDriverWithReply[Boolean](HeartBeat(blockManagerId))
|
||||
}
|
||||
|
||||
/** Register the BlockManager's id with the master. */
|
||||
/** Register the BlockManager's id with the driver. */
|
||||
def registerBlockManager(
|
||||
blockManagerId: BlockManagerId, maxMemSize: Long, slaveActor: ActorRef) {
|
||||
logInfo("Trying to register BlockManager")
|
||||
|
@ -74,25 +71,25 @@ private[spark] class BlockManagerMaster(
|
|||
storageLevel: StorageLevel,
|
||||
memSize: Long,
|
||||
diskSize: Long): Boolean = {
|
||||
val res = askMasterWithRetry[Boolean](
|
||||
val res = askDriverWithReply[Boolean](
|
||||
UpdateBlockInfo(blockManagerId, blockId, storageLevel, memSize, diskSize))
|
||||
logInfo("Updated info of block " + blockId)
|
||||
res
|
||||
}
|
||||
|
||||
/** Get locations of the blockId from the master */
|
||||
/** Get locations of the blockId from the driver */
|
||||
def getLocations(blockId: String): Seq[BlockManagerId] = {
|
||||
askMasterWithRetry[Seq[BlockManagerId]](GetLocations(blockId))
|
||||
askDriverWithReply[Seq[BlockManagerId]](GetLocations(blockId))
|
||||
}
|
||||
|
||||
/** Get locations of multiple blockIds from the master */
|
||||
/** Get locations of multiple blockIds from the driver */
|
||||
def getLocations(blockIds: Array[String]): Seq[Seq[BlockManagerId]] = {
|
||||
askMasterWithRetry[Seq[Seq[BlockManagerId]]](GetLocationsMultipleBlockIds(blockIds))
|
||||
askDriverWithReply[Seq[Seq[BlockManagerId]]](GetLocationsMultipleBlockIds(blockIds))
|
||||
}
|
||||
|
||||
/** Get ids of other nodes in the cluster from the master */
|
||||
/** Get ids of other nodes in the cluster from the driver */
|
||||
def getPeers(blockManagerId: BlockManagerId, numPeers: Int): Seq[BlockManagerId] = {
|
||||
val result = askMasterWithRetry[Seq[BlockManagerId]](GetPeers(blockManagerId, numPeers))
|
||||
val result = askDriverWithReply[Seq[BlockManagerId]](GetPeers(blockManagerId, numPeers))
|
||||
if (result.length != numPeers) {
|
||||
throw new SparkException(
|
||||
"Error getting peers, only got " + result.size + " instead of " + numPeers)
|
||||
|
@ -102,10 +99,10 @@ private[spark] class BlockManagerMaster(
|
|||
|
||||
/**
|
||||
* Remove a block from the slaves that have it. This can only be used to remove
|
||||
* blocks that the master knows about.
|
||||
* blocks that the driver knows about.
|
||||
*/
|
||||
def removeBlock(blockId: String) {
|
||||
askMasterWithRetry(RemoveBlock(blockId))
|
||||
askDriverWithReply(RemoveBlock(blockId))
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -115,37 +112,37 @@ private[spark] class BlockManagerMaster(
|
|||
* amount of remaining memory.
|
||||
*/
|
||||
def getMemoryStatus: Map[BlockManagerId, (Long, Long)] = {
|
||||
askMasterWithRetry[Map[BlockManagerId, (Long, Long)]](GetMemoryStatus)
|
||||
askDriverWithReply[Map[BlockManagerId, (Long, Long)]](GetMemoryStatus)
|
||||
}
|
||||
|
||||
def getStorageStatus: Array[StorageStatus] = {
|
||||
askMasterWithRetry[ArrayBuffer[StorageStatus]](GetStorageStatus).toArray
|
||||
askDriverWithReply[ArrayBuffer[StorageStatus]](GetStorageStatus).toArray
|
||||
}
|
||||
|
||||
/** Stop the master actor, called only on the Spark master node */
|
||||
/** Stop the driver actor, called only on the Spark driver node */
|
||||
def stop() {
|
||||
if (masterActor != null) {
|
||||
if (driverActor != null) {
|
||||
tell(StopBlockManagerMaster)
|
||||
masterActor = null
|
||||
driverActor = null
|
||||
logInfo("BlockManagerMaster stopped")
|
||||
}
|
||||
}
|
||||
|
||||
/** Send a one-way message to the master actor, to which we expect it to reply with true. */
|
||||
private def tell(message: Any) {
|
||||
if (!askMasterWithRetry[Boolean](message)) {
|
||||
if (!askDriverWithReply[Boolean](message)) {
|
||||
throw new SparkException("BlockManagerMasterActor returned false, expected true.")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a message to the master actor and get its result within a default timeout, or
|
||||
* Send a message to the driver actor and get its result within a default timeout, or
|
||||
* throw a SparkException if this fails.
|
||||
*/
|
||||
private def askMasterWithRetry[T](message: Any): T = {
|
||||
private def askDriverWithReply[T](message: Any): T = {
|
||||
// TODO: Consider removing multiple attempts
|
||||
if (masterActor == null) {
|
||||
throw new SparkException("Error sending message to BlockManager as masterActor is null " +
|
||||
if (driverActor == null) {
|
||||
throw new SparkException("Error sending message to BlockManager as driverActor is null " +
|
||||
"[message = " + message + "]")
|
||||
}
|
||||
var attempts = 0
|
||||
|
@ -153,7 +150,7 @@ private[spark] class BlockManagerMaster(
|
|||
while (attempts < AKKA_RETRY_ATTEMPTS) {
|
||||
attempts += 1
|
||||
try {
|
||||
val future = masterActor.ask(message)(timeout)
|
||||
val future = driverActor.ask(message)(timeout)
|
||||
val result = Await.result(future, timeout)
|
||||
if (result == null) {
|
||||
throw new Exception("BlockManagerMaster returned null")
|
||||
|
|
|
@ -115,7 +115,7 @@ class BlockManagerMasterActor(val isLocal: Boolean) extends Actor with Logging {
|
|||
}
|
||||
|
||||
def expireDeadHosts() {
|
||||
logDebug("Checking for hosts with no recent heart beats in BlockManagerMaster.")
|
||||
logTrace("Checking for hosts with no recent heart beats in BlockManagerMaster.")
|
||||
val now = System.currentTimeMillis()
|
||||
val minSeenTime = now - slaveTimeout
|
||||
val toRemove = new HashSet[BlockManagerId]
|
||||
|
|
|
@ -45,7 +45,7 @@ class BlockManagerUI(val actorSystem: ActorSystem, blockManagerMaster: ActorRef,
|
|||
path("") {
|
||||
completeWith {
|
||||
// Request the current storage status from the Master
|
||||
val storageStatusList = sc.getSlavesStorageStatus
|
||||
val storageStatusList = sc.getExecutorStorageStatus
|
||||
// Calculate macro-level statistics
|
||||
val maxMem = storageStatusList.map(_.maxMem).reduce(_+_)
|
||||
val remainingMem = storageStatusList.map(_.memRemaining).reduce(_+_)
|
||||
|
@ -60,7 +60,7 @@ class BlockManagerUI(val actorSystem: ActorSystem, blockManagerMaster: ActorRef,
|
|||
parameter("id") { id =>
|
||||
completeWith {
|
||||
val prefix = "rdd_" + id.toString
|
||||
val storageStatusList = sc.getSlavesStorageStatus
|
||||
val storageStatusList = sc.getExecutorStorageStatus
|
||||
val filteredStorageStatusList = StorageUtils.
|
||||
filterStorageStatusByPrefix(storageStatusList, prefix)
|
||||
val rddInfo = StorageUtils.rddInfoFromStorageStatus(filteredStorageStatusList, sc).head
|
||||
|
|
|
@ -22,12 +22,11 @@ case class StorageStatus(blockManagerId: BlockManagerId, maxMem: Long,
|
|||
}
|
||||
|
||||
case class RDDInfo(id: Int, name: String, storageLevel: StorageLevel,
|
||||
numPartitions: Int, memSize: Long, diskSize: Long) {
|
||||
numCachedPartitions: Int, numPartitions: Int, memSize: Long, diskSize: Long) {
|
||||
override def toString = {
|
||||
import Utils.memoryBytesToString
|
||||
import java.lang.{Integer => JInt}
|
||||
String.format("RDD \"%s\" (%d) Storage: %s; Partitions: %d; MemorySize: %s; DiskSize: %s", name, id.asInstanceOf[JInt],
|
||||
storageLevel.toString, numPartitions.asInstanceOf[JInt], memoryBytesToString(memSize), memoryBytesToString(diskSize))
|
||||
"RDD \"%s\" (%d) Storage: %s; CachedPartitions: %d; TotalPartitions: %d; MemorySize: %s; DiskSize: %s".format(name, id,
|
||||
storageLevel.toString, numCachedPartitions, numPartitions, memoryBytesToString(memSize), memoryBytesToString(diskSize))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -44,8 +43,6 @@ object StorageUtils {
|
|||
/* Given a list of BlockStatus objets, returns information for each RDD */
|
||||
def rddInfoFromBlockStatusList(infos: Map[String, BlockStatus],
|
||||
sc: SparkContext) : Array[RDDInfo] = {
|
||||
// Find all RDD Blocks (ignore broadcast variables)
|
||||
val rddBlocks = infos.filterKeys(_.startsWith("rdd"))
|
||||
|
||||
// Group by rddId, ignore the partition name
|
||||
val groupedRddBlocks = infos.groupBy { case(k, v) =>
|
||||
|
@ -65,9 +62,8 @@ object StorageUtils {
|
|||
val rdd = sc.persistentRdds(rddId)
|
||||
val rddName = Option(rdd.name).getOrElse(rddKey)
|
||||
val rddStorageLevel = rdd.getStorageLevel
|
||||
//TODO get total number of partitions in rdd
|
||||
|
||||
RDDInfo(rddId, rddName, rddStorageLevel, rddBlocks.length, memSize, diskSize)
|
||||
RDDInfo(rddId, rddName, rddStorageLevel, rddBlocks.length, rdd.splits.size, memSize, diskSize)
|
||||
}.toArray
|
||||
}
|
||||
|
||||
|
|
|
@ -75,9 +75,9 @@ private[spark] object ThreadingTest {
|
|||
System.setProperty("spark.kryoserializer.buffer.mb", "1")
|
||||
val actorSystem = ActorSystem("test")
|
||||
val serializer = new KryoSerializer
|
||||
val masterIp: String = System.getProperty("spark.master.host", "localhost")
|
||||
val masterPort: Int = System.getProperty("spark.master.port", "7077").toInt
|
||||
val blockManagerMaster = new BlockManagerMaster(actorSystem, true, true, masterIp, masterPort)
|
||||
val driverIp: String = System.getProperty("spark.driver.host", "localhost")
|
||||
val driverPort: Int = System.getProperty("spark.driver.port", "7077").toInt
|
||||
val blockManagerMaster = new BlockManagerMaster(actorSystem, true, true, driverIp, driverPort)
|
||||
val blockManager = new BlockManager(
|
||||
"<driver>", actorSystem, blockManagerMaster, serializer, 1024 * 1024)
|
||||
val producers = (1 to numProducers).map(i => new ProducerThread(blockManager, i))
|
||||
|
|
|
@ -18,9 +18,13 @@ import java.util.concurrent.TimeoutException
|
|||
* Various utility classes for working with Akka.
|
||||
*/
|
||||
private[spark] object AkkaUtils {
|
||||
|
||||
/**
|
||||
* Creates an ActorSystem ready for remoting, with various Spark features. Returns both the
|
||||
* ActorSystem itself and its port (which is hard to get from Akka).
|
||||
*
|
||||
* Note: the `name` parameter is important, as even if a client sends a message to right
|
||||
* host + port, if the system name is incorrect, Akka will drop the message.
|
||||
*/
|
||||
def createActorSystem(name: String, host: String, port: Int): (ActorSystem, Int) = {
|
||||
val akkaThreads = System.getProperty("spark.akka.threads", "4").toInt
|
||||
|
@ -30,6 +34,7 @@ private[spark] object AkkaUtils {
|
|||
val akkaConf = ConfigFactory.parseString("""
|
||||
akka.daemonic = on
|
||||
akka.event-handlers = ["akka.event.slf4j.Slf4jEventHandler"]
|
||||
akka.stdout-loglevel = "ERROR"
|
||||
akka.actor.provider = "akka.remote.RemoteActorRefProvider"
|
||||
akka.remote.transport = "akka.remote.netty.NettyRemoteTransport"
|
||||
akka.remote.log-remote-lifecycle-events = on
|
||||
|
@ -41,7 +46,7 @@ private[spark] object AkkaUtils {
|
|||
akka.actor.default-dispatcher.throughput = %d
|
||||
""".format(host, port, akkaTimeout, akkaFrameSize, akkaThreads, akkaBatchSize))
|
||||
|
||||
val actorSystem = ActorSystem("spark", akkaConf, getClass.getClassLoader)
|
||||
val actorSystem = ActorSystem(name, akkaConf, getClass.getClassLoader)
|
||||
|
||||
// Figure out the port number we bound to, in case port was passed as 0. This is a bit of a
|
||||
// hack because Akka doesn't let you figure out the port through the public API yet.
|
||||
|
|
|
@ -9,13 +9,12 @@ import spark.Logging
|
|||
* Runs a timer task to periodically clean up metadata (e.g. old files or hashtable entries)
|
||||
*/
|
||||
class MetadataCleaner(name: String, cleanupFunc: (Long) => Unit) extends Logging {
|
||||
private val delaySeconds = MetadataCleaner.getDelaySeconds
|
||||
private val periodSeconds = math.max(10, delaySeconds / 10)
|
||||
private val timer = new Timer(name + " cleanup timer", true)
|
||||
|
||||
val delaySeconds = MetadataCleaner.getDelaySeconds
|
||||
val periodSeconds = math.max(10, delaySeconds / 10)
|
||||
val timer = new Timer(name + " cleanup timer", true)
|
||||
|
||||
val task = new TimerTask {
|
||||
def run() {
|
||||
private val task = new TimerTask {
|
||||
override def run() {
|
||||
try {
|
||||
cleanupFunc(System.currentTimeMillis() - (delaySeconds * 1000))
|
||||
logInfo("Ran metadata cleaner for " + name)
|
||||
|
@ -27,8 +26,8 @@ class MetadataCleaner(name: String, cleanupFunc: (Long) => Unit) extends Logging
|
|||
|
||||
if (delaySeconds > 0) {
|
||||
logDebug(
|
||||
"Starting metadata cleaner for " + name + " with delay of " + delaySeconds + " seconds and "
|
||||
+ "period of " + periodSeconds + " secs")
|
||||
"Starting metadata cleaner for " + name + " with delay of " + delaySeconds + " seconds " +
|
||||
"and period of " + periodSeconds + " secs")
|
||||
timer.schedule(task, periodSeconds * 1000, periodSeconds * 1000)
|
||||
}
|
||||
|
||||
|
@ -39,7 +38,7 @@ class MetadataCleaner(name: String, cleanupFunc: (Long) => Unit) extends Logging
|
|||
|
||||
|
||||
object MetadataCleaner {
|
||||
def getDelaySeconds = (System.getProperty("spark.cleaner.delay", "-100").toDouble * 60).toInt
|
||||
def setDelaySeconds(delay: Long) { System.setProperty("spark.cleaner.delay", delay.toString) }
|
||||
def getDelaySeconds = System.getProperty("spark.cleaner.delay", "-1").toInt
|
||||
def setDelaySeconds(delay: Int) { System.setProperty("spark.cleaner.delay", delay.toString) }
|
||||
}
|
||||
|
||||
|
|
|
@ -11,7 +11,11 @@
|
|||
<strong>Storage Level:</strong>
|
||||
@(rddInfo.storageLevel.description)
|
||||
<li>
|
||||
<strong>Partitions:</strong>
|
||||
<strong>Cached Partitions:</strong>
|
||||
@(rddInfo.numCachedPartitions)
|
||||
</li>
|
||||
<li>
|
||||
<strong>Total Partitions:</strong>
|
||||
@(rddInfo.numPartitions)
|
||||
</li>
|
||||
<li>
|
||||
|
|
|
@ -6,7 +6,8 @@
|
|||
<tr>
|
||||
<th>RDD Name</th>
|
||||
<th>Storage Level</th>
|
||||
<th>Partitions</th>
|
||||
<th>Cached Partitions</th>
|
||||
<th>Fraction Partitions Cached</th>
|
||||
<th>Size in Memory</th>
|
||||
<th>Size on Disk</th>
|
||||
</tr>
|
||||
|
@ -21,7 +22,8 @@
|
|||
</td>
|
||||
<td>@(rdd.storageLevel.description)
|
||||
</td>
|
||||
<td>@rdd.numPartitions</td>
|
||||
<td>@rdd.numCachedPartitions</td>
|
||||
<td>@(rdd.numCachedPartitions / rdd.numPartitions.toDouble)</td>
|
||||
<td>@{Utils.memoryBytesToString(rdd.memSize)}</td>
|
||||
<td>@{Utils.memoryBytesToString(rdd.diskSize)}</td>
|
||||
</tr>
|
||||
|
|
|
@ -17,6 +17,12 @@ class AccumulatorSuite extends FunSuite with ShouldMatchers with LocalSparkConte
|
|||
val d = sc.parallelize(1 to 20)
|
||||
d.foreach{x => acc += x}
|
||||
acc.value should be (210)
|
||||
|
||||
|
||||
val longAcc = sc.accumulator(0l)
|
||||
val maxInt = Integer.MAX_VALUE.toLong
|
||||
d.foreach{x => longAcc += maxInt + x}
|
||||
longAcc.value should be (210l + maxInt * 20)
|
||||
}
|
||||
|
||||
test ("value not assignable from tasks") {
|
||||
|
|
|
@ -99,7 +99,7 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
|
|||
// the parent RDD has been checkpointed and parent splits have been changed to HadoopSplits.
|
||||
// Note that this test is very specific to the current implementation of CartesianRDD.
|
||||
val ones = sc.makeRDD(1 to 100, 10).map(x => x)
|
||||
ones.checkpoint // checkpoint that MappedRDD
|
||||
ones.checkpoint() // checkpoint that MappedRDD
|
||||
val cartesian = new CartesianRDD(sc, ones, ones)
|
||||
val splitBeforeCheckpoint =
|
||||
serializeDeserialize(cartesian.splits.head.asInstanceOf[CartesianSplit])
|
||||
|
@ -125,7 +125,7 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
|
|||
// the parent RDD has been checkpointed and parent splits have been changed to HadoopSplits.
|
||||
// Note that this test is very specific to the current implementation of CoalescedRDDSplits
|
||||
val ones = sc.makeRDD(1 to 100, 10).map(x => x)
|
||||
ones.checkpoint // checkpoint that MappedRDD
|
||||
ones.checkpoint() // checkpoint that MappedRDD
|
||||
val coalesced = new CoalescedRDD(ones, 2)
|
||||
val splitBeforeCheckpoint =
|
||||
serializeDeserialize(coalesced.splits.head.asInstanceOf[CoalescedRDDSplit])
|
||||
|
@ -160,7 +160,6 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
|
|||
// so only the RDD will reduce in serialized size, not the splits.
|
||||
testParentCheckpointing(
|
||||
rdd => new ZippedRDD(sc, rdd, rdd.map(x => x)), true, false)
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -176,7 +175,7 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
|
|||
testRDDSplitSize: Boolean = false
|
||||
) {
|
||||
// Generate the final RDD using given RDD operation
|
||||
val baseRDD = generateLongLineageRDD
|
||||
val baseRDD = generateLongLineageRDD()
|
||||
val operatedRDD = op(baseRDD)
|
||||
val parentRDD = operatedRDD.dependencies.headOption.orNull
|
||||
val rddType = operatedRDD.getClass.getSimpleName
|
||||
|
@ -245,12 +244,16 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
|
|||
testRDDSplitSize: Boolean
|
||||
) {
|
||||
// Generate the final RDD using given RDD operation
|
||||
val baseRDD = generateLongLineageRDD
|
||||
val baseRDD = generateLongLineageRDD()
|
||||
val operatedRDD = op(baseRDD)
|
||||
val parentRDD = operatedRDD.dependencies.head.rdd
|
||||
val rddType = operatedRDD.getClass.getSimpleName
|
||||
val parentRDDType = parentRDD.getClass.getSimpleName
|
||||
|
||||
// Get the splits and dependencies of the parent in case they're lazily computed
|
||||
parentRDD.dependencies
|
||||
parentRDD.splits
|
||||
|
||||
// Find serialized sizes before and after the checkpoint
|
||||
val (rddSizeBeforeCheckpoint, splitSizeBeforeCheckpoint) = getSerializedSizes(operatedRDD)
|
||||
parentRDD.checkpoint() // checkpoint the parent RDD, not the generated one
|
||||
|
@ -267,7 +270,7 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
|
|||
if (testRDDSize) {
|
||||
assert(
|
||||
rddSizeAfterCheckpoint < rddSizeBeforeCheckpoint,
|
||||
"Size of " + rddType + " did not reduce after parent checkpointing parent " + parentRDDType +
|
||||
"Size of " + rddType + " did not reduce after checkpointing parent " + parentRDDType +
|
||||
"[" + rddSizeBeforeCheckpoint + " --> " + rddSizeAfterCheckpoint + "]"
|
||||
)
|
||||
}
|
||||
|
@ -318,10 +321,12 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get serialized sizes of the RDD and its splits
|
||||
* Get serialized sizes of the RDD and its splits, in order to test whether the size shrinks
|
||||
* upon checkpointing. Ignores the checkpointData field, which may grow when we checkpoint.
|
||||
*/
|
||||
def getSerializedSizes(rdd: RDD[_]): (Int, Int) = {
|
||||
(Utils.serialize(rdd).size, Utils.serialize(rdd.splits).size)
|
||||
(Utils.serialize(rdd).length - Utils.serialize(rdd.checkpointData).length,
|
||||
Utils.serialize(rdd.splits).length)
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -46,7 +46,7 @@ public class JavaAPISuite implements Serializable {
|
|||
sc.stop();
|
||||
sc = null;
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port");
|
||||
System.clearProperty("spark.driver.port");
|
||||
}
|
||||
|
||||
static class ReverseIntComparator implements Comparator<Integer>, Serializable {
|
||||
|
|
|
@ -26,7 +26,7 @@ object LocalSparkContext {
|
|||
def stop(sc: SparkContext) {
|
||||
sc.stop()
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
}
|
||||
|
||||
/** Runs `f` by passing in `sc` and ensures that `sc` is stopped. */
|
||||
|
|
|
@ -78,10 +78,9 @@ class MapOutputTrackerSuite extends FunSuite with LocalSparkContext {
|
|||
|
||||
test("remote fetch") {
|
||||
try {
|
||||
System.clearProperty("spark.master.host") // In case some previous test had set it
|
||||
val (actorSystem, boundPort) =
|
||||
AkkaUtils.createActorSystem("test", "localhost", 0)
|
||||
System.setProperty("spark.master.port", boundPort.toString)
|
||||
System.clearProperty("spark.driver.host") // In case some previous test had set it
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("spark", "localhost", 0)
|
||||
System.setProperty("spark.driver.port", boundPort.toString)
|
||||
val masterTracker = new MapOutputTracker(actorSystem, true)
|
||||
val slaveTracker = new MapOutputTracker(actorSystem, false)
|
||||
masterTracker.registerShuffle(10, 1)
|
||||
|
@ -106,7 +105,7 @@ class MapOutputTrackerSuite extends FunSuite with LocalSparkContext {
|
|||
// failure should be cached
|
||||
intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) }
|
||||
} finally {
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,9 +12,10 @@ class RDDSuite extends FunSuite with LocalSparkContext {
|
|||
val nums = sc.makeRDD(Array(1, 2, 3, 4), 2)
|
||||
assert(nums.collect().toList === List(1, 2, 3, 4))
|
||||
val dups = sc.makeRDD(Array(1, 1, 2, 2, 3, 3, 4, 4), 2)
|
||||
assert(dups.distinct.count === 4)
|
||||
assert(dups.distinct().collect === dups.distinct.collect)
|
||||
assert(dups.distinct(2).collect === dups.distinct.collect)
|
||||
assert(dups.distinct().count() === 4)
|
||||
assert(dups.distinct.count === 4) // Can distinct and count be called without parentheses?
|
||||
assert(dups.distinct.collect === dups.distinct().collect)
|
||||
assert(dups.distinct(2).collect === dups.distinct().collect)
|
||||
assert(nums.reduce(_ + _) === 10)
|
||||
assert(nums.fold(0)(_ + _) === 10)
|
||||
assert(nums.map(_.toString).collect().toList === List("1", "2", "3", "4"))
|
||||
|
@ -31,6 +32,10 @@ class RDDSuite extends FunSuite with LocalSparkContext {
|
|||
case(split, iter) => Iterator((split, iter.reduceLeft(_ + _)))
|
||||
}
|
||||
assert(partitionSumsWithSplit.collect().toList === List((0, 3), (1, 7)))
|
||||
|
||||
intercept[UnsupportedOperationException] {
|
||||
nums.filter(_ > 5).reduce(_ + _)
|
||||
}
|
||||
}
|
||||
|
||||
test("SparkContext.union") {
|
||||
|
@ -164,7 +169,7 @@ class RDDSuite extends FunSuite with LocalSparkContext {
|
|||
// Note that split number starts from 0, so > 8 means only 10th partition left.
|
||||
val prunedRdd = new PartitionPruningRDD(data, splitNum => splitNum > 8)
|
||||
assert(prunedRdd.splits.size === 1)
|
||||
val prunedData = prunedRdd.collect
|
||||
val prunedData = prunedRdd.collect()
|
||||
assert(prunedData.size === 1)
|
||||
assert(prunedData(0) === 10)
|
||||
}
|
||||
|
|
663
core/src/test/scala/spark/scheduler/DAGSchedulerSuite.scala
Normal file
663
core/src/test/scala/spark/scheduler/DAGSchedulerSuite.scala
Normal file
|
@ -0,0 +1,663 @@
|
|||
package spark.scheduler
|
||||
|
||||
import scala.collection.mutable.{Map, HashMap}
|
||||
|
||||
import org.scalatest.FunSuite
|
||||
import org.scalatest.BeforeAndAfter
|
||||
import org.scalatest.concurrent.TimeLimitedTests
|
||||
import org.scalatest.mock.EasyMockSugar
|
||||
import org.scalatest.time.{Span, Seconds}
|
||||
|
||||
import org.easymock.EasyMock._
|
||||
import org.easymock.Capture
|
||||
import org.easymock.EasyMock
|
||||
import org.easymock.{IAnswer, IArgumentMatcher}
|
||||
|
||||
import akka.actor.ActorSystem
|
||||
|
||||
import spark.storage.BlockManager
|
||||
import spark.storage.BlockManagerId
|
||||
import spark.storage.BlockManagerMaster
|
||||
import spark.{Dependency, ShuffleDependency, OneToOneDependency}
|
||||
import spark.FetchFailedException
|
||||
import spark.MapOutputTracker
|
||||
import spark.RDD
|
||||
import spark.SparkContext
|
||||
import spark.SparkException
|
||||
import spark.Split
|
||||
import spark.TaskContext
|
||||
import spark.TaskEndReason
|
||||
|
||||
import spark.{FetchFailed, Success}
|
||||
|
||||
/**
|
||||
* Tests for DAGScheduler. These tests directly call the event processing functions in DAGScheduler
|
||||
* rather than spawning an event loop thread as happens in the real code. They use EasyMock
|
||||
* to mock out two classes that DAGScheduler interacts with: TaskScheduler (to which TaskSets are
|
||||
* submitted) and BlockManagerMaster (from which cache locations are retrieved and to which dead
|
||||
* host notifications are sent). In addition, tests may check for side effects on a non-mocked
|
||||
* MapOutputTracker instance.
|
||||
*
|
||||
* Tests primarily consist of running DAGScheduler#processEvent and
|
||||
* DAGScheduler#submitWaitingStages (via test utility functions like runEvent or respondToTaskSet)
|
||||
* and capturing the resulting TaskSets from the mock TaskScheduler.
|
||||
*/
|
||||
class DAGSchedulerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar with TimeLimitedTests {
|
||||
|
||||
// impose a time limit on this test in case we don't let the job finish, in which case
|
||||
// JobWaiter#getResult will hang.
|
||||
override val timeLimit = Span(5, Seconds)
|
||||
|
||||
val sc: SparkContext = new SparkContext("local", "DAGSchedulerSuite")
|
||||
var scheduler: DAGScheduler = null
|
||||
val taskScheduler = mock[TaskScheduler]
|
||||
val blockManagerMaster = mock[BlockManagerMaster]
|
||||
var mapOutputTracker: MapOutputTracker = null
|
||||
var schedulerThread: Thread = null
|
||||
var schedulerException: Throwable = null
|
||||
|
||||
/**
|
||||
* Set of EasyMock argument matchers that match a TaskSet for a given RDD.
|
||||
* We cache these so we do not create duplicate matchers for the same RDD.
|
||||
* This allows us to easily setup a sequence of expectations for task sets for
|
||||
* that RDD.
|
||||
*/
|
||||
val taskSetMatchers = new HashMap[MyRDD, IArgumentMatcher]
|
||||
|
||||
/**
|
||||
* Set of cache locations to return from our mock BlockManagerMaster.
|
||||
* Keys are (rdd ID, partition ID). Anything not present will return an empty
|
||||
* list of cache locations silently.
|
||||
*/
|
||||
val cacheLocations = new HashMap[(Int, Int), Seq[BlockManagerId]]
|
||||
|
||||
/**
|
||||
* JobWaiter for the last JobSubmitted event we pushed. To keep tests (most of which
|
||||
* will only submit one job) from needing to explicitly track it.
|
||||
*/
|
||||
var lastJobWaiter: JobWaiter[Int] = null
|
||||
|
||||
/**
|
||||
* Array into which we are accumulating the results from the last job asynchronously.
|
||||
*/
|
||||
var lastJobResult: Array[Int] = null
|
||||
|
||||
/**
|
||||
* Tell EasyMockSugar what mock objects we want to be configured by expecting {...}
|
||||
* and whenExecuting {...} */
|
||||
implicit val mocks = MockObjects(taskScheduler, blockManagerMaster)
|
||||
|
||||
/**
|
||||
* Utility function to reset mocks and set expectations on them. EasyMock wants mock objects
|
||||
* to be reset after each time their expectations are set, and we tend to check mock object
|
||||
* calls over a single call to DAGScheduler.
|
||||
*
|
||||
* We also set a default expectation here that blockManagerMaster.getLocations can be called
|
||||
* and will return values from cacheLocations.
|
||||
*/
|
||||
def resetExpecting(f: => Unit) {
|
||||
reset(taskScheduler)
|
||||
reset(blockManagerMaster)
|
||||
expecting {
|
||||
expectGetLocations()
|
||||
f
|
||||
}
|
||||
}
|
||||
|
||||
before {
|
||||
taskSetMatchers.clear()
|
||||
cacheLocations.clear()
|
||||
val actorSystem = ActorSystem("test")
|
||||
mapOutputTracker = new MapOutputTracker(actorSystem, true)
|
||||
resetExpecting {
|
||||
taskScheduler.setListener(anyObject())
|
||||
}
|
||||
whenExecuting {
|
||||
scheduler = new DAGScheduler(taskScheduler, mapOutputTracker, blockManagerMaster, null)
|
||||
}
|
||||
}
|
||||
|
||||
after {
|
||||
assert(scheduler.processEvent(StopDAGScheduler))
|
||||
resetExpecting {
|
||||
taskScheduler.stop()
|
||||
}
|
||||
whenExecuting {
|
||||
scheduler.stop()
|
||||
}
|
||||
sc.stop()
|
||||
System.clearProperty("spark.master.port")
|
||||
}
|
||||
|
||||
def makeBlockManagerId(host: String): BlockManagerId =
|
||||
BlockManagerId("exec-" + host, host, 12345)
|
||||
|
||||
/**
|
||||
* Type of RDD we use for testing. Note that we should never call the real RDD compute methods.
|
||||
* This is a pair RDD type so it can always be used in ShuffleDependencies.
|
||||
*/
|
||||
type MyRDD = RDD[(Int, Int)]
|
||||
|
||||
/**
|
||||
* Create an RDD for passing to DAGScheduler. These RDDs will use the dependencies and
|
||||
* preferredLocations (if any) that are passed to them. They are deliberately not executable
|
||||
* so we can test that DAGScheduler does not try to execute RDDs locally.
|
||||
*/
|
||||
def makeRdd(
|
||||
numSplits: Int,
|
||||
dependencies: List[Dependency[_]],
|
||||
locations: Seq[Seq[String]] = Nil
|
||||
): MyRDD = {
|
||||
val maxSplit = numSplits - 1
|
||||
return new MyRDD(sc, dependencies) {
|
||||
override def compute(split: Split, context: TaskContext): Iterator[(Int, Int)] =
|
||||
throw new RuntimeException("should not be reached")
|
||||
override def getSplits() = (0 to maxSplit).map(i => new Split {
|
||||
override def index = i
|
||||
}).toArray
|
||||
override def getPreferredLocations(split: Split): Seq[String] =
|
||||
if (locations.isDefinedAt(split.index))
|
||||
locations(split.index)
|
||||
else
|
||||
Nil
|
||||
override def toString: String = "DAGSchedulerSuiteRDD " + id
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* EasyMock matcher method. For use as an argument matcher for a TaskSet whose first task
|
||||
* is from a particular RDD.
|
||||
*/
|
||||
def taskSetForRdd(rdd: MyRDD): TaskSet = {
|
||||
val matcher = taskSetMatchers.getOrElseUpdate(rdd,
|
||||
new IArgumentMatcher {
|
||||
override def matches(actual: Any): Boolean = {
|
||||
val taskSet = actual.asInstanceOf[TaskSet]
|
||||
taskSet.tasks(0) match {
|
||||
case rt: ResultTask[_, _] => rt.rdd.id == rdd.id
|
||||
case smt: ShuffleMapTask => smt.rdd.id == rdd.id
|
||||
case _ => false
|
||||
}
|
||||
}
|
||||
override def appendTo(buf: StringBuffer) {
|
||||
buf.append("taskSetForRdd(" + rdd + ")")
|
||||
}
|
||||
})
|
||||
EasyMock.reportMatcher(matcher)
|
||||
return null
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup an EasyMock expectation to repsond to blockManagerMaster.getLocations() called from
|
||||
* cacheLocations.
|
||||
*/
|
||||
def expectGetLocations(): Unit = {
|
||||
EasyMock.expect(blockManagerMaster.getLocations(anyObject().asInstanceOf[Array[String]])).
|
||||
andAnswer(new IAnswer[Seq[Seq[BlockManagerId]]] {
|
||||
override def answer(): Seq[Seq[BlockManagerId]] = {
|
||||
val blocks = getCurrentArguments()(0).asInstanceOf[Array[String]]
|
||||
return blocks.map { name =>
|
||||
val pieces = name.split("_")
|
||||
if (pieces(0) == "rdd") {
|
||||
val key = pieces(1).toInt -> pieces(2).toInt
|
||||
if (cacheLocations.contains(key)) {
|
||||
cacheLocations(key)
|
||||
} else {
|
||||
Seq[BlockManagerId]()
|
||||
}
|
||||
} else {
|
||||
Seq[BlockManagerId]()
|
||||
}
|
||||
}.toSeq
|
||||
}
|
||||
}).anyTimes()
|
||||
}
|
||||
|
||||
/**
|
||||
* Process the supplied event as if it were the top of the DAGScheduler event queue, expecting
|
||||
* the scheduler not to exit.
|
||||
*
|
||||
* After processing the event, submit waiting stages as is done on most iterations of the
|
||||
* DAGScheduler event loop.
|
||||
*/
|
||||
def runEvent(event: DAGSchedulerEvent) {
|
||||
assert(!scheduler.processEvent(event))
|
||||
scheduler.submitWaitingStages()
|
||||
}
|
||||
|
||||
/**
|
||||
* Expect a TaskSet for the specified RDD to be submitted to the TaskScheduler. Should be
|
||||
* called from a resetExpecting { ... } block.
|
||||
*
|
||||
* Returns a easymock Capture that will contain the task set after the stage is submitted.
|
||||
* Most tests should use interceptStage() instead of this directly.
|
||||
*/
|
||||
def expectStage(rdd: MyRDD): Capture[TaskSet] = {
|
||||
val taskSetCapture = new Capture[TaskSet]
|
||||
taskScheduler.submitTasks(and(capture(taskSetCapture), taskSetForRdd(rdd)))
|
||||
return taskSetCapture
|
||||
}
|
||||
|
||||
/**
|
||||
* Expect the supplied code snippet to submit a stage for the specified RDD.
|
||||
* Return the resulting TaskSet. First marks all the tasks are belonging to the
|
||||
* current MapOutputTracker generation.
|
||||
*/
|
||||
def interceptStage(rdd: MyRDD)(f: => Unit): TaskSet = {
|
||||
var capture: Capture[TaskSet] = null
|
||||
resetExpecting {
|
||||
capture = expectStage(rdd)
|
||||
}
|
||||
whenExecuting {
|
||||
f
|
||||
}
|
||||
val taskSet = capture.getValue
|
||||
for (task <- taskSet.tasks) {
|
||||
task.generation = mapOutputTracker.getGeneration
|
||||
}
|
||||
return taskSet
|
||||
}
|
||||
|
||||
/**
|
||||
* Send the given CompletionEvent messages for the tasks in the TaskSet.
|
||||
*/
|
||||
def respondToTaskSet(taskSet: TaskSet, results: Seq[(TaskEndReason, Any)]) {
|
||||
assert(taskSet.tasks.size >= results.size)
|
||||
for ((result, i) <- results.zipWithIndex) {
|
||||
if (i < taskSet.tasks.size) {
|
||||
runEvent(CompletionEvent(taskSet.tasks(i), result._1, result._2, Map[Long, Any]()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Assert that the supplied TaskSet has exactly the given preferredLocations.
|
||||
*/
|
||||
def expectTaskSetLocations(taskSet: TaskSet, locations: Seq[Seq[String]]) {
|
||||
assert(locations.size === taskSet.tasks.size)
|
||||
for ((expectLocs, taskLocs) <-
|
||||
taskSet.tasks.map(_.preferredLocations).zip(locations)) {
|
||||
assert(expectLocs === taskLocs)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* When we submit dummy Jobs, this is the compute function we supply. Except in a local test
|
||||
* below, we do not expect this function to ever be executed; instead, we will return results
|
||||
* directly through CompletionEvents.
|
||||
*/
|
||||
def jobComputeFunc(context: TaskContext, it: Iterator[(Int, Int)]): Int =
|
||||
it.next._1.asInstanceOf[Int]
|
||||
|
||||
|
||||
/**
|
||||
* Start a job to compute the given RDD. Returns the JobWaiter that will
|
||||
* collect the result of the job via callbacks from DAGScheduler.
|
||||
*/
|
||||
def submitRdd(rdd: MyRDD, allowLocal: Boolean = false): (JobWaiter[Int], Array[Int]) = {
|
||||
val resultArray = new Array[Int](rdd.splits.size)
|
||||
val (toSubmit, waiter) = scheduler.prepareJob[(Int, Int), Int](
|
||||
rdd,
|
||||
jobComputeFunc,
|
||||
(0 to (rdd.splits.size - 1)),
|
||||
"test-site",
|
||||
allowLocal,
|
||||
(i: Int, value: Int) => resultArray(i) = value
|
||||
)
|
||||
lastJobWaiter = waiter
|
||||
lastJobResult = resultArray
|
||||
runEvent(toSubmit)
|
||||
return (waiter, resultArray)
|
||||
}
|
||||
|
||||
/**
|
||||
* Assert that a job we started has failed.
|
||||
*/
|
||||
def expectJobException(waiter: JobWaiter[Int] = lastJobWaiter) {
|
||||
waiter.awaitResult() match {
|
||||
case JobSucceeded => fail()
|
||||
case JobFailed(_) => return
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Assert that a job we started has succeeded and has the given result.
|
||||
*/
|
||||
def expectJobResult(expected: Array[Int], waiter: JobWaiter[Int] = lastJobWaiter,
|
||||
result: Array[Int] = lastJobResult) {
|
||||
waiter.awaitResult match {
|
||||
case JobSucceeded =>
|
||||
assert(expected === result)
|
||||
case JobFailed(_) =>
|
||||
fail()
|
||||
}
|
||||
}
|
||||
|
||||
def makeMapStatus(host: String, reduces: Int): MapStatus =
|
||||
new MapStatus(makeBlockManagerId(host), Array.fill[Byte](reduces)(2))
|
||||
|
||||
test("zero split job") {
|
||||
val rdd = makeRdd(0, Nil)
|
||||
var numResults = 0
|
||||
def accumulateResult(partition: Int, value: Int) {
|
||||
numResults += 1
|
||||
}
|
||||
scheduler.runJob(rdd, jobComputeFunc, Seq(), "test-site", false, accumulateResult)
|
||||
assert(numResults === 0)
|
||||
}
|
||||
|
||||
test("run trivial job") {
|
||||
val rdd = makeRdd(1, Nil)
|
||||
val taskSet = interceptStage(rdd) { submitRdd(rdd) }
|
||||
respondToTaskSet(taskSet, List( (Success, 42) ))
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
|
||||
test("local job") {
|
||||
val rdd = new MyRDD(sc, Nil) {
|
||||
override def compute(split: Split, context: TaskContext): Iterator[(Int, Int)] =
|
||||
Array(42 -> 0).iterator
|
||||
override def getSplits() = Array( new Split { override def index = 0 } )
|
||||
override def getPreferredLocations(split: Split) = Nil
|
||||
override def toString = "DAGSchedulerSuite Local RDD"
|
||||
}
|
||||
submitRdd(rdd, true)
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
|
||||
test("run trivial job w/ dependency") {
|
||||
val baseRdd = makeRdd(1, Nil)
|
||||
val finalRdd = makeRdd(1, List(new OneToOneDependency(baseRdd)))
|
||||
val taskSet = interceptStage(finalRdd) { submitRdd(finalRdd) }
|
||||
respondToTaskSet(taskSet, List( (Success, 42) ))
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
|
||||
test("cache location preferences w/ dependency") {
|
||||
val baseRdd = makeRdd(1, Nil)
|
||||
val finalRdd = makeRdd(1, List(new OneToOneDependency(baseRdd)))
|
||||
cacheLocations(baseRdd.id -> 0) =
|
||||
Seq(makeBlockManagerId("hostA"), makeBlockManagerId("hostB"))
|
||||
val taskSet = interceptStage(finalRdd) { submitRdd(finalRdd) }
|
||||
expectTaskSetLocations(taskSet, List(Seq("hostA", "hostB")))
|
||||
respondToTaskSet(taskSet, List( (Success, 42) ))
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
|
||||
test("trivial job failure") {
|
||||
val rdd = makeRdd(1, Nil)
|
||||
val taskSet = interceptStage(rdd) { submitRdd(rdd) }
|
||||
runEvent(TaskSetFailed(taskSet, "test failure"))
|
||||
expectJobException()
|
||||
}
|
||||
|
||||
test("run trivial shuffle") {
|
||||
val shuffleMapRdd = makeRdd(2, Nil)
|
||||
val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
|
||||
val shuffleId = shuffleDep.shuffleId
|
||||
val reduceRdd = makeRdd(1, List(shuffleDep))
|
||||
|
||||
val firstStage = interceptStage(shuffleMapRdd) { submitRdd(reduceRdd) }
|
||||
val secondStage = interceptStage(reduceRdd) {
|
||||
respondToTaskSet(firstStage, List(
|
||||
(Success, makeMapStatus("hostA", 1)),
|
||||
(Success, makeMapStatus("hostB", 1))
|
||||
))
|
||||
}
|
||||
assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
|
||||
Array(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")))
|
||||
respondToTaskSet(secondStage, List( (Success, 42) ))
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
|
||||
test("run trivial shuffle with fetch failure") {
|
||||
val shuffleMapRdd = makeRdd(2, Nil)
|
||||
val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
|
||||
val shuffleId = shuffleDep.shuffleId
|
||||
val reduceRdd = makeRdd(2, List(shuffleDep))
|
||||
|
||||
val firstStage = interceptStage(shuffleMapRdd) { submitRdd(reduceRdd) }
|
||||
val secondStage = interceptStage(reduceRdd) {
|
||||
respondToTaskSet(firstStage, List(
|
||||
(Success, makeMapStatus("hostA", 1)),
|
||||
(Success, makeMapStatus("hostB", 1))
|
||||
))
|
||||
}
|
||||
resetExpecting {
|
||||
blockManagerMaster.removeExecutor("exec-hostA")
|
||||
}
|
||||
whenExecuting {
|
||||
respondToTaskSet(secondStage, List(
|
||||
(Success, 42),
|
||||
(FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0), null)
|
||||
))
|
||||
}
|
||||
val thirdStage = interceptStage(shuffleMapRdd) {
|
||||
scheduler.resubmitFailedStages()
|
||||
}
|
||||
val fourthStage = interceptStage(reduceRdd) {
|
||||
respondToTaskSet(thirdStage, List( (Success, makeMapStatus("hostA", 1)) ))
|
||||
}
|
||||
assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
|
||||
Array(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")))
|
||||
respondToTaskSet(fourthStage, List( (Success, 43) ))
|
||||
expectJobResult(Array(42, 43))
|
||||
}
|
||||
|
||||
test("ignore late map task completions") {
|
||||
val shuffleMapRdd = makeRdd(2, Nil)
|
||||
val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
|
||||
val shuffleId = shuffleDep.shuffleId
|
||||
val reduceRdd = makeRdd(2, List(shuffleDep))
|
||||
|
||||
val taskSet = interceptStage(shuffleMapRdd) { submitRdd(reduceRdd) }
|
||||
val oldGeneration = mapOutputTracker.getGeneration
|
||||
resetExpecting {
|
||||
blockManagerMaster.removeExecutor("exec-hostA")
|
||||
}
|
||||
whenExecuting {
|
||||
runEvent(ExecutorLost("exec-hostA"))
|
||||
}
|
||||
val newGeneration = mapOutputTracker.getGeneration
|
||||
assert(newGeneration > oldGeneration)
|
||||
val noAccum = Map[Long, Any]()
|
||||
// We rely on the event queue being ordered and increasing the generation number by 1
|
||||
// should be ignored for being too old
|
||||
runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA", 1), noAccum))
|
||||
// should work because it's a non-failed host
|
||||
runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostB", 1), noAccum))
|
||||
// should be ignored for being too old
|
||||
runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA", 1), noAccum))
|
||||
taskSet.tasks(1).generation = newGeneration
|
||||
val secondStage = interceptStage(reduceRdd) {
|
||||
runEvent(CompletionEvent(taskSet.tasks(1), Success, makeMapStatus("hostA", 1), noAccum))
|
||||
}
|
||||
assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
|
||||
Array(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
|
||||
respondToTaskSet(secondStage, List( (Success, 42), (Success, 43) ))
|
||||
expectJobResult(Array(42, 43))
|
||||
}
|
||||
|
||||
test("run trivial shuffle with out-of-band failure and retry") {
|
||||
val shuffleMapRdd = makeRdd(2, Nil)
|
||||
val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
|
||||
val shuffleId = shuffleDep.shuffleId
|
||||
val reduceRdd = makeRdd(1, List(shuffleDep))
|
||||
|
||||
val firstStage = interceptStage(shuffleMapRdd) { submitRdd(reduceRdd) }
|
||||
resetExpecting {
|
||||
blockManagerMaster.removeExecutor("exec-hostA")
|
||||
}
|
||||
whenExecuting {
|
||||
runEvent(ExecutorLost("exec-hostA"))
|
||||
}
|
||||
// DAGScheduler will immediately resubmit the stage after it appears to have no pending tasks
|
||||
// rather than marking it is as failed and waiting.
|
||||
val secondStage = interceptStage(shuffleMapRdd) {
|
||||
respondToTaskSet(firstStage, List(
|
||||
(Success, makeMapStatus("hostA", 1)),
|
||||
(Success, makeMapStatus("hostB", 1))
|
||||
))
|
||||
}
|
||||
val thirdStage = interceptStage(reduceRdd) {
|
||||
respondToTaskSet(secondStage, List(
|
||||
(Success, makeMapStatus("hostC", 1))
|
||||
))
|
||||
}
|
||||
assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
|
||||
Array(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
|
||||
respondToTaskSet(thirdStage, List( (Success, 42) ))
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
|
||||
test("recursive shuffle failures") {
|
||||
val shuffleOneRdd = makeRdd(2, Nil)
|
||||
val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
|
||||
val shuffleTwoRdd = makeRdd(2, List(shuffleDepOne))
|
||||
val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
|
||||
val finalRdd = makeRdd(1, List(shuffleDepTwo))
|
||||
|
||||
val firstStage = interceptStage(shuffleOneRdd) { submitRdd(finalRdd) }
|
||||
val secondStage = interceptStage(shuffleTwoRdd) {
|
||||
respondToTaskSet(firstStage, List(
|
||||
(Success, makeMapStatus("hostA", 2)),
|
||||
(Success, makeMapStatus("hostB", 2))
|
||||
))
|
||||
}
|
||||
val thirdStage = interceptStage(finalRdd) {
|
||||
respondToTaskSet(secondStage, List(
|
||||
(Success, makeMapStatus("hostA", 1)),
|
||||
(Success, makeMapStatus("hostC", 1))
|
||||
))
|
||||
}
|
||||
resetExpecting {
|
||||
blockManagerMaster.removeExecutor("exec-hostA")
|
||||
}
|
||||
whenExecuting {
|
||||
respondToTaskSet(thirdStage, List(
|
||||
(FetchFailed(makeBlockManagerId("hostA"), shuffleDepTwo.shuffleId, 0, 0), null)
|
||||
))
|
||||
}
|
||||
val recomputeOne = interceptStage(shuffleOneRdd) {
|
||||
scheduler.resubmitFailedStages()
|
||||
}
|
||||
val recomputeTwo = interceptStage(shuffleTwoRdd) {
|
||||
respondToTaskSet(recomputeOne, List(
|
||||
(Success, makeMapStatus("hostA", 2))
|
||||
))
|
||||
}
|
||||
val finalStage = interceptStage(finalRdd) {
|
||||
respondToTaskSet(recomputeTwo, List(
|
||||
(Success, makeMapStatus("hostA", 1))
|
||||
))
|
||||
}
|
||||
respondToTaskSet(finalStage, List( (Success, 42) ))
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
|
||||
test("cached post-shuffle") {
|
||||
val shuffleOneRdd = makeRdd(2, Nil)
|
||||
val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
|
||||
val shuffleTwoRdd = makeRdd(2, List(shuffleDepOne))
|
||||
val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
|
||||
val finalRdd = makeRdd(1, List(shuffleDepTwo))
|
||||
|
||||
val firstShuffleStage = interceptStage(shuffleOneRdd) { submitRdd(finalRdd) }
|
||||
cacheLocations(shuffleTwoRdd.id -> 0) = Seq(makeBlockManagerId("hostD"))
|
||||
cacheLocations(shuffleTwoRdd.id -> 1) = Seq(makeBlockManagerId("hostC"))
|
||||
val secondShuffleStage = interceptStage(shuffleTwoRdd) {
|
||||
respondToTaskSet(firstShuffleStage, List(
|
||||
(Success, makeMapStatus("hostA", 2)),
|
||||
(Success, makeMapStatus("hostB", 2))
|
||||
))
|
||||
}
|
||||
val reduceStage = interceptStage(finalRdd) {
|
||||
respondToTaskSet(secondShuffleStage, List(
|
||||
(Success, makeMapStatus("hostA", 1)),
|
||||
(Success, makeMapStatus("hostB", 1))
|
||||
))
|
||||
}
|
||||
resetExpecting {
|
||||
blockManagerMaster.removeExecutor("exec-hostA")
|
||||
}
|
||||
whenExecuting {
|
||||
respondToTaskSet(reduceStage, List(
|
||||
(FetchFailed(makeBlockManagerId("hostA"), shuffleDepTwo.shuffleId, 0, 0), null)
|
||||
))
|
||||
}
|
||||
// DAGScheduler should notice the cached copy of the second shuffle and try to get it rerun.
|
||||
val recomputeTwo = interceptStage(shuffleTwoRdd) {
|
||||
scheduler.resubmitFailedStages()
|
||||
}
|
||||
expectTaskSetLocations(recomputeTwo, Seq(Seq("hostD")))
|
||||
val finalRetry = interceptStage(finalRdd) {
|
||||
respondToTaskSet(recomputeTwo, List(
|
||||
(Success, makeMapStatus("hostD", 1))
|
||||
))
|
||||
}
|
||||
respondToTaskSet(finalRetry, List( (Success, 42) ))
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
|
||||
test("cached post-shuffle but fails") {
|
||||
val shuffleOneRdd = makeRdd(2, Nil)
|
||||
val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null)
|
||||
val shuffleTwoRdd = makeRdd(2, List(shuffleDepOne))
|
||||
val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null)
|
||||
val finalRdd = makeRdd(1, List(shuffleDepTwo))
|
||||
|
||||
val firstShuffleStage = interceptStage(shuffleOneRdd) { submitRdd(finalRdd) }
|
||||
cacheLocations(shuffleTwoRdd.id -> 0) = Seq(makeBlockManagerId("hostD"))
|
||||
cacheLocations(shuffleTwoRdd.id -> 1) = Seq(makeBlockManagerId("hostC"))
|
||||
val secondShuffleStage = interceptStage(shuffleTwoRdd) {
|
||||
respondToTaskSet(firstShuffleStage, List(
|
||||
(Success, makeMapStatus("hostA", 2)),
|
||||
(Success, makeMapStatus("hostB", 2))
|
||||
))
|
||||
}
|
||||
val reduceStage = interceptStage(finalRdd) {
|
||||
respondToTaskSet(secondShuffleStage, List(
|
||||
(Success, makeMapStatus("hostA", 1)),
|
||||
(Success, makeMapStatus("hostB", 1))
|
||||
))
|
||||
}
|
||||
resetExpecting {
|
||||
blockManagerMaster.removeExecutor("exec-hostA")
|
||||
}
|
||||
whenExecuting {
|
||||
respondToTaskSet(reduceStage, List(
|
||||
(FetchFailed(makeBlockManagerId("hostA"), shuffleDepTwo.shuffleId, 0, 0), null)
|
||||
))
|
||||
}
|
||||
val recomputeTwoCached = interceptStage(shuffleTwoRdd) {
|
||||
scheduler.resubmitFailedStages()
|
||||
}
|
||||
expectTaskSetLocations(recomputeTwoCached, Seq(Seq("hostD")))
|
||||
intercept[FetchFailedException]{
|
||||
mapOutputTracker.getServerStatuses(shuffleDepOne.shuffleId, 0)
|
||||
}
|
||||
|
||||
// Simulate the shuffle input data failing to be cached.
|
||||
cacheLocations.remove(shuffleTwoRdd.id -> 0)
|
||||
respondToTaskSet(recomputeTwoCached, List(
|
||||
(FetchFailed(null, shuffleDepOne.shuffleId, 0, 0), null)
|
||||
))
|
||||
|
||||
// After the fetch failure, DAGScheduler should recheck the cache and decide to resubmit
|
||||
// everything.
|
||||
val recomputeOne = interceptStage(shuffleOneRdd) {
|
||||
scheduler.resubmitFailedStages()
|
||||
}
|
||||
// We use hostA here to make sure DAGScheduler doesn't think it's still dead.
|
||||
val recomputeTwoUncached = interceptStage(shuffleTwoRdd) {
|
||||
respondToTaskSet(recomputeOne, List( (Success, makeMapStatus("hostA", 1)) ))
|
||||
}
|
||||
expectTaskSetLocations(recomputeTwoUncached, Seq(Seq[String]()))
|
||||
val finalRetry = interceptStage(finalRdd) {
|
||||
respondToTaskSet(recomputeTwoUncached, List( (Success, makeMapStatus("hostA", 1)) ))
|
||||
|
||||
}
|
||||
respondToTaskSet(finalRetry, List( (Success, 42) ))
|
||||
expectJobResult(Array(42))
|
||||
}
|
||||
}
|
|
@ -202,7 +202,7 @@ Apart from these, the following properties are also available, and may be useful
|
|||
<td>10</td>
|
||||
<td>
|
||||
Maximum message size to allow in "control plane" communication (for serialized tasks and task
|
||||
results), in MB. Increase this if your tasks need to send back large results to the master
|
||||
results), in MB. Increase this if your tasks need to send back large results to the driver
|
||||
(e.g. using <code>collect()</code> on a large dataset).
|
||||
</td>
|
||||
</tr>
|
||||
|
@ -211,7 +211,7 @@ Apart from these, the following properties are also available, and may be useful
|
|||
<td>4</td>
|
||||
<td>
|
||||
Number of actor threads to use for communication. Can be useful to increase on large clusters
|
||||
when the master has a lot of CPU cores.
|
||||
when the driver has a lot of CPU cores.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
@ -222,17 +222,17 @@ Apart from these, the following properties are also available, and may be useful
|
|||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>spark.master.host</td>
|
||||
<td>spark.driver.host</td>
|
||||
<td>(local hostname)</td>
|
||||
<td>
|
||||
Hostname or IP address for the master to listen on.
|
||||
Hostname or IP address for the driver to listen on.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>spark.master.port</td>
|
||||
<td>spark.driver.port</td>
|
||||
<td>(random)</td>
|
||||
<td>
|
||||
Port for the master to listen on.
|
||||
Port for the driver to listen on.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
|
|
@ -67,13 +67,20 @@ The script automatically adds the `pyspark` package to the `PYTHONPATH`.
|
|||
|
||||
# Interactive Use
|
||||
|
||||
The `pyspark` script launches a Python interpreter that is configured to run PySpark jobs.
|
||||
When run without any input files, `pyspark` launches a shell that can be used explore data interactively, which is a simple way to learn the API:
|
||||
The `pyspark` script launches a Python interpreter that is configured to run PySpark jobs. To use `pyspark` interactively, first build Spark, then launch it directly from the command line without any options:
|
||||
|
||||
{% highlight bash %}
|
||||
$ sbt/sbt package
|
||||
$ ./pyspark
|
||||
{% endhighlight %}
|
||||
|
||||
The Python shell can be used explore data interactively and is a simple way to learn the API:
|
||||
|
||||
{% highlight python %}
|
||||
>>> words = sc.textFile("/usr/share/dict/words")
|
||||
>>> words.filter(lambda w: w.startswith("spar")).take(5)
|
||||
[u'spar', u'sparable', u'sparada', u'sparadrap', u'sparagrass']
|
||||
>>> help(pyspark) # Show all pyspark functions
|
||||
{% endhighlight %}
|
||||
|
||||
By default, the `pyspark` shell creates SparkContext that runs jobs locally.
|
||||
|
|
|
@ -50,11 +50,6 @@
|
|||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop1</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>!hadoopVersion</name>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.spark-project</groupId>
|
||||
|
@ -88,12 +83,6 @@
|
|||
</profile>
|
||||
<profile>
|
||||
<id>hadoop2</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>hadoopVersion</name>
|
||||
<value>2</value>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.spark-project</groupId>
|
||||
|
|
17
pom.xml
17
pom.xml
|
@ -273,6 +273,12 @@
|
|||
<version>1.8</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.easymock</groupId>
|
||||
<artifactId>easymock</artifactId>
|
||||
<version>3.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalacheck</groupId>
|
||||
<artifactId>scalacheck_${scala.version}</artifactId>
|
||||
|
@ -499,11 +505,6 @@
|
|||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop1</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>!hadoopVersion</name>
|
||||
</property>
|
||||
</activation>
|
||||
|
||||
<properties>
|
||||
<hadoop.major.version>1</hadoop.major.version>
|
||||
|
@ -521,12 +522,6 @@
|
|||
|
||||
<profile>
|
||||
<id>hadoop2</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>hadoopVersion</name>
|
||||
<value>2</value>
|
||||
</property>
|
||||
</activation>
|
||||
<properties>
|
||||
<hadoop.major.version>2</hadoop.major.version>
|
||||
</properties>
|
||||
|
|
|
@ -92,7 +92,8 @@ object SparkBuild extends Build {
|
|||
"org.eclipse.jetty" % "jetty-server" % "7.5.3.v20111011",
|
||||
"org.scalatest" %% "scalatest" % "1.8" % "test",
|
||||
"org.scalacheck" %% "scalacheck" % "1.9" % "test",
|
||||
"com.novocode" % "junit-interface" % "0.8" % "test"
|
||||
"com.novocode" % "junit-interface" % "0.8" % "test",
|
||||
"org.easymock" % "easymock" % "3.1" % "test"
|
||||
),
|
||||
parallelExecution := false,
|
||||
/* Workaround for issue #206 (fixed after SBT 0.11.0) */
|
||||
|
|
|
@ -196,12 +196,3 @@ def _start_update_server():
|
|||
thread.daemon = True
|
||||
thread.start()
|
||||
return server
|
||||
|
||||
|
||||
def _test():
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
||||
|
|
|
@ -37,12 +37,3 @@ class Broadcast(object):
|
|||
def __reduce__(self):
|
||||
self._pickle_registry.add(self)
|
||||
return (_from_id, (self.bid, ))
|
||||
|
||||
|
||||
def _test():
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
import os
|
||||
import atexit
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from threading import Lock
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
|
@ -24,11 +22,10 @@ class SparkContext(object):
|
|||
broadcast variables on that cluster.
|
||||
"""
|
||||
|
||||
gateway = launch_gateway()
|
||||
jvm = gateway.jvm
|
||||
_readRDDFromPickleFile = jvm.PythonRDD.readRDDFromPickleFile
|
||||
_writeIteratorToPickleFile = jvm.PythonRDD.writeIteratorToPickleFile
|
||||
_takePartition = jvm.PythonRDD.takePartition
|
||||
_gateway = None
|
||||
_jvm = None
|
||||
_writeIteratorToPickleFile = None
|
||||
_takePartition = None
|
||||
_next_accum_id = 0
|
||||
_active_spark_context = None
|
||||
_lock = Lock()
|
||||
|
@ -56,6 +53,13 @@ class SparkContext(object):
|
|||
raise ValueError("Cannot run multiple SparkContexts at once")
|
||||
else:
|
||||
SparkContext._active_spark_context = self
|
||||
if not SparkContext._gateway:
|
||||
SparkContext._gateway = launch_gateway()
|
||||
SparkContext._jvm = SparkContext._gateway.jvm
|
||||
SparkContext._writeIteratorToPickleFile = \
|
||||
SparkContext._jvm.PythonRDD.writeIteratorToPickleFile
|
||||
SparkContext._takePartition = \
|
||||
SparkContext._jvm.PythonRDD.takePartition
|
||||
self.master = master
|
||||
self.jobName = jobName
|
||||
self.sparkHome = sparkHome or None # None becomes null in Py4J
|
||||
|
@ -63,8 +67,8 @@ class SparkContext(object):
|
|||
self.batchSize = batchSize # -1 represents a unlimited batch size
|
||||
|
||||
# Create the Java SparkContext through Py4J
|
||||
empty_string_array = self.gateway.new_array(self.jvm.String, 0)
|
||||
self._jsc = self.jvm.JavaSparkContext(master, jobName, sparkHome,
|
||||
empty_string_array = self._gateway.new_array(self._jvm.String, 0)
|
||||
self._jsc = self._jvm.JavaSparkContext(master, jobName, sparkHome,
|
||||
empty_string_array)
|
||||
|
||||
# Create a single Accumulator in Java that we'll send all our updates through;
|
||||
|
@ -72,8 +76,8 @@ class SparkContext(object):
|
|||
self._accumulatorServer = accumulators._start_update_server()
|
||||
(host, port) = self._accumulatorServer.server_address
|
||||
self._javaAccumulator = self._jsc.accumulator(
|
||||
self.jvm.java.util.ArrayList(),
|
||||
self.jvm.PythonAccumulatorParam(host, port))
|
||||
self._jvm.java.util.ArrayList(),
|
||||
self._jvm.PythonAccumulatorParam(host, port))
|
||||
|
||||
self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
|
||||
# Broadcast's __reduce__ method stores Broadcast instances here.
|
||||
|
@ -88,6 +92,11 @@ class SparkContext(object):
|
|||
SparkFiles._sc = self
|
||||
sys.path.append(SparkFiles.getRootDirectory())
|
||||
|
||||
# Create a temporary directory inside spark.local.dir:
|
||||
local_dir = self._jvm.spark.Utils.getLocalDir()
|
||||
self._temp_dir = \
|
||||
self._jvm.spark.Utils.createTempDir(local_dir).getAbsolutePath()
|
||||
|
||||
@property
|
||||
def defaultParallelism(self):
|
||||
"""
|
||||
|
@ -120,14 +129,14 @@ class SparkContext(object):
|
|||
# Calling the Java parallelize() method with an ArrayList is too slow,
|
||||
# because it sends O(n) Py4J commands. As an alternative, serialized
|
||||
# objects are written to a file and loaded through textFile().
|
||||
tempFile = NamedTemporaryFile(delete=False)
|
||||
atexit.register(lambda: os.unlink(tempFile.name))
|
||||
tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
|
||||
if self.batchSize != 1:
|
||||
c = batched(c, self.batchSize)
|
||||
for x in c:
|
||||
write_with_length(dump_pickle(x), tempFile)
|
||||
tempFile.close()
|
||||
jrdd = self._readRDDFromPickleFile(self._jsc, tempFile.name, numSlices)
|
||||
readRDDFromPickleFile = self._jvm.PythonRDD.readRDDFromPickleFile
|
||||
jrdd = readRDDFromPickleFile(self._jsc, tempFile.name, numSlices)
|
||||
return RDD(jrdd, self)
|
||||
|
||||
def textFile(self, name, minSplits=None):
|
||||
|
@ -240,13 +249,17 @@ class SparkContext(object):
|
|||
|
||||
|
||||
def _test():
|
||||
import atexit
|
||||
import doctest
|
||||
import tempfile
|
||||
globs = globals().copy()
|
||||
globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
|
||||
globs['tempdir'] = tempfile.mkdtemp()
|
||||
atexit.register(lambda: shutil.rmtree(globs['tempdir']))
|
||||
doctest.testmod(globs=globs)
|
||||
(failure_count, test_count) = doctest.testmod(globs=globs)
|
||||
globs['sc'].stop()
|
||||
if failure_count:
|
||||
exit(-1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -35,4 +35,4 @@ class SparkFiles(object):
|
|||
return cls._root_directory
|
||||
else:
|
||||
# This will have to change if we support multiple SparkContexts:
|
||||
return cls._sc.jvm.spark.SparkFiles.getRootDirectory()
|
||||
return cls._sc._jvm.spark.SparkFiles.getRootDirectory()
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import atexit
|
||||
from base64 import standard_b64encode as b64enc
|
||||
import copy
|
||||
from collections import defaultdict
|
||||
|
@ -264,12 +263,8 @@ class RDD(object):
|
|||
# Transferring lots of data through Py4J can be slow because
|
||||
# socket.readline() is inefficient. Instead, we'll dump the data to a
|
||||
# file and read it back.
|
||||
tempFile = NamedTemporaryFile(delete=False)
|
||||
tempFile = NamedTemporaryFile(delete=False, dir=self.ctx._temp_dir)
|
||||
tempFile.close()
|
||||
def clean_up_file():
|
||||
try: os.unlink(tempFile.name)
|
||||
except: pass
|
||||
atexit.register(clean_up_file)
|
||||
self.ctx._writeIteratorToPickleFile(iterator, tempFile.name)
|
||||
# Read the data into Python and deserialize it:
|
||||
with open(tempFile.name, 'rb') as tempFile:
|
||||
|
@ -377,6 +372,10 @@ class RDD(object):
|
|||
items = []
|
||||
for partition in range(self._jrdd.splits().size()):
|
||||
iterator = self.ctx._takePartition(self._jrdd.rdd(), partition)
|
||||
# Each item in the iterator is a string, Python object, batch of
|
||||
# Python objects. Regardless, it is sufficient to take `num`
|
||||
# of these objects in order to collect `num` Python objects:
|
||||
iterator = iterator.take(num)
|
||||
items.extend(self._collect_iterator_through_file(iterator))
|
||||
if len(items) >= num:
|
||||
break
|
||||
|
@ -407,7 +406,7 @@ class RDD(object):
|
|||
return (str(x).encode("utf-8") for x in iterator)
|
||||
keyed = PipelinedRDD(self, func)
|
||||
keyed._bypass_serializer = True
|
||||
keyed._jrdd.map(self.ctx.jvm.BytesToString()).saveAsTextFile(path)
|
||||
keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
|
||||
|
||||
# Pair functions
|
||||
|
||||
|
@ -550,8 +549,8 @@ class RDD(object):
|
|||
yield dump_pickle(Batch(items))
|
||||
keyed = PipelinedRDD(self, add_shuffle_key)
|
||||
keyed._bypass_serializer = True
|
||||
pairRDD = self.ctx.jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
|
||||
partitioner = self.ctx.jvm.PythonPartitioner(numSplits,
|
||||
pairRDD = self.ctx._jvm.PairwiseRDD(keyed._jrdd.rdd()).asJavaPairRDD()
|
||||
partitioner = self.ctx._jvm.PythonPartitioner(numSplits,
|
||||
id(partitionFunc))
|
||||
jrdd = pairRDD.partitionBy(partitioner).values()
|
||||
rdd = RDD(jrdd, self.ctx)
|
||||
|
@ -730,13 +729,13 @@ class PipelinedRDD(RDD):
|
|||
pipe_command = ' '.join(b64enc(cloudpickle.dumps(f)) for f in cmds)
|
||||
broadcast_vars = ListConverter().convert(
|
||||
[x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
|
||||
self.ctx.gateway._gateway_client)
|
||||
self.ctx._gateway._gateway_client)
|
||||
self.ctx._pickled_broadcast_vars.clear()
|
||||
class_manifest = self._prev_jrdd.classManifest()
|
||||
env = copy.copy(self.ctx.environment)
|
||||
env['PYTHONPATH'] = os.environ.get("PYTHONPATH", "")
|
||||
env = MapConverter().convert(env, self.ctx.gateway._gateway_client)
|
||||
python_rdd = self.ctx.jvm.PythonRDD(self._prev_jrdd.rdd(),
|
||||
env = MapConverter().convert(env, self.ctx._gateway._gateway_client)
|
||||
python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(),
|
||||
pipe_command, env, self.preservesPartitioning, self.ctx.pythonExec,
|
||||
broadcast_vars, self.ctx._javaAccumulator, class_manifest)
|
||||
self._jrdd_val = python_rdd.asJavaRDD()
|
||||
|
@ -753,8 +752,10 @@ def _test():
|
|||
# The small batch size here ensures that we see multiple batches,
|
||||
# even in these small test examples:
|
||||
globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
|
||||
doctest.testmod(globs=globs)
|
||||
(failure_count, test_count) = doctest.testmod(globs=globs)
|
||||
globs['sc'].stop()
|
||||
if failure_count:
|
||||
exit(-1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -4,6 +4,7 @@ An interactive shell.
|
|||
This file is designed to be launched as a PYTHONSTARTUP script.
|
||||
"""
|
||||
import os
|
||||
import pyspark
|
||||
from pyspark.context import SparkContext
|
||||
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ class PySparkTestCase(unittest.TestCase):
|
|||
sys.path = self._old_sys_path
|
||||
# To avoid Akka rebinding to the same port, since it doesn't unbind
|
||||
# immediately on shutdown
|
||||
self.sc.jvm.System.clearProperty("spark.master.port")
|
||||
self.sc._jvm.System.clearProperty("spark.driver.port")
|
||||
|
||||
|
||||
class TestCheckpoint(PySparkTestCase):
|
||||
|
@ -108,5 +108,14 @@ class TestAddFile(PySparkTestCase):
|
|||
self.assertEqual("Hello World!", UserClass().hello())
|
||||
|
||||
|
||||
class TestIO(PySparkTestCase):
|
||||
|
||||
def test_stdout_redirection(self):
|
||||
import subprocess
|
||||
def func(x):
|
||||
subprocess.check_call('ls', shell=True)
|
||||
self.sc.parallelize([1]).foreach(func)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
"""
|
||||
Worker that receives input from Piped RDD.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from base64 import standard_b64decode
|
||||
# CloudPickler needs to be imported so that depicklers are registered using the
|
||||
# copy_reg module.
|
||||
|
@ -14,8 +16,8 @@ from pyspark.serializers import write_with_length, read_with_length, write_int,
|
|||
|
||||
|
||||
# Redirect stdout to stderr so that users must return values from functions.
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = sys.stderr
|
||||
old_stdout = os.fdopen(os.dup(1), 'w')
|
||||
os.dup2(2, 1)
|
||||
|
||||
|
||||
def load_obj():
|
||||
|
@ -40,8 +42,13 @@ def main():
|
|||
else:
|
||||
dumps = dump_pickle
|
||||
iterator = read_from_pickle_file(sys.stdin)
|
||||
for obj in func(split_index, iterator):
|
||||
write_with_length(dumps(obj), old_stdout)
|
||||
try:
|
||||
for obj in func(split_index, iterator):
|
||||
write_with_length(dumps(obj), old_stdout)
|
||||
except Exception as e:
|
||||
write_int(-2, old_stdout)
|
||||
write_with_length(traceback.format_exc(), old_stdout)
|
||||
sys.exit(-1)
|
||||
# Mark the beginning of the accumulators section of the output
|
||||
write_int(-1, old_stdout)
|
||||
for aid, accum in _accumulatorRegistry.items():
|
||||
|
|
|
@ -70,11 +70,6 @@
|
|||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop1</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>!hadoopVersion</name>
|
||||
</property>
|
||||
</activation>
|
||||
<properties>
|
||||
<classifier>hadoop1</classifier>
|
||||
</properties>
|
||||
|
@ -115,12 +110,6 @@
|
|||
</profile>
|
||||
<profile>
|
||||
<id>hadoop2</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>hadoopVersion</name>
|
||||
<value>2</value>
|
||||
</property>
|
||||
</activation>
|
||||
<properties>
|
||||
<classifier>hadoop2</classifier>
|
||||
</properties>
|
||||
|
|
11
repl/pom.xml
11
repl/pom.xml
|
@ -72,11 +72,6 @@
|
|||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop1</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>!hadoopVersion</name>
|
||||
</property>
|
||||
</activation>
|
||||
<properties>
|
||||
<classifier>hadoop1</classifier>
|
||||
</properties>
|
||||
|
@ -128,12 +123,6 @@
|
|||
</profile>
|
||||
<profile>
|
||||
<id>hadoop2</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>hadoopVersion</name>
|
||||
<value>2</value>
|
||||
</property>
|
||||
</activation>
|
||||
<properties>
|
||||
<classifier>hadoop2</classifier>
|
||||
</properties>
|
||||
|
|
|
@ -31,7 +31,7 @@ class ReplSuite extends FunSuite {
|
|||
if (interp.sparkContext != null)
|
||||
interp.sparkContext.stop()
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
return out.toString
|
||||
}
|
||||
|
||||
|
|
|
@ -83,11 +83,6 @@
|
|||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop1</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>!hadoopVersion</name>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.spark-project</groupId>
|
||||
|
@ -115,12 +110,6 @@
|
|||
</profile>
|
||||
<profile>
|
||||
<id>hadoop2</id>
|
||||
<activation>
|
||||
<property>
|
||||
<name>hadoopVersion</name>
|
||||
<value>2</value>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.spark-project</groupId>
|
||||
|
|
|
@ -198,10 +198,10 @@ abstract class DStream[T: ClassManifest] (
|
|||
metadataCleanerDelay < 0 || rememberDuration.milliseconds < metadataCleanerDelay * 1000,
|
||||
"It seems you are doing some DStream window operation or setting a checkpoint interval " +
|
||||
"which requires " + this.getClass.getSimpleName + " to remember generated RDDs for more " +
|
||||
"than " + rememberDuration.milliseconds + " milliseconds. But the Spark's metadata cleanup" +
|
||||
"delay is set to " + (metadataCleanerDelay / 60.0) + " minutes, which is not sufficient. Please set " +
|
||||
"the Java property 'spark.cleaner.delay' to more than " +
|
||||
math.ceil(rememberDuration.milliseconds.toDouble / 60000.0).toInt + " minutes."
|
||||
"than " + rememberDuration.milliseconds / 1000 + " seconds. But Spark's metadata cleanup" +
|
||||
"delay is set to " + metadataCleanerDelay + " seconds, which is not sufficient. Please " +
|
||||
"set the Java property 'spark.cleaner.delay' to more than " +
|
||||
math.ceil(rememberDuration.milliseconds / 1000.0).toInt + " seconds."
|
||||
)
|
||||
|
||||
dependencies.foreach(_.validate())
|
||||
|
|
|
@ -389,7 +389,7 @@ object StreamingContext {
|
|||
// Set the default cleaner delay to an hour if not already set.
|
||||
// This should be sufficient for even 1 second interval.
|
||||
if (MetadataCleaner.getDelaySeconds < 0) {
|
||||
MetadataCleaner.setDelaySeconds(60)
|
||||
MetadataCleaner.setDelaySeconds(3600)
|
||||
}
|
||||
new SparkContext(master, frameworkName)
|
||||
}
|
||||
|
|
|
@ -153,8 +153,8 @@ abstract class NetworkReceiver[T: ClassManifest]() extends Serializable with Log
|
|||
/** A helper actor that communicates with the NetworkInputTracker */
|
||||
private class NetworkReceiverActor extends Actor {
|
||||
logInfo("Attempting to register with tracker")
|
||||
val ip = System.getProperty("spark.master.host", "localhost")
|
||||
val port = System.getProperty("spark.master.port", "7077").toInt
|
||||
val ip = System.getProperty("spark.driver.host", "localhost")
|
||||
val port = System.getProperty("spark.driver.port", "7077").toInt
|
||||
val url = "akka://spark@%s:%s/user/NetworkInputTracker".format(ip, port)
|
||||
val tracker = env.actorSystem.actorFor(url)
|
||||
val timeout = 5.seconds
|
||||
|
|
|
@ -43,7 +43,7 @@ public class JavaAPISuite implements Serializable {
|
|||
ssc = null;
|
||||
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port");
|
||||
System.clearProperty("spark.driver.port");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -10,7 +10,7 @@ class BasicOperationsSuite extends TestSuiteBase {
|
|||
|
||||
after {
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
}
|
||||
|
||||
test("map") {
|
||||
|
|
|
@ -19,7 +19,7 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
|
|||
FileUtils.deleteDirectory(new File(checkpointDir))
|
||||
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
}
|
||||
|
||||
var ssc: StreamingContext = null
|
||||
|
|
|
@ -24,7 +24,7 @@ class FailureSuite extends TestSuiteBase with BeforeAndAfter {
|
|||
FileUtils.deleteDirectory(new File(checkpointDir))
|
||||
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
}
|
||||
|
||||
override def framework = "CheckpointSuite"
|
||||
|
|
|
@ -42,7 +42,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
|
|||
}
|
||||
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
}
|
||||
|
||||
test("network input stream") {
|
||||
|
|
|
@ -13,7 +13,7 @@ class WindowOperationsSuite extends TestSuiteBase {
|
|||
|
||||
after {
|
||||
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
|
||||
System.clearProperty("spark.master.port")
|
||||
System.clearProperty("spark.driver.port")
|
||||
}
|
||||
|
||||
val largerSlideInput = Seq(
|
||||
|
|
Loading…
Reference in a new issue