2012-12-05 01:10:25 -05:00
|
|
|
package spark
|
|
|
|
|
|
|
|
import org.apache.hadoop.fs.Path
|
2012-12-20 14:52:23 -05:00
|
|
|
import rdd.{CheckpointRDD, CoalescedRDD}
|
2012-12-11 02:36:37 -05:00
|
|
|
import scheduler.{ResultTask, ShuffleMapTask}
|
2012-12-05 01:10:25 -05:00
|
|
|
|
2012-12-11 02:36:37 -05:00
|
|
|
/**
|
2012-12-17 21:52:43 -05:00
|
|
|
* Enumeration to manage state transitions of an RDD through checkpointing
|
|
|
|
* [ Initialized --> marked for checkpointing --> checkpointing in progress --> checkpointed ]
|
2012-12-11 02:36:37 -05:00
|
|
|
*/
|
2012-12-17 21:52:43 -05:00
|
|
|
private[spark] object CheckpointState extends Enumeration {
|
|
|
|
type CheckpointState = Value
|
|
|
|
val Initialized, MarkedForCheckpoint, CheckpointingInProgress, Checkpointed = Value
|
|
|
|
}
|
2012-12-05 01:10:25 -05:00
|
|
|
|
2012-12-17 21:52:43 -05:00
|
|
|
/**
|
2013-01-07 17:55:49 -05:00
|
|
|
* This class contains all the information related to RDD checkpointing. Each instance of this class
|
|
|
|
* is associated with a RDD. It manages process of checkpointing of the associated RDD, as well as,
|
|
|
|
* manages the post-checkpoint state by providing the updated splits, iterator and preferred locations
|
|
|
|
* of the checkpointed RDD.
|
2012-12-17 21:52:43 -05:00
|
|
|
*/
|
2012-12-05 01:10:25 -05:00
|
|
|
private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T])
|
2013-01-29 01:30:12 -05:00
|
|
|
extends Logging with Serializable {
|
2012-12-05 01:10:25 -05:00
|
|
|
|
2012-12-17 21:52:43 -05:00
|
|
|
import CheckpointState._
|
2012-12-05 01:10:25 -05:00
|
|
|
|
2013-01-07 17:55:49 -05:00
|
|
|
// The checkpoint state of the associated RDD.
|
2012-12-17 21:52:43 -05:00
|
|
|
var cpState = Initialized
|
2013-01-07 17:55:49 -05:00
|
|
|
|
|
|
|
// The file to which the associated RDD has been checkpointed to
|
2012-12-11 02:36:37 -05:00
|
|
|
@transient var cpFile: Option[String] = None
|
2013-01-07 17:55:49 -05:00
|
|
|
|
|
|
|
// The CheckpointRDD created from the checkpoint file, that is, the new parent the associated RDD.
|
2013-01-29 01:30:12 -05:00
|
|
|
var cpRDD: Option[RDD[T]] = None
|
2012-12-05 01:10:25 -05:00
|
|
|
|
2012-12-11 02:36:37 -05:00
|
|
|
// Mark the RDD for checkpointing
|
2012-12-17 21:52:43 -05:00
|
|
|
def markForCheckpoint() {
|
|
|
|
RDDCheckpointData.synchronized {
|
|
|
|
if (cpState == Initialized) cpState = MarkedForCheckpoint
|
|
|
|
}
|
2012-12-05 01:10:25 -05:00
|
|
|
}
|
|
|
|
|
2012-12-11 02:36:37 -05:00
|
|
|
// Is the RDD already checkpointed
|
2013-01-29 01:30:12 -05:00
|
|
|
def isCheckpointed: Boolean = {
|
2012-12-17 21:52:43 -05:00
|
|
|
RDDCheckpointData.synchronized { cpState == Checkpointed }
|
2012-12-05 01:10:25 -05:00
|
|
|
}
|
|
|
|
|
2012-12-17 21:52:43 -05:00
|
|
|
// Get the file to which this RDD was checkpointed to as an Option
|
2013-01-29 01:30:12 -05:00
|
|
|
def getCheckpointFile: Option[String] = {
|
2012-12-11 02:36:37 -05:00
|
|
|
RDDCheckpointData.synchronized { cpFile }
|
2012-12-05 01:10:25 -05:00
|
|
|
}
|
|
|
|
|
2012-12-11 02:36:37 -05:00
|
|
|
// Do the checkpointing of the RDD. Called after the first job using that RDD is over.
|
2012-12-05 01:10:25 -05:00
|
|
|
def doCheckpoint() {
|
2012-12-11 02:36:37 -05:00
|
|
|
// If it is marked for checkpointing AND checkpointing is not already in progress,
|
|
|
|
// then set it to be in progress, else return
|
|
|
|
RDDCheckpointData.synchronized {
|
2012-12-17 21:52:43 -05:00
|
|
|
if (cpState == MarkedForCheckpoint) {
|
|
|
|
cpState = CheckpointingInProgress
|
2012-12-05 01:10:25 -05:00
|
|
|
} else {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-11 02:36:37 -05:00
|
|
|
// Save to file, and reload it as an RDD
|
2013-01-20 02:50:17 -05:00
|
|
|
val path = new Path(rdd.context.checkpointDir.get, "rdd-" + rdd.id).toString
|
2012-12-20 14:52:23 -05:00
|
|
|
rdd.context.runJob(rdd, CheckpointRDD.writeToFile(path) _)
|
|
|
|
val newRDD = new CheckpointRDD[T](rdd.context, path)
|
2012-12-11 02:36:37 -05:00
|
|
|
|
|
|
|
// Change the dependencies and splits of the RDD
|
|
|
|
RDDCheckpointData.synchronized {
|
2012-12-20 14:52:23 -05:00
|
|
|
cpFile = Some(path)
|
2012-12-05 01:10:25 -05:00
|
|
|
cpRDD = Some(newRDD)
|
2013-01-29 01:30:12 -05:00
|
|
|
rdd.markCheckpointed(newRDD) // Update the RDD's dependencies and splits
|
2012-12-17 21:52:43 -05:00
|
|
|
cpState = Checkpointed
|
2013-01-07 17:37:21 -05:00
|
|
|
RDDCheckpointData.clearTaskCaches()
|
2012-12-11 02:36:37 -05:00
|
|
|
logInfo("Done checkpointing RDD " + rdd.id + ", new parent is RDD " + newRDD.id)
|
2012-12-05 01:10:25 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-11 02:36:37 -05:00
|
|
|
// Get preferred location of a split after checkpointing
|
2013-01-29 01:30:12 -05:00
|
|
|
def getPreferredLocations(split: Split): Seq[String] = {
|
2012-12-11 02:36:37 -05:00
|
|
|
RDDCheckpointData.synchronized {
|
|
|
|
cpRDD.get.preferredLocations(split)
|
|
|
|
}
|
2012-12-05 01:10:25 -05:00
|
|
|
}
|
|
|
|
|
2012-12-18 16:30:53 -05:00
|
|
|
def getSplits: Array[Split] = {
|
|
|
|
RDDCheckpointData.synchronized {
|
|
|
|
cpRDD.get.splits
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-29 01:30:12 -05:00
|
|
|
def checkpointRDD: Option[RDD[T]] = {
|
|
|
|
RDDCheckpointData.synchronized {
|
|
|
|
cpRDD
|
|
|
|
}
|
2012-12-11 02:36:37 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private[spark] object RDDCheckpointData {
|
2013-01-07 17:37:21 -05:00
|
|
|
def clearTaskCaches() {
|
2012-12-11 02:36:37 -05:00
|
|
|
ShuffleMapTask.clearCache()
|
|
|
|
ResultTask.clearCache()
|
2012-12-05 01:10:25 -05:00
|
|
|
}
|
|
|
|
}
|