spark-instrumented-optimizer/core/src/main/scala/spark/RDDCheckpointData.scala

package spark

import org.apache.hadoop.fs.Path
import rdd.{CheckpointRDD, CoalescedRDD}
import scheduler.{ResultTask, ShuffleMapTask}

/**
 * Enumeration to manage state transitions of an RDD through checkpointing
 * [ Initialized --> marked for checkpointing --> checkpointing in progress --> checkpointed ]
 */
private[spark] object CheckpointState extends Enumeration {
  type CheckpointState = Value
  val Initialized, MarkedForCheckpoint, CheckpointingInProgress, Checkpointed = Value
}

/**
 * This class contains all the information of the regarding RDD checkpointing.
 */
private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T])
extends Logging with Serializable {

  import CheckpointState._

  var cpState = Initialized
  @transient var cpFile: Option[String] = None
  @transient var cpRDD: Option[RDD[T]] = None

  // Mark the RDD for checkpointing
  def markForCheckpoint() {
    RDDCheckpointData.synchronized {
      if (cpState == Initialized) cpState = MarkedForCheckpoint
    }
  }

  // Is the RDD already checkpointed
  def isCheckpointed(): Boolean = {
    RDDCheckpointData.synchronized { cpState == Checkpointed }
  }

  // Get the file to which this RDD was checkpointed to as an Option
  def getCheckpointFile(): Option[String] = {
    RDDCheckpointData.synchronized { cpFile }
  }

  // Do the checkpointing of the RDD. Called after the first job using that RDD is over.
  def doCheckpoint() {
    // If it is marked for checkpointing AND checkpointing is not already in progress,
    // then set it to be in progress, else return
    RDDCheckpointData.synchronized {
      if (cpState == MarkedForCheckpoint) {
        cpState = CheckpointingInProgress
      } else {
        return
      }
    }

    // Save to file, and reload it as an RDD
    val path = new Path(rdd.context.checkpointDir, "rdd-" + rdd.id).toString
    rdd.context.runJob(rdd, CheckpointRDD.writeToFile(path) _)
    val newRDD = new CheckpointRDD[T](rdd.context, path)

    // Change the dependencies and splits of the RDD
    RDDCheckpointData.synchronized {
      cpFile = Some(path)
      cpRDD = Some(newRDD)
      rdd.changeDependencies(newRDD)
      cpState = Checkpointed
      RDDCheckpointData.clearTaskCaches()
      logInfo("Done checkpointing RDD " + rdd.id + ", new parent is RDD " + newRDD.id)
    }
  }

  // Get preferred location of a split after checkpointing
  def getPreferredLocations(split: Split) = {
    RDDCheckpointData.synchronized {
      cpRDD.get.preferredLocations(split)
    }
  }

  def getSplits: Array[Split] = {
    RDDCheckpointData.synchronized {
      cpRDD.get.splits
    }
  }

  // Get iterator. This is called at the worker nodes.
  def iterator(split: Split, context: TaskContext): Iterator[T] = {
    rdd.firstParent[T].iterator(split, context)
  }
}

private[spark] object RDDCheckpointData {
  def clearTaskCaches() {
    ShuffleMapTask.clearCache()
    ResultTask.clearCache()
  }
}
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`package spark`

			`import org.apache.hadoop.fs.Path`
Changed checkpoint writing and reading process. 2012-12-20 14:52:23 -05:00			`import rdd.{CheckpointRDD, CoalescedRDD}`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`import scheduler.{ResultTask, ShuffleMapTask}`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`/**`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`* Enumeration to manage state transitions of an RDD through checkpointing`
			`* [ Initialized --> marked for checkpointing --> checkpointing in progress --> checkpointed ]`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`*/`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`private[spark] object CheckpointState extends Enumeration {`
			`type CheckpointState = Value`
			`val Initialized, MarkedForCheckpoint, CheckpointingInProgress, Checkpointed = Value`
			`}`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`/**`
			`* This class contains all the information of the regarding RDD checkpointing.`
			`*/`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T])`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`extends Logging with Serializable {`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`import CheckpointState._`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`var cpState = Initialized`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`@transient var cpFile: Option[String] = None`
			`@transient var cpRDD: Option[RDD[T]] = None`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Mark the RDD for checkpointing`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`def markForCheckpoint() {`
			`RDDCheckpointData.synchronized {`
			`if (cpState == Initialized) cpState = MarkedForCheckpoint`
			`}`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Is the RDD already checkpointed`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`def isCheckpointed(): Boolean = {`
			`RDDCheckpointData.synchronized { cpState == Checkpointed }`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`

Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`// Get the file to which this RDD was checkpointed to as an Option`
			`def getCheckpointFile(): Option[String] = {`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`RDDCheckpointData.synchronized { cpFile }`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Do the checkpointing of the RDD. Called after the first job using that RDD is over.`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`def doCheckpoint() {`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// If it is marked for checkpointing AND checkpointing is not already in progress,`
			`// then set it to be in progress, else return`
			`RDDCheckpointData.synchronized {`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`if (cpState == MarkedForCheckpoint) {`
			`cpState = CheckpointingInProgress`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`} else {`
			`return`
			`}`
			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Save to file, and reload it as an RDD`
Changed checkpoint writing and reading process. 2012-12-20 14:52:23 -05:00			`val path = new Path(rdd.context.checkpointDir, "rdd-" + rdd.id).toString`
			`rdd.context.runJob(rdd, CheckpointRDD.writeToFile(path) _)`
			`val newRDD = new CheckpointRDD[T](rdd.context, path)`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00
			`// Change the dependencies and splits of the RDD`
			`RDDCheckpointData.synchronized {`
Changed checkpoint writing and reading process. 2012-12-20 14:52:23 -05:00			`cpFile = Some(path)`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`cpRDD = Some(newRDD)`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`rdd.changeDependencies(newRDD)`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`cpState = Checkpointed`
Renamed examples and added documentation. 2013-01-07 17:37:21 -05:00			`RDDCheckpointData.clearTaskCaches()`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`logInfo("Done checkpointing RDD " + rdd.id + ", new parent is RDD " + newRDD.id)`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`
			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Get preferred location of a split after checkpointing`
Introduced getSpits, getDependencies, and getPreferredLocations in RDD and RDDCheckpointData. 2012-12-18 16:30:53 -05:00			`def getPreferredLocations(split: Split) = {`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`RDDCheckpointData.synchronized {`
			`cpRDD.get.preferredLocations(split)`
			`}`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`

Introduced getSpits, getDependencies, and getPreferredLocations in RDD and RDDCheckpointData. 2012-12-18 16:30:53 -05:00			`def getSplits: Array[Split] = {`
			`RDDCheckpointData.synchronized {`
			`cpRDD.get.splits`
			`}`
			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Get iterator. This is called at the worker nodes.`
Merge branch 'dev-checkpoint' of github.com:radlab/spark into dev-merge Conflicts: core/src/main/scala/spark/ParallelCollection.scala core/src/main/scala/spark/RDD.scala core/src/main/scala/spark/rdd/BlockRDD.scala core/src/main/scala/spark/rdd/CartesianRDD.scala core/src/main/scala/spark/rdd/CoGroupedRDD.scala core/src/main/scala/spark/rdd/CoalescedRDD.scala core/src/main/scala/spark/rdd/FilteredRDD.scala core/src/main/scala/spark/rdd/FlatMappedRDD.scala core/src/main/scala/spark/rdd/GlommedRDD.scala core/src/main/scala/spark/rdd/HadoopRDD.scala core/src/main/scala/spark/rdd/MapPartitionsRDD.scala core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala core/src/main/scala/spark/rdd/MappedRDD.scala core/src/main/scala/spark/rdd/PipedRDD.scala core/src/main/scala/spark/rdd/SampledRDD.scala core/src/main/scala/spark/rdd/ShuffledRDD.scala core/src/main/scala/spark/rdd/UnionRDD.scala core/src/main/scala/spark/scheduler/ResultTask.scala core/src/test/scala/spark/CheckpointSuite.scala 2012-12-26 22:09:01 -05:00			`def iterator(split: Split, context: TaskContext): Iterator[T] = {`
			`rdd.firstParent[T].iterator(split, context)`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`}`
			`}`

			`private[spark] object RDDCheckpointData {`
Renamed examples and added documentation. 2013-01-07 17:37:21 -05:00			`def clearTaskCaches() {`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`ShuffleMapTask.clearCache()`
			`ResultTask.clearCache()`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`
			`}`