spark-instrumented-optimizer/core/src/main/scala/spark/RDDCheckpointData.scala

package spark

import org.apache.hadoop.fs.Path
import rdd.CoalescedRDD
import scheduler.{ResultTask, ShuffleMapTask}

/**
 * Enumeration to manage state transitions of an RDD through checkpointing
 * [ Initialized --> marked for checkpointing --> checkpointing in progress --> checkpointed ]
 */
private[spark] object CheckpointState extends Enumeration {
  type CheckpointState = Value
  val Initialized, MarkedForCheckpoint, CheckpointingInProgress, Checkpointed = Value
}

/**
 * This class contains all the information of the regarding RDD checkpointing.
 */
private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T])
extends Logging with Serializable {

  import CheckpointState._

  var cpState = Initialized
  @transient var cpFile: Option[String] = None
  @transient var cpRDD: Option[RDD[T]] = None

  // Mark the RDD for checkpointing
  def markForCheckpoint() {
    RDDCheckpointData.synchronized {
      if (cpState == Initialized) cpState = MarkedForCheckpoint
    }
  }

  // Is the RDD already checkpointed
  def isCheckpointed(): Boolean = {
    RDDCheckpointData.synchronized { cpState == Checkpointed }
  }

  // Get the file to which this RDD was checkpointed to as an Option
  def getCheckpointFile(): Option[String] = {
    RDDCheckpointData.synchronized { cpFile }
  }

  // Do the checkpointing of the RDD. Called after the first job using that RDD is over.
  def doCheckpoint() {
    // If it is marked for checkpointing AND checkpointing is not already in progress,
    // then set it to be in progress, else return
    RDDCheckpointData.synchronized {
      if (cpState == MarkedForCheckpoint) {
        cpState = CheckpointingInProgress
      } else {
        return
      }
    }

    // Save to file, and reload it as an RDD
    val file = new Path(rdd.context.checkpointDir, "rdd-" + rdd.id).toString
    rdd.saveAsObjectFile(file)

    val newRDD = {
      val hadoopRDD = rdd.context.objectFile[T](file, rdd.splits.size)

      val oldSplits = rdd.splits.size
      val newSplits = hadoopRDD.splits.size

      logDebug("RDD splits = " + oldSplits + " --> " + newSplits)
      if (newSplits < oldSplits) {
        throw new Exception("# splits after checkpointing is less than before " +
          "[" + oldSplits + " --> " + newSplits)
      } else if (newSplits > oldSplits) {
        new CoalescedRDD(hadoopRDD, rdd.splits.size)
      } else {
        hadoopRDD
      }
    }
    logDebug("New RDD has " + newRDD.splits.size + " splits")

    // Change the dependencies and splits of the RDD
    RDDCheckpointData.synchronized {
      cpFile = Some(file)
      cpRDD = Some(newRDD)
      rdd.changeDependencies(newRDD)
      cpState = Checkpointed
      RDDCheckpointData.checkpointCompleted()
      logInfo("Done checkpointing RDD " + rdd.id + ", new parent is RDD " + newRDD.id)
    }
  }

  // Get preferred location of a split after checkpointing
  def getPreferredLocations(split: Split) = {
    RDDCheckpointData.synchronized {
      cpRDD.get.preferredLocations(split)
    }
  }

  def getSplits: Array[Split] = {
    RDDCheckpointData.synchronized {
      cpRDD.get.splits
    }
  }

  // Get iterator. This is called at the worker nodes.
  def iterator(split: Split): Iterator[T] = {
    rdd.firstParent[T].iterator(split)
  }
}

private[spark] object RDDCheckpointData {
  def checkpointCompleted() {
    ShuffleMapTask.clearCache()
    ResultTask.clearCache()
  }
}
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`package spark`

			`import org.apache.hadoop.fs.Path`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`import rdd.CoalescedRDD`
			`import scheduler.{ResultTask, ShuffleMapTask}`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`/**`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`* Enumeration to manage state transitions of an RDD through checkpointing`
			`* [ Initialized --> marked for checkpointing --> checkpointing in progress --> checkpointed ]`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`*/`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`private[spark] object CheckpointState extends Enumeration {`
			`type CheckpointState = Value`
			`val Initialized, MarkedForCheckpoint, CheckpointingInProgress, Checkpointed = Value`
			`}`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`/**`
			`* This class contains all the information of the regarding RDD checkpointing.`
			`*/`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T])`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`extends Logging with Serializable {`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`import CheckpointState._`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`var cpState = Initialized`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`@transient var cpFile: Option[String] = None`
			`@transient var cpRDD: Option[RDD[T]] = None`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Mark the RDD for checkpointing`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`def markForCheckpoint() {`
			`RDDCheckpointData.synchronized {`
			`if (cpState == Initialized) cpState = MarkedForCheckpoint`
			`}`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Is the RDD already checkpointed`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`def isCheckpointed(): Boolean = {`
			`RDDCheckpointData.synchronized { cpState == Checkpointed }`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`

Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`// Get the file to which this RDD was checkpointed to as an Option`
			`def getCheckpointFile(): Option[String] = {`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`RDDCheckpointData.synchronized { cpFile }`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Do the checkpointing of the RDD. Called after the first job using that RDD is over.`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`def doCheckpoint() {`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// If it is marked for checkpointing AND checkpointing is not already in progress,`
			`// then set it to be in progress, else return`
			`RDDCheckpointData.synchronized {`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`if (cpState == MarkedForCheckpoint) {`
			`cpState = CheckpointingInProgress`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`} else {`
			`return`
			`}`
			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Save to file, and reload it as an RDD`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`val file = new Path(rdd.context.checkpointDir, "rdd-" + rdd.id).toString`
			`rdd.saveAsObjectFile(file)`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`val newRDD = {`
			`val hadoopRDD = rdd.context.objectFile[T](file, rdd.splits.size)`

			`val oldSplits = rdd.splits.size`
			`val newSplits = hadoopRDD.splits.size`

			`logDebug("RDD splits = " + oldSplits + " --> " + newSplits)`
			`if (newSplits < oldSplits) {`
			`throw new Exception("# splits after checkpointing is less than before " +`
			`"[" + oldSplits + " --> " + newSplits)`
			`} else if (newSplits > oldSplits) {`
			`new CoalescedRDD(hadoopRDD, rdd.splits.size)`
			`} else {`
			`hadoopRDD`
			`}`
			`}`
			`logDebug("New RDD has " + newRDD.splits.size + " splits")`

			`// Change the dependencies and splits of the RDD`
			`RDDCheckpointData.synchronized {`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`cpFile = Some(file)`
			`cpRDD = Some(newRDD)`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`rdd.changeDependencies(newRDD)`
Converted CheckpointState in RDDCheckpointData to use scala Enumeration. 2012-12-17 21:52:43 -05:00			`cpState = Checkpointed`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`RDDCheckpointData.checkpointCompleted()`
			`logInfo("Done checkpointing RDD " + rdd.id + ", new parent is RDD " + newRDD.id)`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`
			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Get preferred location of a split after checkpointing`
Introduced getSpits, getDependencies, and getPreferredLocations in RDD and RDDCheckpointData. 2012-12-18 16:30:53 -05:00			`def getPreferredLocations(split: Split) = {`
Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`RDDCheckpointData.synchronized {`
			`cpRDD.get.preferredLocations(split)`
			`}`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`

Introduced getSpits, getDependencies, and getPreferredLocations in RDD and RDDCheckpointData. 2012-12-18 16:30:53 -05:00			`def getSplits: Array[Split] = {`
			`RDDCheckpointData.synchronized {`
			`cpRDD.get.splits`
			`}`
			`}`

Bunch of bug fixes related to checkpointing in RDDs. RDDCheckpointData object is used to lock all serialization and dependency changes for checkpointing. ResultTask converted to Externalizable and serialized RDD is cached like ShuffleMapTask. 2012-12-11 02:36:37 -05:00			`// Get iterator. This is called at the worker nodes.`
			`def iterator(split: Split): Iterator[T] = {`
			`rdd.firstParent[T].iterator(split)`
			`}`
			`}`

			`private[spark] object RDDCheckpointData {`
			`def checkpointCompleted() {`
			`ShuffleMapTask.clearCache()`
			`ResultTask.clearCache()`
Refactored RDD checkpointing to minimize extra fields in RDD class. 2012-12-05 01:10:25 -05:00			`}`
			`}`