Fixed bug in UnionRDD and CoGroupedRDD

2012-12-11 13:59:43 -08:00 · 2012-12-11 13:59:43 -08:00 · fa28f25619
parent 746afc2e65
commit fa28f25619
3 changed files with 10 additions and 115 deletions
--- a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala
@ -15,8 +15,11 @@ import spark.Split
 import java.io.{ObjectOutputStream, IOException}
 private[spark] sealed trait CoGroupSplitDep extends Serializable
-private[spark] case class NarrowCoGroupSplitDep(rdd: RDD[_], splitIndex: Int, var split: Split = null)
+private[spark] case class NarrowCoGroupSplitDep(
-  extends CoGroupSplitDep {
+    rdd: RDD[_],
    splitIndex: Int,
    var split: Split
  ) extends CoGroupSplitDep {
  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream) {
@ -75,7 +78,7 @@ CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner)
          case s: ShuffleDependency[_, _] =>
            new ShuffleCoGroupSplitDep(s.shuffleId): CoGroupSplitDep
          case _ =>
-            new NarrowCoGroupSplitDep(r, i): CoGroupSplitDep
+            new NarrowCoGroupSplitDep(r, i, r.splits(i)): CoGroupSplitDep
        }
      }.toList)
    }
--- a/core/src/main/scala/spark/rdd/UnionRDD.scala
+++ b/core/src/main/scala/spark/rdd/UnionRDD.scala
@ -5,13 +5,9 @@ import scala.collection.mutable.ArrayBuffer
 import spark._
 import java.io.{ObjectOutputStream, IOException}
-private[spark] class UnionSplit[T: ClassManifest](
+private[spark] class UnionSplit[T: ClassManifest](idx: Int, rdd: RDD[T], splitIndex: Int)
-    idx: Int,
+  extends Split {
-    rdd: RDD[T],
+  var split: Split = rdd.splits(splitIndex)
    splitIndex: Int,
    var split: Split = null)
  extends Split
  with Serializable {
  def iterator() = rdd.iterator(split)
  def preferredLocations() = rdd.preferredLocations(split)
--- a/core/src/test/scala/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/spark/CheckpointSuite.scala
@ -329,114 +329,10 @@ class CheckpointSuite extends FunSuite with BeforeAndAfter with Logging {
    val bytes = Utils.serialize(obj)
    Utils.deserialize[T](bytes)
  }
  /*
  test("Consistency check for ResultTask") {
    // Time ----------------------->
    // Core 1: |<- count in thread 1, task 1 ->| |<-- checkpoint, task 1 ---->| |<- count in thread 2, task 2 ->|
    // Core 2: |<- count in thread 1, task 2 ->| |<--- checkpoint, task 2 ---------->| |<- count in thread 2, task 1 ->|
    //                                                                                |
    //                                                                        checkpoint completed
    sc.stop(); sc = null
    System.clearProperty("spark.master.port")
    val dir = File.createTempFile("temp_", "")
    dir.delete()
    val ctxt = new SparkContext("local[2]", "ResultTask")
    ctxt.setCheckpointDir(dir.toString)
    try {
      val rdd = ctxt.makeRDD(1 to 2, 2).map(x => {
        val state = CheckpointSuite.incrementState()
        println("State = " + state)
        if (state <= 3) {
          // If executing the two tasks for the job comouting rdd.count
          // of thread 1, or the first task for the recomputation due
          // to checkpointing (saveing to HDFS), then do nothing
        } else if (state == 4) {
          // If executing the second task for the recomputation due to
          // checkpointing. then prolong this task, to allow rdd.count
          // of thread 2 to start before checkpoint of this RDD is completed
          Thread.sleep(1000)
          println("State = " + state + " wake up")
        } else {
          // Else executing the tasks from thread 2
          Thread.sleep(1000)
          println("State = " + state + " wake up")
        }
        (x, 1)
      })
      rdd.checkpoint()
      val env = SparkEnv.get
      val thread1 = new Thread() {
        override def run() {
          try {
            SparkEnv.set(env)
            rdd.count()
          } catch {
            case e: Exception => CheckpointSuite.failed("Exception in thread 1", e)
          }
        }
      }
      thread1.start()
      val thread2 = new Thread() {
        override def run() {
          try {
            SparkEnv.set(env)
            CheckpointSuite.waitTillState(3)
            println("\n\n\n\n")
            rdd.count()
          } catch {
            case e: Exception => CheckpointSuite.failed("Exception in thread 2", e)
          }
        }
      }
      thread2.start()
      thread1.join()
      thread2.join()
    } finally {
      dir.delete()
    }
    assert(!CheckpointSuite.failed, CheckpointSuite.failureMessage)
    ctxt.stop()
  }
  */
 }
 object CheckpointSuite {
  /*
  var state = 0
  var failed = false
  var failureMessage = ""
  def incrementState(): Int = {
    this.synchronized { state += 1; this.notifyAll(); state }
  }
  def getState(): Int = {
    this.synchronized( state )
  }
  def waitTillState(s: Int) {
    while(state < s) {
      this.synchronized { this.wait() }
    }
  }
  def failed(msg: String, ex: Exception) {
    failed = true
    failureMessage += msg + "\n" + ex + "\n\n"
  }
  */
  // This is a custom cogroup function that does not use mapValues like
  // the PairRDDFunctions.cogroup()
  def cogroup[K, V](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner) = {