Fixed bug in UnionRDD and CoGroupedRDD
This commit is contained in:
parent
746afc2e65
commit
fa28f25619
|
@ -15,8 +15,11 @@ import spark.Split
|
||||||
import java.io.{ObjectOutputStream, IOException}
|
import java.io.{ObjectOutputStream, IOException}
|
||||||
|
|
||||||
private[spark] sealed trait CoGroupSplitDep extends Serializable
|
private[spark] sealed trait CoGroupSplitDep extends Serializable
|
||||||
private[spark] case class NarrowCoGroupSplitDep(rdd: RDD[_], splitIndex: Int, var split: Split = null)
|
private[spark] case class NarrowCoGroupSplitDep(
|
||||||
extends CoGroupSplitDep {
|
rdd: RDD[_],
|
||||||
|
splitIndex: Int,
|
||||||
|
var split: Split
|
||||||
|
) extends CoGroupSplitDep {
|
||||||
|
|
||||||
@throws(classOf[IOException])
|
@throws(classOf[IOException])
|
||||||
private def writeObject(oos: ObjectOutputStream) {
|
private def writeObject(oos: ObjectOutputStream) {
|
||||||
|
@ -75,7 +78,7 @@ CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner)
|
||||||
case s: ShuffleDependency[_, _] =>
|
case s: ShuffleDependency[_, _] =>
|
||||||
new ShuffleCoGroupSplitDep(s.shuffleId): CoGroupSplitDep
|
new ShuffleCoGroupSplitDep(s.shuffleId): CoGroupSplitDep
|
||||||
case _ =>
|
case _ =>
|
||||||
new NarrowCoGroupSplitDep(r, i): CoGroupSplitDep
|
new NarrowCoGroupSplitDep(r, i, r.splits(i)): CoGroupSplitDep
|
||||||
}
|
}
|
||||||
}.toList)
|
}.toList)
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,13 +5,9 @@ import scala.collection.mutable.ArrayBuffer
|
||||||
import spark._
|
import spark._
|
||||||
import java.io.{ObjectOutputStream, IOException}
|
import java.io.{ObjectOutputStream, IOException}
|
||||||
|
|
||||||
private[spark] class UnionSplit[T: ClassManifest](
|
private[spark] class UnionSplit[T: ClassManifest](idx: Int, rdd: RDD[T], splitIndex: Int)
|
||||||
idx: Int,
|
extends Split {
|
||||||
rdd: RDD[T],
|
var split: Split = rdd.splits(splitIndex)
|
||||||
splitIndex: Int,
|
|
||||||
var split: Split = null)
|
|
||||||
extends Split
|
|
||||||
with Serializable {
|
|
||||||
|
|
||||||
def iterator() = rdd.iterator(split)
|
def iterator() = rdd.iterator(split)
|
||||||
def preferredLocations() = rdd.preferredLocations(split)
|
def preferredLocations() = rdd.preferredLocations(split)
|
||||||
|
|
|
@ -329,114 +329,10 @@ class CheckpointSuite extends FunSuite with BeforeAndAfter with Logging {
|
||||||
val bytes = Utils.serialize(obj)
|
val bytes = Utils.serialize(obj)
|
||||||
Utils.deserialize[T](bytes)
|
Utils.deserialize[T](bytes)
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
test("Consistency check for ResultTask") {
|
|
||||||
// Time ----------------------->
|
|
||||||
// Core 1: |<- count in thread 1, task 1 ->| |<-- checkpoint, task 1 ---->| |<- count in thread 2, task 2 ->|
|
|
||||||
// Core 2: |<- count in thread 1, task 2 ->| |<--- checkpoint, task 2 ---------->| |<- count in thread 2, task 1 ->|
|
|
||||||
// |
|
|
||||||
// checkpoint completed
|
|
||||||
sc.stop(); sc = null
|
|
||||||
System.clearProperty("spark.master.port")
|
|
||||||
|
|
||||||
val dir = File.createTempFile("temp_", "")
|
|
||||||
dir.delete()
|
|
||||||
val ctxt = new SparkContext("local[2]", "ResultTask")
|
|
||||||
ctxt.setCheckpointDir(dir.toString)
|
|
||||||
|
|
||||||
try {
|
|
||||||
val rdd = ctxt.makeRDD(1 to 2, 2).map(x => {
|
|
||||||
val state = CheckpointSuite.incrementState()
|
|
||||||
println("State = " + state)
|
|
||||||
if (state <= 3) {
|
|
||||||
// If executing the two tasks for the job comouting rdd.count
|
|
||||||
// of thread 1, or the first task for the recomputation due
|
|
||||||
// to checkpointing (saveing to HDFS), then do nothing
|
|
||||||
} else if (state == 4) {
|
|
||||||
// If executing the second task for the recomputation due to
|
|
||||||
// checkpointing. then prolong this task, to allow rdd.count
|
|
||||||
// of thread 2 to start before checkpoint of this RDD is completed
|
|
||||||
|
|
||||||
Thread.sleep(1000)
|
|
||||||
println("State = " + state + " wake up")
|
|
||||||
} else {
|
|
||||||
// Else executing the tasks from thread 2
|
|
||||||
Thread.sleep(1000)
|
|
||||||
println("State = " + state + " wake up")
|
|
||||||
}
|
|
||||||
|
|
||||||
(x, 1)
|
|
||||||
})
|
|
||||||
rdd.checkpoint()
|
|
||||||
val env = SparkEnv.get
|
|
||||||
|
|
||||||
val thread1 = new Thread() {
|
|
||||||
override def run() {
|
|
||||||
try {
|
|
||||||
SparkEnv.set(env)
|
|
||||||
rdd.count()
|
|
||||||
} catch {
|
|
||||||
case e: Exception => CheckpointSuite.failed("Exception in thread 1", e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
thread1.start()
|
|
||||||
|
|
||||||
val thread2 = new Thread() {
|
|
||||||
override def run() {
|
|
||||||
try {
|
|
||||||
SparkEnv.set(env)
|
|
||||||
CheckpointSuite.waitTillState(3)
|
|
||||||
println("\n\n\n\n")
|
|
||||||
rdd.count()
|
|
||||||
} catch {
|
|
||||||
case e: Exception => CheckpointSuite.failed("Exception in thread 2", e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
thread2.start()
|
|
||||||
|
|
||||||
thread1.join()
|
|
||||||
thread2.join()
|
|
||||||
} finally {
|
|
||||||
dir.delete()
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(!CheckpointSuite.failed, CheckpointSuite.failureMessage)
|
|
||||||
|
|
||||||
ctxt.stop()
|
|
||||||
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
object CheckpointSuite {
|
object CheckpointSuite {
|
||||||
/*
|
|
||||||
var state = 0
|
|
||||||
var failed = false
|
|
||||||
var failureMessage = ""
|
|
||||||
|
|
||||||
def incrementState(): Int = {
|
|
||||||
this.synchronized { state += 1; this.notifyAll(); state }
|
|
||||||
}
|
|
||||||
|
|
||||||
def getState(): Int = {
|
|
||||||
this.synchronized( state )
|
|
||||||
}
|
|
||||||
|
|
||||||
def waitTillState(s: Int) {
|
|
||||||
while(state < s) {
|
|
||||||
this.synchronized { this.wait() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def failed(msg: String, ex: Exception) {
|
|
||||||
failed = true
|
|
||||||
failureMessage += msg + "\n" + ex + "\n\n"
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// This is a custom cogroup function that does not use mapValues like
|
// This is a custom cogroup function that does not use mapValues like
|
||||||
// the PairRDDFunctions.cogroup()
|
// the PairRDDFunctions.cogroup()
|
||||||
def cogroup[K, V](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner) = {
|
def cogroup[K, V](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner) = {
|
||||||
|
|
Loading…
Reference in a new issue