Merge pull request #225 from pwendell/dev

Log message which records RDD origin
This commit is contained in:
Matei Zaharia 2012-09-28 16:28:07 -07:00
commit 9f6efbf06a
2 changed files with 39 additions and 1 deletions

View file

@ -61,6 +61,9 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
def compute(split: Split): Iterator[T]
@transient val dependencies: List[Dependency[_]]
// Record user function generating this RDD
val origin = getOriginDescription
// Optionally overridden by subclasses to specify how they are partitioned
val partitioner: Option[Partitioner] = None
@ -124,6 +127,38 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
}
}
// Describe which spark and user functions generated this RDD. Only works if called from
// constructor.
def getOriginDescription : String = {
val trace = Thread.currentThread().getStackTrace().filter( el =>
(!el.getMethodName().contains("getStackTrace")))
// Keep crawling up the stack trace until we find the first function not inside of the spark
// package. We track the last (shallowest) contiguous Spark method. This might be an RDD
// transformation, a SparkContext function (such as parallelize), or anything else that leads
// to instantiation of an RDD. We also track the first (deepest) user method, file, and line.
var lastSparkMethod = "<not_found>"
var firstUserMethod = "<not_found>"
var firstUserFile = "<not_found>"
var firstUserLine = -1
var finished = false
for (el <- trace) {
if (!finished) {
if (el.getClassName().contains("spark") && !el.getClassName().startsWith("spark.examples")) {
lastSparkMethod = el.getMethodName()
}
else {
firstUserMethod = el.getMethodName()
firstUserLine = el.getLineNumber()
firstUserFile = el.getFileName()
finished = true
}
}
}
"%s at: %s (%s:%s)".format(lastSparkMethod, firstUserMethod, firstUserFile, firstUserLine)
}
// Transformations (return a new RDD)
def map[U: ClassManifest](f: T => U): RDD[U] = new MappedRDD(this, sc.clean(f))

View file

@ -337,7 +337,8 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
if (missing == Nil) {
logInfo("Submitting " + stage + ", which has no missing parents")
logInfo("Submitting " + stage + " from " + stage.rdd.origin +
", which has no missing parents")
submitMissingTasks(stage)
running += stage
} else {
@ -452,6 +453,8 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
waiting --= newlyRunnable
running ++= newlyRunnable
for (stage <- newlyRunnable.sortBy(_.id)) {
logInfo("Submitting " + stage + " from " + stage.rdd.origin +
" which is now runnable")
submitMissingTasks(stage)
}
}