Allow SparkContext.submitJob to submit a job for only a subset of the partitions.
This commit is contained in:
parent
37d8f37a8e
commit
bf515688e7
|
@ -816,6 +816,7 @@ class SparkContext(
|
|||
def submitJob[T, U, R](
|
||||
rdd: RDD[T],
|
||||
processPartition: Iterator[T] => U,
|
||||
partitions: Seq[Int],
|
||||
partitionResultHandler: (Int, U) => Unit,
|
||||
resultFunc: () => R): Future[R] =
|
||||
{
|
||||
|
@ -823,7 +824,7 @@ class SparkContext(
|
|||
val waiter = dagScheduler.submitJob(
|
||||
rdd,
|
||||
(context: TaskContext, iter: Iterator[T]) => processPartition(iter),
|
||||
0 until rdd.partitions.size,
|
||||
partitions,
|
||||
callSite,
|
||||
allowLocal = false,
|
||||
partitionResultHandler,
|
||||
|
|
|
@ -568,7 +568,7 @@ abstract class RDD[T: ClassManifest](
|
|||
def collectAsync(): Future[Seq[T]] = {
|
||||
val results = new ArrayBuffer[T]
|
||||
sc.submitJob[T, Array[T], Seq[T]](
|
||||
this, _.toArray, (index, data) => results ++= data, () => results)
|
||||
this, _.toArray, Range(0, partitions.size), (index, data) => results ++= data, () => results)
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -42,11 +42,11 @@ import org.apache.spark.util.{MetadataCleaner, TimeStampedHashMap}
|
|||
* locations to run each task on, based on the current cache status, and passes these to the
|
||||
* low-level TaskScheduler. Furthermore, it handles failures due to shuffle output files being
|
||||
* lost, in which case old stages may need to be resubmitted. Failures *within* a stage that are
|
||||
* not caused by shuffie file loss are handled by the TaskScheduler, which will retry each task
|
||||
* not caused by shuffle file loss are handled by the TaskScheduler, which will retry each task
|
||||
* a small number of times before cancelling the whole stage.
|
||||
*
|
||||
* THREADING: This class runs all its logic in a single thread executing the run() method, to which
|
||||
* events are submitted using a synchonized queue (eventQueue). The public API methods, such as
|
||||
* events are submitted using a synchronized queue (eventQueue). The public API methods, such as
|
||||
* runJob, taskEnded and executorLost, post events asynchronously to this queue. All other methods
|
||||
* should be private.
|
||||
*/
|
||||
|
|
Loading…
Reference in a new issue