2010-06-17 15:49:42 -04:00
package spark
import java.util.concurrent.atomic.AtomicLong
import java.util.HashSet
2010-08-18 18:25:57 -04:00
import java.util.Random
2010-06-17 15:49:42 -04:00
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.Map
2010-10-03 23:28:20 -04:00
import scala.collection.mutable.HashMap
2010-06-17 15:49:42 -04:00
2010-11-04 02:58:53 -04:00
import SparkContext._
2010-07-25 23:53:46 -04:00
import mesos._
2010-06-17 15:49:42 -04:00
2011-02-27 02:41:44 -05:00
@serializable
2011-02-27 02:15:33 -05:00
abstract class Dependency [ T ] ( val rdd : RDD [ T ] , val isShuffle : Boolean )
abstract class NarrowDependency [ T ] ( rdd : RDD [ T ] )
extends Dependency ( rdd , false ) {
def getParents ( outputPartition : Int ) : Seq [ Int ]
}
2011-02-27 02:41:44 -05:00
class OneToOneDependency [ T ] ( rdd : RDD [ T ] ) extends NarrowDependency [ T ] ( rdd ) {
override def getParents ( partitionId : Int ) = List ( partitionId )
}
2011-02-27 02:15:33 -05:00
class ShuffleDependency [ K , V , C ] (
rdd : RDD [ ( K , V ) ] ,
val spec : ShuffleSpec [ K , V , C ]
) extends Dependency ( rdd , true )
2011-02-27 02:41:44 -05:00
@serializable
2011-02-27 02:15:33 -05:00
class ShuffleSpec [ K , V , C ] (
val createCombiner : V => C ,
val mergeValue : ( C , V ) => C ,
val mergeCombiners : ( C , C ) => C ,
val partitioner : Partitioner [ K ]
)
2011-02-27 02:41:44 -05:00
@serializable
2011-02-27 02:15:33 -05:00
abstract class Partitioner [ K ] {
def numPartitions : Int
def getPartition ( key : K ) : Int
}
2010-10-23 18:34:03 -04:00
2010-06-17 15:49:42 -04:00
@serializable
2010-10-23 18:34:03 -04:00
abstract class RDD [ T : ClassManifest ] ( @transient sc : SparkContext ) {
2010-06-17 15:49:42 -04:00
def splits : Array [ Split ]
def iterator ( split : Split ) : Iterator [ T ]
2010-06-27 18:21:54 -04:00
def preferredLocations ( split : Split ) : Seq [ String ]
2011-02-27 02:15:33 -05:00
2011-02-27 02:41:44 -05:00
val dependencies : List [ Dependency [ _ ] ] = Nil
val partitioner : Option [ Partitioner [ _ ] ] = None
2010-06-17 15:49:42 -04:00
def taskStarted ( split : Split , slot : SlaveOffer ) { }
def sparkContext = sc
2011-02-27 02:15:33 -05:00
def map [ U : ClassManifest ] ( f : T => U ) : RDD [ U ] = new MappedRDD ( this , sc . clean ( f ) )
def filter ( f : T => Boolean ) : RDD [ T ] = new FilteredRDD ( this , sc . clean ( f ) )
2010-06-17 15:49:42 -04:00
def cache ( ) = new CachedRDD ( this )
2010-10-03 23:28:20 -04:00
2011-02-27 02:15:33 -05:00
def sample ( withReplacement : Boolean , frac : Double , seed : Int ) : RDD [ T ] =
2010-10-03 23:28:20 -04:00
new SampledRDD ( this , withReplacement , frac , seed )
2011-02-27 02:15:33 -05:00
def flatMap [ U : ClassManifest ] ( f : T => Traversable [ U ] ) : RDD [ U ] =
2010-10-03 23:28:20 -04:00
new FlatMappedRDD ( this , sc . clean ( f ) )
2010-06-17 15:49:42 -04:00
def foreach ( f : T => Unit ) {
val cleanF = sc . clean ( f )
val tasks = splits . map ( s => new ForeachTask ( this , s , cleanF ) ) . toArray
sc . runTaskObjects ( tasks )
}
def collect ( ) : Array [ T ] = {
2011-02-27 02:41:44 -05:00
val results = sc . runJob ( this , ( iter : Iterator [ T ] ) => iter . toArray )
2010-06-17 15:49:42 -04:00
Array . concat ( results : _ * )
}
def toArray ( ) : Array [ T ] = collect ( )
def reduce ( f : ( T , T ) => T ) : T = {
val cleanF = sc . clean ( f )
2011-02-27 02:15:33 -05:00
val reducePartition : Iterator [ T ] => Option [ T ] = iter => {
if ( iter . hasNext )
Some ( iter . reduceLeft ( f ) )
else
None
}
2011-02-27 02:41:44 -05:00
val options = sc . runJob ( this , reducePartition )
2010-06-17 15:49:42 -04:00
val results = new ArrayBuffer [ T ]
2011-02-27 02:15:33 -05:00
for ( opt <- options ; elem <- opt )
2010-06-17 15:49:42 -04:00
results += elem
if ( results . size == 0 )
throw new UnsupportedOperationException ( "empty collection" )
else
return results . reduceLeft ( f )
}
def take ( num : Int ) : Array [ T ] = {
if ( num == 0 )
return new Array [ T ] ( 0 )
val buf = new ArrayBuffer [ T ]
for ( split <- splits ; elem <- iterator ( split ) ) {
buf += elem
if ( buf . length == num )
return buf . toArray
}
return buf . toArray
}
def first : T = take ( 1 ) match {
case Array ( t ) => t
case _ => throw new UnsupportedOperationException ( "empty collection" )
}
2010-10-23 18:34:03 -04:00
def count ( ) : Long = {
2011-02-27 02:15:33 -05:00
try {
2010-10-23 18:34:03 -04:00
map ( x => 1L ) . reduce ( _ + _ )
} catch {
case e : UnsupportedOperationException => 0L // No elements in RDD
}
}
2010-06-18 15:54:33 -04:00
2011-02-27 02:15:33 -05:00
def union ( other : RDD [ T ] ) : RDD [ T ] = new UnionRDD ( sc , Array ( this , other ) )
2010-06-18 15:54:33 -04:00
2011-02-27 02:15:33 -05:00
def ++ ( other : RDD [ T ] ) : RDD [ T ] = this . union ( other )
2010-10-23 18:34:03 -04:00
2011-02-27 02:15:33 -05:00
def splitRdd ( ) : RDD [ Array [ T ] ] = new SplitRDD ( this )
2010-10-23 18:34:03 -04:00
2011-02-27 02:15:33 -05:00
def cartesian [ U : ClassManifest ] ( other : RDD [ U ] ) : RDD [ ( T , U ) ] =
2010-10-23 18:34:03 -04:00
new CartesianRDD ( sc , this , other )
2010-11-04 02:58:53 -04:00
def groupBy [ K ] ( func : T => K , numSplits : Int ) : RDD [ ( K , Seq [ T ] ) ] =
this . map ( t => ( func ( t ) , t ) ) . groupByKey ( numSplits )
def groupBy [ K ] ( func : T => K ) : RDD [ ( K , Seq [ T ] ) ] =
groupBy [ K ] ( func , sc . numCores )
2010-06-17 15:49:42 -04:00
}
@serializable
2010-08-31 15:08:09 -04:00
abstract class RDDTask [ U : ClassManifest , T : ClassManifest ] (
val rdd : RDD [ T ] , val split : Split )
2010-06-17 15:49:42 -04:00
extends Task [ U ] {
2010-06-27 18:21:54 -04:00
override def preferredLocations ( ) = rdd . preferredLocations ( split )
2010-06-17 15:49:42 -04:00
override def markStarted ( slot : SlaveOffer ) { rdd . taskStarted ( split , slot ) }
}
2010-08-31 15:08:09 -04:00
class ForeachTask [ T : ClassManifest ] (
rdd : RDD [ T ] , split : Split , func : T => Unit )
2010-09-29 02:22:07 -04:00
extends RDDTask [ Unit , T ] ( rdd , split ) with Logging {
2010-06-17 15:49:42 -04:00
override def run ( ) {
2010-09-29 02:22:07 -04:00
logInfo ( "Processing " + split )
2010-06-17 15:49:42 -04:00
rdd . iterator ( split ) . foreach ( func )
}
}
2010-08-31 15:08:09 -04:00
class CollectTask [ T ] (
rdd : RDD [ T ] , split : Split ) ( implicit m : ClassManifest [ T ] )
2010-09-29 02:22:07 -04:00
extends RDDTask [ Array [ T ] , T ] ( rdd , split ) with Logging {
2010-06-17 15:49:42 -04:00
override def run ( ) : Array [ T ] = {
2010-09-29 02:22:07 -04:00
logInfo ( "Processing " + split )
2010-06-17 15:49:42 -04:00
rdd . iterator ( split ) . toArray ( m )
}
}
2010-08-31 15:08:09 -04:00
class ReduceTask [ T : ClassManifest ] (
rdd : RDD [ T ] , split : Split , f : ( T , T ) => T )
2010-09-29 02:22:07 -04:00
extends RDDTask [ Option [ T ] , T ] ( rdd , split ) with Logging {
2010-06-17 15:49:42 -04:00
override def run ( ) : Option [ T ] = {
2010-09-29 02:22:07 -04:00
logInfo ( "Processing " + split )
2010-06-17 15:49:42 -04:00
val iter = rdd . iterator ( split )
if ( iter . hasNext )
Some ( iter . reduceLeft ( f ) )
else
None
}
}
2010-08-31 15:08:09 -04:00
class MappedRDD [ U : ClassManifest , T : ClassManifest ] (
2010-10-17 00:21:16 -04:00
prev : RDD [ T ] , f : T => U )
2010-08-31 15:08:09 -04:00
extends RDD [ U ] ( prev . sparkContext ) {
2010-06-17 15:49:42 -04:00
override def splits = prev . splits
2010-06-27 18:21:54 -04:00
override def preferredLocations ( split : Split ) = prev . preferredLocations ( split )
2010-06-17 15:49:42 -04:00
override def iterator ( split : Split ) = prev . iterator ( split ) . map ( f )
override def taskStarted ( split : Split , slot : SlaveOffer ) = prev . taskStarted ( split , slot )
2011-02-27 02:41:44 -05:00
override val dependencies = List ( new OneToOneDependency ( prev ) )
2010-06-17 15:49:42 -04:00
}
2010-08-31 15:08:09 -04:00
class FilteredRDD [ T : ClassManifest ] (
2010-10-17 00:21:16 -04:00
prev : RDD [ T ] , f : T => Boolean )
2010-08-31 15:08:09 -04:00
extends RDD [ T ] ( prev . sparkContext ) {
2010-06-17 15:49:42 -04:00
override def splits = prev . splits
2010-06-27 18:21:54 -04:00
override def preferredLocations ( split : Split ) = prev . preferredLocations ( split )
2010-06-17 15:49:42 -04:00
override def iterator ( split : Split ) = prev . iterator ( split ) . filter ( f )
override def taskStarted ( split : Split , slot : SlaveOffer ) = prev . taskStarted ( split , slot )
}
2010-10-03 23:28:20 -04:00
class FlatMappedRDD [ U : ClassManifest , T : ClassManifest ] (
2010-10-17 00:21:16 -04:00
prev : RDD [ T ] , f : T => Traversable [ U ] )
2010-10-03 23:28:20 -04:00
extends RDD [ U ] ( prev . sparkContext ) {
override def splits = prev . splits
override def preferredLocations ( split : Split ) = prev . preferredLocations ( split )
override def iterator ( split : Split ) =
prev . iterator ( split ) . toStream . flatMap ( f ) . iterator
override def taskStarted ( split : Split , slot : SlaveOffer ) = prev . taskStarted ( split , slot )
}
2010-10-17 00:21:16 -04:00
class SplitRDD [ T : ClassManifest ] ( prev : RDD [ T ] )
2010-08-31 15:08:09 -04:00
extends RDD [ Array [ T ] ] ( prev . sparkContext ) {
2010-08-18 18:25:57 -04:00
override def splits = prev . splits
override def preferredLocations ( split : Split ) = prev . preferredLocations ( split )
override def iterator ( split : Split ) = Iterator . fromArray ( Array ( prev . iterator ( split ) . toArray ) )
override def taskStarted ( split : Split , slot : SlaveOffer ) = prev . taskStarted ( split , slot )
}
2010-10-07 17:38:25 -04:00
@serializable class SeededSplit ( val prev : Split , val seed : Int ) extends Split {
2010-10-07 20:17:07 -04:00
override def getId ( ) =
"SeededSplit(" + prev . getId ( ) + ", seed " + seed + ")"
2010-10-07 17:38:25 -04:00
}
2010-08-18 18:25:57 -04:00
2010-08-31 15:08:09 -04:00
class SampledRDD [ T : ClassManifest ] (
2010-10-17 00:21:16 -04:00
prev : RDD [ T ] , withReplacement : Boolean , frac : Double , seed : Int )
2010-08-31 15:08:09 -04:00
extends RDD [ T ] ( prev . sparkContext ) {
2010-10-17 00:21:16 -04:00
2010-08-18 18:25:57 -04:00
@transient val splits_ = { val rg = new Random ( seed ) ; prev . splits . map ( x => new SeededSplit ( x , rg . nextInt ) ) }
2010-08-31 15:08:09 -04:00
override def splits = splits_ . asInstanceOf [ Array [ Split ] ]
override def preferredLocations ( split : Split ) = prev . preferredLocations ( split . asInstanceOf [ SeededSplit ] . prev )
2010-10-17 00:21:16 -04:00
override def iterator ( splitIn : Split ) = {
2010-08-31 15:08:09 -04:00
val split = splitIn . asInstanceOf [ SeededSplit ]
2010-08-18 18:59:35 -04:00
val rg = new Random ( split . seed ) ;
// Sampling with replacement (TODO: use reservoir sampling to make this more efficient?)
if ( withReplacement ) {
val oldData = prev . iterator ( split . prev ) . toArray
val sampleSize = ( oldData . size * frac ) . ceil . toInt
val sampledData = for ( i <- 1 to sampleSize ) yield oldData ( rg . nextInt ( oldData . size ) ) // all of oldData's indices are candidates, even if sampleSize < oldData.size
sampledData . iterator
}
// Sampling without replacement
else {
prev . iterator ( split . prev ) . filter ( x => ( rg . nextDouble <= frac ) )
}
}
2010-08-31 15:08:09 -04:00
override def taskStarted ( split : Split , slot : SlaveOffer ) = prev . taskStarted ( split . asInstanceOf [ SeededSplit ] . prev , slot )
2010-08-18 18:25:57 -04:00
}
2010-08-18 18:59:35 -04:00
2010-08-31 15:08:09 -04:00
class CachedRDD [ T ] (
prev : RDD [ T ] ) ( implicit m : ClassManifest [ T ] )
2010-09-29 02:22:07 -04:00
extends RDD [ T ] ( prev . sparkContext ) with Logging {
2010-06-17 15:49:42 -04:00
val id = CachedRDD . newId ( )
2010-06-27 18:21:54 -04:00
@transient val cacheLocs = Map [ Split , List [ String ] ] ( )
2010-06-17 15:49:42 -04:00
override def splits = prev . splits
2010-06-27 18:21:54 -04:00
override def preferredLocations ( split : Split ) = {
2010-06-17 15:49:42 -04:00
if ( cacheLocs . contains ( split ) )
2010-06-27 18:21:54 -04:00
cacheLocs ( split )
2010-06-17 15:49:42 -04:00
else
2010-06-27 18:21:54 -04:00
prev . preferredLocations ( split )
2010-06-17 15:49:42 -04:00
}
2010-10-17 00:21:16 -04:00
2010-06-17 15:49:42 -04:00
override def iterator ( split : Split ) : Iterator [ T ] = {
2010-10-07 20:17:07 -04:00
val key = id + "::" + split . getId ( )
2010-10-03 01:06:06 -04:00
logInfo ( "CachedRDD split key is " + key )
2010-06-17 15:49:42 -04:00
val cache = CachedRDD . cache
val loading = CachedRDD . loading
val cachedVal = cache . get ( key )
if ( cachedVal != null ) {
// Split is in cache, so just return its values
return Iterator . fromArray ( cachedVal . asInstanceOf [ Array [ T ] ] )
} else {
// Mark the split as loading (unless someone else marks it first)
loading . synchronized {
if ( loading . contains ( key ) ) {
while ( loading . contains ( key ) ) {
try { loading . wait ( ) } catch { case _ => }
}
return Iterator . fromArray ( cache . get ( key ) . asInstanceOf [ Array [ T ] ] )
} else {
loading . add ( key )
}
}
// If we got here, we have to load the split
2010-09-29 02:22:07 -04:00
logInfo ( "Loading and caching " + split )
2010-06-17 15:49:42 -04:00
val array = prev . iterator ( split ) . toArray ( m )
cache . put ( key , array )
loading . synchronized {
loading . remove ( key )
loading . notifyAll ( )
}
return Iterator . fromArray ( array )
}
}
override def taskStarted ( split : Split , slot : SlaveOffer ) {
val oldList = cacheLocs . getOrElse ( split , Nil )
2010-06-27 18:21:54 -04:00
val host = slot . getHost
if ( ! oldList . contains ( host ) )
cacheLocs ( split ) = host : : oldList
2010-06-17 15:49:42 -04:00
}
}
private object CachedRDD {
val nextId = new AtomicLong ( 0 ) // Generates IDs for cached RDDs (on master)
def newId ( ) = nextId . getAndIncrement ( )
// Stores map results for various splits locally (on workers)
2010-10-23 20:54:25 -04:00
val cache = Cache . newKeySpace ( )
2010-06-17 15:49:42 -04:00
// Remembers which splits are currently being loaded (on workers)
val loading = new HashSet [ String ]
}
2010-06-18 15:54:33 -04:00
@serializable
2010-10-16 20:13:52 -04:00
class UnionSplit [ T : ClassManifest ] ( rdd : RDD [ T ] , split : Split )
extends Split {
def iterator ( ) = rdd . iterator ( split )
def preferredLocations ( ) = rdd . preferredLocations ( split )
override def getId ( ) = "UnionSplit(" + split . getId ( ) + ")"
2010-06-18 15:54:33 -04:00
}
@serializable
2010-10-16 20:13:52 -04:00
class UnionRDD [ T : ClassManifest ] ( sc : SparkContext , rdds : Seq [ RDD [ T ] ] )
2010-08-31 15:08:09 -04:00
extends RDD [ T ] ( sc ) {
2010-10-16 20:13:52 -04:00
@transient val splits_ : Array [ Split ] = {
2010-10-17 00:21:16 -04:00
val splits : Seq [ Split ] =
2010-10-16 20:13:52 -04:00
for ( rdd <- rdds ; split <- rdd . splits )
yield new UnionSplit ( rdd , split )
splits . toArray
2010-06-18 15:54:33 -04:00
}
2010-10-16 20:13:52 -04:00
override def splits = splits_
2010-08-31 15:08:09 -04:00
2010-10-16 20:13:52 -04:00
override def iterator ( s : Split ) : Iterator [ T ] =
s . asInstanceOf [ UnionSplit [ T ] ] . iterator ( )
2010-08-31 15:08:09 -04:00
2010-10-17 00:21:16 -04:00
override def preferredLocations ( s : Split ) : Seq [ String ] =
2010-08-31 15:08:09 -04:00
s . asInstanceOf [ UnionSplit [ T ] ] . preferredLocations ( )
}
2010-10-07 17:38:25 -04:00
@serializable class CartesianSplit ( val s1 : Split , val s2 : Split ) extends Split {
2010-10-07 20:17:07 -04:00
override def getId ( ) =
"CartesianSplit(" + s1 . getId ( ) + ", " + s2 . getId ( ) + ")"
2010-10-07 17:38:25 -04:00
}
2010-08-31 15:08:09 -04:00
@serializable
class CartesianRDD [ T : ClassManifest , U : ClassManifest ] (
sc : SparkContext , rdd1 : RDD [ T ] , rdd2 : RDD [ U ] )
extends RDD [ Pair [ T , U ] ] ( sc ) {
@transient val splits_ = {
// create the cross product split
rdd2 . splits . map ( y => rdd1 . splits . map ( x => new CartesianSplit ( x , y ) ) ) . flatten
}
override def splits = splits_ . asInstanceOf [ Array [ Split ] ]
2010-06-18 15:54:33 -04:00
2010-08-31 15:08:09 -04:00
override def preferredLocations ( split : Split ) = {
val currSplit = split . asInstanceOf [ CartesianSplit ]
rdd1 . preferredLocations ( currSplit . s1 ) ++ rdd2 . preferredLocations ( currSplit . s2 )
}
override def iterator ( split : Split ) = {
val currSplit = split . asInstanceOf [ CartesianSplit ]
for ( x <- rdd1 . iterator ( currSplit . s1 ) ; y <- rdd2 . iterator ( currSplit . s2 ) ) yield ( x , y )
}
2010-06-18 15:54:33 -04:00
2010-08-31 15:08:09 -04:00
override def taskStarted ( split : Split , slot : SlaveOffer ) = {
val currSplit = split . asInstanceOf [ CartesianSplit ]
rdd1 . taskStarted ( currSplit . s1 , slot )
rdd2 . taskStarted ( currSplit . s2 , slot )
}
2010-06-18 15:54:33 -04:00
}
2010-10-03 23:28:20 -04:00
2010-11-04 02:51:11 -04:00
@serializable class PairRDDExtras [ K , V ] ( self : RDD [ ( K , V ) ] ) {
def reduceByKeyToDriver ( func : ( V , V ) => V ) : Map [ K , V ] = {
2010-10-03 23:28:20 -04:00
def mergeMaps ( m1 : HashMap [ K , V ] , m2 : HashMap [ K , V ] ) : HashMap [ K , V ] = {
for ( ( k , v ) <- m2 ) {
m1 . get ( k ) match {
case None => m1 ( k ) = v
case Some ( w ) => m1 ( k ) = func ( w , v )
}
}
return m1
}
2010-11-04 02:51:11 -04:00
self . map ( pair => HashMap ( pair ) ) . reduce ( mergeMaps )
2010-10-03 23:28:20 -04:00
}
2010-11-04 00:27:24 -04:00
2010-11-04 02:51:11 -04:00
def combineByKey [ C ] ( createCombiner : V => C ,
mergeValue : ( C , V ) => C ,
mergeCombiners : ( C , C ) => C ,
numSplits : Int )
2010-11-08 03:45:02 -05:00
: RDD [ ( K , C ) ] =
{
val shufClass = Class . forName ( System . getProperty (
2011-02-08 20:03:03 -05:00
"spark.shuffle.class" , "spark.LocalFileShuffle" ) )
2010-11-08 03:45:02 -05:00
val shuf = shufClass . newInstance ( ) . asInstanceOf [ Shuffle [ K , V , C ] ]
shuf . compute ( self , numSplits , createCombiner , mergeValue , mergeCombiners )
2010-11-04 02:51:11 -04:00
}
def reduceByKey ( func : ( V , V ) => V , numSplits : Int ) : RDD [ ( K , V ) ] = {
combineByKey [ V ] ( ( v : V ) => v , func , func , numSplits )
}
def groupByKey ( numSplits : Int ) : RDD [ ( K , Seq [ V ] ) ] = {
def createCombiner ( v : V ) = ArrayBuffer ( v )
def mergeValue ( buf : ArrayBuffer [ V ] , v : V ) = buf += v
def mergeCombiners ( b1 : ArrayBuffer [ V ] , b2 : ArrayBuffer [ V ] ) = b1 ++= b2
val bufs = combineByKey [ ArrayBuffer [ V ] ] (
createCombiner _ , mergeValue _ , mergeCombiners _ , numSplits )
bufs . asInstanceOf [ RDD [ ( K , Seq [ V ] ) ] ]
}
def join [ W ] ( other : RDD [ ( K , W ) ] , numSplits : Int ) : RDD [ ( K , ( V , W ) ) ] = {
val vs : RDD [ ( K , Either [ V , W ] ) ] = self . map { case ( k , v ) => ( k , Left ( v ) ) }
val ws : RDD [ ( K , Either [ V , W ] ) ] = other . map { case ( k , w ) => ( k , Right ( w ) ) }
2010-11-04 02:58:53 -04:00
( vs ++ ws ) . groupByKey ( numSplits ) . flatMap {
2010-11-04 02:51:11 -04:00
case ( k , seq ) => {
val vbuf = new ArrayBuffer [ V ]
val wbuf = new ArrayBuffer [ W ]
seq . foreach ( _ match {
case Left ( v ) => vbuf += v
case Right ( w ) => wbuf += w
} )
for ( v <- vbuf ; w <- wbuf ) yield ( k , ( v , w ) )
}
}
}
def combineByKey [ C ] ( createCombiner : V => C ,
2010-11-04 00:27:24 -04:00
mergeValue : ( C , V ) => C ,
mergeCombiners : ( C , C ) => C )
2010-11-04 02:51:11 -04:00
: RDD [ ( K , C ) ] = {
combineByKey ( createCombiner , mergeValue , mergeCombiners , numCores )
}
def reduceByKey ( func : ( V , V ) => V ) : RDD [ ( K , V ) ] = {
reduceByKey ( func , numCores )
}
def groupByKey ( ) : RDD [ ( K , Seq [ V ] ) ] = {
groupByKey ( numCores )
2010-11-04 00:27:24 -04:00
}
2010-11-04 02:51:11 -04:00
def join [ W ] ( other : RDD [ ( K , W ) ] ) : RDD [ ( K , ( V , W ) ) ] = {
join ( other , numCores )
}
def numCores = self . sparkContext . numCores
def collectAsMap ( ) : Map [ K , V ] = HashMap ( self . collect ( ) : _ * )
2010-10-03 23:28:20 -04:00
}