Optimization for count()

This commit is contained in:
Matei Zaharia 2011-05-13 10:41:34 -07:00
parent 4b1f0f1ce4
commit 16c886a581

View file

@ -107,7 +107,14 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) {
}
def count(): Long = {
sc.runJob(this, (iter: Iterator[T]) => iter.size.toLong).sum
sc.runJob(this, (iter: Iterator[T]) => {
var result = 0L
while (iter.hasNext) {
result += 1L
iter.next
}
result
}).sum
}
def toArray(): Array[T] = collect()