[SPARK-2887] fix bug of countApproxDistinct() when have more than one partition

fix bug of countApproxDistinct() when have more than one partition

Author: Davies Liu <davies.liu@gmail.com>

Closes #1812 from davies/approx and squashes the following commits:

bf757ce [Davies Liu] fix bug of countApproxDistinct() when have more than one partition
This commit is contained in:
Davies Liu 2014-08-06 21:22:13 -07:00 committed by Patrick Wendell
parent a263a7e9f0
commit ffd1f59a62
2 changed files with 6 additions and 6 deletions

View file

@ -1004,7 +1004,7 @@ abstract class RDD[T: ClassTag](
},
(h1: HyperLogLogPlus, h2: HyperLogLogPlus) => {
h1.addAll(h2)
h2
h1
}).cardinality()
}

View file

@ -81,11 +81,11 @@ class RDDSuite extends FunSuite with SharedSparkContext {
def error(est: Long, size: Long) = math.abs(est - size) / size.toDouble
val size = 100
val uniformDistro = for (i <- 1 to 100000) yield i % size
val simpleRdd = sc.makeRDD(uniformDistro)
assert(error(simpleRdd.countApproxDistinct(4, 0), size) < 0.4)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.1)
val size = 1000
val uniformDistro = for (i <- 1 to 5000) yield i % size
val simpleRdd = sc.makeRDD(uniformDistro, 10)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
}
test("SparkContext.union") {