[SPARK-2887] fix bug of countApproxDistinct() when have more than one partition
fix bug of countApproxDistinct() when have more than one partition Author: Davies Liu <davies.liu@gmail.com> Closes #1812 from davies/approx and squashes the following commits: bf757ce [Davies Liu] fix bug of countApproxDistinct() when have more than one partition
This commit is contained in:
parent
a263a7e9f0
commit
ffd1f59a62
|
@ -1004,7 +1004,7 @@ abstract class RDD[T: ClassTag](
|
|||
},
|
||||
(h1: HyperLogLogPlus, h2: HyperLogLogPlus) => {
|
||||
h1.addAll(h2)
|
||||
h2
|
||||
h1
|
||||
}).cardinality()
|
||||
}
|
||||
|
||||
|
|
|
@ -81,11 +81,11 @@ class RDDSuite extends FunSuite with SharedSparkContext {
|
|||
|
||||
def error(est: Long, size: Long) = math.abs(est - size) / size.toDouble
|
||||
|
||||
val size = 100
|
||||
val uniformDistro = for (i <- 1 to 100000) yield i % size
|
||||
val simpleRdd = sc.makeRDD(uniformDistro)
|
||||
assert(error(simpleRdd.countApproxDistinct(4, 0), size) < 0.4)
|
||||
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.1)
|
||||
val size = 1000
|
||||
val uniformDistro = for (i <- 1 to 5000) yield i % size
|
||||
val simpleRdd = sc.makeRDD(uniformDistro, 10)
|
||||
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
|
||||
assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
|
||||
}
|
||||
|
||||
test("SparkContext.union") {
|
||||
|
|
Loading…
Reference in a new issue