[SPARK-3081][MLLIB] rename RandomRDDGenerators to RandomRDDs
`RandomRDDGenerators` means factory for `RandomRDDGenerator`. However, its methods return RDDs but not RDDGenerators. So a more proper (and shorter) name would be `RandomRDDs`. dorx brkyvz Author: Xiangrui Meng <meng@databricks.com> Closes #1979 from mengxr/randomrdds and squashes the following commits: b161a2d [Xiangrui Meng] rename RandomRDDGenerators to RandomRDDs
This commit is contained in:
parent
7e70708a99
commit
ac6411c6e7
|
@ -27,7 +27,7 @@ import org.apache.spark.mllib.classification._
|
|||
import org.apache.spark.mllib.clustering._
|
||||
import org.apache.spark.mllib.optimization._
|
||||
import org.apache.spark.mllib.linalg.{Matrix, SparseVector, Vector, Vectors}
|
||||
import org.apache.spark.mllib.random.{RandomRDDGenerators => RG}
|
||||
import org.apache.spark.mllib.random.{RandomRDDs => RG}
|
||||
import org.apache.spark.mllib.recommendation._
|
||||
import org.apache.spark.mllib.regression._
|
||||
import org.apache.spark.mllib.tree.configuration.{Algo, Strategy}
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.spark.mllib.random
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.annotation.Experimental
|
||||
import org.apache.spark.mllib.linalg.Vector
|
||||
|
@ -24,14 +26,12 @@ import org.apache.spark.mllib.rdd.{RandomVectorRDD, RandomRDD}
|
|||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
/**
|
||||
* :: Experimental ::
|
||||
* Generator methods for creating RDDs comprised of i.i.d. samples from some distribution.
|
||||
*/
|
||||
@Experimental
|
||||
object RandomRDDGenerators {
|
||||
object RandomRDDs {
|
||||
|
||||
/**
|
||||
* :: Experimental ::
|
|
@ -34,7 +34,7 @@ import org.apache.spark.util.StatCounter
|
|||
*
|
||||
* TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
|
||||
*/
|
||||
class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Serializable {
|
||||
class RandomRDDsSuite extends FunSuite with LocalSparkContext with Serializable {
|
||||
|
||||
def testGeneratedRDD(rdd: RDD[Double],
|
||||
expectedSize: Long,
|
||||
|
@ -113,18 +113,18 @@ class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Seri
|
|||
val poissonMean = 100.0
|
||||
|
||||
for (seed <- 0 until 5) {
|
||||
val uniform = RandomRDDGenerators.uniformRDD(sc, size, numPartitions, seed)
|
||||
val uniform = RandomRDDs.uniformRDD(sc, size, numPartitions, seed)
|
||||
testGeneratedRDD(uniform, size, numPartitions, 0.5, 1 / math.sqrt(12))
|
||||
|
||||
val normal = RandomRDDGenerators.normalRDD(sc, size, numPartitions, seed)
|
||||
val normal = RandomRDDs.normalRDD(sc, size, numPartitions, seed)
|
||||
testGeneratedRDD(normal, size, numPartitions, 0.0, 1.0)
|
||||
|
||||
val poisson = RandomRDDGenerators.poissonRDD(sc, poissonMean, size, numPartitions, seed)
|
||||
val poisson = RandomRDDs.poissonRDD(sc, poissonMean, size, numPartitions, seed)
|
||||
testGeneratedRDD(poisson, size, numPartitions, poissonMean, math.sqrt(poissonMean), 0.1)
|
||||
}
|
||||
|
||||
// mock distribution to check that partitions have unique seeds
|
||||
val random = RandomRDDGenerators.randomRDD(sc, new MockDistro(), 1000L, 1000, 0L)
|
||||
val random = RandomRDDs.randomRDD(sc, new MockDistro(), 1000L, 1000, 0L)
|
||||
assert(random.collect.size === random.collect.distinct.size)
|
||||
}
|
||||
|
||||
|
@ -135,13 +135,13 @@ class RandomRDDGeneratorsSuite extends FunSuite with LocalSparkContext with Seri
|
|||
val poissonMean = 100.0
|
||||
|
||||
for (seed <- 0 until 5) {
|
||||
val uniform = RandomRDDGenerators.uniformVectorRDD(sc, rows, cols, parts, seed)
|
||||
val uniform = RandomRDDs.uniformVectorRDD(sc, rows, cols, parts, seed)
|
||||
testGeneratedVectorRDD(uniform, rows, cols, parts, 0.5, 1 / math.sqrt(12))
|
||||
|
||||
val normal = RandomRDDGenerators.normalVectorRDD(sc, rows, cols, parts, seed)
|
||||
val normal = RandomRDDs.normalVectorRDD(sc, rows, cols, parts, seed)
|
||||
testGeneratedVectorRDD(normal, rows, cols, parts, 0.0, 1.0)
|
||||
|
||||
val poisson = RandomRDDGenerators.poissonVectorRDD(sc, poissonMean, rows, cols, parts, seed)
|
||||
val poisson = RandomRDDs.poissonVectorRDD(sc, poissonMean, rows, cols, parts, seed)
|
||||
testGeneratedVectorRDD(poisson, rows, cols, parts, poissonMean, math.sqrt(poissonMean), 0.1)
|
||||
}
|
||||
}
|
|
@ -25,8 +25,7 @@ from pyspark.mllib._common import _deserialize_double, _deserialize_double_vecto
|
|||
from pyspark.serializers import NoOpSerializer
|
||||
|
||||
|
||||
class RandomRDDGenerators:
|
||||
|
||||
class RandomRDDs:
|
||||
"""
|
||||
Generator methods for creating RDDs comprised of i.i.d samples from
|
||||
some distribution.
|
||||
|
@ -40,17 +39,17 @@ class RandomRDDGenerators:
|
|||
|
||||
To transform the distribution in the generated RDD from U[0.0, 1.0]
|
||||
to U[a, b], use
|
||||
C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\
|
||||
C{RandomRDDs.uniformRDD(sc, n, p, seed)\
|
||||
.map(lambda v: a + (b - a) * v)}
|
||||
|
||||
>>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect()
|
||||
>>> x = RandomRDDs.uniformRDD(sc, 100).collect()
|
||||
>>> len(x)
|
||||
100
|
||||
>>> max(x) <= 1.0 and min(x) >= 0.0
|
||||
True
|
||||
>>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions()
|
||||
>>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions()
|
||||
4
|
||||
>>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions()
|
||||
>>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions()
|
||||
>>> parts == sc.defaultParallelism
|
||||
True
|
||||
"""
|
||||
|
@ -66,10 +65,10 @@ class RandomRDDGenerators:
|
|||
|
||||
To transform the distribution in the generated RDD from standard normal
|
||||
to some other normal N(mean, sigma), use
|
||||
C{RandomRDDGenerators.normal(sc, n, p, seed)\
|
||||
C{RandomRDDs.normal(sc, n, p, seed)\
|
||||
.map(lambda v: mean + sigma * v)}
|
||||
|
||||
>>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L)
|
||||
>>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L)
|
||||
>>> stats = x.stats()
|
||||
>>> stats.count()
|
||||
1000L
|
||||
|
@ -89,7 +88,7 @@ class RandomRDDGenerators:
|
|||
distribution with the input mean.
|
||||
|
||||
>>> mean = 100.0
|
||||
>>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L)
|
||||
>>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=1L)
|
||||
>>> stats = x.stats()
|
||||
>>> stats.count()
|
||||
1000L
|
||||
|
@ -110,12 +109,12 @@ class RandomRDDGenerators:
|
|||
from the uniform distribution on [0.0 1.0].
|
||||
|
||||
>>> import numpy as np
|
||||
>>> mat = np.matrix(RandomRDDGenerators.uniformVectorRDD(sc, 10, 10).collect())
|
||||
>>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
|
||||
>>> mat.shape
|
||||
(10, 10)
|
||||
>>> mat.max() <= 1.0 and mat.min() >= 0.0
|
||||
True
|
||||
>>> RandomRDDGenerators.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
|
||||
>>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
|
||||
4
|
||||
"""
|
||||
jrdd = sc._jvm.PythonMLLibAPI() \
|
||||
|
@ -130,7 +129,7 @@ class RandomRDDGenerators:
|
|||
from the standard normal distribution.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> mat = np.matrix(RandomRDDGenerators.normalVectorRDD(sc, 100, 100, seed=1L).collect())
|
||||
>>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
|
||||
>>> mat.shape
|
||||
(100, 100)
|
||||
>>> abs(mat.mean() - 0.0) < 0.1
|
||||
|
@ -151,7 +150,7 @@ class RandomRDDGenerators:
|
|||
|
||||
>>> import numpy as np
|
||||
>>> mean = 100.0
|
||||
>>> rdd = RandomRDDGenerators.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
|
||||
>>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
|
||||
>>> mat = np.mat(rdd.collect())
|
||||
>>> mat.shape
|
||||
(100, 100)
|
||||
|
|
Loading…
Reference in a new issue