[SPARK-3850] Trim trailing spaces for MLlib.
Author: Reynold Xin <rxin@databricks.com> Closes #6534 from rxin/whitespace-mllib and squashes the following commits: 38926e3 [Reynold Xin] [SPARK-3850] Trim trailing spaces for MLlib.
This commit is contained in:
parent
d1d2def2f5
commit
e1067d0ad1
|
@ -35,13 +35,13 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
|
|||
|
||||
/**
|
||||
* Centers the data with mean before scaling.
|
||||
* It will build a dense output, so this does not work on sparse input
|
||||
* It will build a dense output, so this does not work on sparse input
|
||||
* and will raise an exception.
|
||||
* Default: false
|
||||
* @group param
|
||||
*/
|
||||
val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data with mean")
|
||||
|
||||
|
||||
/**
|
||||
* Scales the data to unit standard deviation.
|
||||
* Default: true
|
||||
|
@ -68,13 +68,13 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM
|
|||
|
||||
/** @group setParam */
|
||||
def setOutputCol(value: String): this.type = set(outputCol, value)
|
||||
|
||||
|
||||
/** @group setParam */
|
||||
def setWithMean(value: Boolean): this.type = set(withMean, value)
|
||||
|
||||
|
||||
/** @group setParam */
|
||||
def setWithStd(value: Boolean): this.type = set(withStd, value)
|
||||
|
||||
|
||||
override def fit(dataset: DataFrame): StandardScalerModel = {
|
||||
transformSchema(dataset.schema, logging = true)
|
||||
val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v }
|
||||
|
|
|
@ -321,7 +321,7 @@ private class LeastSquaresAggregator(
|
|||
}
|
||||
(weightsArray, -sum + labelMean / labelStd, weightsArray.length)
|
||||
}
|
||||
|
||||
|
||||
private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
|
||||
|
||||
private val gradientSumArray = Array.ofDim[Double](dim)
|
||||
|
|
|
@ -399,7 +399,7 @@ private[python] class PythonMLLibAPI extends Serializable {
|
|||
val sigma = si.map(_.asInstanceOf[DenseMatrix])
|
||||
val gaussians = Array.tabulate(weight.length){
|
||||
i => new MultivariateGaussian(mean(i), sigma(i))
|
||||
}
|
||||
}
|
||||
val model = new GaussianMixtureModel(weight, gaussians)
|
||||
model.predictSoft(data).map(Vectors.dense)
|
||||
}
|
||||
|
@ -494,7 +494,7 @@ private[python] class PythonMLLibAPI extends Serializable {
|
|||
def normalizeVector(p: Double, rdd: JavaRDD[Vector]): JavaRDD[Vector] = {
|
||||
new Normalizer(p).transform(rdd)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Java stub for StandardScaler.fit(). This stub returns a
|
||||
* handle to the Java object instead of the content of the Java object.
|
||||
|
|
|
@ -36,11 +36,11 @@ import org.apache.spark.util.Utils
|
|||
* independent Gaussian distributions with associated "mixing" weights
|
||||
* specifying each's contribution to the composite.
|
||||
*
|
||||
* Given a set of sample points, this class will maximize the log-likelihood
|
||||
* for a mixture of k Gaussians, iterating until the log-likelihood changes by
|
||||
* Given a set of sample points, this class will maximize the log-likelihood
|
||||
* for a mixture of k Gaussians, iterating until the log-likelihood changes by
|
||||
* less than convergenceTol, or until it has reached the max number of iterations.
|
||||
* While this process is generally guaranteed to converge, it is not guaranteed
|
||||
* to find a global optimum.
|
||||
* to find a global optimum.
|
||||
*
|
||||
* Note: For high-dimensional data (with many features), this algorithm may perform poorly.
|
||||
* This is due to high-dimensional data (a) making it difficult to cluster at all (based
|
||||
|
@ -53,24 +53,24 @@ import org.apache.spark.util.Utils
|
|||
*/
|
||||
@Experimental
|
||||
class GaussianMixture private (
|
||||
private var k: Int,
|
||||
private var convergenceTol: Double,
|
||||
private var k: Int,
|
||||
private var convergenceTol: Double,
|
||||
private var maxIterations: Int,
|
||||
private var seed: Long) extends Serializable {
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
|
||||
* maxIterations: 100, seed: random}.
|
||||
*/
|
||||
def this() = this(2, 0.01, 100, Utils.random.nextLong())
|
||||
|
||||
|
||||
// number of samples per cluster to use when initializing Gaussians
|
||||
private val nSamples = 5
|
||||
|
||||
// an initializing GMM can be provided rather than using the
|
||||
|
||||
// an initializing GMM can be provided rather than using the
|
||||
// default random starting point
|
||||
private var initialModel: Option[GaussianMixtureModel] = None
|
||||
|
||||
|
||||
/** Set the initial GMM starting point, bypassing the random initialization.
|
||||
* You must call setK() prior to calling this method, and the condition
|
||||
* (model.k == this.k) must be met; failure will result in an IllegalArgumentException
|
||||
|
@ -83,37 +83,37 @@ class GaussianMixture private (
|
|||
}
|
||||
this
|
||||
}
|
||||
|
||||
|
||||
/** Return the user supplied initial GMM, if supplied */
|
||||
def getInitialModel: Option[GaussianMixtureModel] = initialModel
|
||||
|
||||
|
||||
/** Set the number of Gaussians in the mixture model. Default: 2 */
|
||||
def setK(k: Int): this.type = {
|
||||
this.k = k
|
||||
this
|
||||
}
|
||||
|
||||
|
||||
/** Return the number of Gaussians in the mixture model */
|
||||
def getK: Int = k
|
||||
|
||||
|
||||
/** Set the maximum number of iterations to run. Default: 100 */
|
||||
def setMaxIterations(maxIterations: Int): this.type = {
|
||||
this.maxIterations = maxIterations
|
||||
this
|
||||
}
|
||||
|
||||
|
||||
/** Return the maximum number of iterations to run */
|
||||
def getMaxIterations: Int = maxIterations
|
||||
|
||||
|
||||
/**
|
||||
* Set the largest change in log-likelihood at which convergence is
|
||||
* Set the largest change in log-likelihood at which convergence is
|
||||
* considered to have occurred.
|
||||
*/
|
||||
def setConvergenceTol(convergenceTol: Double): this.type = {
|
||||
this.convergenceTol = convergenceTol
|
||||
this
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the largest change in log-likelihood at which convergence is
|
||||
* considered to have occurred.
|
||||
|
@ -132,41 +132,41 @@ class GaussianMixture private (
|
|||
/** Perform expectation maximization */
|
||||
def run(data: RDD[Vector]): GaussianMixtureModel = {
|
||||
val sc = data.sparkContext
|
||||
|
||||
|
||||
// we will operate on the data as breeze data
|
||||
val breezeData = data.map(_.toBreeze).cache()
|
||||
|
||||
|
||||
// Get length of the input vectors
|
||||
val d = breezeData.first().length
|
||||
|
||||
|
||||
// Determine initial weights and corresponding Gaussians.
|
||||
// If the user supplied an initial GMM, we use those values, otherwise
|
||||
// we start with uniform weights, a random mean from the data, and
|
||||
// diagonal covariance matrices using component variances
|
||||
// derived from the samples
|
||||
// derived from the samples
|
||||
val (weights, gaussians) = initialModel match {
|
||||
case Some(gmm) => (gmm.weights, gmm.gaussians)
|
||||
|
||||
|
||||
case None => {
|
||||
val samples = breezeData.takeSample(withReplacement = true, k * nSamples, seed)
|
||||
(Array.fill(k)(1.0 / k), Array.tabulate(k) { i =>
|
||||
(Array.fill(k)(1.0 / k), Array.tabulate(k) { i =>
|
||||
val slice = samples.view(i * nSamples, (i + 1) * nSamples)
|
||||
new MultivariateGaussian(vectorMean(slice), initCovariance(slice))
|
||||
new MultivariateGaussian(vectorMean(slice), initCovariance(slice))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
var llh = Double.MinValue // current log-likelihood
|
||||
|
||||
var llh = Double.MinValue // current log-likelihood
|
||||
var llhp = 0.0 // previous log-likelihood
|
||||
|
||||
|
||||
var iter = 0
|
||||
while (iter < maxIterations && math.abs(llh-llhp) > convergenceTol) {
|
||||
// create and broadcast curried cluster contribution function
|
||||
val compute = sc.broadcast(ExpectationSum.add(weights, gaussians)_)
|
||||
|
||||
|
||||
// aggregate the cluster contribution for all sample points
|
||||
val sums = breezeData.aggregate(ExpectationSum.zero(k, d))(compute.value, _ += _)
|
||||
|
||||
|
||||
// Create new distributions based on the partial assignments
|
||||
// (often referred to as the "M" step in literature)
|
||||
val sumWeights = sums.weights.sum
|
||||
|
@ -179,22 +179,22 @@ class GaussianMixture private (
|
|||
gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i))
|
||||
i = i + 1
|
||||
}
|
||||
|
||||
|
||||
llhp = llh // current becomes previous
|
||||
llh = sums.logLikelihood // this is the freshly computed log-likelihood
|
||||
iter += 1
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
new GaussianMixtureModel(weights, gaussians)
|
||||
}
|
||||
|
||||
|
||||
/** Average of dense breeze vectors */
|
||||
private def vectorMean(x: IndexedSeq[BV[Double]]): BDV[Double] = {
|
||||
val v = BDV.zeros[Double](x(0).length)
|
||||
x.foreach(xi => v += xi)
|
||||
v / x.length.toDouble
|
||||
v / x.length.toDouble
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Construct matrix where diagonal entries are element-wise
|
||||
* variance of input vectors (computes biased variance)
|
||||
|
@ -210,14 +210,14 @@ class GaussianMixture private (
|
|||
// companion class to provide zero constructor for ExpectationSum
|
||||
private object ExpectationSum {
|
||||
def zero(k: Int, d: Int): ExpectationSum = {
|
||||
new ExpectationSum(0.0, Array.fill(k)(0.0),
|
||||
new ExpectationSum(0.0, Array.fill(k)(0.0),
|
||||
Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d, d)))
|
||||
}
|
||||
|
||||
|
||||
// compute cluster contributions for each input point
|
||||
// (U, T) => U for aggregation
|
||||
def add(
|
||||
weights: Array[Double],
|
||||
weights: Array[Double],
|
||||
dists: Array[MultivariateGaussian])
|
||||
(sums: ExpectationSum, x: BV[Double]): ExpectationSum = {
|
||||
val p = weights.zip(dists).map {
|
||||
|
@ -235,7 +235,7 @@ private object ExpectationSum {
|
|||
i = i + 1
|
||||
}
|
||||
sums
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregation class for partial expectation results
|
||||
|
@ -244,9 +244,9 @@ private class ExpectationSum(
|
|||
val weights: Array[Double],
|
||||
val means: Array[BDV[Double]],
|
||||
val sigmas: Array[BreezeMatrix[Double]]) extends Serializable {
|
||||
|
||||
|
||||
val k = weights.length
|
||||
|
||||
|
||||
def +=(x: ExpectationSum): ExpectationSum = {
|
||||
var i = 0
|
||||
while (i < k) {
|
||||
|
@ -257,5 +257,5 @@ private class ExpectationSum(
|
|||
}
|
||||
logLikelihood += x.logLikelihood
|
||||
this
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,10 +34,10 @@ import org.apache.spark.sql.{SQLContext, Row}
|
|||
/**
|
||||
* :: Experimental ::
|
||||
*
|
||||
* Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
|
||||
* are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
|
||||
* the respective mean and covariance for each Gaussian distribution i=1..k.
|
||||
*
|
||||
* Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
|
||||
* are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
|
||||
* the respective mean and covariance for each Gaussian distribution i=1..k.
|
||||
*
|
||||
* @param weights Weights for each Gaussian distribution in the mixture, where weights(i) is
|
||||
* the weight for Gaussian i, and weights.sum == 1
|
||||
* @param gaussians Array of MultivariateGaussian where gaussians(i) represents
|
||||
|
@ -45,9 +45,9 @@ import org.apache.spark.sql.{SQLContext, Row}
|
|||
*/
|
||||
@Experimental
|
||||
class GaussianMixtureModel(
|
||||
val weights: Array[Double],
|
||||
val weights: Array[Double],
|
||||
val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable{
|
||||
|
||||
|
||||
require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")
|
||||
|
||||
override protected def formatVersion = "1.0"
|
||||
|
@ -64,20 +64,20 @@ class GaussianMixtureModel(
|
|||
val responsibilityMatrix = predictSoft(points)
|
||||
responsibilityMatrix.map(r => r.indexOf(r.max))
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Given the input vectors, return the membership value of each vector
|
||||
* to all mixture components.
|
||||
* to all mixture components.
|
||||
*/
|
||||
def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = {
|
||||
val sc = points.sparkContext
|
||||
val bcDists = sc.broadcast(gaussians)
|
||||
val bcWeights = sc.broadcast(weights)
|
||||
points.map { x =>
|
||||
points.map { x =>
|
||||
computeSoftAssignments(x.toBreeze.toDenseVector, bcDists.value, bcWeights.value, k)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute the partial assignments for each vector
|
||||
*/
|
||||
|
@ -89,7 +89,7 @@ class GaussianMixtureModel(
|
|||
val p = weights.zip(dists).map {
|
||||
case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt)
|
||||
}
|
||||
val pSum = p.sum
|
||||
val pSum = p.sum
|
||||
for (i <- 0 until k) {
|
||||
p(i) /= pSum
|
||||
}
|
||||
|
|
|
@ -121,7 +121,7 @@ class PowerIterationClustering private[clustering] (
|
|||
import org.apache.spark.mllib.clustering.PowerIterationClustering._
|
||||
|
||||
/** Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100,
|
||||
* initMode: "random"}.
|
||||
* initMode: "random"}.
|
||||
*/
|
||||
def this() = this(k = 2, maxIterations = 100, initMode = "random")
|
||||
|
||||
|
@ -243,7 +243,7 @@ object PowerIterationClustering extends Logging {
|
|||
|
||||
/**
|
||||
* Generates random vertex properties (v0) to start power iteration.
|
||||
*
|
||||
*
|
||||
* @param g a graph representing the normalized affinity matrix (W)
|
||||
* @return a graph with edges representing W and vertices representing a random vector
|
||||
* with unit 1-norm
|
||||
|
@ -266,7 +266,7 @@ object PowerIterationClustering extends Logging {
|
|||
* Generates the degree vector as the vertex properties (v0) to start power iteration.
|
||||
* It is not exactly the node degrees but just the normalized sum similarities. Call it
|
||||
* as degree vector because it is used in the PIC paper.
|
||||
*
|
||||
*
|
||||
* @param g a graph representing the normalized affinity matrix (W)
|
||||
* @return a graph with edges representing W and vertices representing the degree vector
|
||||
*/
|
||||
|
@ -276,7 +276,7 @@ object PowerIterationClustering extends Logging {
|
|||
val v0 = g.vertices.mapValues(_ / sum)
|
||||
GraphImpl.fromExistingRDDs(VertexRDD(v0), g.edges)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Runs power iteration.
|
||||
* @param g input graph with edges representing the normalized affinity matrix (W) and vertices
|
||||
|
|
|
@ -42,7 +42,7 @@ import org.apache.spark.util.random.XORShiftRandom
|
|||
import org.apache.spark.sql.{SQLContext, Row}
|
||||
|
||||
/**
|
||||
* Entry in vocabulary
|
||||
* Entry in vocabulary
|
||||
*/
|
||||
private case class VocabWord(
|
||||
var word: String,
|
||||
|
@ -56,18 +56,18 @@ private case class VocabWord(
|
|||
* :: Experimental ::
|
||||
* Word2Vec creates vector representation of words in a text corpus.
|
||||
* The algorithm first constructs a vocabulary from the corpus
|
||||
* and then learns vector representation of words in the vocabulary.
|
||||
* The vector representation can be used as features in
|
||||
* and then learns vector representation of words in the vocabulary.
|
||||
* The vector representation can be used as features in
|
||||
* natural language processing and machine learning algorithms.
|
||||
*
|
||||
* We used skip-gram model in our implementation and hierarchical softmax
|
||||
*
|
||||
* We used skip-gram model in our implementation and hierarchical softmax
|
||||
* method to train the model. The variable names in the implementation
|
||||
* matches the original C implementation.
|
||||
*
|
||||
* For original C implementation, see https://code.google.com/p/word2vec/
|
||||
* For research papers, see
|
||||
* For original C implementation, see https://code.google.com/p/word2vec/
|
||||
* For research papers, see
|
||||
* Efficient Estimation of Word Representations in Vector Space
|
||||
* and
|
||||
* and
|
||||
* Distributed Representations of Words and Phrases and their Compositionality.
|
||||
*/
|
||||
@Experimental
|
||||
|
@ -79,7 +79,7 @@ class Word2Vec extends Serializable with Logging {
|
|||
private var numIterations = 1
|
||||
private var seed = Utils.random.nextLong()
|
||||
private var minCount = 5
|
||||
|
||||
|
||||
/**
|
||||
* Sets vector size (default: 100).
|
||||
*/
|
||||
|
@ -122,15 +122,15 @@ class Word2Vec extends Serializable with Logging {
|
|||
this
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets minCount, the minimum number of times a token must appear to be included in the word2vec
|
||||
/**
|
||||
* Sets minCount, the minimum number of times a token must appear to be included in the word2vec
|
||||
* model's vocabulary (default: 5).
|
||||
*/
|
||||
def setMinCount(minCount: Int): this.type = {
|
||||
this.minCount = minCount
|
||||
this
|
||||
}
|
||||
|
||||
|
||||
private val EXP_TABLE_SIZE = 1000
|
||||
private val MAX_EXP = 6
|
||||
private val MAX_CODE_LENGTH = 40
|
||||
|
@ -150,13 +150,13 @@ class Word2Vec extends Serializable with Logging {
|
|||
.map(x => VocabWord(
|
||||
x._1,
|
||||
x._2,
|
||||
new Array[Int](MAX_CODE_LENGTH),
|
||||
new Array[Int](MAX_CODE_LENGTH),
|
||||
new Array[Int](MAX_CODE_LENGTH),
|
||||
new Array[Int](MAX_CODE_LENGTH),
|
||||
0))
|
||||
.filter(_.cn >= minCount)
|
||||
.collect()
|
||||
.sortWith((a, b) => a.cn > b.cn)
|
||||
|
||||
|
||||
vocabSize = vocab.length
|
||||
require(vocabSize > 0, "The vocabulary size should be > 0. You may need to check " +
|
||||
"the setting of minCount, which could be large enough to remove all your words in sentences.")
|
||||
|
@ -198,8 +198,8 @@ class Word2Vec extends Serializable with Logging {
|
|||
}
|
||||
var pos1 = vocabSize - 1
|
||||
var pos2 = vocabSize
|
||||
|
||||
var min1i = 0
|
||||
|
||||
var min1i = 0
|
||||
var min2i = 0
|
||||
|
||||
a = 0
|
||||
|
@ -268,15 +268,15 @@ class Word2Vec extends Serializable with Logging {
|
|||
val words = dataset.flatMap(x => x)
|
||||
|
||||
learnVocab(words)
|
||||
|
||||
|
||||
createBinaryTree()
|
||||
|
||||
|
||||
val sc = dataset.context
|
||||
|
||||
val expTable = sc.broadcast(createExpTable())
|
||||
val bcVocab = sc.broadcast(vocab)
|
||||
val bcVocabHash = sc.broadcast(vocabHash)
|
||||
|
||||
|
||||
val sentences: RDD[Array[Int]] = words.mapPartitions { iter =>
|
||||
new Iterator[Array[Int]] {
|
||||
def hasNext: Boolean = iter.hasNext
|
||||
|
@ -297,7 +297,7 @@ class Word2Vec extends Serializable with Logging {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
val newSentences = sentences.repartition(numPartitions).cache()
|
||||
val initRandom = new XORShiftRandom(seed)
|
||||
|
||||
|
@ -402,7 +402,7 @@ class Word2Vec extends Serializable with Logging {
|
|||
}
|
||||
}
|
||||
newSentences.unpersist()
|
||||
|
||||
|
||||
val word2VecMap = mutable.HashMap.empty[String, Array[Float]]
|
||||
var i = 0
|
||||
while (i < vocabSize) {
|
||||
|
@ -480,7 +480,7 @@ class Word2VecModel private[mllib] (
|
|||
|
||||
/**
|
||||
* Transforms a word to its vector representation
|
||||
* @param word a word
|
||||
* @param word a word
|
||||
* @return vector representation of word
|
||||
*/
|
||||
def transform(word: String): Vector = {
|
||||
|
@ -495,7 +495,7 @@ class Word2VecModel private[mllib] (
|
|||
/**
|
||||
* Find synonyms of a word
|
||||
* @param word a word
|
||||
* @param num number of synonyms to find
|
||||
* @param num number of synonyms to find
|
||||
* @return array of (word, cosineSimilarity)
|
||||
*/
|
||||
def findSynonyms(word: String, num: Int): Array[(String, Double)] = {
|
||||
|
@ -506,7 +506,7 @@ class Word2VecModel private[mllib] (
|
|||
/**
|
||||
* Find synonyms of the vector representation of a word
|
||||
* @param vector vector representation of a word
|
||||
* @param num number of synonyms to find
|
||||
* @param num number of synonyms to find
|
||||
* @return array of (word, cosineSimilarity)
|
||||
*/
|
||||
def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
|
||||
|
|
|
@ -228,7 +228,7 @@ private[spark] object BLAS extends Serializable with Logging {
|
|||
}
|
||||
_nativeBLAS
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A := alpha * x * x^T^ + A
|
||||
* @param alpha a real scalar that will be multiplied to x * x^T^.
|
||||
|
@ -264,7 +264,7 @@ private[spark] object BLAS extends Serializable with Logging {
|
|||
j += 1
|
||||
}
|
||||
i += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def syr(alpha: Double, x: SparseVector, A: DenseMatrix) {
|
||||
|
@ -505,7 +505,7 @@ private[spark] object BLAS extends Serializable with Logging {
|
|||
nativeBLAS.dgemv(tStrA, mA, nA, alpha, A.values, mA, x.values, 1, beta,
|
||||
y.values, 1)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* y := alpha * A * x + beta * y
|
||||
* For `DenseMatrix` A and `SparseVector` x.
|
||||
|
@ -557,7 +557,7 @@ private[spark] object BLAS extends Serializable with Logging {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* y := alpha * A * x + beta * y
|
||||
* For `SparseMatrix` A and `SparseVector` x.
|
||||
|
|
|
@ -81,7 +81,7 @@ private[mllib] object EigenValueDecomposition {
|
|||
|
||||
require(n * ncv.toLong <= Integer.MAX_VALUE && ncv * (ncv.toLong + 8) <= Integer.MAX_VALUE,
|
||||
s"k = $k and/or n = $n are too large to compute an eigendecomposition")
|
||||
|
||||
|
||||
var ido = new intW(0)
|
||||
var info = new intW(0)
|
||||
var resid = new Array[Double](n)
|
||||
|
|
|
@ -27,10 +27,10 @@ import org.apache.spark.mllib.regression.GeneralizedLinearModel
|
|||
* PMML Model Export for GeneralizedLinearModel class with binary ClassificationModel
|
||||
*/
|
||||
private[mllib] class BinaryClassificationPMMLModelExport(
|
||||
model : GeneralizedLinearModel,
|
||||
model : GeneralizedLinearModel,
|
||||
description : String,
|
||||
normalizationMethod : RegressionNormalizationMethodType,
|
||||
threshold: Double)
|
||||
threshold: Double)
|
||||
extends PMMLModelExport {
|
||||
|
||||
populateBinaryClassificationPMML()
|
||||
|
@ -72,7 +72,7 @@ private[mllib] class BinaryClassificationPMMLModelExport(
|
|||
.withUsageType(FieldUsageType.ACTIVE))
|
||||
regressionTableYES.withNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
|
||||
}
|
||||
|
||||
|
||||
// add target field
|
||||
val targetField = FieldName.create("target")
|
||||
dataDictionary
|
||||
|
@ -80,9 +80,9 @@ private[mllib] class BinaryClassificationPMMLModelExport(
|
|||
miningSchema
|
||||
.withMiningFields(new MiningField(targetField)
|
||||
.withUsageType(FieldUsageType.TARGET))
|
||||
|
||||
|
||||
dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size)
|
||||
|
||||
|
||||
pmml.setDataDictionary(dataDictionary)
|
||||
pmml.withModels(regressionModel)
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ import scala.beans.BeanProperty
|
|||
import org.dmg.pmml.{Application, Header, PMML, Timestamp}
|
||||
|
||||
private[mllib] trait PMMLModelExport {
|
||||
|
||||
|
||||
/**
|
||||
* Holder of the exported model in PMML format
|
||||
*/
|
||||
|
@ -33,7 +33,7 @@ private[mllib] trait PMMLModelExport {
|
|||
val pmml: PMML = new PMML
|
||||
|
||||
setHeader(pmml)
|
||||
|
||||
|
||||
private def setHeader(pmml: PMML): Unit = {
|
||||
val version = getClass.getPackage.getImplementationVersion
|
||||
val app = new Application().withName("Apache Spark MLlib").withVersion(version)
|
||||
|
|
|
@ -27,9 +27,9 @@ import org.apache.spark.mllib.regression.LinearRegressionModel
|
|||
import org.apache.spark.mllib.regression.RidgeRegressionModel
|
||||
|
||||
private[mllib] object PMMLModelExportFactory {
|
||||
|
||||
|
||||
/**
|
||||
* Factory object to help creating the necessary PMMLModelExport implementation
|
||||
* Factory object to help creating the necessary PMMLModelExport implementation
|
||||
* taking as input the machine learning model (for example KMeansModel).
|
||||
*/
|
||||
def createPMMLModelExport(model: Any): PMMLModelExport = {
|
||||
|
@ -44,7 +44,7 @@ private[mllib] object PMMLModelExportFactory {
|
|||
new GeneralizedLinearPMMLModelExport(lasso, "lasso regression")
|
||||
case svm: SVMModel =>
|
||||
new BinaryClassificationPMMLModelExport(
|
||||
svm, "linear SVM", RegressionNormalizationMethodType.NONE,
|
||||
svm, "linear SVM", RegressionNormalizationMethodType.NONE,
|
||||
svm.getThreshold.getOrElse(0.0))
|
||||
case logistic: LogisticRegressionModel =>
|
||||
if (logistic.numClasses == 2) {
|
||||
|
@ -60,5 +60,5 @@ private[mllib] object PMMLModelExportFactory {
|
|||
"PMML Export not supported for model: " + model.getClass.getName)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -234,7 +234,7 @@ object RandomRDDs {
|
|||
*
|
||||
* @param sc SparkContext used to create the RDD.
|
||||
* @param shape shape parameter (> 0) for the gamma distribution
|
||||
* @param scale scale parameter (> 0) for the gamma distribution
|
||||
* @param scale scale parameter (> 0) for the gamma distribution
|
||||
* @param size Size of the RDD.
|
||||
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
|
||||
* @param seed Random seed (default: a random long integer).
|
||||
|
@ -293,7 +293,7 @@ object RandomRDDs {
|
|||
*
|
||||
* @param sc SparkContext used to create the RDD.
|
||||
* @param mean mean for the log normal distribution
|
||||
* @param std standard deviation for the log normal distribution
|
||||
* @param std standard deviation for the log normal distribution
|
||||
* @param size Size of the RDD.
|
||||
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
|
||||
* @param seed Random seed (default: a random long integer).
|
||||
|
@ -671,7 +671,7 @@ object RandomRDDs {
|
|||
*
|
||||
* @param sc SparkContext used to create the RDD.
|
||||
* @param shape shape parameter (> 0) for the gamma distribution.
|
||||
* @param scale scale parameter (> 0) for the gamma distribution.
|
||||
* @param scale scale parameter (> 0) for the gamma distribution.
|
||||
* @param numRows Number of Vectors in the RDD.
|
||||
* @param numCols Number of elements in each Vector.
|
||||
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
|
||||
|
|
|
@ -175,7 +175,7 @@ class ALS private (
|
|||
/**
|
||||
* :: DeveloperApi ::
|
||||
* Sets storage level for final RDDs (user/product used in MatrixFactorizationModel). The default
|
||||
* value is `MEMORY_AND_DISK`. Users can change it to a serialized storage, e.g.
|
||||
* value is `MEMORY_AND_DISK`. Users can change it to a serialized storage, e.g.
|
||||
* `MEMORY_AND_DISK_SER` and set `spark.rdd.compress` to `true` to reduce the space requirement,
|
||||
* at the cost of speed.
|
||||
*/
|
||||
|
|
|
@ -170,15 +170,15 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
|
|||
case class Data(boundary: Double, prediction: Double)
|
||||
|
||||
def save(
|
||||
sc: SparkContext,
|
||||
path: String,
|
||||
boundaries: Array[Double],
|
||||
predictions: Array[Double],
|
||||
sc: SparkContext,
|
||||
path: String,
|
||||
boundaries: Array[Double],
|
||||
predictions: Array[Double],
|
||||
isotonic: Boolean): Unit = {
|
||||
val sqlContext = new SQLContext(sc)
|
||||
|
||||
val metadata = compact(render(
|
||||
("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
|
||||
("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
|
||||
("isotonic" -> isotonic)))
|
||||
sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
|
||||
|
||||
|
|
|
@ -29,102 +29,102 @@ import org.apache.spark.mllib.util.MLUtils
|
|||
* the event that the covariance matrix is singular, the density will be computed in a
|
||||
* reduced dimensional subspace under which the distribution is supported.
|
||||
* (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
|
||||
*
|
||||
*
|
||||
* @param mu The mean vector of the distribution
|
||||
* @param sigma The covariance matrix of the distribution
|
||||
*/
|
||||
@DeveloperApi
|
||||
class MultivariateGaussian (
|
||||
val mu: Vector,
|
||||
val mu: Vector,
|
||||
val sigma: Matrix) extends Serializable {
|
||||
|
||||
require(sigma.numCols == sigma.numRows, "Covariance matrix must be square")
|
||||
require(mu.size == sigma.numCols, "Mean vector length must match covariance matrix size")
|
||||
|
||||
|
||||
private val breezeMu = mu.toBreeze.toDenseVector
|
||||
|
||||
|
||||
/**
|
||||
* private[mllib] constructor
|
||||
*
|
||||
*
|
||||
* @param mu The mean vector of the distribution
|
||||
* @param sigma The covariance matrix of the distribution
|
||||
*/
|
||||
private[mllib] def this(mu: DBV[Double], sigma: DBM[Double]) = {
|
||||
this(Vectors.fromBreeze(mu), Matrices.fromBreeze(sigma))
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute distribution dependent constants:
|
||||
* rootSigmaInv = D^(-1/2)^ * U, where sigma = U * D * U.t
|
||||
* u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
|
||||
* u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
|
||||
*/
|
||||
private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
|
||||
|
||||
|
||||
/** Returns density of this multivariate Gaussian at given point, x */
|
||||
def pdf(x: Vector): Double = {
|
||||
pdf(x.toBreeze)
|
||||
}
|
||||
|
||||
|
||||
/** Returns the log-density of this multivariate Gaussian at given point, x */
|
||||
def logpdf(x: Vector): Double = {
|
||||
logpdf(x.toBreeze)
|
||||
}
|
||||
|
||||
|
||||
/** Returns density of this multivariate Gaussian at given point, x */
|
||||
private[mllib] def pdf(x: BV[Double]): Double = {
|
||||
math.exp(logpdf(x))
|
||||
}
|
||||
|
||||
|
||||
/** Returns the log-density of this multivariate Gaussian at given point, x */
|
||||
private[mllib] def logpdf(x: BV[Double]): Double = {
|
||||
val delta = x - breezeMu
|
||||
val v = rootSigmaInv * delta
|
||||
u + v.t * v * -0.5
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate distribution dependent components used for the density function:
|
||||
* pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu))
|
||||
* where k is length of the mean vector.
|
||||
*
|
||||
* We here compute distribution-fixed parts
|
||||
*
|
||||
* We here compute distribution-fixed parts
|
||||
* log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
|
||||
* and
|
||||
* D^(-1/2)^ * U, where sigma = U * D * U.t
|
||||
*
|
||||
*
|
||||
* Both the determinant and the inverse can be computed from the singular value decomposition
|
||||
* of sigma. Noting that covariance matrices are always symmetric and positive semi-definite,
|
||||
* we can use the eigendecomposition. We also do not compute the inverse directly; noting
|
||||
* that
|
||||
*
|
||||
* that
|
||||
*
|
||||
* sigma = U * D * U.t
|
||||
* inv(Sigma) = U * inv(D) * U.t
|
||||
* inv(Sigma) = U * inv(D) * U.t
|
||||
* = (D^{-1/2}^ * U).t * (D^{-1/2}^ * U)
|
||||
*
|
||||
*
|
||||
* and thus
|
||||
*
|
||||
*
|
||||
* -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2}^ * U * (x-mu))^2^
|
||||
*
|
||||
* To guard against singular covariance matrices, this method computes both the
|
||||
*
|
||||
* To guard against singular covariance matrices, this method computes both the
|
||||
* pseudo-determinant and the pseudo-inverse (Moore-Penrose). Singular values are considered
|
||||
* to be non-zero only if they exceed a tolerance based on machine precision, matrix size, and
|
||||
* relation to the maximum singular value (same tolerance used by, e.g., Octave).
|
||||
*/
|
||||
private def calculateCovarianceConstants: (DBM[Double], Double) = {
|
||||
val eigSym.EigSym(d, u) = eigSym(sigma.toBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t
|
||||
|
||||
|
||||
// For numerical stability, values are considered to be non-zero only if they exceed tol.
|
||||
// This prevents any inverted value from exceeding (eps * n * max(d))^-1
|
||||
val tol = MLUtils.EPSILON * max(d) * d.length
|
||||
|
||||
|
||||
try {
|
||||
// log(pseudo-determinant) is sum of the logs of all non-zero singular values
|
||||
val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum
|
||||
|
||||
// calculate the root-pseudo-inverse of the diagonal matrix of singular values
|
||||
|
||||
// calculate the root-pseudo-inverse of the diagonal matrix of singular values
|
||||
// by inverting the square root of all non-zero values
|
||||
val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))
|
||||
|
||||
|
||||
(pinvS * u, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
|
||||
} catch {
|
||||
case uex: UnsupportedOperationException =>
|
||||
|
|
|
@ -270,7 +270,7 @@ object GradientBoostedTrees extends Logging {
|
|||
logInfo(s"$timer")
|
||||
|
||||
if (persistedInput) input.unpersist()
|
||||
|
||||
|
||||
if (validate) {
|
||||
new GradientBoostedTreesModel(
|
||||
boostingStrategy.treeStrategy.algo,
|
||||
|
|
|
@ -474,7 +474,7 @@ object RandomForest extends Serializable with Logging {
|
|||
val (treeIndex, node) = nodeQueue.head
|
||||
// Choose subset of features for node (if subsampling).
|
||||
val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
|
||||
Some(SamplingUtils.reservoirSampleAndCount(Range(0,
|
||||
Some(SamplingUtils.reservoirSampleAndCount(Range(0,
|
||||
metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong)._1)
|
||||
} else {
|
||||
None
|
||||
|
|
|
@ -265,7 +265,7 @@ object MLUtils {
|
|||
}
|
||||
Vectors.fromBreeze(vector1)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the squared Euclidean distance between two vectors. The following formula will be used
|
||||
* if it does not introduce too much numerical error:
|
||||
|
|
|
@ -38,7 +38,7 @@ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext
|
|||
val dataset = sqlContext.createDataFrame(
|
||||
sc.parallelize(LinearDataGenerator.generateLinearInput(
|
||||
6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
|
||||
|
||||
|
||||
/**
|
||||
* Using the following R code to load the data, train the model and evaluate metrics.
|
||||
*
|
||||
|
|
|
@ -47,7 +47,7 @@ class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
|
||||
test("Binarize continuous features with setter") {
|
||||
val threshold: Double = 0.2
|
||||
val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
|
||||
val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
|
||||
val dataFrame: DataFrame = sqlContext.createDataFrame(
|
||||
data.zip(thresholdBinarized)).toDF("feature", "expected")
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
test("two clusters") {
|
||||
val data = sc.parallelize(GaussianTestData.data)
|
||||
|
||||
|
@ -62,7 +62,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
val Ew = Array(1.0 / 3.0, 2.0 / 3.0)
|
||||
val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
|
||||
val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))
|
||||
|
||||
|
||||
val gmm = new GaussianMixture()
|
||||
.setK(2)
|
||||
.setInitialModel(initialGmm)
|
||||
|
|
|
@ -56,7 +56,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon
|
|||
predictions(a.cluster) += a.id
|
||||
}
|
||||
assert(predictions.toSet == Set((0 to 3).toSet, (4 to 15).toSet))
|
||||
|
||||
|
||||
val model2 = new PowerIterationClustering()
|
||||
.setK(2)
|
||||
.setInitializationMode("degree")
|
||||
|
|
|
@ -139,7 +139,7 @@ class BLASSuite extends SparkFunSuite {
|
|||
syr(alpha, x, dA)
|
||||
|
||||
assert(dA ~== expected absTol 1e-15)
|
||||
|
||||
|
||||
val dB =
|
||||
new DenseMatrix(3, 4, Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0))
|
||||
|
||||
|
@ -148,7 +148,7 @@ class BLASSuite extends SparkFunSuite {
|
|||
syr(alpha, x, dB)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
val dC =
|
||||
new DenseMatrix(3, 3, Array(0.0, 1.2, 2.2, 1.2, 3.2, 5.3, 2.2, 5.3, 1.8))
|
||||
|
||||
|
@ -157,7 +157,7 @@ class BLASSuite extends SparkFunSuite {
|
|||
syr(alpha, x, dC)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
val y = new DenseVector(Array(0.0, 2.7, 3.5, 2.1, 1.5))
|
||||
|
||||
withClue("Size of vector must match the rank of matrix") {
|
||||
|
@ -255,13 +255,13 @@ class BLASSuite extends SparkFunSuite {
|
|||
val dA =
|
||||
new DenseMatrix(4, 3, Array(0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0))
|
||||
val sA = new SparseMatrix(4, 3, Array(0, 1, 3, 4), Array(1, 0, 2, 3), Array(1.0, 2.0, 1.0, 3.0))
|
||||
|
||||
|
||||
val dA2 =
|
||||
new DenseMatrix(4, 3, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0), true)
|
||||
val sA2 =
|
||||
new SparseMatrix(4, 3, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0),
|
||||
true)
|
||||
|
||||
|
||||
val dx = new DenseVector(Array(1.0, 2.0, 3.0))
|
||||
val sx = dx.toSparse
|
||||
val expected = new DenseVector(Array(4.0, 1.0, 2.0, 9.0))
|
||||
|
@ -270,7 +270,7 @@ class BLASSuite extends SparkFunSuite {
|
|||
assert(sA.multiply(dx) ~== expected absTol 1e-15)
|
||||
assert(dA.multiply(sx) ~== expected absTol 1e-15)
|
||||
assert(sA.multiply(sx) ~== expected absTol 1e-15)
|
||||
|
||||
|
||||
val y1 = new DenseVector(Array(1.0, 3.0, 1.0, 0.0))
|
||||
val y2 = y1.copy
|
||||
val y3 = y1.copy
|
||||
|
@ -287,7 +287,7 @@ class BLASSuite extends SparkFunSuite {
|
|||
val y14 = y1.copy
|
||||
val y15 = y1.copy
|
||||
val y16 = y1.copy
|
||||
|
||||
|
||||
val expected2 = new DenseVector(Array(6.0, 7.0, 4.0, 9.0))
|
||||
val expected3 = new DenseVector(Array(10.0, 8.0, 6.0, 18.0))
|
||||
|
||||
|
@ -295,42 +295,42 @@ class BLASSuite extends SparkFunSuite {
|
|||
gemv(1.0, sA, dx, 2.0, y2)
|
||||
gemv(1.0, dA, sx, 2.0, y3)
|
||||
gemv(1.0, sA, sx, 2.0, y4)
|
||||
|
||||
|
||||
gemv(1.0, dA2, dx, 2.0, y5)
|
||||
gemv(1.0, sA2, dx, 2.0, y6)
|
||||
gemv(1.0, dA2, sx, 2.0, y7)
|
||||
gemv(1.0, sA2, sx, 2.0, y8)
|
||||
|
||||
|
||||
gemv(2.0, dA, dx, 2.0, y9)
|
||||
gemv(2.0, sA, dx, 2.0, y10)
|
||||
gemv(2.0, dA, sx, 2.0, y11)
|
||||
gemv(2.0, sA, sx, 2.0, y12)
|
||||
|
||||
|
||||
gemv(2.0, dA2, dx, 2.0, y13)
|
||||
gemv(2.0, sA2, dx, 2.0, y14)
|
||||
gemv(2.0, dA2, sx, 2.0, y15)
|
||||
gemv(2.0, sA2, sx, 2.0, y16)
|
||||
|
||||
|
||||
assert(y1 ~== expected2 absTol 1e-15)
|
||||
assert(y2 ~== expected2 absTol 1e-15)
|
||||
assert(y3 ~== expected2 absTol 1e-15)
|
||||
assert(y4 ~== expected2 absTol 1e-15)
|
||||
|
||||
|
||||
assert(y5 ~== expected2 absTol 1e-15)
|
||||
assert(y6 ~== expected2 absTol 1e-15)
|
||||
assert(y7 ~== expected2 absTol 1e-15)
|
||||
assert(y8 ~== expected2 absTol 1e-15)
|
||||
|
||||
|
||||
assert(y9 ~== expected3 absTol 1e-15)
|
||||
assert(y10 ~== expected3 absTol 1e-15)
|
||||
assert(y11 ~== expected3 absTol 1e-15)
|
||||
assert(y12 ~== expected3 absTol 1e-15)
|
||||
|
||||
|
||||
assert(y13 ~== expected3 absTol 1e-15)
|
||||
assert(y14 ~== expected3 absTol 1e-15)
|
||||
assert(y15 ~== expected3 absTol 1e-15)
|
||||
assert(y16 ~== expected3 absTol 1e-15)
|
||||
|
||||
|
||||
withClue("columns of A don't match the rows of B") {
|
||||
intercept[Exception] {
|
||||
gemv(1.0, dA.transpose, dx, 2.0, y1)
|
||||
|
@ -345,12 +345,12 @@ class BLASSuite extends SparkFunSuite {
|
|||
gemv(1.0, sA.transpose, sx, 2.0, y1)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
val dAT =
|
||||
new DenseMatrix(3, 4, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0))
|
||||
val sAT =
|
||||
new SparseMatrix(3, 4, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0))
|
||||
|
||||
|
||||
val dATT = dAT.transpose
|
||||
val sATT = sAT.transpose
|
||||
|
||||
|
|
|
@ -214,13 +214,13 @@ class VectorsSuite extends SparkFunSuite {
|
|||
|
||||
val squaredDist = breezeSquaredDistance(sparseVector1.toBreeze, sparseVector2.toBreeze)
|
||||
|
||||
// SparseVector vs. SparseVector
|
||||
assert(Vectors.sqdist(sparseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
|
||||
// SparseVector vs. SparseVector
|
||||
assert(Vectors.sqdist(sparseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
|
||||
// DenseVector vs. SparseVector
|
||||
assert(Vectors.sqdist(denseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
|
||||
// DenseVector vs. DenseVector
|
||||
assert(Vectors.sqdist(denseVector1, denseVector2) ~== squaredDist relTol 1E-8)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("foreachActive") {
|
||||
|
|
|
@ -53,13 +53,13 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
|
|||
// ensure logistic regression has normalization method set to LOGIT
|
||||
assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.LOGIT)
|
||||
}
|
||||
|
||||
|
||||
test("linear SVM PMML export") {
|
||||
val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)
|
||||
val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
|
||||
|
||||
|
||||
val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
|
||||
|
||||
|
||||
// assert that the PMML format is as expected
|
||||
assert(svmModelExport.isInstanceOf[PMMLModelExport])
|
||||
val pmml = svmModelExport.getPmml
|
||||
|
@ -80,5 +80,5 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite {
|
|||
// ensure linear SVM has normalization method set to NONE
|
||||
assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.NONE)
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -45,5 +45,5 @@ class KMeansPMMLModelExportSuite extends SparkFunSuite {
|
|||
val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
|
||||
assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -60,25 +60,25 @@ class PMMLModelExportFactorySuite extends SparkFunSuite {
|
|||
test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
|
||||
+ "when passing a LogisticRegressionModel or SVMModel") {
|
||||
val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)
|
||||
|
||||
|
||||
val logisticRegressionModel =
|
||||
new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
|
||||
val logisticRegressionModelExport =
|
||||
PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
|
||||
assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
|
||||
|
||||
|
||||
val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
|
||||
val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
|
||||
assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
|
||||
}
|
||||
|
||||
|
||||
test("PMMLModelExportFactory throw IllegalArgumentException "
|
||||
+ "when passing a Multinomial Logistic Regression") {
|
||||
/** 3 classes, 2 features */
|
||||
val multiclassLogisticRegressionModel = new LogisticRegressionModel(
|
||||
weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
|
||||
weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
|
||||
numFeatures = 2, numClasses = 3)
|
||||
|
||||
|
||||
intercept[IllegalArgumentException] {
|
||||
PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
|
||||
}
|
||||
|
|
|
@ -26,39 +26,39 @@ class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext
|
|||
test("univariate") {
|
||||
val x1 = Vectors.dense(0.0)
|
||||
val x2 = Vectors.dense(1.5)
|
||||
|
||||
|
||||
val mu = Vectors.dense(0.0)
|
||||
val sigma1 = Matrices.dense(1, 1, Array(1.0))
|
||||
val dist1 = new MultivariateGaussian(mu, sigma1)
|
||||
assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
|
||||
assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)
|
||||
|
||||
|
||||
val sigma2 = Matrices.dense(1, 1, Array(4.0))
|
||||
val dist2 = new MultivariateGaussian(mu, sigma2)
|
||||
assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
|
||||
assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
|
||||
}
|
||||
|
||||
|
||||
test("multivariate") {
|
||||
val x1 = Vectors.dense(0.0, 0.0)
|
||||
val x2 = Vectors.dense(1.0, 1.0)
|
||||
|
||||
|
||||
val mu = Vectors.dense(0.0, 0.0)
|
||||
val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
|
||||
val dist1 = new MultivariateGaussian(mu, sigma1)
|
||||
assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
|
||||
assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)
|
||||
|
||||
|
||||
val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
|
||||
val dist2 = new MultivariateGaussian(mu, sigma2)
|
||||
assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
|
||||
assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
|
||||
}
|
||||
|
||||
|
||||
test("multivariate degenerate") {
|
||||
val x1 = Vectors.dense(0.0, 0.0)
|
||||
val x2 = Vectors.dense(1.0, 1.0)
|
||||
|
||||
|
||||
val mu = Vectors.dense(0.0, 0.0)
|
||||
val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
|
||||
val dist = new MultivariateGaussian(mu, sigma)
|
||||
|
|
|
@ -62,7 +62,7 @@ class MLUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
val fastSquaredDist3 =
|
||||
fastSquaredDistance(v2, norm2, v3, norm3, precision)
|
||||
assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
|
||||
if (m > 10) {
|
||||
if (m > 10) {
|
||||
val v4 = Vectors.sparse(n, indices.slice(0, m - 10),
|
||||
indices.map(i => a(i) + 0.5).slice(0, m - 10))
|
||||
val norm4 = Vectors.norm(v4, 2.0)
|
||||
|
|
Loading…
Reference in a new issue