[SPARK-34860][ML] Multinomial Logistic Regression with intercept support centering

### What changes were proposed in this pull request? 1, use new `MultinomialLogisticBlockAggregator` which support virtual centering 2, remove no-used `BlockLogisticAggregator` ### Why are the changes needed? 1, for better convergence; 2, its solution is much close to GLMNET; ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? updated and new test suites Closes #31985 from zhengruifeng/mlr_center. Authored-by: Ruifeng Zheng <ruifengz@foxmail.com> Signed-off-by: Sean Owen <srowen@gmail.com>
2021-03-30 18:06:59 -05:00 · 2021-03-30 18:06:59 -05:00 · d372e6e094
parent c902f77b42
commit d372e6e094
3 changed files with 178 additions and 301 deletions
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@ -941,19 +941,21 @@ class LogisticRegression @Since("1.2.0") (
      optimizer: FirstOrderMinimizer[BDV[Double], DiffFunction[BDV[Double]]]) = {
    val multinomial = checkMultinomial(numClasses)

-    // for binary LR, we can center the input vector, if and only if:
+    // for LR, we can center the input vector, if and only if:
    // 1, fitIntercept is true;
    // 2, no penalty on the intercept, which is always true in existing impl;
    // 3, no bounds on the intercept.
-    val fitWithMean = !multinomial && $(fitIntercept) &&
-      (!isSet(lowerBoundsOnIntercepts) || $(lowerBoundsOnIntercepts)(0).isNegInfinity) &&
-      (!isSet(upperBoundsOnIntercepts) || $(upperBoundsOnIntercepts)(0).isPosInfinity)
+    val fitWithMean = $(fitIntercept) &&
+      (!isSet(lowerBoundsOnIntercepts) ||
+        $(lowerBoundsOnIntercepts).toArray.forall(_.isNegInfinity)) &&
+      (!isSet(upperBoundsOnIntercepts) ||
+        $(upperBoundsOnIntercepts).toArray.forall(_.isPosInfinity))

    val numFeatures = featuresStd.length
    val inverseStd = featuresStd.map(std => if (std != 0) 1.0 / std else 0.0)
    val scaledMean = Array.tabulate(numFeatures)(i => inverseStd(i) * featuresMean(i))
    val bcInverseStd = instances.context.broadcast(inverseStd)
-    var bcObjects = Seq(bcInverseStd)
+    val bcScaledMean = instances.context.broadcast(scaledMean)

    val scaled = instances.mapPartitions { iter =>
      val func = StandardScalerModel.getTransformFunc(Array.empty, bcInverseStd.value, false, true)
@ -966,25 +968,30 @@ class LogisticRegression @Since("1.2.0") (
      .setName(s"$uid: training blocks (blockSizeInMB=$actualBlockSizeInMB)")

    val costFun = if (multinomial) {
-      // TODO: create a separate MultinomialLogisticBlockAggregator for clearness
-      val getAggregatorFunc = new BlockLogisticAggregator(numFeatures, numClasses,
-         $(fitIntercept), true)(_)
+      val getAggregatorFunc = new MultinomialLogisticBlockAggregator(bcInverseStd, bcScaledMean,
+         $(fitIntercept), fitWithMean)(_)
      new RDDLossFunction(blocks, getAggregatorFunc, regularization, $(aggregationDepth))
    } else {
-      val bcScaledMean = instances.context.broadcast(scaledMean)
-      bcObjects +:= bcScaledMean
      val getAggregatorFunc = new BinaryLogisticBlockAggregator(bcInverseStd, bcScaledMean,
         $(fitIntercept), fitWithMean)(_)
      new RDDLossFunction(blocks, getAggregatorFunc, regularization, $(aggregationDepth))
    }

    if (fitWithMean) {
-      // orginal `initialCoefWithInterceptArray` is for problem:
-      // y = f(w1 * x1 / std_x1, w2 * x2 / std_x2, ..., intercept)
-      // we should adjust it to the initial solution for problem:
-      // y = f(w1 * (x1 - avg_x1) / std_x1, w2 * (x2 - avg_x2) / std_x2, ..., intercept)
-      val adapt = BLAS.getBLAS(numFeatures).ddot(numFeatures, initialSolution, 1, scaledMean, 1)
-      initialSolution(numFeatures) += adapt
+      if (multinomial) {
+        val adapt = Array.ofDim[Double](numClasses)
+        BLAS.f2jBLAS.dgemv("N", numClasses, numFeatures, 1.0,
+          initialSolution, numClasses, scaledMean, 1, 0.0, adapt, 1)
+        BLAS.getBLAS(numFeatures).daxpy(numClasses, 1.0, adapt, 0, 1,
+          initialSolution, numClasses * numFeatures, 1)
+      } else {
+        // orginal `initialCoefWithInterceptArray` is for problem:
+        // y = f(w1 * x1 / std_x1, w2 * x2 / std_x2, ..., intercept)
+        // we should adjust it to the initial solution for problem:
+        // y = f(w1 * (x1 - avg_x1) / std_x1, w2 * (x2 - avg_x2) / std_x2, ..., intercept)
+        val adapt = BLAS.getBLAS(numFeatures).ddot(numFeatures, initialSolution, 1, scaledMean, 1)
+        initialSolution(numFeatures) += adapt
+      }
    }

    val states = optimizer.iterations(new CachedDiffFunction(costFun),
@ -1002,16 +1009,25 @@ class LogisticRegression @Since("1.2.0") (
      arrayBuilder += state.adjustedValue
    }
    blocks.unpersist()
-    bcObjects.foreach(_.destroy())
+    bcInverseStd.destroy()
+    bcScaledMean.destroy()

    val solution = if (state == null) null else state.x.toArray
    if (fitWithMean && solution != null) {
-      // the final solution is for problem:
-      // y = f(w1 * (x1 - avg_x1) / std_x1, w2 * (x2 - avg_x2) / std_x2, ..., intercept)
-      // we should adjust it back for original problem:
-      // y = f(w1 * x1 / std_x1, w2 * x2 / std_x2, ..., intercept)
-      val adapt = BLAS.getBLAS(numFeatures).ddot(numFeatures, solution, 1, scaledMean, 1)
-      solution(numFeatures) -= adapt
+      if (multinomial) {
+        val adapt = Array.ofDim[Double](numClasses)
+        BLAS.f2jBLAS.dgemv("N", numClasses, numFeatures, 1.0,
+          solution, numClasses, scaledMean, 1, 0.0, adapt, 1)
+        BLAS.getBLAS(numFeatures).daxpy(numClasses, -1.0, adapt, 0, 1,
+          solution, numClasses * numFeatures, 1)
+      } else {
+        // the final solution is for problem:
+        // y = f(w1 * (x1 - avg_x1) / std_x1, w2 * (x2 - avg_x2) / std_x2, ..., intercept)
+        // we should adjust it back for original problem:
+        // y = f(w1 * x1 / std_x1, w2 * x2 / std_x2, ..., intercept)
+        val adapt = BLAS.getBLAS(numFeatures).ddot(numFeatures, solution, 1, scaledMean, 1)
+        solution(numFeatures) -= adapt
+      }
    }
    (solution, arrayBuilder.result)
  }
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala
@ -1,264 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.ml.optim.aggregator
-
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.InstanceBlock
-import org.apache.spark.ml.impl.Utils
-import org.apache.spark.ml.linalg._
-
-/**
- * BlockLogisticAggregator computes the gradient and loss used in Logistic classification
- * for blocks in sparse or dense matrix in an online fashion.
- *
- * Two BlockLogisticAggregators can be merged together to have a summary of loss and gradient of
- * the corresponding joint dataset.
- *
- * NOTE: The feature values are expected to be standardized before computation.
- *
- * @param bcCoefficients The coefficients corresponding to the features.
- * @param fitIntercept Whether to fit an intercept term.
- */
-private[ml] class BlockLogisticAggregator(
-    numFeatures: Int,
-    numClasses: Int,
-    fitIntercept: Boolean,
-    multinomial: Boolean)(bcCoefficients: Broadcast[Vector])
-  extends DifferentiableLossAggregator[InstanceBlock, BlockLogisticAggregator] with Logging {
-
-  if (multinomial && numClasses <= 2) {
-    logInfo(s"Multinomial logistic regression for binary classification yields separate " +
-      s"coefficients for positive and negative classes. When no regularization is applied, the" +
-      s"result will be effectively the same as binary logistic regression. When regularization" +
-      s"is applied, multinomial loss will produce a result different from binary loss.")
-  }
-
-  private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
-  private val coefficientSize = bcCoefficients.value.size
-  protected override val dim: Int = coefficientSize
-
-  if (multinomial) {
-    require(numClasses ==  coefficientSize / numFeaturesPlusIntercept, s"The number of " +
-      s"coefficients should be ${numClasses * numFeaturesPlusIntercept} but was $coefficientSize")
-  } else {
-    require(coefficientSize == numFeaturesPlusIntercept, s"Expected $numFeaturesPlusIntercept " +
-      s"coefficients but got $coefficientSize")
-    require(numClasses == 1 || numClasses == 2, s"Binary logistic aggregator requires numClasses " +
-      s"in {1, 2} but found $numClasses.")
-  }
-
-  @transient private lazy val coefficientsArray = bcCoefficients.value match {
-    case DenseVector(values) => values
-    case _ => throw new IllegalArgumentException(s"coefficients only supports dense vector but " +
-      s"got type ${bcCoefficients.value.getClass}.)")
-  }
-
-  @transient private lazy val binaryLinear = (multinomial, fitIntercept) match {
-    case (false, true) => Vectors.dense(coefficientsArray.take(numFeatures))
-    case (false, false) => Vectors.dense(coefficientsArray)
-    case _ => null
-  }
-
-  @transient private lazy val multinomialLinear = (multinomial, fitIntercept) match {
-    case (true, true) =>
-      Matrices.dense(numClasses, numFeatures,
-        coefficientsArray.take(numClasses * numFeatures)).toDense
-    case (true, false) =>
-      Matrices.dense(numClasses, numFeatures, coefficientsArray).toDense
-    case _ => null
-  }
-
-  /**
-   * Add a new training instance block to this BlockLogisticAggregator, and update the loss and
-   * gradient of the objective function.
-   *
-   * @param block The instance block of data point to be added.
-   * @return This BlockLogisticAggregator object.
-   */
-  def add(block: InstanceBlock): this.type = {
-    require(block.matrix.isTransposed)
-    require(numFeatures == block.numFeatures, s"Dimensions mismatch when adding new " +
-      s"instance. Expecting $numFeatures but got ${block.numFeatures}.")
-    require(block.weightIter.forall(_ >= 0),
-      s"instance weights ${block.weightIter.mkString("[", ",", "]")} has to be >= 0.0")
-
-    if (block.weightIter.forall(_ == 0)) return this
-
-    if (multinomial) {
-      multinomialUpdateInPlace(block)
-    } else {
-      binaryUpdateInPlace(block)
-    }
-
-    this
-  }
-
-  /** Update gradient and loss using binary loss function. */
-  private def binaryUpdateInPlace(block: InstanceBlock): Unit = {
-    val size = block.size
-
-    // vec here represents margins or negative dotProducts
-    val vec = if (fitIntercept) {
-      Vectors.dense(Array.fill(size)(coefficientsArray.last)).toDense
-    } else {
-      Vectors.zeros(size).toDense
-    }
-    BLAS.gemv(-1.0, block.matrix, binaryLinear, -1.0, vec)
-
-    // in-place convert margins to multiplier
-    // then, vec represents multiplier
-    var localLossSum = 0.0
-    var i = 0
-    while (i < size) {
-      val weight = block.getWeight(i)
-      if (weight > 0) {
-        val label = block.getLabel(i)
-        val margin = vec(i)
-        if (label > 0) {
-          // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
-          localLossSum += weight * Utils.log1pExp(margin)
-        } else {
-          localLossSum += weight * (Utils.log1pExp(margin) - margin)
-        }
-        val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label)
-        vec.values(i) = multiplier
-      } else { vec.values(i) = 0.0 }
-      i += 1
-    }
-    lossSum += localLossSum
-    weightSum += block.weightIter.sum
-
-    // predictions are all correct, no gradient signal
-    if (vec.values.forall(_ == 0)) return
-
-    block.matrix match {
-      case dm: DenseMatrix =>
-        BLAS.nativeBLAS.dgemv("N", dm.numCols, dm.numRows, 1.0, dm.values, dm.numCols,
-          vec.values, 1, 1.0, gradientSumArray, 1)
-
-      case sm: SparseMatrix if fitIntercept =>
-        val linearGradSumVec = Vectors.zeros(numFeatures).toDense
-        BLAS.gemv(1.0, sm.transpose, vec, 0.0, linearGradSumVec)
-        BLAS.getBLAS(numFeatures).daxpy(numFeatures, 1.0, linearGradSumVec.values, 1,
-          gradientSumArray, 1)
-
-      case sm: SparseMatrix if !fitIntercept =>
-        val gradSumVec = new DenseVector(gradientSumArray)
-        BLAS.gemv(1.0, sm.transpose, vec, 1.0, gradSumVec)
-
-      case m =>
-        throw new IllegalArgumentException(s"Unknown matrix type ${m.getClass}.")
-    }
-
-    if (fitIntercept) gradientSumArray(numFeatures) += vec.values.sum
-  }
-
-  /** Update gradient and loss using multinomial (softmax) loss function. */
-  private def multinomialUpdateInPlace(block: InstanceBlock): Unit = {
-    val size = block.size
-
-    // mat here represents margins, shape: S X C
-    val mat = DenseMatrix.zeros(size, numClasses)
-    if (fitIntercept) {
-      val localCoefficientsArray = coefficientsArray
-      val offset = numClasses * numFeatures
-      var j = 0
-      while (j < numClasses) {
-        val intercept = localCoefficientsArray(offset + j)
-        var i = 0
-        while (i < size) { mat.update(i, j, intercept); i += 1 }
-        j += 1
-      }
-    }
-    BLAS.gemm(1.0, block.matrix, multinomialLinear.transpose, 1.0, mat)
-
-    // in-place convert margins to multipliers
-    // then, mat represents multipliers
-    var localLossSum = 0.0
-    var i = 0
-    val tmp = Array.ofDim[Double](numClasses)
-    val interceptGradSumArr = if (fitIntercept) Array.ofDim[Double](numClasses) else null
-    while (i < size) {
-      val weight = block.getWeight(i)
-      if (weight > 0) {
-        val label = block.getLabel(i)
-
-        var maxMargin = Double.NegativeInfinity
-        var j = 0
-        while (j < numClasses) {
-          tmp(j) = mat(i, j)
-          maxMargin = math.max(maxMargin, tmp(j))
-          j += 1
-        }
-
-        // marginOfLabel is margins(label) in the formula
-        val marginOfLabel = tmp(label.toInt)
-
-        var sum = 0.0
-        j = 0
-        while (j < numClasses) {
-          if (maxMargin > 0) tmp(j) -= maxMargin
-          val exp = math.exp(tmp(j))
-          sum += exp
-          tmp(j) = exp
-          j += 1
-        }
-
-        j = 0
-        while (j < numClasses) {
-          val multiplier = weight * (tmp(j) / sum - (if (label == j) 1.0 else 0.0))
-          mat.update(i, j, multiplier)
-          if (fitIntercept) interceptGradSumArr(j) += multiplier
-          j += 1
-        }
-
-        if (maxMargin > 0) {
-          localLossSum += weight * (math.log(sum) - marginOfLabel + maxMargin)
-        } else {
-          localLossSum += weight * (math.log(sum) - marginOfLabel)
-        }
-      } else {
-        var j = 0; while (j < numClasses) { mat.update(i, j, 0.0); j += 1 }
-      }
-      i += 1
-    }
-    lossSum += localLossSum
-    weightSum += block.weightIter.sum
-
-    // mat (multipliers):             S X C, dense                                N
-    // mat.transpose (multipliers):   C X S, dense                                T
-    // block.matrix:                  S X F, unknown type                         T
-    // gradSumMat(gradientSumArray):  C X FPI (numFeaturesPlusIntercept), dense   N
-    block.matrix match {
-      case dm: DenseMatrix =>
-        BLAS.nativeBLAS.dgemm("T", "T", numClasses, numFeatures, size, 1.0,
-          mat.values, size, dm.values, numFeatures, 1.0, gradientSumArray, numClasses)
-
-      case sm: SparseMatrix =>
-        // linearGradSumMat = matrix.T X mat
-        val linearGradSumMat = DenseMatrix.zeros(numFeatures, numClasses)
-        BLAS.gemm(1.0, sm.transpose, mat, 0.0, linearGradSumMat)
-        linearGradSumMat.foreachActive { (i, j, v) => gradientSumArray(i * numClasses + j) += v }
-    }
-
-    if (fitIntercept) {
-      BLAS.getBLAS(numClasses).daxpy(numClasses, 1.0, interceptGradSumArr, 0, 1,
-        gradientSumArray, numClasses * numFeatures, 1)
-    }
-  }
-}
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@ -46,6 +46,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
  @transient var binaryDataset: DataFrame = _
  @transient var binaryDatasetWithSmallVar: DataFrame = _
  @transient var multinomialDataset: DataFrame = _
+  @transient var multinomialDatasetWithSmallVar: DataFrame = _
  @transient var multinomialDatasetWithZeroVar: DataFrame = _
  private val eps: Double = 1e-5

@ -118,6 +119,23 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
      df
    }

+    multinomialDatasetWithSmallVar = {
+      val nPoints = 50000
+      val coefficients = Array(
+        -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
+        -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)
+
+      val xMean = Array(5.843, 3.057, 3.758, 10.199)
+      val xVariance = Array(0.6856, 0.1899, 3.116, 0.001)
+
+      val testData = generateMultinomialLogisticInput(
+        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)
+
+      val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed))
+      df.cache()
+      df
+    }
+
    multinomialDatasetWithZeroVar = {
      val nPoints = 100
      val coefficients = Array(
@ -141,18 +159,21 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
   * so we can validate the training accuracy compared with R's glmnet package.
   */
  ignore("export test data into CSV format") {
-    binaryDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
-      label + "," + weight + "," + features.toArray.mkString(",")
+    binaryDataset.rdd.map { case Row(l: Double, f: Vector, w: Double) =>
+      l + "," + w + "," + f.toArray.mkString(",")
    }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset")
-    binaryDatasetWithSmallVar.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
-      label + "," + weight + "," + features.toArray.mkString(",")
+    binaryDatasetWithSmallVar.rdd.map { case Row(l: Double, f: Vector, w: Double) =>
+      l + "," + w + "," + f.toArray.mkString(",")
    }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDatasetWithSmallVar")
-    multinomialDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
-      label + "," + weight + "," + features.toArray.mkString(",")
+    multinomialDataset.rdd.map { case Row(l: Double, f: Vector, w: Double) =>
+      l + "," + w + "," + f.toArray.mkString(",")
    }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset")
-    multinomialDatasetWithZeroVar.rdd.map {
-      case Row(label: Double, features: Vector, weight: Double) =>
-        label + "," + weight + "," + features.toArray.mkString(",")
+    multinomialDatasetWithSmallVar.rdd.map { case Row(l: Double, f: Vector, w: Double) =>
+      l + "," + w + "," + f.toArray.mkString(",")
+    }.repartition(1)
+     .saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDatasetWithSmallVar")
+    multinomialDatasetWithZeroVar.rdd.map { case Row(l: Double, f: Vector, w: Double) =>
+        l + "," + w + "," + f.toArray.mkString(",")
    }.repartition(1)
     .saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDatasetWithZeroVar")
  }
@ -1863,21 +1884,125 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
      0.0, 0.0, 0.0, 0.09064661,
      -0.1144333, 0.3204703, -0.1621061, -0.2308192,
      0.0, -0.4832131, 0.0, 0.0), isTransposed = true)
-    val interceptsRStd = Vectors.dense(-0.72638218, -0.01737265, 0.74375484)
+    val interceptsRStd = Vectors.dense(-0.69265374, -0.2260274, 0.9186811)
    val coefficientsR = new DenseMatrix(3, 4, Array(
      0.0, 0.0, 0.01641412, 0.03570376,
      -0.05110822, 0.0, -0.21595670, -0.16162836,
      0.0, 0.0, 0.0, 0.0), isTransposed = true)
    val interceptsR = Vectors.dense(-0.44707756, 0.75180900, -0.3047314)

-    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05)
-    assert(model1.interceptVector ~== interceptsRStd relTol 0.1)
+    assert(model1.coefficientMatrix ~== coefficientsRStd absTol 1e-3)
+    assert(model1.interceptVector ~== interceptsRStd relTol 1e-3)
    assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
-    assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02)
-    assert(model2.interceptVector ~== interceptsR relTol 0.1)
+    assert(model2.coefficientMatrix ~== coefficientsR absTol 1e-3)
+    assert(model2.interceptVector ~== interceptsR relTol 1e-3)
    assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
  }

+  test("SPARK-34860: multinomial logistic regression with intercept, with small var") {
+    val trainer1 = new LogisticRegression().setFitIntercept(true).setStandardization(true)
+      .setWeightCol("weight")
+    val trainer2 = new LogisticRegression().setFitIntercept(true).setStandardization(false)
+      .setWeightCol("weight")
+    val trainer3 = new LogisticRegression().setFitIntercept(true).setStandardization(true)
+      .setElasticNetParam(0.0001).setRegParam(0.5).setWeightCol("weight")
+
+    val model1 = trainer1.fit(multinomialDatasetWithSmallVar)
+    val model2 = trainer2.fit(multinomialDatasetWithSmallVar)
+    val model3 = trainer3.fit(multinomialDatasetWithSmallVar)
+
+    /*
+      Use the following R code to load the data and train the model using glmnet package.
+      library("glmnet")
+      data <- read.csv("path", header=FALSE)
+      label = factor(data$V1)
+      w = data$V2
+      features = as.matrix(data.frame(data$V3, data$V4, data$V5, data$V6))
+      coefficients = coef(glmnet(features, label, weights=w, family="multinomial", alpha = 0,
+      lambda = 0))
+      coefficients
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+               2.91748298
+      data.V3  0.21755977
+      data.V4  0.01647541
+      data.V5  0.16507778
+      data.V6 -0.14016680
+
+      $`1`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+              -17.5107460
+      data.V3  -0.2443600
+      data.V4   0.7564655
+      data.V5  -0.2955698
+      data.V6   1.3262009
+
+      $`2`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+              14.59326301
+      data.V3  0.02680026
+      data.V4 -0.77294095
+      data.V5  0.13049206
+      data.V6 -1.18603411
+
+
+
+      coefficientsStd = coef(glmnet(features, label, weights=w, family="multinomial",
+      alpha = 0.0001, lambda = 0.5, standardize=T))
+      coefficientsStd
+      $`0`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                       s0
+              1.751626027
+      data.V3 0.019970169
+      data.V4 0.079611293
+      data.V5 0.003959452
+      data.V6 0.110024399
+
+      $`1`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                         s0
+              -3.9297124987
+      data.V3 -0.0004788494
+      data.V4  0.0010097453
+      data.V5 -0.0005832701
+      data.V6  .
+
+      $`2`
+      5 x 1 sparse Matrix of class "dgCMatrix"
+                        s0
+               2.178086472
+      data.V3 -0.019369990
+      data.V4 -0.080851149
+      data.V5 -0.003319687
+      data.V6 -0.112435972
+     */
+    val interceptsR = Vectors.dense(2.91748298, -17.5107460, 14.59326301)
+    val coefficientsR = new DenseMatrix(3, 4, Array(
+      0.21755977, 0.01647541, 0.16507778, -0.14016680,
+      -0.2443600, 0.7564655, -0.2955698, 1.3262009,
+      0.02680026, -0.77294095, 0.13049206, -1.18603411), isTransposed = true)
+
+    assert(model1.interceptVector ~== interceptsR relTol 1e-2)
+    assert(model1.coefficientMatrix ~= coefficientsR relTol 1e-1)
+
+    // Without regularization, with or without standardization will converge to the same solution.
+    assert(model2.interceptVector ~== interceptsR relTol 1e-2)
+    assert(model2.coefficientMatrix ~= coefficientsR relTol 1e-1)
+
+    val interceptsR2 = Vectors.dense(1.751626027, -3.9297124987, 2.178086472)
+    val coefficientsR2 = new DenseMatrix(3, 4, Array(
+      0.019970169, 0.079611293, 0.003959452, 0.110024399,
+      -0.0004788494, 0.0010097453, -0.0005832701, 0.0,
+      -0.019369990, -0.080851149, -0.003319687, -0.112435972), isTransposed = true)
+
+    assert(model3.interceptVector ~== interceptsR2 relTol 1e-3)
+    assert(model3.coefficientMatrix ~= coefficientsR2 relTol 1e-2)
+  }
+
  test("multinomial logistic regression without intercept with L1 regularization") {
    val trainer1 = (new LogisticRegression).setFitIntercept(false)
      .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true).setWeightCol("weight")