[SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR

## What changes were proposed in this pull request? PR https://github.com/apache/spark/pull/17715 Added Constrained Logistic Regression for ML. We should add it to SparkR. ## How was this patch tested? Add new unit tests. Author: wangmiao1981 <wm624@hotmail.com> Closes #18128 from wangmiao1981/test.
2017-06-21 20:42:45 -07:00 · 2017-06-21 20:42:45 -07:00 · 53543374ce
parent 215281d88e
commit 53543374ce
4 changed files with 135 additions and 8 deletions
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@ -204,6 +204,20 @@ function(object, path, overwrite = FALSE) {
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
 #'                         or the number of partitions are large, this param could be adjusted to a larger size.
 #'                         This is an expert parameter. Default value should be good for most cases.
+#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization.
+#'                                  The bound matrix must be compatible with the shape (1, number of features) for binomial
+#'                                  regression, or (number of classes, number of features) for multinomial regression.
+#'                                  It is a R matrix.
+#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization.
+#'                                  The bound matrix must be compatible with the shape (1, number of features) for binomial
+#'                                  regression, or (number of classes, number of features) for multinomial regression.
+#'                                  It is a R matrix.
+#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization.
+#'                                The bounds vector size must be equal to 1 for binomial regression, or the number
+#'                                of classes for multinomial regression.
+#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
+#'                                The bound vector size must be equal to 1 for binomial regression, or the number
+#'                                of classes for multinomial regression.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
@ -241,8 +255,12 @@ function(object, path, overwrite = FALSE) {
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
          function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
                   tol = 1E-6, family = "auto", standardization = TRUE,
-                   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
+                   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
+                   lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
+                   lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
            formula <- paste(deparse(formula), collapse = "")
+            row <- 0
+            col <- 0

            if (!is.null(weightCol) && weightCol == "") {
              weightCol <- NULL
@ -250,12 +268,51 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
              weightCol <- as.character(weightCol)
            }

+            if (!is.null(lowerBoundsOnIntercepts)) {
+                lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
+            }
+
+            if (!is.null(upperBoundsOnIntercepts)) {
+                upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
+            }
+
+            if (!is.null(lowerBoundsOnCoefficients)) {
+              if (class(lowerBoundsOnCoefficients) != "matrix") {
+                stop("lowerBoundsOnCoefficients must be a matrix.")
+              }
+              row <- nrow(lowerBoundsOnCoefficients)
+              col <- ncol(lowerBoundsOnCoefficients)
+              lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
+            }
+
+            if (!is.null(upperBoundsOnCoefficients)) {
+              if (class(upperBoundsOnCoefficients) != "matrix") {
+                stop("upperBoundsOnCoefficients must be a matrix.")
+              }
+
+              if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
+                || col != ncol(upperBoundsOnCoefficients))) {
+                stop(paste0("dimension of upperBoundsOnCoefficients ",
+                           "is not the same as lowerBoundsOnCoefficients", sep = ""))
+              }
+
+              if (is.null(lowerBoundsOnCoefficients)) {
+                row <- nrow(upperBoundsOnCoefficients)
+                col <- ncol(upperBoundsOnCoefficients)
+              }
+
+              upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
+            }
+
            jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
                                data@sdf, formula, as.numeric(regParam),
                                as.numeric(elasticNetParam), as.integer(maxIter),
                                as.numeric(tol), as.character(family),
                                as.logical(standardization), as.array(thresholds),
-                                weightCol, as.integer(aggregationDepth))
+                                weightCol, as.integer(aggregationDepth),
+                                as.integer(row), as.integer(col),
+                                lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
+                                lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
            new("LogisticRegressionModel", jobj = jobj)
          })

--- a/R/pkg/tests/fulltests/test_mllib_classification.R
+++ b/R/pkg/tests/fulltests/test_mllib_classification.R
@ -223,6 +223,46 @@ test_that("spark.logit", {
  model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
  prediction2 <- collect(select(predict(model2, df2), "prediction"))
  expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
+
+  # Test binomial logistic regression againt two classes with upperBoundsOnCoefficients
+  # and upperBoundsOnIntercepts
+  u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
+  model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
+                       upperBoundsOnIntercepts = 1.0)
+  summary <- summary(model)
+  coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000)
+  coefs <- summary$coefficients[, "Estimate"]
+  expect_true(all(abs(coefsR - coefs) < 0.1))
+  # Test upperBoundsOnCoefficients should be matrix
+  expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)),
+                           upperBoundsOnIntercepts = 1.0))
+
+  # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients
+  # and lowerBoundsOnIntercepts
+  l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
+  model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
+                       lowerBoundsOnIntercepts = 0.0)
+  summary <- summary(model)
+  coefsR <- c(0, 0, -1, 0, 1.902192)
+  coefs <- summary$coefficients[, "Estimate"]
+  expect_true(all(abs(coefsR - coefs) < 0.1))
+  # Test lowerBoundsOnCoefficients should be matrix
+  expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)),
+                           lowerBoundsOnIntercepts = 0.0))
+
+  # Test multinomial logistic regression with lowerBoundsOnCoefficients
+  # and lowerBoundsOnIntercepts
+  l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
+  model <- spark.logit(training, Species ~ ., family = "multinomial",
+                       lowerBoundsOnCoefficients = l,
+                       lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))
+  summary <- summary(model)
+  versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429)
+  virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021)
+  versicolorCoefs <- summary$coefficients[, "versicolor"]
+  virginicaCoefs <- summary$coefficients[, "virginica"]
+  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
 })

 test_that("spark.mlp", {
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas

  /**
   * The lower bounds on intercepts if fitting under bound constrained optimization.
-   * The bounds vector size must be equal with 1 for binomial regression, or the number
+   * The bounds vector size must be equal to 1 for binomial regression, or the number
   * of classes for multinomial regression. Otherwise, it throws exception.
   * Default is none.
   *
@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas

  /**
   * The upper bounds on intercepts if fitting under bound constrained optimization.
-   * The bound vector size must be equal with 1 for binomial regression, or the number
+   * The bound vector size must be equal to 1 for binomial regression, or the number
   * of classes for multinomial regression. Otherwise, it throws exception.
   * Default is none.
   *
@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") (
    }
    if (isSet(lowerBoundsOnIntercepts)) {
      require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
-        "lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
+        "lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
        s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.")
    }
    if (isSet(upperBoundsOnIntercepts)) {
      require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
-        "upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
+        "upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
        s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.")
    }
    if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@ -25,7 +25,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
 import org.apache.spark.ml.feature.{IndexToString, RFormula}
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors}
 import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
@ -97,7 +97,13 @@ private[r] object LogisticRegressionWrapper
      standardization: Boolean,
      thresholds: Array[Double],
      weightCol: String,
-      aggregationDepth: Int
+      aggregationDepth: Int,
+      numRowsOfBoundsOnCoefficients: Int,
+      numColsOfBoundsOnCoefficients: Int,
+      lowerBoundsOnCoefficients: Array[Double],
+      upperBoundsOnCoefficients: Array[Double],
+      lowerBoundsOnIntercepts: Array[Double],
+      upperBoundsOnIntercepts: Array[Double]
      ): LogisticRegressionWrapper = {

    val rFormula = new RFormula()
@ -133,6 +139,30 @@ private[r] object LogisticRegressionWrapper

    if (weightCol != null) lr.setWeightCol(weightCol)

+    if (numRowsOfBoundsOnCoefficients != 0 &&
+      numColsOfBoundsOnCoefficients != 0 && lowerBoundsOnCoefficients != null) {
+      val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
+        numColsOfBoundsOnCoefficients, lowerBoundsOnCoefficients)
+      lr.setLowerBoundsOnCoefficients(coef)
+    }
+
+    if (numRowsOfBoundsOnCoefficients != 0 &&
+      numColsOfBoundsOnCoefficients != 0 && upperBoundsOnCoefficients != null) {
+      val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
+        numColsOfBoundsOnCoefficients, upperBoundsOnCoefficients)
+      lr.setUpperBoundsOnCoefficients(coef)
+    }
+
+    if (lowerBoundsOnIntercepts != null) {
+      val intercept = Vectors.dense(lowerBoundsOnIntercepts)
+      lr.setLowerBoundsOnIntercepts(intercept)
+    }
+
+    if (upperBoundsOnIntercepts != null) {
+      val intercept = Vectors.dense(upperBoundsOnIntercepts)
+      lr.setUpperBoundsOnIntercepts(intercept)
+    }
+
    val idxToStr = new IndexToString()
      .setInputCol(PREDICTED_LABEL_INDEX_COL)
      .setOutputCol(PREDICTED_LABEL_COL)