[SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR
## What changes were proposed in this pull request? PR https://github.com/apache/spark/pull/17715 Added Constrained Logistic Regression for ML. We should add it to SparkR. ## How was this patch tested? Add new unit tests. Author: wangmiao1981 <wm624@hotmail.com> Closes #18128 from wangmiao1981/test.
This commit is contained in:
parent
215281d88e
commit
53543374ce
|
@ -204,6 +204,20 @@ function(object, path, overwrite = FALSE) {
|
|||
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
|
||||
#' or the number of partitions are large, this param could be adjusted to a larger size.
|
||||
#' This is an expert parameter. Default value should be good for most cases.
|
||||
#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization.
|
||||
#' The bound matrix must be compatible with the shape (1, number of features) for binomial
|
||||
#' regression, or (number of classes, number of features) for multinomial regression.
|
||||
#' It is a R matrix.
|
||||
#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization.
|
||||
#' The bound matrix must be compatible with the shape (1, number of features) for binomial
|
||||
#' regression, or (number of classes, number of features) for multinomial regression.
|
||||
#' It is a R matrix.
|
||||
#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization.
|
||||
#' The bounds vector size must be equal to 1 for binomial regression, or the number
|
||||
#' of classes for multinomial regression.
|
||||
#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
|
||||
#' The bound vector size must be equal to 1 for binomial regression, or the number
|
||||
#' of classes for multinomial regression.
|
||||
#' @param ... additional arguments passed to the method.
|
||||
#' @return \code{spark.logit} returns a fitted logistic regression model.
|
||||
#' @rdname spark.logit
|
||||
|
@ -241,8 +255,12 @@ function(object, path, overwrite = FALSE) {
|
|||
setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
|
||||
function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
|
||||
tol = 1E-6, family = "auto", standardization = TRUE,
|
||||
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
|
||||
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
|
||||
lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
|
||||
lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
|
||||
formula <- paste(deparse(formula), collapse = "")
|
||||
row <- 0
|
||||
col <- 0
|
||||
|
||||
if (!is.null(weightCol) && weightCol == "") {
|
||||
weightCol <- NULL
|
||||
|
@ -250,12 +268,51 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
|
|||
weightCol <- as.character(weightCol)
|
||||
}
|
||||
|
||||
if (!is.null(lowerBoundsOnIntercepts)) {
|
||||
lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
|
||||
}
|
||||
|
||||
if (!is.null(upperBoundsOnIntercepts)) {
|
||||
upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
|
||||
}
|
||||
|
||||
if (!is.null(lowerBoundsOnCoefficients)) {
|
||||
if (class(lowerBoundsOnCoefficients) != "matrix") {
|
||||
stop("lowerBoundsOnCoefficients must be a matrix.")
|
||||
}
|
||||
row <- nrow(lowerBoundsOnCoefficients)
|
||||
col <- ncol(lowerBoundsOnCoefficients)
|
||||
lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
|
||||
}
|
||||
|
||||
if (!is.null(upperBoundsOnCoefficients)) {
|
||||
if (class(upperBoundsOnCoefficients) != "matrix") {
|
||||
stop("upperBoundsOnCoefficients must be a matrix.")
|
||||
}
|
||||
|
||||
if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
|
||||
|| col != ncol(upperBoundsOnCoefficients))) {
|
||||
stop(paste0("dimension of upperBoundsOnCoefficients ",
|
||||
"is not the same as lowerBoundsOnCoefficients", sep = ""))
|
||||
}
|
||||
|
||||
if (is.null(lowerBoundsOnCoefficients)) {
|
||||
row <- nrow(upperBoundsOnCoefficients)
|
||||
col <- ncol(upperBoundsOnCoefficients)
|
||||
}
|
||||
|
||||
upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
|
||||
}
|
||||
|
||||
jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
|
||||
data@sdf, formula, as.numeric(regParam),
|
||||
as.numeric(elasticNetParam), as.integer(maxIter),
|
||||
as.numeric(tol), as.character(family),
|
||||
as.logical(standardization), as.array(thresholds),
|
||||
weightCol, as.integer(aggregationDepth))
|
||||
weightCol, as.integer(aggregationDepth),
|
||||
as.integer(row), as.integer(col),
|
||||
lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
|
||||
lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
|
||||
new("LogisticRegressionModel", jobj = jobj)
|
||||
})
|
||||
|
||||
|
|
|
@ -223,6 +223,46 @@ test_that("spark.logit", {
|
|||
model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
|
||||
prediction2 <- collect(select(predict(model2, df2), "prediction"))
|
||||
expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
|
||||
|
||||
# Test binomial logistic regression againt two classes with upperBoundsOnCoefficients
|
||||
# and upperBoundsOnIntercepts
|
||||
u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
|
||||
model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
|
||||
upperBoundsOnIntercepts = 1.0)
|
||||
summary <- summary(model)
|
||||
coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000)
|
||||
coefs <- summary$coefficients[, "Estimate"]
|
||||
expect_true(all(abs(coefsR - coefs) < 0.1))
|
||||
# Test upperBoundsOnCoefficients should be matrix
|
||||
expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)),
|
||||
upperBoundsOnIntercepts = 1.0))
|
||||
|
||||
# Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients
|
||||
# and lowerBoundsOnIntercepts
|
||||
l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
|
||||
model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
|
||||
lowerBoundsOnIntercepts = 0.0)
|
||||
summary <- summary(model)
|
||||
coefsR <- c(0, 0, -1, 0, 1.902192)
|
||||
coefs <- summary$coefficients[, "Estimate"]
|
||||
expect_true(all(abs(coefsR - coefs) < 0.1))
|
||||
# Test lowerBoundsOnCoefficients should be matrix
|
||||
expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)),
|
||||
lowerBoundsOnIntercepts = 0.0))
|
||||
|
||||
# Test multinomial logistic regression with lowerBoundsOnCoefficients
|
||||
# and lowerBoundsOnIntercepts
|
||||
l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
|
||||
model <- spark.logit(training, Species ~ ., family = "multinomial",
|
||||
lowerBoundsOnCoefficients = l,
|
||||
lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))
|
||||
summary <- summary(model)
|
||||
versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429)
|
||||
virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021)
|
||||
versicolorCoefs <- summary$coefficients[, "versicolor"]
|
||||
virginicaCoefs <- summary$coefficients[, "virginica"]
|
||||
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
|
||||
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
|
||||
})
|
||||
|
||||
test_that("spark.mlp", {
|
||||
|
|
|
@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
|
|||
|
||||
/**
|
||||
* The lower bounds on intercepts if fitting under bound constrained optimization.
|
||||
* The bounds vector size must be equal with 1 for binomial regression, or the number
|
||||
* The bounds vector size must be equal to 1 for binomial regression, or the number
|
||||
* of classes for multinomial regression. Otherwise, it throws exception.
|
||||
* Default is none.
|
||||
*
|
||||
|
@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
|
|||
|
||||
/**
|
||||
* The upper bounds on intercepts if fitting under bound constrained optimization.
|
||||
* The bound vector size must be equal with 1 for binomial regression, or the number
|
||||
* The bound vector size must be equal to 1 for binomial regression, or the number
|
||||
* of classes for multinomial regression. Otherwise, it throws exception.
|
||||
* Default is none.
|
||||
*
|
||||
|
@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") (
|
|||
}
|
||||
if (isSet(lowerBoundsOnIntercepts)) {
|
||||
require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
|
||||
"lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
|
||||
"lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
|
||||
s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.")
|
||||
}
|
||||
if (isSet(upperBoundsOnIntercepts)) {
|
||||
require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
|
||||
"upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
|
||||
"upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
|
||||
s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.")
|
||||
}
|
||||
if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {
|
||||
|
|
|
@ -25,7 +25,7 @@ import org.json4s.jackson.JsonMethods._
|
|||
import org.apache.spark.ml.{Pipeline, PipelineModel}
|
||||
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
|
||||
import org.apache.spark.ml.feature.{IndexToString, RFormula}
|
||||
import org.apache.spark.ml.linalg.Vector
|
||||
import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors}
|
||||
import org.apache.spark.ml.r.RWrapperUtils._
|
||||
import org.apache.spark.ml.util._
|
||||
import org.apache.spark.sql.{DataFrame, Dataset}
|
||||
|
@ -97,7 +97,13 @@ private[r] object LogisticRegressionWrapper
|
|||
standardization: Boolean,
|
||||
thresholds: Array[Double],
|
||||
weightCol: String,
|
||||
aggregationDepth: Int
|
||||
aggregationDepth: Int,
|
||||
numRowsOfBoundsOnCoefficients: Int,
|
||||
numColsOfBoundsOnCoefficients: Int,
|
||||
lowerBoundsOnCoefficients: Array[Double],
|
||||
upperBoundsOnCoefficients: Array[Double],
|
||||
lowerBoundsOnIntercepts: Array[Double],
|
||||
upperBoundsOnIntercepts: Array[Double]
|
||||
): LogisticRegressionWrapper = {
|
||||
|
||||
val rFormula = new RFormula()
|
||||
|
@ -133,6 +139,30 @@ private[r] object LogisticRegressionWrapper
|
|||
|
||||
if (weightCol != null) lr.setWeightCol(weightCol)
|
||||
|
||||
if (numRowsOfBoundsOnCoefficients != 0 &&
|
||||
numColsOfBoundsOnCoefficients != 0 && lowerBoundsOnCoefficients != null) {
|
||||
val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
|
||||
numColsOfBoundsOnCoefficients, lowerBoundsOnCoefficients)
|
||||
lr.setLowerBoundsOnCoefficients(coef)
|
||||
}
|
||||
|
||||
if (numRowsOfBoundsOnCoefficients != 0 &&
|
||||
numColsOfBoundsOnCoefficients != 0 && upperBoundsOnCoefficients != null) {
|
||||
val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
|
||||
numColsOfBoundsOnCoefficients, upperBoundsOnCoefficients)
|
||||
lr.setUpperBoundsOnCoefficients(coef)
|
||||
}
|
||||
|
||||
if (lowerBoundsOnIntercepts != null) {
|
||||
val intercept = Vectors.dense(lowerBoundsOnIntercepts)
|
||||
lr.setLowerBoundsOnIntercepts(intercept)
|
||||
}
|
||||
|
||||
if (upperBoundsOnIntercepts != null) {
|
||||
val intercept = Vectors.dense(upperBoundsOnIntercepts)
|
||||
lr.setUpperBoundsOnIntercepts(intercept)
|
||||
}
|
||||
|
||||
val idxToStr = new IndexToString()
|
||||
.setInputCol(PREDICTED_LABEL_INDEX_COL)
|
||||
.setOutputCol(PREDICTED_LABEL_COL)
|
||||
|
|
Loading…
Reference in a new issue