[SPARK-19395][SPARKR] Convert coefficients in summary to matrix
## What changes were proposed in this pull request? The `coefficients` component in model summary should be 'matrix' but the underlying structure is indeed list. This affects several models except for 'AFTSurvivalRegressionModel' which has the correct implementation. The fix is to first `unlist` the coefficients returned from the `callJMethod` before converting to matrix. An example illustrates the issues: ``` data(iris) df <- createDataFrame(iris) model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") s <- summary(model) > str(s$coefficients) List of 8 $ : num 6.53 $ : num -0.223 $ : num 0.479 $ : num 0.155 $ : num 13.6 $ : num -1.44 $ : num 0 $ : num 0.152 - attr(*, "dim")= int [1:2] 2 4 - attr(*, "dimnames")=List of 2 ..$ : chr [1:2] "(Intercept)" "Sepal_Width" ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)" > s$coefficients[, 2] $`(Intercept)` [1] 0.4788963 $Sepal_Width [1] 0.1550809 ``` This shows that the underlying structure of coefficients is still `list`. felixcheung wangmiao1981 Author: actuaryzhang <actuaryzhang10@gmail.com> Closes #16730 from actuaryzhang/sparkRCoef.
This commit is contained in:
parent
26a4cba3ff
commit
ce112cec4f
|
@ -145,7 +145,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
|
|||
labels <- callJMethod(jobj, "labels")
|
||||
coefficients <- callJMethod(jobj, "rCoefficients")
|
||||
nCol <- length(coefficients) / length(features)
|
||||
coefficients <- matrix(coefficients, ncol = nCol)
|
||||
coefficients <- matrix(unlist(coefficients), ncol = nCol)
|
||||
# If nCol == 1, means this is a binomial logistic regression model with pivoting.
|
||||
# Otherwise, it's a multinomial logistic regression model without pivoting.
|
||||
if (nCol == 1) {
|
||||
|
|
|
@ -390,7 +390,7 @@ setMethod("summary", signature(object = "KMeansModel"),
|
|||
coefficients <- callJMethod(jobj, "coefficients")
|
||||
k <- callJMethod(jobj, "k")
|
||||
size <- callJMethod(jobj, "size")
|
||||
coefficients <- t(matrix(coefficients, ncol = k))
|
||||
coefficients <- t(matrix(unlist(coefficients), ncol = k))
|
||||
colnames(coefficients) <- unlist(features)
|
||||
rownames(coefficients) <- 1:k
|
||||
cluster <- if (is.loaded) {
|
||||
|
|
|
@ -182,11 +182,11 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
|
|||
# coefficients, standard error of coefficients, t value and p value. Otherwise,
|
||||
# it will be fitted by local "l-bfgs", we can only provide coefficients.
|
||||
if (length(features) == length(coefficients)) {
|
||||
coefficients <- matrix(coefficients, ncol = 1)
|
||||
coefficients <- matrix(unlist(coefficients), ncol = 1)
|
||||
colnames(coefficients) <- c("Estimate")
|
||||
rownames(coefficients) <- unlist(features)
|
||||
} else {
|
||||
coefficients <- matrix(coefficients, ncol = 4)
|
||||
coefficients <- matrix(unlist(coefficients), ncol = 4)
|
||||
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
|
||||
rownames(coefficients) <- unlist(features)
|
||||
}
|
||||
|
|
|
@ -68,12 +68,17 @@ test_that("spark.logit", {
|
|||
df <- suppressWarnings(createDataFrame(iris))
|
||||
model <- spark.logit(df, Species ~ ., regParam = 0.5)
|
||||
summary <- summary(model)
|
||||
|
||||
# test summary coefficients return matrix type
|
||||
expect_true(class(summary$coefficients) == "matrix")
|
||||
expect_true(class(summary$coefficients[, 1]) == "numeric")
|
||||
|
||||
versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
|
||||
virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
|
||||
setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
|
||||
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
|
||||
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
|
||||
setosaCoefs <- unlist(summary$coefficients[, "setosa"])
|
||||
versicolorCoefs <- summary$coefficients[, "versicolor"]
|
||||
virginicaCoefs <- summary$coefficients[, "virginica"]
|
||||
setosaCoefs <- summary$coefficients[, "setosa"]
|
||||
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
|
||||
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
|
||||
expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
|
||||
|
@ -136,8 +141,8 @@ test_that("spark.logit", {
|
|||
summary <- summary(model)
|
||||
versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
|
||||
virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
|
||||
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
|
||||
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
|
||||
versicolorCoefs <- summary$coefficients[, "versicolor"]
|
||||
virginicaCoefs <- summary$coefficients[, "virginica"]
|
||||
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
|
||||
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
|
||||
|
||||
|
@ -145,7 +150,7 @@ test_that("spark.logit", {
|
|||
model <- spark.logit(training, Species ~ ., regParam = 0.5)
|
||||
summary <- summary(model)
|
||||
coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
|
||||
coefs <- unlist(summary$coefficients[, "Estimate"])
|
||||
coefs <- summary$coefficients[, "Estimate"]
|
||||
expect_true(all(abs(coefsR - coefs) < 0.1))
|
||||
|
||||
# Test prediction with string label
|
||||
|
|
|
@ -166,6 +166,10 @@ test_that("spark.kmeans", {
|
|||
expect_equal(k, 2)
|
||||
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
|
||||
|
||||
# test summary coefficients return matrix type
|
||||
expect_true(class(summary.model$coefficients) == "matrix")
|
||||
expect_true(class(summary.model$coefficients[1, ]) == "numeric")
|
||||
|
||||
# Test model save/load
|
||||
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
|
||||
write.ml(model, modelPath)
|
||||
|
|
|
@ -87,11 +87,14 @@ test_that("spark.glm summary", {
|
|||
# gaussian family
|
||||
training <- suppressWarnings(createDataFrame(iris))
|
||||
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
|
||||
|
||||
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
|
||||
|
||||
coefs <- unlist(stats$coefficients)
|
||||
rCoefs <- unlist(rStats$coefficients)
|
||||
# test summary coefficients return matrix type
|
||||
expect_true(class(stats$coefficients) == "matrix")
|
||||
expect_true(class(stats$coefficients[, 1]) == "numeric")
|
||||
|
||||
coefs <- stats$coefficients
|
||||
rCoefs <- rStats$coefficients
|
||||
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
||||
expect_true(all(
|
||||
rownames(stats$coefficients) ==
|
||||
|
@ -117,8 +120,8 @@ test_that("spark.glm summary", {
|
|||
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
|
||||
family = binomial(link = "logit")))
|
||||
|
||||
coefs <- unlist(stats$coefficients)
|
||||
rCoefs <- unlist(rStats$coefficients)
|
||||
coefs <- stats$coefficients
|
||||
rCoefs <- rStats$coefficients
|
||||
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
||||
expect_true(all(
|
||||
rownames(stats$coefficients) ==
|
||||
|
@ -141,8 +144,8 @@ test_that("spark.glm summary", {
|
|||
stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
|
||||
rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
|
||||
|
||||
coefs <- unlist(stats$coefficients)
|
||||
rCoefs <- unlist(rStats$coefficients)
|
||||
coefs <- stats$coefficients
|
||||
rCoefs <- rStats$coefficients
|
||||
expect_true(all(abs(rCoefs - coefs) < 1e-3))
|
||||
expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
|
||||
expect_equal(stats$dispersion, rStats$dispersion)
|
||||
|
@ -169,7 +172,7 @@ test_that("spark.glm summary", {
|
|||
data <- as.data.frame(cbind(A, b))
|
||||
df <- createDataFrame(data)
|
||||
stats <- summary(spark.glm(df, b ~ . - 1))
|
||||
coefs <- unlist(stats$coefficients)
|
||||
coefs <- stats$coefficients
|
||||
expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
|
||||
})
|
||||
|
||||
|
@ -259,8 +262,8 @@ test_that("glm summary", {
|
|||
|
||||
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
|
||||
|
||||
coefs <- unlist(stats$coefficients)
|
||||
rCoefs <- unlist(rStats$coefficients)
|
||||
coefs <- stats$coefficients
|
||||
rCoefs <- rStats$coefficients
|
||||
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
||||
expect_true(all(
|
||||
rownames(stats$coefficients) ==
|
||||
|
@ -282,8 +285,8 @@ test_that("glm summary", {
|
|||
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
|
||||
family = binomial(link = "logit")))
|
||||
|
||||
coefs <- unlist(stats$coefficients)
|
||||
rCoefs <- unlist(rStats$coefficients)
|
||||
coefs <- stats$coefficients
|
||||
rCoefs <- rStats$coefficients
|
||||
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
||||
expect_true(all(
|
||||
rownames(stats$coefficients) ==
|
||||
|
|
Loading…
Reference in a new issue