[SPARK-19395][SPARKR] Convert coefficients in summary to matrix

## What changes were proposed in this pull request?
The `coefficients` component in model summary should be 'matrix' but the underlying structure is indeed list. This affects several models except for 'AFTSurvivalRegressionModel' which has the correct implementation. The fix is to first `unlist` the coefficients returned from the `callJMethod` before converting to matrix. An example illustrates the issues:

```
data(iris)
df <- createDataFrame(iris)
model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
s <- summary(model)

> str(s$coefficients)
List of 8
 $ : num 6.53
 $ : num -0.223
 $ : num 0.479
 $ : num 0.155
 $ : num 13.6
 $ : num -1.44
 $ : num 0
 $ : num 0.152
 - attr(*, "dim")= int [1:2] 2 4
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:2] "(Intercept)" "Sepal_Width"
  ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
> s$coefficients[, 2]
$`(Intercept)`
[1] 0.4788963

$Sepal_Width
[1] 0.1550809
```

This  shows that the underlying structure of coefficients is still `list`.

felixcheung wangmiao1981

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #16730 from actuaryzhang/sparkRCoef.
This commit is contained in:
actuaryzhang 2017-01-31 12:20:43 -08:00 committed by Felix Cheung
parent 26a4cba3ff
commit ce112cec4f
6 changed files with 34 additions and 22 deletions

View file

@ -145,7 +145,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
labels <- callJMethod(jobj, "labels")
coefficients <- callJMethod(jobj, "rCoefficients")
nCol <- length(coefficients) / length(features)
coefficients <- matrix(coefficients, ncol = nCol)
coefficients <- matrix(unlist(coefficients), ncol = nCol)
# If nCol == 1, means this is a binomial logistic regression model with pivoting.
# Otherwise, it's a multinomial logistic regression model without pivoting.
if (nCol == 1) {

View file

@ -390,7 +390,7 @@ setMethod("summary", signature(object = "KMeansModel"),
coefficients <- callJMethod(jobj, "coefficients")
k <- callJMethod(jobj, "k")
size <- callJMethod(jobj, "size")
coefficients <- t(matrix(coefficients, ncol = k))
coefficients <- t(matrix(unlist(coefficients), ncol = k))
colnames(coefficients) <- unlist(features)
rownames(coefficients) <- 1:k
cluster <- if (is.loaded) {

View file

@ -182,11 +182,11 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
# coefficients, standard error of coefficients, t value and p value. Otherwise,
# it will be fitted by local "l-bfgs", we can only provide coefficients.
if (length(features) == length(coefficients)) {
coefficients <- matrix(coefficients, ncol = 1)
coefficients <- matrix(unlist(coefficients), ncol = 1)
colnames(coefficients) <- c("Estimate")
rownames(coefficients) <- unlist(features)
} else {
coefficients <- matrix(coefficients, ncol = 4)
coefficients <- matrix(unlist(coefficients), ncol = 4)
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
rownames(coefficients) <- unlist(features)
}

View file

@ -68,12 +68,17 @@ test_that("spark.logit", {
df <- suppressWarnings(createDataFrame(iris))
model <- spark.logit(df, Species ~ ., regParam = 0.5)
summary <- summary(model)
# test summary coefficients return matrix type
expect_true(class(summary$coefficients) == "matrix")
expect_true(class(summary$coefficients[, 1]) == "numeric")
versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
setosaCoefs <- unlist(summary$coefficients[, "setosa"])
versicolorCoefs <- summary$coefficients[, "versicolor"]
virginicaCoefs <- summary$coefficients[, "virginica"]
setosaCoefs <- summary$coefficients[, "setosa"]
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
@ -136,8 +141,8 @@ test_that("spark.logit", {
summary <- summary(model)
versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
versicolorCoefs <- summary$coefficients[, "versicolor"]
virginicaCoefs <- summary$coefficients[, "virginica"]
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
@ -145,7 +150,7 @@ test_that("spark.logit", {
model <- spark.logit(training, Species ~ ., regParam = 0.5)
summary <- summary(model)
coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
coefs <- unlist(summary$coefficients[, "Estimate"])
coefs <- summary$coefficients[, "Estimate"]
expect_true(all(abs(coefsR - coefs) < 0.1))
# Test prediction with string label

View file

@ -166,6 +166,10 @@ test_that("spark.kmeans", {
expect_equal(k, 2)
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
# test summary coefficients return matrix type
expect_true(class(summary.model$coefficients) == "matrix")
expect_true(class(summary.model$coefficients[1, ]) == "numeric")
# Test model save/load
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
write.ml(model, modelPath)

View file

@ -87,11 +87,14 @@ test_that("spark.glm summary", {
# gaussian family
training <- suppressWarnings(createDataFrame(iris))
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
# test summary coefficients return matrix type
expect_true(class(stats$coefficients) == "matrix")
expect_true(class(stats$coefficients[, 1]) == "numeric")
coefs <- stats$coefficients
rCoefs <- rStats$coefficients
expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==
@ -117,8 +120,8 @@ test_that("spark.glm summary", {
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
family = binomial(link = "logit")))
coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
coefs <- stats$coefficients
rCoefs <- rStats$coefficients
expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==
@ -141,8 +144,8 @@ test_that("spark.glm summary", {
stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
coefs <- stats$coefficients
rCoefs <- rStats$coefficients
expect_true(all(abs(rCoefs - coefs) < 1e-3))
expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
expect_equal(stats$dispersion, rStats$dispersion)
@ -169,7 +172,7 @@ test_that("spark.glm summary", {
data <- as.data.frame(cbind(A, b))
df <- createDataFrame(data)
stats <- summary(spark.glm(df, b ~ . - 1))
coefs <- unlist(stats$coefficients)
coefs <- stats$coefficients
expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
})
@ -259,8 +262,8 @@ test_that("glm summary", {
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
coefs <- stats$coefficients
rCoefs <- rStats$coefficients
expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==
@ -282,8 +285,8 @@ test_that("glm summary", {
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
family = binomial(link = "logit")))
coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
coefs <- stats$coefficients
rCoefs <- rStats$coefficients
expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==