spark-instrumented-optimizer/R/pkg/inst/tests/testthat/test_mllib_clustering.R
actuaryzhang ce112cec4f [SPARK-19395][SPARKR] Convert coefficients in summary to matrix
## What changes were proposed in this pull request?
The `coefficients` component in model summary should be 'matrix' but the underlying structure is indeed list. This affects several models except for 'AFTSurvivalRegressionModel' which has the correct implementation. The fix is to first `unlist` the coefficients returned from the `callJMethod` before converting to matrix. An example illustrates the issues:

```
data(iris)
df <- createDataFrame(iris)
model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
s <- summary(model)

> str(s$coefficients)
List of 8
 $ : num 6.53
 $ : num -0.223
 $ : num 0.479
 $ : num 0.155
 $ : num 13.6
 $ : num -1.44
 $ : num 0
 $ : num 0.152
 - attr(*, "dim")= int [1:2] 2 4
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:2] "(Intercept)" "Sepal_Width"
  ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
> s$coefficients[, 2]
$`(Intercept)`
[1] 0.4788963

$Sepal_Width
[1] 0.1550809
```

This  shows that the underlying structure of coefficients is still `list`.

felixcheung wangmiao1981

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #16730 from actuaryzhang/sparkRCoef.
2017-01-31 12:20:43 -08:00

308 lines
11 KiB
R

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
library(testthat)
context("MLlib clustering algorithms")
# Tests for MLlib clustering algorithms in SparkR
sparkSession <- sparkR.session(enableHiveSupport = FALSE)
absoluteSparkPath <- function(x) {
sparkHome <- sparkR.conf("spark.home")
file.path(sparkHome, x)
}
test_that("spark.bisectingKmeans", {
newIris <- iris
newIris$Species <- NULL
training <- suppressWarnings(createDataFrame(newIris))
take(training, 1)
model <- spark.bisectingKmeans(data = training, ~ .)
sample <- take(select(predict(model, training), "prediction"), 1)
expect_equal(typeof(sample$prediction), "integer")
expect_equal(sample$prediction, 1)
# Test fitted works on Bisecting KMeans
fitted.model <- fitted(model)
expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction),
c(0, 1, 2, 3))
# Test summary works on KMeans
summary.model <- summary(model)
cluster <- summary.model$cluster
k <- summary.model$k
expect_equal(k, 4)
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction),
c(0, 1, 2, 3))
# Test model save/load
modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
summary2 <- summary(model2)
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
expect_equal(summary.model$coefficients, summary2$coefficients)
expect_true(!summary.model$is.loaded)
expect_true(summary2$is.loaded)
unlink(modelPath)
})
test_that("spark.gaussianMixture", {
# R code to reproduce the result.
# nolint start
#' library(mvtnorm)
#' set.seed(1)
#' a <- rmvnorm(7, c(0, 0))
#' b <- rmvnorm(8, c(10, 10))
#' data <- rbind(a, b)
#' model <- mvnormalmixEM(data, k = 2)
#' model$lambda
#
# [1] 0.4666667 0.5333333
#
#' model$mu
#
# [1] 0.11731091 -0.06192351
# [1] 10.363673 9.897081
#
#' model$sigma
#
# [[1]]
# [,1] [,2]
# [1,] 0.62049934 0.06880802
# [2,] 0.06880802 1.27431874
#
# [[2]]
# [,1] [,2]
# [1,] 0.2961543 0.160783
# [2,] 0.1607830 1.008878
#
#' model$loglik
#
# [1] -46.89499
# nolint end
data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808),
list(0.3295078, -0.8204684), list(0.4874291, 0.7383247),
list(0.5757814, -0.3053884), list(1.5117812, 0.3898432),
list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664),
list(9.9838097, 10.9438362), list(10.8212212, 10.5939013),
list(10.9189774, 10.7821363), list(10.0745650, 8.0106483),
list(10.6198257, 9.9438713), list(9.8442045, 8.5292476),
list(9.5218499, 10.4179416))
df <- createDataFrame(data, c("x1", "x2"))
model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
stats <- summary(model)
rLambda <- c(0.4666667, 0.5333333)
rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081)
rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874,
0.2961543, 0.160783, 0.1607830, 1.008878)
rLoglik <- -46.89499
expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
expect_equal(unlist(stats$loglik), rLoglik, tolerance = 1e-3)
p <- collect(select(predict(model, df), "prediction"))
expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
# Test model save/load
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)
expect_equal(stats$lambda, stats2$lambda)
expect_equal(unlist(stats$mu), unlist(stats2$mu))
expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
expect_equal(unlist(stats$loglik), unlist(stats2$loglik))
unlink(modelPath)
})
test_that("spark.kmeans", {
newIris <- iris
newIris$Species <- NULL
training <- suppressWarnings(createDataFrame(newIris))
take(training, 1)
model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
sample <- take(select(predict(model, training), "prediction"), 1)
expect_equal(typeof(sample$prediction), "integer")
expect_equal(sample$prediction, 1)
# Test stats::kmeans is working
statsModel <- kmeans(x = newIris, centers = 2)
expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
# Test fitted works on KMeans
fitted.model <- fitted(model)
expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
# Test summary works on KMeans
summary.model <- summary(model)
cluster <- summary.model$cluster
k <- summary.model$k
expect_equal(k, 2)
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
# test summary coefficients return matrix type
expect_true(class(summary.model$coefficients) == "matrix")
expect_true(class(summary.model$coefficients[1, ]) == "numeric")
# Test model save/load
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
summary2 <- summary(model2)
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
expect_equal(summary.model$coefficients, summary2$coefficients)
expect_true(!summary.model$is.loaded)
expect_true(summary2$is.loaded)
unlink(modelPath)
# Test Kmeans on dataset that is sensitive to seed value
col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
col2 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
col3 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
cols <- as.data.frame(cbind(col1, col2, col3))
df <- createDataFrame(cols)
model1 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
initMode = "random", seed = 1, tol = 1E-5)
model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
initMode = "random", seed = 22222, tol = 1E-5)
fitted.model1 <- fitted(model1)
fitted.model2 <- fitted(model2)
# The predicted clusters are different
expect_equal(sort(collect(distinct(select(fitted.model1, "prediction")))$prediction),
c(0, 1, 2, 3))
expect_equal(sort(collect(distinct(select(fitted.model2, "prediction")))$prediction),
c(0, 1, 2))
})
test_that("spark.lda with libsvm", {
text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm")
model <- spark.lda(text, optimizer = "em")
stats <- summary(model, 10)
isDistributed <- stats$isDistributed
logLikelihood <- stats$logLikelihood
logPerplexity <- stats$logPerplexity
vocabSize <- stats$vocabSize
topics <- stats$topicTopTerms
weights <- stats$topicTopTermsWeights
vocabulary <- stats$vocabulary
trainingLogLikelihood <- stats$trainingLogLikelihood
logPrior <- stats$logPrior
expect_true(isDistributed)
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
expect_equal(vocabSize, 11)
expect_true(is.null(vocabulary))
expect_true(trainingLogLikelihood <= 0 & !is.na(trainingLogLikelihood))
expect_true(logPrior <= 0 & !is.na(logPrior))
# Test model save/load
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)
expect_true(stats2$isDistributed)
expect_equal(logLikelihood, stats2$logLikelihood)
expect_equal(logPerplexity, stats2$logPerplexity)
expect_equal(vocabSize, stats2$vocabSize)
expect_equal(vocabulary, stats2$vocabulary)
expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
expect_equal(logPrior, stats2$logPrior)
unlink(modelPath)
})
test_that("spark.lda with text input", {
text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
model <- spark.lda(text, optimizer = "online", features = "value")
stats <- summary(model)
isDistributed <- stats$isDistributed
logLikelihood <- stats$logLikelihood
logPerplexity <- stats$logPerplexity
vocabSize <- stats$vocabSize
topics <- stats$topicTopTerms
weights <- stats$topicTopTermsWeights
vocabulary <- stats$vocabulary
trainingLogLikelihood <- stats$trainingLogLikelihood
logPrior <- stats$logPrior
expect_false(isDistributed)
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
expect_equal(vocabSize, 10)
expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
expect_true(is.na(trainingLogLikelihood))
expect_true(is.na(logPrior))
# Test model save/load
modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)
expect_false(stats2$isDistributed)
expect_equal(logLikelihood, stats2$logLikelihood)
expect_equal(logPerplexity, stats2$logPerplexity)
expect_equal(vocabSize, stats2$vocabSize)
expect_true(all.equal(vocabulary, stats2$vocabulary))
expect_true(is.na(stats2$trainingLogLikelihood))
expect_true(is.na(stats2$logPrior))
unlink(modelPath)
})
test_that("spark.posterior and spark.perplexity", {
text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
model <- spark.lda(text, features = "value", k = 3)
# Assert perplexities are equal
stats <- summary(model)
logPerplexity <- spark.perplexity(model, text)
expect_equal(logPerplexity, stats$logPerplexity)
# Assert the sum of every topic distribution is equal to 1
posterior <- spark.posterior(model, text)
local.posterior <- collect(posterior)$topicDistribution
expect_equal(length(local.posterior), sum(unlist(local.posterior)))
})
sparkR.session.stop()