ce112cec4f
## What changes were proposed in this pull request? The `coefficients` component in model summary should be 'matrix' but the underlying structure is indeed list. This affects several models except for 'AFTSurvivalRegressionModel' which has the correct implementation. The fix is to first `unlist` the coefficients returned from the `callJMethod` before converting to matrix. An example illustrates the issues: ``` data(iris) df <- createDataFrame(iris) model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") s <- summary(model) > str(s$coefficients) List of 8 $ : num 6.53 $ : num -0.223 $ : num 0.479 $ : num 0.155 $ : num 13.6 $ : num -1.44 $ : num 0 $ : num 0.152 - attr(*, "dim")= int [1:2] 2 4 - attr(*, "dimnames")=List of 2 ..$ : chr [1:2] "(Intercept)" "Sepal_Width" ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)" > s$coefficients[, 2] $`(Intercept)` [1] 0.4788963 $Sepal_Width [1] 0.1550809 ``` This shows that the underlying structure of coefficients is still `list`. felixcheung wangmiao1981 Author: actuaryzhang <actuaryzhang10@gmail.com> Closes #16730 from actuaryzhang/sparkRCoef.
308 lines
11 KiB
R
308 lines
11 KiB
R
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
library(testthat)
|
|
|
|
context("MLlib clustering algorithms")
|
|
|
|
# Tests for MLlib clustering algorithms in SparkR
|
|
sparkSession <- sparkR.session(enableHiveSupport = FALSE)
|
|
|
|
absoluteSparkPath <- function(x) {
|
|
sparkHome <- sparkR.conf("spark.home")
|
|
file.path(sparkHome, x)
|
|
}
|
|
|
|
test_that("spark.bisectingKmeans", {
|
|
newIris <- iris
|
|
newIris$Species <- NULL
|
|
training <- suppressWarnings(createDataFrame(newIris))
|
|
|
|
take(training, 1)
|
|
|
|
model <- spark.bisectingKmeans(data = training, ~ .)
|
|
sample <- take(select(predict(model, training), "prediction"), 1)
|
|
expect_equal(typeof(sample$prediction), "integer")
|
|
expect_equal(sample$prediction, 1)
|
|
|
|
# Test fitted works on Bisecting KMeans
|
|
fitted.model <- fitted(model)
|
|
expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction),
|
|
c(0, 1, 2, 3))
|
|
|
|
# Test summary works on KMeans
|
|
summary.model <- summary(model)
|
|
cluster <- summary.model$cluster
|
|
k <- summary.model$k
|
|
expect_equal(k, 4)
|
|
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction),
|
|
c(0, 1, 2, 3))
|
|
|
|
# Test model save/load
|
|
modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
|
|
write.ml(model, modelPath)
|
|
expect_error(write.ml(model, modelPath))
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
model2 <- read.ml(modelPath)
|
|
summary2 <- summary(model2)
|
|
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
|
|
expect_equal(summary.model$coefficients, summary2$coefficients)
|
|
expect_true(!summary.model$is.loaded)
|
|
expect_true(summary2$is.loaded)
|
|
|
|
unlink(modelPath)
|
|
})
|
|
|
|
test_that("spark.gaussianMixture", {
|
|
# R code to reproduce the result.
|
|
# nolint start
|
|
#' library(mvtnorm)
|
|
#' set.seed(1)
|
|
#' a <- rmvnorm(7, c(0, 0))
|
|
#' b <- rmvnorm(8, c(10, 10))
|
|
#' data <- rbind(a, b)
|
|
#' model <- mvnormalmixEM(data, k = 2)
|
|
#' model$lambda
|
|
#
|
|
# [1] 0.4666667 0.5333333
|
|
#
|
|
#' model$mu
|
|
#
|
|
# [1] 0.11731091 -0.06192351
|
|
# [1] 10.363673 9.897081
|
|
#
|
|
#' model$sigma
|
|
#
|
|
# [[1]]
|
|
# [,1] [,2]
|
|
# [1,] 0.62049934 0.06880802
|
|
# [2,] 0.06880802 1.27431874
|
|
#
|
|
# [[2]]
|
|
# [,1] [,2]
|
|
# [1,] 0.2961543 0.160783
|
|
# [2,] 0.1607830 1.008878
|
|
#
|
|
#' model$loglik
|
|
#
|
|
# [1] -46.89499
|
|
# nolint end
|
|
data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808),
|
|
list(0.3295078, -0.8204684), list(0.4874291, 0.7383247),
|
|
list(0.5757814, -0.3053884), list(1.5117812, 0.3898432),
|
|
list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664),
|
|
list(9.9838097, 10.9438362), list(10.8212212, 10.5939013),
|
|
list(10.9189774, 10.7821363), list(10.0745650, 8.0106483),
|
|
list(10.6198257, 9.9438713), list(9.8442045, 8.5292476),
|
|
list(9.5218499, 10.4179416))
|
|
df <- createDataFrame(data, c("x1", "x2"))
|
|
model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
|
|
stats <- summary(model)
|
|
rLambda <- c(0.4666667, 0.5333333)
|
|
rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081)
|
|
rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874,
|
|
0.2961543, 0.160783, 0.1607830, 1.008878)
|
|
rLoglik <- -46.89499
|
|
expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
|
|
expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
|
|
expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
|
|
expect_equal(unlist(stats$loglik), rLoglik, tolerance = 1e-3)
|
|
p <- collect(select(predict(model, df), "prediction"))
|
|
expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
|
|
|
|
# Test model save/load
|
|
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
|
|
write.ml(model, modelPath)
|
|
expect_error(write.ml(model, modelPath))
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
model2 <- read.ml(modelPath)
|
|
stats2 <- summary(model2)
|
|
expect_equal(stats$lambda, stats2$lambda)
|
|
expect_equal(unlist(stats$mu), unlist(stats2$mu))
|
|
expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
|
|
expect_equal(unlist(stats$loglik), unlist(stats2$loglik))
|
|
|
|
unlink(modelPath)
|
|
})
|
|
|
|
test_that("spark.kmeans", {
|
|
newIris <- iris
|
|
newIris$Species <- NULL
|
|
training <- suppressWarnings(createDataFrame(newIris))
|
|
|
|
take(training, 1)
|
|
|
|
model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
|
|
sample <- take(select(predict(model, training), "prediction"), 1)
|
|
expect_equal(typeof(sample$prediction), "integer")
|
|
expect_equal(sample$prediction, 1)
|
|
|
|
# Test stats::kmeans is working
|
|
statsModel <- kmeans(x = newIris, centers = 2)
|
|
expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
|
|
|
|
# Test fitted works on KMeans
|
|
fitted.model <- fitted(model)
|
|
expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
|
|
|
|
# Test summary works on KMeans
|
|
summary.model <- summary(model)
|
|
cluster <- summary.model$cluster
|
|
k <- summary.model$k
|
|
expect_equal(k, 2)
|
|
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
|
|
|
|
# test summary coefficients return matrix type
|
|
expect_true(class(summary.model$coefficients) == "matrix")
|
|
expect_true(class(summary.model$coefficients[1, ]) == "numeric")
|
|
|
|
# Test model save/load
|
|
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
|
|
write.ml(model, modelPath)
|
|
expect_error(write.ml(model, modelPath))
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
model2 <- read.ml(modelPath)
|
|
summary2 <- summary(model2)
|
|
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
|
|
expect_equal(summary.model$coefficients, summary2$coefficients)
|
|
expect_true(!summary.model$is.loaded)
|
|
expect_true(summary2$is.loaded)
|
|
|
|
unlink(modelPath)
|
|
|
|
# Test Kmeans on dataset that is sensitive to seed value
|
|
col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
|
|
col2 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
|
|
col3 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
|
|
cols <- as.data.frame(cbind(col1, col2, col3))
|
|
df <- createDataFrame(cols)
|
|
|
|
model1 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
|
|
initMode = "random", seed = 1, tol = 1E-5)
|
|
model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
|
|
initMode = "random", seed = 22222, tol = 1E-5)
|
|
|
|
fitted.model1 <- fitted(model1)
|
|
fitted.model2 <- fitted(model2)
|
|
# The predicted clusters are different
|
|
expect_equal(sort(collect(distinct(select(fitted.model1, "prediction")))$prediction),
|
|
c(0, 1, 2, 3))
|
|
expect_equal(sort(collect(distinct(select(fitted.model2, "prediction")))$prediction),
|
|
c(0, 1, 2))
|
|
})
|
|
|
|
test_that("spark.lda with libsvm", {
|
|
text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm")
|
|
model <- spark.lda(text, optimizer = "em")
|
|
|
|
stats <- summary(model, 10)
|
|
isDistributed <- stats$isDistributed
|
|
logLikelihood <- stats$logLikelihood
|
|
logPerplexity <- stats$logPerplexity
|
|
vocabSize <- stats$vocabSize
|
|
topics <- stats$topicTopTerms
|
|
weights <- stats$topicTopTermsWeights
|
|
vocabulary <- stats$vocabulary
|
|
trainingLogLikelihood <- stats$trainingLogLikelihood
|
|
logPrior <- stats$logPrior
|
|
|
|
expect_true(isDistributed)
|
|
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
|
|
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
|
|
expect_equal(vocabSize, 11)
|
|
expect_true(is.null(vocabulary))
|
|
expect_true(trainingLogLikelihood <= 0 & !is.na(trainingLogLikelihood))
|
|
expect_true(logPrior <= 0 & !is.na(logPrior))
|
|
|
|
# Test model save/load
|
|
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
|
|
write.ml(model, modelPath)
|
|
expect_error(write.ml(model, modelPath))
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
model2 <- read.ml(modelPath)
|
|
stats2 <- summary(model2)
|
|
|
|
expect_true(stats2$isDistributed)
|
|
expect_equal(logLikelihood, stats2$logLikelihood)
|
|
expect_equal(logPerplexity, stats2$logPerplexity)
|
|
expect_equal(vocabSize, stats2$vocabSize)
|
|
expect_equal(vocabulary, stats2$vocabulary)
|
|
expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
|
|
expect_equal(logPrior, stats2$logPrior)
|
|
|
|
unlink(modelPath)
|
|
})
|
|
|
|
test_that("spark.lda with text input", {
|
|
text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
|
|
model <- spark.lda(text, optimizer = "online", features = "value")
|
|
|
|
stats <- summary(model)
|
|
isDistributed <- stats$isDistributed
|
|
logLikelihood <- stats$logLikelihood
|
|
logPerplexity <- stats$logPerplexity
|
|
vocabSize <- stats$vocabSize
|
|
topics <- stats$topicTopTerms
|
|
weights <- stats$topicTopTermsWeights
|
|
vocabulary <- stats$vocabulary
|
|
trainingLogLikelihood <- stats$trainingLogLikelihood
|
|
logPrior <- stats$logPrior
|
|
|
|
expect_false(isDistributed)
|
|
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
|
|
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
|
|
expect_equal(vocabSize, 10)
|
|
expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
|
|
expect_true(is.na(trainingLogLikelihood))
|
|
expect_true(is.na(logPrior))
|
|
|
|
# Test model save/load
|
|
modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
|
|
write.ml(model, modelPath)
|
|
expect_error(write.ml(model, modelPath))
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
model2 <- read.ml(modelPath)
|
|
stats2 <- summary(model2)
|
|
|
|
expect_false(stats2$isDistributed)
|
|
expect_equal(logLikelihood, stats2$logLikelihood)
|
|
expect_equal(logPerplexity, stats2$logPerplexity)
|
|
expect_equal(vocabSize, stats2$vocabSize)
|
|
expect_true(all.equal(vocabulary, stats2$vocabulary))
|
|
expect_true(is.na(stats2$trainingLogLikelihood))
|
|
expect_true(is.na(stats2$logPrior))
|
|
|
|
unlink(modelPath)
|
|
})
|
|
|
|
test_that("spark.posterior and spark.perplexity", {
|
|
text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
|
|
model <- spark.lda(text, features = "value", k = 3)
|
|
|
|
# Assert perplexities are equal
|
|
stats <- summary(model)
|
|
logPerplexity <- spark.perplexity(model, text)
|
|
expect_equal(logPerplexity, stats$logPerplexity)
|
|
|
|
# Assert the sum of every topic distribution is equal to 1
|
|
posterior <- spark.posterior(model, text)
|
|
local.posterior <- collect(posterior)$topicDistribution
|
|
expect_equal(length(local.posterior), sum(unlist(local.posterior)))
|
|
})
|
|
|
|
sparkR.session.stop()
|