2015-07-20 23:49:38 -04:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
|
|
|
library(testthat)
|
|
|
|
|
|
|
|
context("MLlib functions")
|
|
|
|
|
|
|
|
# Tests for MLlib functions in SparkR
|
2016-07-17 22:02:21 -04:00
|
|
|
sparkSession <- sparkR.session(enableHiveSupport = FALSE)
|
2015-07-20 23:49:38 -04:00
|
|
|
|
2016-09-07 06:24:03 -04:00
|
|
|
absoluteSparkPath <- function(x) {
|
|
|
|
sparkHome <- sparkR.conf("spark.home")
|
|
|
|
file.path(sparkHome, x)
|
|
|
|
}
|
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
test_that("formula of spark.glm", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
# directly calling the spark API
|
|
|
|
# dot minus and intercept vs native glm
|
|
|
|
model <- spark.glm(training, Sepal_Width ~ . - Species + 0)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
|
|
|
|
# feature interaction vs native glm
|
|
|
|
model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
|
|
|
|
# glm should work with long formula
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
training$LongLongLongLongLongName <- training$Sepal_Width
|
|
|
|
training$VeryLongLongLongLonLongName <- training$Sepal_Length
|
|
|
|
training$AnotherLongLongLongLongName <- training$Species
|
|
|
|
model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName +
|
|
|
|
AnotherLongLongLongLongName)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.glm and predict", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
# gaussian family
|
|
|
|
model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
|
|
|
|
prediction <- predict(model, training)
|
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
|
|
|
|
vals <- collect(select(prediction, "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
|
|
|
|
# poisson family
|
|
|
|
model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
|
|
|
|
family = poisson(link = identity))
|
|
|
|
prediction <- predict(model, training)
|
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
|
|
|
|
vals <- collect(select(prediction, "prediction"))
|
|
|
|
rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
|
|
|
|
data = iris, family = poisson(link = identity)), iris))
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
|
|
|
|
# Test stats::predict is working
|
|
|
|
x <- rnorm(15)
|
|
|
|
y <- x + rnorm(15)
|
|
|
|
expect_equal(length(predict(lm(y ~ x))), 15)
|
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.glm summary", {
|
|
|
|
# gaussian family
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
|
|
|
|
|
|
|
|
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
2016-08-22 15:27:33 -04:00
|
|
|
out <- capture.output(print(stats))
|
|
|
|
expect_match(out[2], "Deviance Residuals:")
|
|
|
|
expect_true(any(grepl("AIC: 59.22", out)))
|
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
# binomial family
|
2016-05-26 14:20:20 -04:00
|
|
|
df <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
training <- df[df$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
|
|
|
|
family = binomial(link = "logit")))
|
|
|
|
|
|
|
|
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
|
|
|
|
family = binomial(link = "logit")))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "Sepal_Length", "Sepal_Width")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
2016-08-10 13:53:48 -04:00
|
|
|
# Test spark.glm works with weighted dataset
|
|
|
|
a1 <- c(0, 1, 2, 3)
|
|
|
|
a2 <- c(5, 2, 1, 3)
|
|
|
|
w <- c(1, 2, 3, 4)
|
|
|
|
b <- c(1, 0, 1, 0)
|
|
|
|
data <- as.data.frame(cbind(a1, a2, w, b))
|
|
|
|
df <- suppressWarnings(createDataFrame(data))
|
|
|
|
|
|
|
|
stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
|
|
|
|
rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-3))
|
|
|
|
expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
# Test summary works on base GLM models
|
|
|
|
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
|
|
|
|
baseSummary <- summary(baseModel)
|
|
|
|
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
|
2016-09-01 00:39:31 -04:00
|
|
|
|
|
|
|
# Test spark.glm works with regularization parameter
|
|
|
|
data <- as.data.frame(cbind(a1, a2, b))
|
|
|
|
df <- suppressWarnings(createDataFrame(data))
|
|
|
|
regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
|
|
|
|
expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
|
2016-04-30 02:13:03 -04:00
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.glm save/load", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
m <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
|
|
|
|
s <- summary(m)
|
|
|
|
|
2016-04-30 11:37:56 -04:00
|
|
|
modelPath <- tempfile(pattern = "spark-glm", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(m, modelPath)
|
|
|
|
expect_error(write.ml(m, modelPath))
|
|
|
|
write.ml(m, modelPath, overwrite = TRUE)
|
|
|
|
m2 <- read.ml(modelPath)
|
2016-04-30 02:13:03 -04:00
|
|
|
s2 <- summary(m2)
|
|
|
|
|
|
|
|
expect_equal(s$coefficients, s2$coefficients)
|
|
|
|
expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
|
|
|
|
expect_equal(s$dispersion, s2$dispersion)
|
|
|
|
expect_equal(s$null.deviance, s2$null.deviance)
|
|
|
|
expect_equal(s$deviance, s2$deviance)
|
|
|
|
expect_equal(s$df.null, s2$df.null)
|
|
|
|
expect_equal(s$df.residual, s2$df.residual)
|
|
|
|
expect_equal(s$aic, s2$aic)
|
|
|
|
expect_equal(s$iter, s2$iter)
|
|
|
|
expect_true(!s$is.loaded)
|
|
|
|
expect_true(s2$is.loaded)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
test_that("formula of glm", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-12 13:51:07 -04:00
|
|
|
# dot minus and intercept vs native glm
|
|
|
|
model <- glm(Sepal_Width ~ . - Species + 0, data = training)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
[SPARK-11339][SPARKR] Document the list of functions in R base package that are masked by functions with same name in SparkR
Added tests for function that are reported as masked, to make sure the base:: or stats:: function can be called.
For those we can't call, added them to SparkR programming guide.
It would seem to me `table, sample, subset, filter, cov` not working are not actually expected - I investigated/experimented with them but couldn't get them to work. It looks like as they are defined in base or stats they are missing the S3 generic, eg.
```
> methods("transform")
[1] transform,ANY-method transform.data.frame
[3] transform,DataFrame-method transform.default
see '?methods' for accessing help and source code
> methods("subset")
[1] subset.data.frame subset,DataFrame-method subset.default
[4] subset.matrix
see '?methods' for accessing help and source code
Warning message:
In .S3methods(generic.function, class, parent.frame()) :
function 'subset' appears not to be S3 generic; found functions that look like S3 methods
```
Any idea?
More information on masking:
http://www.ats.ucla.edu/stat/r/faq/referencing_objects.htm
http://www.sfu.ca/~sweldon/howTo/guide4.pdf
This is what the output doc looks like (minus css):
![image](https://cloud.githubusercontent.com/assets/8969467/11229714/2946e5de-8d4d-11e5-94b0-dda9696b6fdd.png)
Author: felixcheung <felixcheung_m@hotmail.com>
Closes #9785 from felixcheung/rmasked.
2015-11-19 02:32:49 -05:00
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
# feature interaction vs native glm
|
|
|
|
model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
2015-07-20 23:49:38 -04:00
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
# glm should work with long formula
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2015-11-05 19:34:10 -05:00
|
|
|
training$LongLongLongLongLongName <- training$Sepal_Width
|
|
|
|
training$VeryLongLongLongLonLongName <- training$Sepal_Length
|
|
|
|
training$AnotherLongLongLongLongName <- training$Species
|
|
|
|
model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName,
|
|
|
|
data = training)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
})
|
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
test_that("glm and predict", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-12 13:51:07 -04:00
|
|
|
# gaussian family
|
2015-07-27 20:17:49 -04:00
|
|
|
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
|
2016-04-12 13:51:07 -04:00
|
|
|
prediction <- predict(model, training)
|
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
|
|
|
|
vals <- collect(select(prediction, "prediction"))
|
2015-07-27 20:17:49 -04:00
|
|
|
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
2015-07-28 17:16:57 -04:00
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
# poisson family
|
|
|
|
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
|
|
|
|
family = poisson(link = identity))
|
|
|
|
prediction <- predict(model, training)
|
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
|
|
|
|
vals <- collect(select(prediction, "prediction"))
|
|
|
|
rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
|
|
|
|
data = iris, family = poisson(link = identity)), iris))
|
2015-09-25 03:43:22 -04:00
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
2015-11-10 00:06:01 -05:00
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
# Test stats::predict is working
|
|
|
|
x <- rnorm(15)
|
|
|
|
y <- x + rnorm(15)
|
|
|
|
expect_equal(length(predict(lm(y ~ x))), 15)
|
2015-11-10 00:06:01 -05:00
|
|
|
})
|
2016-02-23 18:42:58 -05:00
|
|
|
|
2016-04-15 11:23:51 -04:00
|
|
|
test_that("glm summary", {
|
|
|
|
# gaussian family
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-15 11:23:51 -04:00
|
|
|
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
|
|
|
|
|
|
|
|
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
|
|
|
# binomial family
|
2016-05-26 14:20:20 -04:00
|
|
|
df <- suppressWarnings(createDataFrame(iris))
|
2016-04-15 11:23:51 -04:00
|
|
|
training <- df[df$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
|
|
|
|
family = binomial(link = "logit")))
|
|
|
|
|
|
|
|
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
|
|
|
|
family = binomial(link = "logit")))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "Sepal_Length", "Sepal_Width")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
|
|
|
# Test summary works on base GLM models
|
|
|
|
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
|
|
|
|
baseSummary <- summary(baseModel)
|
|
|
|
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
|
|
|
|
})
|
|
|
|
|
2016-04-29 12:42:54 -04:00
|
|
|
test_that("glm save/load", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-29 12:42:54 -04:00
|
|
|
m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
|
|
|
|
s <- summary(m)
|
|
|
|
|
|
|
|
modelPath <- tempfile(pattern = "glm", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(m, modelPath)
|
|
|
|
expect_error(write.ml(m, modelPath))
|
|
|
|
write.ml(m, modelPath, overwrite = TRUE)
|
|
|
|
m2 <- read.ml(modelPath)
|
2016-04-29 12:42:54 -04:00
|
|
|
s2 <- summary(m2)
|
|
|
|
|
|
|
|
expect_equal(s$coefficients, s2$coefficients)
|
|
|
|
expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
|
|
|
|
expect_equal(s$dispersion, s2$dispersion)
|
|
|
|
expect_equal(s$null.deviance, s2$null.deviance)
|
|
|
|
expect_equal(s$deviance, s2$deviance)
|
|
|
|
expect_equal(s$df.null, s2$df.null)
|
|
|
|
expect_equal(s$df.residual, s2$df.residual)
|
|
|
|
expect_equal(s$aic, s2$aic)
|
|
|
|
expect_equal(s$iter, s2$iter)
|
|
|
|
expect_true(!s$is.loaded)
|
|
|
|
expect_true(s2$is.loaded)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
test_that("spark.kmeans", {
|
2016-02-23 18:42:58 -05:00
|
|
|
newIris <- iris
|
|
|
|
newIris$Species <- NULL
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(newIris))
|
2016-02-23 18:42:58 -05:00
|
|
|
|
|
|
|
take(training, 1)
|
|
|
|
|
2016-06-21 11:31:15 -04:00
|
|
|
model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
|
2016-02-23 18:42:58 -05:00
|
|
|
sample <- take(select(predict(model, training), "prediction"), 1)
|
|
|
|
expect_equal(typeof(sample$prediction), "integer")
|
|
|
|
expect_equal(sample$prediction, 1)
|
|
|
|
|
|
|
|
# Test stats::kmeans is working
|
|
|
|
statsModel <- kmeans(x = newIris, centers = 2)
|
2016-02-24 10:05:20 -05:00
|
|
|
expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
|
2016-02-23 18:42:58 -05:00
|
|
|
|
|
|
|
# Test fitted works on KMeans
|
|
|
|
fitted.model <- fitted(model)
|
|
|
|
expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
|
|
|
|
|
|
|
|
# Test summary works on KMeans
|
|
|
|
summary.model <- summary(model)
|
|
|
|
cluster <- summary.model$cluster
|
|
|
|
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
|
2016-04-29 12:42:54 -04:00
|
|
|
|
|
|
|
# Test model save/load
|
2016-04-30 11:37:56 -04:00
|
|
|
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
2016-04-29 12:42:54 -04:00
|
|
|
summary2 <- summary(model2)
|
|
|
|
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
|
|
|
|
expect_equal(summary.model$coefficients, summary2$coefficients)
|
|
|
|
expect_true(!summary.model$is.loaded)
|
|
|
|
expect_true(summary2$is.loaded)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
2016-02-23 18:42:58 -05:00
|
|
|
})
|
2016-03-22 17:16:51 -04:00
|
|
|
|
2016-08-24 14:18:10 -04:00
|
|
|
test_that("spark.mlp", {
|
2016-09-07 06:24:03 -04:00
|
|
|
df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
|
|
|
|
source = "libsvm")
|
2016-08-24 14:18:10 -04:00
|
|
|
model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs", maxIter = 100,
|
|
|
|
tol = 0.5, stepSize = 1, seed = 1)
|
|
|
|
|
|
|
|
# Test summary method
|
|
|
|
summary <- summary(model)
|
|
|
|
expect_equal(summary$labelCount, 3)
|
|
|
|
expect_equal(summary$layers, c(4, 5, 4, 3))
|
|
|
|
expect_equal(length(summary$weights), 64)
|
2016-09-10 12:52:53 -04:00
|
|
|
expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
|
|
|
|
tolerance = 1e-6)
|
2016-08-24 14:18:10 -04:00
|
|
|
|
|
|
|
# Test predict method
|
|
|
|
mlpTestDF <- df
|
|
|
|
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
|
|
|
|
expect_equal(head(mlpPredictions$prediction, 6), c(0, 1, 1, 1, 1, 1))
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
summary2 <- summary(model2)
|
|
|
|
|
|
|
|
expect_equal(summary2$labelCount, 3)
|
|
|
|
expect_equal(summary2$layers, c(4, 5, 4, 3))
|
|
|
|
expect_equal(length(summary2$weights), 64)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
|
|
|
})
|
|
|
|
|
2016-04-30 11:37:56 -04:00
|
|
|
test_that("spark.naiveBayes", {
|
2016-03-22 17:16:51 -04:00
|
|
|
# R code to reproduce the result.
|
|
|
|
# We do not support instance weights yet. So we ignore the frequencies.
|
|
|
|
#
|
|
|
|
#' library(e1071)
|
|
|
|
#' t <- as.data.frame(Titanic)
|
|
|
|
#' t1 <- t[t$Freq > 0, -5]
|
|
|
|
#' m <- naiveBayes(Survived ~ ., data = t1)
|
|
|
|
#' m
|
|
|
|
#' predict(m, t1)
|
|
|
|
#
|
|
|
|
# -- output of 'm'
|
|
|
|
#
|
|
|
|
# A-priori probabilities:
|
|
|
|
# Y
|
|
|
|
# No Yes
|
|
|
|
# 0.4166667 0.5833333
|
|
|
|
#
|
|
|
|
# Conditional probabilities:
|
|
|
|
# Class
|
|
|
|
# Y 1st 2nd 3rd Crew
|
|
|
|
# No 0.2000000 0.2000000 0.4000000 0.2000000
|
|
|
|
# Yes 0.2857143 0.2857143 0.2857143 0.1428571
|
|
|
|
#
|
|
|
|
# Sex
|
|
|
|
# Y Male Female
|
|
|
|
# No 0.5 0.5
|
|
|
|
# Yes 0.5 0.5
|
|
|
|
#
|
|
|
|
# Age
|
|
|
|
# Y Child Adult
|
|
|
|
# No 0.2000000 0.8000000
|
|
|
|
# Yes 0.4285714 0.5714286
|
|
|
|
#
|
|
|
|
# -- output of 'predict(m, t1)'
|
|
|
|
#
|
|
|
|
# Yes Yes Yes Yes No No Yes Yes No No Yes Yes Yes Yes Yes Yes Yes Yes No No Yes Yes No No
|
|
|
|
#
|
|
|
|
|
|
|
|
t <- as.data.frame(Titanic)
|
|
|
|
t1 <- t[t$Freq > 0, -5]
|
2016-05-26 14:20:20 -04:00
|
|
|
df <- suppressWarnings(createDataFrame(t1))
|
2016-06-21 11:31:15 -04:00
|
|
|
m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
|
2016-03-22 17:16:51 -04:00
|
|
|
s <- summary(m)
|
|
|
|
expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
|
|
|
|
expect_equal(sum(s$apriori), 1)
|
|
|
|
expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6)
|
|
|
|
p <- collect(select(predict(m, df), "prediction"))
|
|
|
|
expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
|
|
|
|
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No",
|
|
|
|
"Yes", "Yes", "No", "No"))
|
|
|
|
|
2016-04-25 17:08:41 -04:00
|
|
|
# Test model save/load
|
2016-04-30 11:37:56 -04:00
|
|
|
modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(m, modelPath)
|
|
|
|
expect_error(write.ml(m, modelPath))
|
|
|
|
write.ml(m, modelPath, overwrite = TRUE)
|
|
|
|
m2 <- read.ml(modelPath)
|
2016-04-25 17:08:41 -04:00
|
|
|
s2 <- summary(m2)
|
|
|
|
expect_equal(s$apriori, s2$apriori)
|
|
|
|
expect_equal(s$tables, s2$tables)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
2016-03-22 17:16:51 -04:00
|
|
|
# Test e1071::naiveBayes
|
|
|
|
if (requireNamespace("e1071", quietly = TRUE)) {
|
2016-08-22 15:27:33 -04:00
|
|
|
expect_error(m <- e1071::naiveBayes(Survived ~ ., data = t1), NA)
|
2016-03-22 17:16:51 -04:00
|
|
|
expect_equal(as.character(predict(m, t1[1, ])), "Yes")
|
|
|
|
}
|
|
|
|
})
|
2016-03-25 01:29:34 -04:00
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
test_that("spark.survreg", {
|
2016-03-25 01:29:34 -04:00
|
|
|
# R code to reproduce the result.
|
|
|
|
#
|
|
|
|
#' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
|
|
|
|
#' x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
|
|
|
|
#' library(survival)
|
|
|
|
#' model <- survreg(Surv(time, status) ~ x + sex, rData)
|
|
|
|
#' summary(model)
|
|
|
|
#' predict(model, data)
|
|
|
|
#
|
|
|
|
# -- output of 'summary(model)'
|
|
|
|
#
|
|
|
|
# Value Std. Error z p
|
|
|
|
# (Intercept) 1.315 0.270 4.88 1.07e-06
|
|
|
|
# x -0.190 0.173 -1.10 2.72e-01
|
|
|
|
# sex -0.253 0.329 -0.77 4.42e-01
|
|
|
|
# Log(scale) -1.160 0.396 -2.93 3.41e-03
|
|
|
|
#
|
|
|
|
# -- output of 'predict(model, data)'
|
|
|
|
#
|
|
|
|
# 1 2 3 4 5 6 7
|
|
|
|
# 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269
|
|
|
|
#
|
|
|
|
data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0),
|
|
|
|
list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1))
|
2016-05-26 14:20:20 -04:00
|
|
|
df <- createDataFrame(data, c("time", "status", "x", "sex"))
|
2016-04-30 02:13:03 -04:00
|
|
|
model <- spark.survreg(df, Surv(time, status) ~ x + sex)
|
2016-03-25 01:29:34 -04:00
|
|
|
stats <- summary(model)
|
|
|
|
coefs <- as.vector(stats$coefficients[, 1])
|
|
|
|
rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800)
|
|
|
|
expect_equal(coefs, rCoefs, tolerance = 1e-4)
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "x", "sex", "Log(scale)")))
|
|
|
|
p <- collect(select(predict(model, df), "prediction"))
|
|
|
|
expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035,
|
|
|
|
2.390146, 2.891269, 2.891269), tolerance = 1e-4)
|
|
|
|
|
2016-04-26 13:30:24 -04:00
|
|
|
# Test model save/load
|
2016-04-30 11:37:56 -04:00
|
|
|
modelPath <- tempfile(pattern = "spark-survreg", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
2016-04-26 13:30:24 -04:00
|
|
|
stats2 <- summary(model2)
|
|
|
|
coefs2 <- as.vector(stats2$coefficients[, 1])
|
|
|
|
expect_equal(coefs, coefs2)
|
|
|
|
expect_equal(rownames(stats$coefficients), rownames(stats2$coefficients))
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
2016-03-25 01:29:34 -04:00
|
|
|
# Test survival::survreg
|
|
|
|
if (requireNamespace("survival", quietly = TRUE)) {
|
|
|
|
rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
|
|
|
|
x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
|
2016-05-03 12:29:49 -04:00
|
|
|
expect_error(
|
2016-03-25 01:29:34 -04:00
|
|
|
model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
|
2016-05-03 12:29:49 -04:00
|
|
|
NA)
|
2016-03-25 01:29:34 -04:00
|
|
|
expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
|
|
|
|
}
|
|
|
|
})
|
2016-07-17 22:02:21 -04:00
|
|
|
|
2016-08-17 09:15:04 -04:00
|
|
|
test_that("spark.isotonicRegression", {
|
|
|
|
label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
|
|
|
|
feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
|
|
|
|
weight <- c(1.0, 1.0, 1.0, 1.0, 1.0)
|
|
|
|
data <- as.data.frame(cbind(label, feature, weight))
|
|
|
|
df <- suppressWarnings(createDataFrame(data))
|
|
|
|
|
|
|
|
model <- spark.isoreg(df, label ~ feature, isotonic = FALSE,
|
|
|
|
weightCol = "weight")
|
|
|
|
# only allow one variable on the right hand side of the formula
|
|
|
|
expect_error(model2 <- spark.isoreg(df, ~., isotonic = FALSE))
|
2016-08-22 15:27:33 -04:00
|
|
|
result <- summary(model)
|
2016-08-17 09:15:04 -04:00
|
|
|
expect_equal(result$predictions, list(7, 5, 4, 4, 1))
|
|
|
|
|
|
|
|
# Test model prediction
|
|
|
|
predict_data <- list(list(-2.0), list(-1.0), list(0.5),
|
|
|
|
list(0.75), list(1.0), list(2.0), list(9.0))
|
|
|
|
predict_df <- createDataFrame(predict_data, c("feature"))
|
|
|
|
predict_result <- collect(select(predict(model, predict_df), "prediction"))
|
|
|
|
expect_equal(predict_result$prediction, c(7.0, 7.0, 6.0, 5.5, 5.0, 4.0, 1.0))
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-isotonicRegression", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
2016-08-22 15:27:33 -04:00
|
|
|
expect_equal(result, summary(model2))
|
2016-08-17 09:15:04 -04:00
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
2016-08-17 14:18:33 -04:00
|
|
|
test_that("spark.gaussianMixture", {
|
|
|
|
# R code to reproduce the result.
|
|
|
|
# nolint start
|
|
|
|
#' library(mvtnorm)
|
2016-08-21 05:23:31 -04:00
|
|
|
#' set.seed(1)
|
|
|
|
#' a <- rmvnorm(7, c(0, 0))
|
|
|
|
#' b <- rmvnorm(8, c(10, 10))
|
2016-08-17 14:18:33 -04:00
|
|
|
#' data <- rbind(a, b)
|
|
|
|
#' model <- mvnormalmixEM(data, k = 2)
|
|
|
|
#' model$lambda
|
|
|
|
#
|
2016-08-21 05:23:31 -04:00
|
|
|
# [1] 0.4666667 0.5333333
|
2016-08-17 14:18:33 -04:00
|
|
|
#
|
|
|
|
#' model$mu
|
|
|
|
#
|
2016-08-21 05:23:31 -04:00
|
|
|
# [1] 0.11731091 -0.06192351
|
|
|
|
# [1] 10.363673 9.897081
|
2016-08-17 14:18:33 -04:00
|
|
|
#
|
|
|
|
#' model$sigma
|
|
|
|
#
|
|
|
|
# [[1]]
|
2016-08-21 05:23:31 -04:00
|
|
|
# [,1] [,2]
|
|
|
|
# [1,] 0.62049934 0.06880802
|
|
|
|
# [2,] 0.06880802 1.27431874
|
2016-08-17 14:18:33 -04:00
|
|
|
#
|
|
|
|
# [[2]]
|
2016-08-21 05:23:31 -04:00
|
|
|
# [,1] [,2]
|
|
|
|
# [1,] 0.2961543 0.160783
|
|
|
|
# [2,] 0.1607830 1.008878
|
2016-08-17 14:18:33 -04:00
|
|
|
# nolint end
|
2016-08-21 05:23:31 -04:00
|
|
|
data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808),
|
|
|
|
list(0.3295078, -0.8204684), list(0.4874291, 0.7383247),
|
|
|
|
list(0.5757814, -0.3053884), list(1.5117812, 0.3898432),
|
|
|
|
list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664),
|
|
|
|
list(9.9838097, 10.9438362), list(10.8212212, 10.5939013),
|
|
|
|
list(10.9189774, 10.7821363), list(10.0745650, 8.0106483),
|
|
|
|
list(10.6198257, 9.9438713), list(9.8442045, 8.5292476),
|
|
|
|
list(9.5218499, 10.4179416))
|
2016-08-17 14:18:33 -04:00
|
|
|
df <- createDataFrame(data, c("x1", "x2"))
|
|
|
|
model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
|
|
|
|
stats <- summary(model)
|
2016-08-21 05:23:31 -04:00
|
|
|
rLambda <- c(0.4666667, 0.5333333)
|
|
|
|
rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081)
|
|
|
|
rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874,
|
|
|
|
0.2961543, 0.160783, 0.1607830, 1.008878)
|
2016-08-19 05:11:59 -04:00
|
|
|
expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
|
2016-08-17 14:18:33 -04:00
|
|
|
expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
|
|
|
|
expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
|
|
|
|
p <- collect(select(predict(model, df), "prediction"))
|
2016-08-21 05:23:31 -04:00
|
|
|
expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
|
2016-08-17 14:18:33 -04:00
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
expect_equal(stats$lambda, stats2$lambda)
|
|
|
|
expect_equal(unlist(stats$mu), unlist(stats2$mu))
|
|
|
|
expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
2016-08-18 08:33:52 -04:00
|
|
|
test_that("spark.lda with libsvm", {
|
2016-09-07 06:24:03 -04:00
|
|
|
text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm")
|
2016-08-18 08:33:52 -04:00
|
|
|
model <- spark.lda(text, optimizer = "em")
|
|
|
|
|
|
|
|
stats <- summary(model, 10)
|
|
|
|
isDistributed <- stats$isDistributed
|
|
|
|
logLikelihood <- stats$logLikelihood
|
|
|
|
logPerplexity <- stats$logPerplexity
|
|
|
|
vocabSize <- stats$vocabSize
|
|
|
|
topics <- stats$topicTopTerms
|
|
|
|
weights <- stats$topicTopTermsWeights
|
|
|
|
vocabulary <- stats$vocabulary
|
|
|
|
|
|
|
|
expect_false(isDistributed)
|
|
|
|
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
|
|
|
|
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
|
|
|
|
expect_equal(vocabSize, 11)
|
|
|
|
expect_true(is.null(vocabulary))
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
|
|
|
|
expect_false(stats2$isDistributed)
|
|
|
|
expect_equal(logLikelihood, stats2$logLikelihood)
|
|
|
|
expect_equal(logPerplexity, stats2$logPerplexity)
|
|
|
|
expect_equal(vocabSize, stats2$vocabSize)
|
|
|
|
expect_equal(vocabulary, stats2$vocabulary)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.lda with text input", {
|
2016-09-07 06:24:03 -04:00
|
|
|
text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
|
2016-08-18 08:33:52 -04:00
|
|
|
model <- spark.lda(text, optimizer = "online", features = "value")
|
|
|
|
|
|
|
|
stats <- summary(model)
|
|
|
|
isDistributed <- stats$isDistributed
|
|
|
|
logLikelihood <- stats$logLikelihood
|
|
|
|
logPerplexity <- stats$logPerplexity
|
|
|
|
vocabSize <- stats$vocabSize
|
|
|
|
topics <- stats$topicTopTerms
|
|
|
|
weights <- stats$topicTopTermsWeights
|
|
|
|
vocabulary <- stats$vocabulary
|
|
|
|
|
|
|
|
expect_false(isDistributed)
|
|
|
|
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
|
|
|
|
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
|
|
|
|
expect_equal(vocabSize, 10)
|
|
|
|
expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
|
|
|
|
expect_false(stats2$isDistributed)
|
|
|
|
expect_equal(logLikelihood, stats2$logLikelihood)
|
|
|
|
expect_equal(logPerplexity, stats2$logPerplexity)
|
|
|
|
expect_equal(vocabSize, stats2$vocabSize)
|
|
|
|
expect_true(all.equal(vocabulary, stats2$vocabulary))
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.posterior and spark.perplexity", {
|
2016-09-07 06:24:03 -04:00
|
|
|
text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
|
2016-08-18 08:33:52 -04:00
|
|
|
model <- spark.lda(text, features = "value", k = 3)
|
|
|
|
|
|
|
|
# Assert perplexities are equal
|
|
|
|
stats <- summary(model)
|
|
|
|
logPerplexity <- spark.perplexity(model, text)
|
|
|
|
expect_equal(logPerplexity, stats$logPerplexity)
|
|
|
|
|
|
|
|
# Assert the sum of every topic distribution is equal to 1
|
|
|
|
posterior <- spark.posterior(model, text)
|
|
|
|
local.posterior <- collect(posterior)$topicDistribution
|
|
|
|
expect_equal(length(local.posterior), sum(unlist(local.posterior)))
|
|
|
|
})
|
|
|
|
|
2016-08-19 17:24:09 -04:00
|
|
|
test_that("spark.als", {
|
|
|
|
data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
|
|
|
|
list(2, 1, 1.0), list(2, 2, 5.0))
|
|
|
|
df <- createDataFrame(data, c("user", "item", "score"))
|
|
|
|
model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
|
|
|
|
rank = 10, maxIter = 5, seed = 0, reg = 0.1)
|
|
|
|
stats <- summary(model)
|
|
|
|
expect_equal(stats$rank, 10)
|
|
|
|
test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
|
|
|
|
predictions <- collect(predict(model, test))
|
|
|
|
|
|
|
|
expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
|
|
|
|
tolerance = 1e-4)
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
expect_equal(stats2$rating, "score")
|
|
|
|
userFactors <- collect(stats$userFactors)
|
|
|
|
itemFactors <- collect(stats$itemFactors)
|
|
|
|
userFactors2 <- collect(stats2$userFactors)
|
|
|
|
itemFactors2 <- collect(stats2$itemFactors)
|
|
|
|
|
|
|
|
orderUser <- order(userFactors$id)
|
|
|
|
orderUser2 <- order(userFactors2$id)
|
|
|
|
expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
|
|
|
|
expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
|
|
|
|
|
|
|
|
orderItem <- order(itemFactors$id)
|
|
|
|
orderItem2 <- order(itemFactors2$id)
|
|
|
|
expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
|
|
|
|
expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
2016-09-03 15:26:30 -04:00
|
|
|
test_that("spark.kstest", {
|
|
|
|
data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
|
|
|
|
df <- createDataFrame(data)
|
|
|
|
testResult <- spark.kstest(df, "test", "norm")
|
|
|
|
stats <- summary(testResult)
|
|
|
|
|
|
|
|
rStats <- ks.test(data$test, "pnorm", alternative = "two.sided")
|
|
|
|
|
|
|
|
expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
|
|
|
|
expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
|
[SPARK-17315][FOLLOW-UP][SPARKR][ML] Fix print of Kolmogorov-Smirnov test summary
## What changes were proposed in this pull request?
#14881 added Kolmogorov-Smirnov Test wrapper to SparkR. I found that ```print.summary.KSTest``` was implemented inappropriately and result in no effect.
Running the following code for KSTest:
```Scala
data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
df <- createDataFrame(data)
testResult <- spark.kstest(df, "test", "norm")
summary(testResult)
```
Before this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/18615016/b9a2823a-7d4f-11e6-934b-128beade355e.png)
After this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/18615014/aafe2798-7d4f-11e6-8b99-c705bb9fe8f2.png)
The new implementation is similar with [```print.summary.GeneralizedLinearRegressionModel```](https://github.com/apache/spark/blob/master/R/pkg/R/mllib.R#L284) of SparkR and [```print.summary.glm```](https://svn.r-project.org/R/trunk/src/library/stats/R/glm.R) of native R.
BTW, I removed the comparison of ```print.summary.KSTest``` in unit test, since it's only wrappers of the summary output which has been checked. Another reason is that these comparison will output summary information to the test console, it will make the test output in a mess.
## How was this patch tested?
Existing test.
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #15139 from yanboliang/spark-17315.
2016-09-21 23:14:18 -04:00
|
|
|
expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
|
2016-09-03 15:26:30 -04:00
|
|
|
|
|
|
|
testResult <- spark.kstest(df, "test", "norm", -0.5)
|
|
|
|
stats <- summary(testResult)
|
|
|
|
|
|
|
|
rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided")
|
|
|
|
|
|
|
|
expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
|
|
|
|
expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
|
[SPARK-17315][FOLLOW-UP][SPARKR][ML] Fix print of Kolmogorov-Smirnov test summary
## What changes were proposed in this pull request?
#14881 added Kolmogorov-Smirnov Test wrapper to SparkR. I found that ```print.summary.KSTest``` was implemented inappropriately and result in no effect.
Running the following code for KSTest:
```Scala
data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
df <- createDataFrame(data)
testResult <- spark.kstest(df, "test", "norm")
summary(testResult)
```
Before this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/18615016/b9a2823a-7d4f-11e6-934b-128beade355e.png)
After this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/18615014/aafe2798-7d4f-11e6-8b99-c705bb9fe8f2.png)
The new implementation is similar with [```print.summary.GeneralizedLinearRegressionModel```](https://github.com/apache/spark/blob/master/R/pkg/R/mllib.R#L284) of SparkR and [```print.summary.glm```](https://svn.r-project.org/R/trunk/src/library/stats/R/glm.R) of native R.
BTW, I removed the comparison of ```print.summary.KSTest``` in unit test, since it's only wrappers of the summary output which has been checked. Another reason is that these comparison will output summary information to the test console, it will make the test output in a mess.
## How was this patch tested?
Existing test.
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #15139 from yanboliang/spark-17315.
2016-09-21 23:14:18 -04:00
|
|
|
expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
|
2016-09-03 15:26:30 -04:00
|
|
|
})
|
|
|
|
|
2016-07-17 22:02:21 -04:00
|
|
|
sparkR.session.stop()
|