2015-07-20 23:49:38 -04:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
|
|
|
library(testthat)
|
|
|
|
|
|
|
|
context("MLlib functions")
|
|
|
|
|
|
|
|
# Tests for MLlib functions in SparkR
|
2016-07-17 22:02:21 -04:00
|
|
|
sparkSession <- sparkR.session(enableHiveSupport = FALSE)
|
2015-07-20 23:49:38 -04:00
|
|
|
|
2016-09-07 06:24:03 -04:00
|
|
|
absoluteSparkPath <- function(x) {
|
|
|
|
sparkHome <- sparkR.conf("spark.home")
|
|
|
|
file.path(sparkHome, x)
|
|
|
|
}
|
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
test_that("formula of spark.glm", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
# directly calling the spark API
|
|
|
|
# dot minus and intercept vs native glm
|
|
|
|
model <- spark.glm(training, Sepal_Width ~ . - Species + 0)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
|
|
|
|
# feature interaction vs native glm
|
|
|
|
model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
|
|
|
|
# glm should work with long formula
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
training$LongLongLongLongLongName <- training$Sepal_Width
|
|
|
|
training$VeryLongLongLongLonLongName <- training$Sepal_Length
|
|
|
|
training$AnotherLongLongLongLongName <- training$Species
|
|
|
|
model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName +
|
|
|
|
AnotherLongLongLongLongName)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.glm and predict", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
# gaussian family
|
|
|
|
model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
|
|
|
|
prediction <- predict(model, training)
|
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
|
|
|
|
vals <- collect(select(prediction, "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
|
|
|
|
# poisson family
|
|
|
|
model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
|
|
|
|
family = poisson(link = identity))
|
|
|
|
prediction <- predict(model, training)
|
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
|
|
|
|
vals <- collect(select(prediction, "prediction"))
|
|
|
|
rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
|
|
|
|
data = iris, family = poisson(link = identity)), iris))
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
|
|
|
|
# Test stats::predict is working
|
|
|
|
x <- rnorm(15)
|
|
|
|
y <- x + rnorm(15)
|
|
|
|
expect_equal(length(predict(lm(y ~ x))), 15)
|
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.glm summary", {
|
|
|
|
# gaussian family
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
|
|
|
|
|
|
|
|
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
2016-08-22 15:27:33 -04:00
|
|
|
out <- capture.output(print(stats))
|
|
|
|
expect_match(out[2], "Deviance Residuals:")
|
|
|
|
expect_true(any(grepl("AIC: 59.22", out)))
|
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
# binomial family
|
2016-05-26 14:20:20 -04:00
|
|
|
df <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
training <- df[df$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
|
|
|
|
family = binomial(link = "logit")))
|
|
|
|
|
|
|
|
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
|
|
|
|
family = binomial(link = "logit")))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "Sepal_Length", "Sepal_Width")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
2016-08-10 13:53:48 -04:00
|
|
|
# Test spark.glm works with weighted dataset
|
2016-12-02 15:16:57 -05:00
|
|
|
a1 <- c(0, 1, 2, 3)
|
|
|
|
a2 <- c(5, 2, 1, 3)
|
|
|
|
w <- c(1, 2, 3, 4)
|
|
|
|
b <- c(1, 0, 1, 0)
|
2016-08-10 13:53:48 -04:00
|
|
|
data <- as.data.frame(cbind(a1, a2, w, b))
|
2016-11-03 18:27:18 -04:00
|
|
|
df <- createDataFrame(data)
|
2016-08-10 13:53:48 -04:00
|
|
|
|
|
|
|
stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
|
|
|
|
rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-3))
|
|
|
|
expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
# Test summary works on base GLM models
|
|
|
|
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
|
|
|
|
baseSummary <- summary(baseModel)
|
|
|
|
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
|
2016-09-01 00:39:31 -04:00
|
|
|
|
|
|
|
# Test spark.glm works with regularization parameter
|
|
|
|
data <- as.data.frame(cbind(a1, a2, b))
|
|
|
|
df <- suppressWarnings(createDataFrame(data))
|
|
|
|
regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
|
2016-12-02 15:16:57 -05:00
|
|
|
expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
|
2016-11-22 22:17:48 -05:00
|
|
|
|
|
|
|
# Test spark.glm works on collinear data
|
|
|
|
A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
|
|
|
|
b <- c(1, 2, 3, 4)
|
|
|
|
data <- as.data.frame(cbind(A, b))
|
|
|
|
df <- createDataFrame(data)
|
|
|
|
stats <- summary(spark.glm(df, b ~ . - 1))
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
|
2016-04-30 02:13:03 -04:00
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.glm save/load", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-30 02:13:03 -04:00
|
|
|
m <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
|
|
|
|
s <- summary(m)
|
|
|
|
|
2016-04-30 11:37:56 -04:00
|
|
|
modelPath <- tempfile(pattern = "spark-glm", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(m, modelPath)
|
|
|
|
expect_error(write.ml(m, modelPath))
|
|
|
|
write.ml(m, modelPath, overwrite = TRUE)
|
|
|
|
m2 <- read.ml(modelPath)
|
2016-04-30 02:13:03 -04:00
|
|
|
s2 <- summary(m2)
|
|
|
|
|
|
|
|
expect_equal(s$coefficients, s2$coefficients)
|
|
|
|
expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
|
|
|
|
expect_equal(s$dispersion, s2$dispersion)
|
|
|
|
expect_equal(s$null.deviance, s2$null.deviance)
|
|
|
|
expect_equal(s$deviance, s2$deviance)
|
|
|
|
expect_equal(s$df.null, s2$df.null)
|
|
|
|
expect_equal(s$df.residual, s2$df.residual)
|
|
|
|
expect_equal(s$aic, s2$aic)
|
|
|
|
expect_equal(s$iter, s2$iter)
|
|
|
|
expect_true(!s$is.loaded)
|
|
|
|
expect_true(s2$is.loaded)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
test_that("formula of glm", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-12 13:51:07 -04:00
|
|
|
# dot minus and intercept vs native glm
|
|
|
|
model <- glm(Sepal_Width ~ . - Species + 0, data = training)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
[SPARK-11339][SPARKR] Document the list of functions in R base package that are masked by functions with same name in SparkR
Added tests for function that are reported as masked, to make sure the base:: or stats:: function can be called.
For those we can't call, added them to SparkR programming guide.
It would seem to me `table, sample, subset, filter, cov` not working are not actually expected - I investigated/experimented with them but couldn't get them to work. It looks like as they are defined in base or stats they are missing the S3 generic, eg.
```
> methods("transform")
[1] transform,ANY-method transform.data.frame
[3] transform,DataFrame-method transform.default
see '?methods' for accessing help and source code
> methods("subset")
[1] subset.data.frame subset,DataFrame-method subset.default
[4] subset.matrix
see '?methods' for accessing help and source code
Warning message:
In .S3methods(generic.function, class, parent.frame()) :
function 'subset' appears not to be S3 generic; found functions that look like S3 methods
```
Any idea?
More information on masking:
http://www.ats.ucla.edu/stat/r/faq/referencing_objects.htm
http://www.sfu.ca/~sweldon/howTo/guide4.pdf
This is what the output doc looks like (minus css):
![image](https://cloud.githubusercontent.com/assets/8969467/11229714/2946e5de-8d4d-11e5-94b0-dda9696b6fdd.png)
Author: felixcheung <felixcheung_m@hotmail.com>
Closes #9785 from felixcheung/rmasked.
2015-11-19 02:32:49 -05:00
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
# feature interaction vs native glm
|
|
|
|
model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
2015-07-20 23:49:38 -04:00
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
# glm should work with long formula
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2015-11-05 19:34:10 -05:00
|
|
|
training$LongLongLongLongLongName <- training$Sepal_Width
|
|
|
|
training$VeryLongLongLongLonLongName <- training$Sepal_Length
|
|
|
|
training$AnotherLongLongLongLongName <- training$Species
|
|
|
|
model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName,
|
|
|
|
data = training)
|
|
|
|
vals <- collect(select(predict(model, training), "prediction"))
|
|
|
|
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
|
|
|
})
|
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
test_that("glm and predict", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-12 13:51:07 -04:00
|
|
|
# gaussian family
|
2015-07-27 20:17:49 -04:00
|
|
|
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
|
2016-04-12 13:51:07 -04:00
|
|
|
prediction <- predict(model, training)
|
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
|
|
|
|
vals <- collect(select(prediction, "prediction"))
|
2015-07-27 20:17:49 -04:00
|
|
|
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
|
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
2015-07-28 17:16:57 -04:00
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
# poisson family
|
|
|
|
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
|
|
|
|
family = poisson(link = identity))
|
|
|
|
prediction <- predict(model, training)
|
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
|
|
|
|
vals <- collect(select(prediction, "prediction"))
|
|
|
|
rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
|
|
|
|
data = iris, family = poisson(link = identity)), iris))
|
2015-09-25 03:43:22 -04:00
|
|
|
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
|
2015-11-10 00:06:01 -05:00
|
|
|
|
2016-04-12 13:51:07 -04:00
|
|
|
# Test stats::predict is working
|
|
|
|
x <- rnorm(15)
|
|
|
|
y <- x + rnorm(15)
|
|
|
|
expect_equal(length(predict(lm(y ~ x))), 15)
|
2015-11-10 00:06:01 -05:00
|
|
|
})
|
2016-02-23 18:42:58 -05:00
|
|
|
|
2016-04-15 11:23:51 -04:00
|
|
|
test_that("glm summary", {
|
|
|
|
# gaussian family
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-15 11:23:51 -04:00
|
|
|
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
|
|
|
|
|
|
|
|
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
|
|
|
# binomial family
|
2016-05-26 14:20:20 -04:00
|
|
|
df <- suppressWarnings(createDataFrame(iris))
|
2016-04-15 11:23:51 -04:00
|
|
|
training <- df[df$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
|
|
|
|
family = binomial(link = "logit")))
|
|
|
|
|
|
|
|
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
|
|
|
|
family = binomial(link = "logit")))
|
|
|
|
|
|
|
|
coefs <- unlist(stats$coefficients)
|
|
|
|
rCoefs <- unlist(rStats$coefficients)
|
|
|
|
expect_true(all(abs(rCoefs - coefs) < 1e-4))
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "Sepal_Length", "Sepal_Width")))
|
|
|
|
expect_equal(stats$dispersion, rStats$dispersion)
|
|
|
|
expect_equal(stats$null.deviance, rStats$null.deviance)
|
|
|
|
expect_equal(stats$deviance, rStats$deviance)
|
|
|
|
expect_equal(stats$df.null, rStats$df.null)
|
|
|
|
expect_equal(stats$df.residual, rStats$df.residual)
|
|
|
|
expect_equal(stats$aic, rStats$aic)
|
|
|
|
|
|
|
|
# Test summary works on base GLM models
|
|
|
|
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
|
|
|
|
baseSummary <- summary(baseModel)
|
|
|
|
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
|
|
|
|
})
|
|
|
|
|
2016-04-29 12:42:54 -04:00
|
|
|
test_that("glm save/load", {
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(iris))
|
2016-04-29 12:42:54 -04:00
|
|
|
m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
|
|
|
|
s <- summary(m)
|
|
|
|
|
|
|
|
modelPath <- tempfile(pattern = "glm", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(m, modelPath)
|
|
|
|
expect_error(write.ml(m, modelPath))
|
|
|
|
write.ml(m, modelPath, overwrite = TRUE)
|
|
|
|
m2 <- read.ml(modelPath)
|
2016-04-29 12:42:54 -04:00
|
|
|
s2 <- summary(m2)
|
|
|
|
|
|
|
|
expect_equal(s$coefficients, s2$coefficients)
|
|
|
|
expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
|
|
|
|
expect_equal(s$dispersion, s2$dispersion)
|
|
|
|
expect_equal(s$null.deviance, s2$null.deviance)
|
|
|
|
expect_equal(s$deviance, s2$deviance)
|
|
|
|
expect_equal(s$df.null, s2$df.null)
|
|
|
|
expect_equal(s$df.residual, s2$df.residual)
|
|
|
|
expect_equal(s$aic, s2$aic)
|
|
|
|
expect_equal(s$iter, s2$iter)
|
|
|
|
expect_true(!s$is.loaded)
|
|
|
|
expect_true(s2$is.loaded)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
test_that("spark.kmeans", {
|
2016-02-23 18:42:58 -05:00
|
|
|
newIris <- iris
|
|
|
|
newIris$Species <- NULL
|
2016-05-26 14:20:20 -04:00
|
|
|
training <- suppressWarnings(createDataFrame(newIris))
|
2016-02-23 18:42:58 -05:00
|
|
|
|
|
|
|
take(training, 1)
|
|
|
|
|
2016-06-21 11:31:15 -04:00
|
|
|
model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
|
2016-02-23 18:42:58 -05:00
|
|
|
sample <- take(select(predict(model, training), "prediction"), 1)
|
|
|
|
expect_equal(typeof(sample$prediction), "integer")
|
|
|
|
expect_equal(sample$prediction, 1)
|
|
|
|
|
|
|
|
# Test stats::kmeans is working
|
|
|
|
statsModel <- kmeans(x = newIris, centers = 2)
|
2016-02-24 10:05:20 -05:00
|
|
|
expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
|
2016-02-23 18:42:58 -05:00
|
|
|
|
|
|
|
# Test fitted works on KMeans
|
|
|
|
fitted.model <- fitted(model)
|
|
|
|
expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
|
|
|
|
|
|
|
|
# Test summary works on KMeans
|
|
|
|
summary.model <- summary(model)
|
|
|
|
cluster <- summary.model$cluster
|
2016-12-09 01:08:19 -05:00
|
|
|
k <- summary.model$k
|
|
|
|
expect_equal(k, 2)
|
2016-02-23 18:42:58 -05:00
|
|
|
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
|
2016-04-29 12:42:54 -04:00
|
|
|
|
|
|
|
# Test model save/load
|
2016-04-30 11:37:56 -04:00
|
|
|
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
2016-04-29 12:42:54 -04:00
|
|
|
summary2 <- summary(model2)
|
|
|
|
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
|
|
|
|
expect_equal(summary.model$coefficients, summary2$coefficients)
|
|
|
|
expect_true(!summary.model$is.loaded)
|
|
|
|
expect_true(summary2$is.loaded)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
2016-02-23 18:42:58 -05:00
|
|
|
})
|
2016-03-22 17:16:51 -04:00
|
|
|
|
2016-08-24 14:18:10 -04:00
|
|
|
test_that("spark.mlp", {
|
2016-09-07 06:24:03 -04:00
|
|
|
df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
|
|
|
|
source = "libsvm")
|
2016-11-16 04:04:18 -05:00
|
|
|
model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
|
|
|
|
solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
|
2016-08-24 14:18:10 -04:00
|
|
|
|
|
|
|
# Test summary method
|
|
|
|
summary <- summary(model)
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(summary$numOfInputs, 4)
|
|
|
|
expect_equal(summary$numOfOutputs, 3)
|
2016-08-24 14:18:10 -04:00
|
|
|
expect_equal(summary$layers, c(4, 5, 4, 3))
|
|
|
|
expect_equal(length(summary$weights), 64)
|
2016-09-10 12:52:53 -04:00
|
|
|
expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
|
|
|
|
tolerance = 1e-6)
|
2016-08-24 14:18:10 -04:00
|
|
|
|
|
|
|
# Test predict method
|
|
|
|
mlpTestDF <- df
|
|
|
|
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
|
2016-08-24 14:18:10 -04:00
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
summary2 <- summary(model2)
|
|
|
|
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(summary2$numOfInputs, 4)
|
|
|
|
expect_equal(summary2$numOfOutputs, 3)
|
2016-08-24 14:18:10 -04:00
|
|
|
expect_equal(summary2$layers, c(4, 5, 4, 3))
|
|
|
|
expect_equal(length(summary2$weights), 64)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
2016-09-23 14:14:22 -04:00
|
|
|
# Test default parameter
|
2016-11-16 04:04:18 -05:00
|
|
|
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
|
2016-09-23 14:14:22 -04:00
|
|
|
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(head(mlpPredictions$prediction, 10),
|
|
|
|
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
|
2016-09-23 14:14:22 -04:00
|
|
|
|
|
|
|
# Test illegal parameter
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_error(spark.mlp(df, label ~ features, layers = NULL),
|
|
|
|
"layers must be a integer vector with length > 1.")
|
|
|
|
expect_error(spark.mlp(df, label ~ features, layers = c()),
|
|
|
|
"layers must be a integer vector with length > 1.")
|
|
|
|
expect_error(spark.mlp(df, label ~ features, layers = c(3)),
|
|
|
|
"layers must be a integer vector with length > 1.")
|
2016-09-23 14:14:22 -04:00
|
|
|
|
|
|
|
# Test random seed
|
|
|
|
# default seed
|
2016-11-16 04:04:18 -05:00
|
|
|
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
|
2016-09-23 14:14:22 -04:00
|
|
|
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(head(mlpPredictions$prediction, 10),
|
|
|
|
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
|
2016-09-23 14:14:22 -04:00
|
|
|
# seed equals 10
|
2016-11-16 04:04:18 -05:00
|
|
|
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
|
2016-09-23 14:14:22 -04:00
|
|
|
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(head(mlpPredictions$prediction, 10),
|
|
|
|
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
|
2016-10-26 00:42:59 -04:00
|
|
|
|
|
|
|
# test initialWeights
|
2016-11-16 04:04:18 -05:00
|
|
|
model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
|
2016-10-26 00:42:59 -04:00
|
|
|
c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
|
|
|
|
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(head(mlpPredictions$prediction, 10),
|
|
|
|
c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
|
2016-10-26 00:42:59 -04:00
|
|
|
|
2016-11-16 04:04:18 -05:00
|
|
|
model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
|
2016-10-26 00:42:59 -04:00
|
|
|
c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0))
|
|
|
|
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(head(mlpPredictions$prediction, 10),
|
|
|
|
c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
|
2016-10-26 00:42:59 -04:00
|
|
|
|
2016-11-16 04:04:18 -05:00
|
|
|
model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2)
|
2016-10-26 00:42:59 -04:00
|
|
|
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
|
2016-11-16 04:04:18 -05:00
|
|
|
expect_equal(head(mlpPredictions$prediction, 10),
|
|
|
|
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "0.0", "2.0", "1.0", "0.0"))
|
|
|
|
|
|
|
|
# Test formula works well
|
|
|
|
df <- suppressWarnings(createDataFrame(iris))
|
|
|
|
model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
|
|
|
|
layers = c(4, 3))
|
|
|
|
summary <- summary(model)
|
|
|
|
expect_equal(summary$numOfInputs, 4)
|
|
|
|
expect_equal(summary$numOfOutputs, 3)
|
|
|
|
expect_equal(summary$layers, c(4, 3))
|
|
|
|
expect_equal(length(summary$weights), 15)
|
|
|
|
expect_equal(head(summary$weights, 5), list(-1.1957257, -5.2693685, 7.4489734, -6.3751413,
|
|
|
|
-10.2376130), tolerance = 1e-6)
|
2016-08-24 14:18:10 -04:00
|
|
|
})
|
|
|
|
|
2016-04-30 11:37:56 -04:00
|
|
|
test_that("spark.naiveBayes", {
|
2016-03-22 17:16:51 -04:00
|
|
|
# R code to reproduce the result.
|
|
|
|
# We do not support instance weights yet. So we ignore the frequencies.
|
|
|
|
#
|
|
|
|
#' library(e1071)
|
|
|
|
#' t <- as.data.frame(Titanic)
|
|
|
|
#' t1 <- t[t$Freq > 0, -5]
|
|
|
|
#' m <- naiveBayes(Survived ~ ., data = t1)
|
|
|
|
#' m
|
|
|
|
#' predict(m, t1)
|
|
|
|
#
|
|
|
|
# -- output of 'm'
|
|
|
|
#
|
|
|
|
# A-priori probabilities:
|
|
|
|
# Y
|
|
|
|
# No Yes
|
|
|
|
# 0.4166667 0.5833333
|
|
|
|
#
|
|
|
|
# Conditional probabilities:
|
|
|
|
# Class
|
|
|
|
# Y 1st 2nd 3rd Crew
|
|
|
|
# No 0.2000000 0.2000000 0.4000000 0.2000000
|
|
|
|
# Yes 0.2857143 0.2857143 0.2857143 0.1428571
|
|
|
|
#
|
|
|
|
# Sex
|
|
|
|
# Y Male Female
|
|
|
|
# No 0.5 0.5
|
|
|
|
# Yes 0.5 0.5
|
|
|
|
#
|
|
|
|
# Age
|
|
|
|
# Y Child Adult
|
|
|
|
# No 0.2000000 0.8000000
|
|
|
|
# Yes 0.4285714 0.5714286
|
|
|
|
#
|
|
|
|
# -- output of 'predict(m, t1)'
|
|
|
|
#
|
|
|
|
# Yes Yes Yes Yes No No Yes Yes No No Yes Yes Yes Yes Yes Yes Yes Yes No No Yes Yes No No
|
|
|
|
#
|
|
|
|
|
|
|
|
t <- as.data.frame(Titanic)
|
|
|
|
t1 <- t[t$Freq > 0, -5]
|
2016-05-26 14:20:20 -04:00
|
|
|
df <- suppressWarnings(createDataFrame(t1))
|
2016-06-21 11:31:15 -04:00
|
|
|
m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
|
2016-03-22 17:16:51 -04:00
|
|
|
s <- summary(m)
|
|
|
|
expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
|
|
|
|
expect_equal(sum(s$apriori), 1)
|
|
|
|
expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6)
|
|
|
|
p <- collect(select(predict(m, df), "prediction"))
|
|
|
|
expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
|
|
|
|
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No",
|
|
|
|
"Yes", "Yes", "No", "No"))
|
|
|
|
|
2016-04-25 17:08:41 -04:00
|
|
|
# Test model save/load
|
2016-04-30 11:37:56 -04:00
|
|
|
modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(m, modelPath)
|
|
|
|
expect_error(write.ml(m, modelPath))
|
|
|
|
write.ml(m, modelPath, overwrite = TRUE)
|
|
|
|
m2 <- read.ml(modelPath)
|
2016-04-25 17:08:41 -04:00
|
|
|
s2 <- summary(m2)
|
|
|
|
expect_equal(s$apriori, s2$apriori)
|
|
|
|
expect_equal(s$tables, s2$tables)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
2016-03-22 17:16:51 -04:00
|
|
|
# Test e1071::naiveBayes
|
|
|
|
if (requireNamespace("e1071", quietly = TRUE)) {
|
2016-08-22 15:27:33 -04:00
|
|
|
expect_error(m <- e1071::naiveBayes(Survived ~ ., data = t1), NA)
|
2016-03-22 17:16:51 -04:00
|
|
|
expect_equal(as.character(predict(m, t1[1, ])), "Yes")
|
|
|
|
}
|
2016-10-11 15:41:35 -04:00
|
|
|
|
|
|
|
# Test numeric response variable
|
|
|
|
t1$NumericSurvived <- ifelse(t1$Survived == "No", 0, 1)
|
|
|
|
t2 <- t1[-4]
|
|
|
|
df <- suppressWarnings(createDataFrame(t2))
|
|
|
|
m <- spark.naiveBayes(df, NumericSurvived ~ ., smoothing = 0.0)
|
|
|
|
s <- summary(m)
|
|
|
|
expect_equal(as.double(s$apriori[1, 1]), 0.5833333, tolerance = 1e-6)
|
|
|
|
expect_equal(sum(s$apriori), 1)
|
|
|
|
expect_equal(as.double(s$tables[1, "Age_Adult"]), 0.5714286, tolerance = 1e-6)
|
2016-03-22 17:16:51 -04:00
|
|
|
})
|
2016-03-25 01:29:34 -04:00
|
|
|
|
2016-04-30 02:13:03 -04:00
|
|
|
test_that("spark.survreg", {
|
2016-03-25 01:29:34 -04:00
|
|
|
# R code to reproduce the result.
|
|
|
|
#
|
|
|
|
#' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
|
|
|
|
#' x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
|
|
|
|
#' library(survival)
|
|
|
|
#' model <- survreg(Surv(time, status) ~ x + sex, rData)
|
|
|
|
#' summary(model)
|
|
|
|
#' predict(model, data)
|
|
|
|
#
|
|
|
|
# -- output of 'summary(model)'
|
|
|
|
#
|
|
|
|
# Value Std. Error z p
|
|
|
|
# (Intercept) 1.315 0.270 4.88 1.07e-06
|
|
|
|
# x -0.190 0.173 -1.10 2.72e-01
|
|
|
|
# sex -0.253 0.329 -0.77 4.42e-01
|
|
|
|
# Log(scale) -1.160 0.396 -2.93 3.41e-03
|
|
|
|
#
|
|
|
|
# -- output of 'predict(model, data)'
|
|
|
|
#
|
|
|
|
# 1 2 3 4 5 6 7
|
|
|
|
# 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269
|
|
|
|
#
|
|
|
|
data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0),
|
|
|
|
list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1))
|
2016-05-26 14:20:20 -04:00
|
|
|
df <- createDataFrame(data, c("time", "status", "x", "sex"))
|
2016-04-30 02:13:03 -04:00
|
|
|
model <- spark.survreg(df, Surv(time, status) ~ x + sex)
|
2016-03-25 01:29:34 -04:00
|
|
|
stats <- summary(model)
|
|
|
|
coefs <- as.vector(stats$coefficients[, 1])
|
|
|
|
rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800)
|
|
|
|
expect_equal(coefs, rCoefs, tolerance = 1e-4)
|
|
|
|
expect_true(all(
|
|
|
|
rownames(stats$coefficients) ==
|
|
|
|
c("(Intercept)", "x", "sex", "Log(scale)")))
|
|
|
|
p <- collect(select(predict(model, df), "prediction"))
|
|
|
|
expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035,
|
|
|
|
2.390146, 2.891269, 2.891269), tolerance = 1e-4)
|
|
|
|
|
2016-04-26 13:30:24 -04:00
|
|
|
# Test model save/load
|
2016-04-30 11:37:56 -04:00
|
|
|
modelPath <- tempfile(pattern = "spark-survreg", fileext = ".tmp")
|
2016-04-30 03:45:44 -04:00
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
2016-04-26 13:30:24 -04:00
|
|
|
stats2 <- summary(model2)
|
|
|
|
coefs2 <- as.vector(stats2$coefficients[, 1])
|
|
|
|
expect_equal(coefs, coefs2)
|
|
|
|
expect_equal(rownames(stats$coefficients), rownames(stats2$coefficients))
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
2016-03-25 01:29:34 -04:00
|
|
|
# Test survival::survreg
|
|
|
|
if (requireNamespace("survival", quietly = TRUE)) {
|
|
|
|
rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
|
|
|
|
x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
|
2016-05-03 12:29:49 -04:00
|
|
|
expect_error(
|
2016-03-25 01:29:34 -04:00
|
|
|
model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
|
2016-05-03 12:29:49 -04:00
|
|
|
NA)
|
2016-03-25 01:29:34 -04:00
|
|
|
expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
|
|
|
|
}
|
|
|
|
})
|
2016-07-17 22:02:21 -04:00
|
|
|
|
2016-08-17 09:15:04 -04:00
|
|
|
test_that("spark.isotonicRegression", {
|
|
|
|
label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
|
|
|
|
feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
|
|
|
|
weight <- c(1.0, 1.0, 1.0, 1.0, 1.0)
|
|
|
|
data <- as.data.frame(cbind(label, feature, weight))
|
2016-11-03 18:27:18 -04:00
|
|
|
df <- createDataFrame(data)
|
2016-08-17 09:15:04 -04:00
|
|
|
|
|
|
|
model <- spark.isoreg(df, label ~ feature, isotonic = FALSE,
|
|
|
|
weightCol = "weight")
|
|
|
|
# only allow one variable on the right hand side of the formula
|
|
|
|
expect_error(model2 <- spark.isoreg(df, ~., isotonic = FALSE))
|
2016-08-22 15:27:33 -04:00
|
|
|
result <- summary(model)
|
2016-08-17 09:15:04 -04:00
|
|
|
expect_equal(result$predictions, list(7, 5, 4, 4, 1))
|
|
|
|
|
|
|
|
# Test model prediction
|
|
|
|
predict_data <- list(list(-2.0), list(-1.0), list(0.5),
|
|
|
|
list(0.75), list(1.0), list(2.0), list(9.0))
|
|
|
|
predict_df <- createDataFrame(predict_data, c("feature"))
|
|
|
|
predict_result <- collect(select(predict(model, predict_df), "prediction"))
|
|
|
|
expect_equal(predict_result$prediction, c(7.0, 7.0, 6.0, 5.5, 5.0, 4.0, 1.0))
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-isotonicRegression", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
2016-08-22 15:27:33 -04:00
|
|
|
expect_equal(result, summary(model2))
|
2016-08-17 09:15:04 -04:00
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
2016-10-26 19:12:55 -04:00
|
|
|
test_that("spark.logit", {
|
[SPARK-18686][SPARKR][ML] Several cleanup and improvements for spark.logit.
## What changes were proposed in this pull request?
Several cleanup and improvements for ```spark.logit```:
* ```summary``` should return coefficients matrix, and should output labels for each class if the model is multinomial logistic regression model.
* ```summary``` should not return ```areaUnderROC, roc, pr, ...```, since most of them are DataFrame which are less important for R users. Meanwhile, these metrics ignore instance weights (setting all to 1.0) which will be changed in later Spark version. In case it will introduce breaking changes, we do not expose them currently.
* SparkR test improvement: comparing the training result with native R glmnet.
* Remove argument ```aggregationDepth``` from ```spark.logit```, since it's an expert Param(related with Spark architecture and job execution) that would be used rarely by R users.
## How was this patch tested?
Unit tests.
The ```summary``` output after this change:
multinomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> model <- spark.logit(df, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
versicolor virginica setosa
(Intercept) 1.514031 -2.609108 1.095077
Sepal_Length 0.02511006 0.2649821 -0.2900921
Sepal_Width -0.5291215 -0.02016446 0.549286
Petal_Length 0.03647411 0.1544119 -0.190886
Petal_Width 0.000236092 0.4195804 -0.4198165
```
binomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> training <- df[df$Species %in% c("versicolor", "virginica"), ]
> model <- spark.logit(training, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
Estimate
(Intercept) -6.053815
Sepal_Length 0.2449379
Sepal_Width 0.1648321
Petal_Length 0.4730718
Petal_Width 1.031947
```
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #16117 from yanboliang/spark-18686.
2016-12-07 03:31:11 -05:00
|
|
|
# R code to reproduce the result.
|
|
|
|
# nolint start
|
|
|
|
#' library(glmnet)
|
|
|
|
#' iris.x = as.matrix(iris[, 1:4])
|
|
|
|
#' iris.y = as.factor(as.character(iris[, 5]))
|
|
|
|
#' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
|
|
|
|
#' coef(logit)
|
|
|
|
#
|
|
|
|
# $setosa
|
|
|
|
# 5 x 1 sparse Matrix of class "dgCMatrix"
|
|
|
|
# s0
|
|
|
|
# 1.0981324
|
|
|
|
# Sepal.Length -0.2909860
|
|
|
|
# Sepal.Width 0.5510907
|
|
|
|
# Petal.Length -0.1915217
|
|
|
|
# Petal.Width -0.4211946
|
|
|
|
#
|
|
|
|
# $versicolor
|
|
|
|
# 5 x 1 sparse Matrix of class "dgCMatrix"
|
|
|
|
# s0
|
|
|
|
# 1.520061e+00
|
|
|
|
# Sepal.Length 2.524501e-02
|
|
|
|
# Sepal.Width -5.310313e-01
|
|
|
|
# Petal.Length 3.656543e-02
|
|
|
|
# Petal.Width -3.144464e-05
|
|
|
|
#
|
|
|
|
# $virginica
|
|
|
|
# 5 x 1 sparse Matrix of class "dgCMatrix"
|
|
|
|
# s0
|
|
|
|
# -2.61819385
|
|
|
|
# Sepal.Length 0.26574097
|
|
|
|
# Sepal.Width -0.02005932
|
|
|
|
# Petal.Length 0.15495629
|
|
|
|
# Petal.Width 0.42122607
|
|
|
|
# nolint end
|
|
|
|
|
|
|
|
# Test multinomial logistic regression againt three classes
|
|
|
|
df <- suppressWarnings(createDataFrame(iris))
|
|
|
|
model <- spark.logit(df, Species ~ ., regParam = 0.5)
|
|
|
|
summary <- summary(model)
|
|
|
|
versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
|
|
|
|
virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
|
|
|
|
setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
|
|
|
|
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
|
|
|
|
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
|
|
|
|
setosaCoefs <- unlist(summary$coefficients[, "setosa"])
|
|
|
|
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
|
|
|
|
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
|
|
|
|
expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
|
|
|
|
|
|
|
|
# Test model save and load
|
|
|
|
modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
coefs <- summary(model)$coefficients
|
|
|
|
coefs2 <- summary(model2)$coefficients
|
|
|
|
expect_equal(coefs, coefs2)
|
2016-10-26 19:12:55 -04:00
|
|
|
unlink(modelPath)
|
|
|
|
|
[SPARK-18686][SPARKR][ML] Several cleanup and improvements for spark.logit.
## What changes were proposed in this pull request?
Several cleanup and improvements for ```spark.logit```:
* ```summary``` should return coefficients matrix, and should output labels for each class if the model is multinomial logistic regression model.
* ```summary``` should not return ```areaUnderROC, roc, pr, ...```, since most of them are DataFrame which are less important for R users. Meanwhile, these metrics ignore instance weights (setting all to 1.0) which will be changed in later Spark version. In case it will introduce breaking changes, we do not expose them currently.
* SparkR test improvement: comparing the training result with native R glmnet.
* Remove argument ```aggregationDepth``` from ```spark.logit```, since it's an expert Param(related with Spark architecture and job execution) that would be used rarely by R users.
## How was this patch tested?
Unit tests.
The ```summary``` output after this change:
multinomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> model <- spark.logit(df, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
versicolor virginica setosa
(Intercept) 1.514031 -2.609108 1.095077
Sepal_Length 0.02511006 0.2649821 -0.2900921
Sepal_Width -0.5291215 -0.02016446 0.549286
Petal_Length 0.03647411 0.1544119 -0.190886
Petal_Width 0.000236092 0.4195804 -0.4198165
```
binomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> training <- df[df$Species %in% c("versicolor", "virginica"), ]
> model <- spark.logit(training, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
Estimate
(Intercept) -6.053815
Sepal_Length 0.2449379
Sepal_Width 0.1648321
Petal_Length 0.4730718
Petal_Width 1.031947
```
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #16117 from yanboliang/spark-18686.
2016-12-07 03:31:11 -05:00
|
|
|
# R code to reproduce the result.
|
|
|
|
# nolint start
|
|
|
|
#' library(glmnet)
|
|
|
|
#' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
#' iris.x = as.matrix(iris2[, 1:4])
|
|
|
|
#' iris.y = as.factor(as.character(iris2[, 5]))
|
|
|
|
#' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
|
|
|
|
#' coef(logit)
|
|
|
|
#
|
|
|
|
# $versicolor
|
|
|
|
# 5 x 1 sparse Matrix of class "dgCMatrix"
|
|
|
|
# s0
|
|
|
|
# 3.93844796
|
|
|
|
# Sepal.Length -0.13538675
|
|
|
|
# Sepal.Width -0.02386443
|
|
|
|
# Petal.Length -0.35076451
|
|
|
|
# Petal.Width -0.77971954
|
|
|
|
#
|
|
|
|
# $virginica
|
|
|
|
# 5 x 1 sparse Matrix of class "dgCMatrix"
|
|
|
|
# s0
|
|
|
|
# -3.93844796
|
|
|
|
# Sepal.Length 0.13538675
|
|
|
|
# Sepal.Width 0.02386443
|
|
|
|
# Petal.Length 0.35076451
|
|
|
|
# Petal.Width 0.77971954
|
|
|
|
#
|
|
|
|
#' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
|
|
|
|
#' coef(logit)
|
|
|
|
#
|
|
|
|
# 5 x 1 sparse Matrix of class "dgCMatrix"
|
|
|
|
# s0
|
|
|
|
# (Intercept) -6.0824412
|
|
|
|
# Sepal.Length 0.2458260
|
|
|
|
# Sepal.Width 0.1642093
|
|
|
|
# Petal.Length 0.4759487
|
|
|
|
# Petal.Width 1.0383948
|
|
|
|
#
|
|
|
|
# nolint end
|
|
|
|
|
|
|
|
# Test multinomial logistic regression againt two classes
|
|
|
|
df <- suppressWarnings(createDataFrame(iris))
|
|
|
|
training <- df[df$Species %in% c("versicolor", "virginica"), ]
|
|
|
|
model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
|
|
|
|
summary <- summary(model)
|
|
|
|
versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
|
|
|
|
virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
|
|
|
|
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
|
|
|
|
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
|
|
|
|
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
|
|
|
|
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
|
|
|
|
|
|
|
|
# Test binomial logistic regression againt two classes
|
|
|
|
model <- spark.logit(training, Species ~ ., regParam = 0.5)
|
|
|
|
summary <- summary(model)
|
|
|
|
coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
|
|
|
|
coefs <- unlist(summary$coefficients[, "Estimate"])
|
|
|
|
expect_true(all(abs(coefsR - coefs) < 0.1))
|
|
|
|
|
|
|
|
# Test prediction with string label
|
|
|
|
prediction <- predict(model, training)
|
2016-11-30 23:32:17 -05:00
|
|
|
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
|
[SPARK-18686][SPARKR][ML] Several cleanup and improvements for spark.logit.
## What changes were proposed in this pull request?
Several cleanup and improvements for ```spark.logit```:
* ```summary``` should return coefficients matrix, and should output labels for each class if the model is multinomial logistic regression model.
* ```summary``` should not return ```areaUnderROC, roc, pr, ...```, since most of them are DataFrame which are less important for R users. Meanwhile, these metrics ignore instance weights (setting all to 1.0) which will be changed in later Spark version. In case it will introduce breaking changes, we do not expose them currently.
* SparkR test improvement: comparing the training result with native R glmnet.
* Remove argument ```aggregationDepth``` from ```spark.logit```, since it's an expert Param(related with Spark architecture and job execution) that would be used rarely by R users.
## How was this patch tested?
Unit tests.
The ```summary``` output after this change:
multinomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> model <- spark.logit(df, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
versicolor virginica setosa
(Intercept) 1.514031 -2.609108 1.095077
Sepal_Length 0.02511006 0.2649821 -0.2900921
Sepal_Width -0.5291215 -0.02016446 0.549286
Petal_Length 0.03647411 0.1544119 -0.190886
Petal_Width 0.000236092 0.4195804 -0.4198165
```
binomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> training <- df[df$Species %in% c("versicolor", "virginica"), ]
> model <- spark.logit(training, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
Estimate
(Intercept) -6.053815
Sepal_Length 0.2449379
Sepal_Width 0.1648321
Petal_Length 0.4730718
Petal_Width 1.031947
```
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #16117 from yanboliang/spark-18686.
2016-12-07 03:31:11 -05:00
|
|
|
expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
|
|
|
|
"versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
|
2016-11-30 23:32:17 -05:00
|
|
|
expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
|
|
|
|
|
[SPARK-18686][SPARKR][ML] Several cleanup and improvements for spark.logit.
## What changes were proposed in this pull request?
Several cleanup and improvements for ```spark.logit```:
* ```summary``` should return coefficients matrix, and should output labels for each class if the model is multinomial logistic regression model.
* ```summary``` should not return ```areaUnderROC, roc, pr, ...```, since most of them are DataFrame which are less important for R users. Meanwhile, these metrics ignore instance weights (setting all to 1.0) which will be changed in later Spark version. In case it will introduce breaking changes, we do not expose them currently.
* SparkR test improvement: comparing the training result with native R glmnet.
* Remove argument ```aggregationDepth``` from ```spark.logit```, since it's an expert Param(related with Spark architecture and job execution) that would be used rarely by R users.
## How was this patch tested?
Unit tests.
The ```summary``` output after this change:
multinomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> model <- spark.logit(df, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
versicolor virginica setosa
(Intercept) 1.514031 -2.609108 1.095077
Sepal_Length 0.02511006 0.2649821 -0.2900921
Sepal_Width -0.5291215 -0.02016446 0.549286
Petal_Length 0.03647411 0.1544119 -0.190886
Petal_Width 0.000236092 0.4195804 -0.4198165
```
binomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> training <- df[df$Species %in% c("versicolor", "virginica"), ]
> model <- spark.logit(training, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
Estimate
(Intercept) -6.053815
Sepal_Length 0.2449379
Sepal_Width 0.1648321
Petal_Length 0.4730718
Petal_Width 1.031947
```
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #16117 from yanboliang/spark-18686.
2016-12-07 03:31:11 -05:00
|
|
|
# Test prediction with numeric label
|
|
|
|
label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
|
|
|
|
feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
|
|
|
|
data <- as.data.frame(cbind(label, feature))
|
2016-10-26 19:12:55 -04:00
|
|
|
df <- createDataFrame(data)
|
[SPARK-18686][SPARKR][ML] Several cleanup and improvements for spark.logit.
## What changes were proposed in this pull request?
Several cleanup and improvements for ```spark.logit```:
* ```summary``` should return coefficients matrix, and should output labels for each class if the model is multinomial logistic regression model.
* ```summary``` should not return ```areaUnderROC, roc, pr, ...```, since most of them are DataFrame which are less important for R users. Meanwhile, these metrics ignore instance weights (setting all to 1.0) which will be changed in later Spark version. In case it will introduce breaking changes, we do not expose them currently.
* SparkR test improvement: comparing the training result with native R glmnet.
* Remove argument ```aggregationDepth``` from ```spark.logit```, since it's an expert Param(related with Spark architecture and job execution) that would be used rarely by R users.
## How was this patch tested?
Unit tests.
The ```summary``` output after this change:
multinomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> model <- spark.logit(df, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
versicolor virginica setosa
(Intercept) 1.514031 -2.609108 1.095077
Sepal_Length 0.02511006 0.2649821 -0.2900921
Sepal_Width -0.5291215 -0.02016446 0.549286
Petal_Length 0.03647411 0.1544119 -0.190886
Petal_Width 0.000236092 0.4195804 -0.4198165
```
binomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> training <- df[df$Species %in% c("versicolor", "virginica"), ]
> model <- spark.logit(training, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
Estimate
(Intercept) -6.053815
Sepal_Length 0.2449379
Sepal_Width 0.1648321
Petal_Length 0.4730718
Petal_Width 1.031947
```
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #16117 from yanboliang/spark-18686.
2016-12-07 03:31:11 -05:00
|
|
|
model <- spark.logit(df, label ~ feature)
|
|
|
|
prediction <- collect(select(predict(model, df), "prediction"))
|
|
|
|
expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0"))
|
2016-10-26 19:12:55 -04:00
|
|
|
})
|
|
|
|
|
2016-08-17 14:18:33 -04:00
|
|
|
test_that("spark.gaussianMixture", {
|
|
|
|
# R code to reproduce the result.
|
|
|
|
# nolint start
|
|
|
|
#' library(mvtnorm)
|
2016-08-21 05:23:31 -04:00
|
|
|
#' set.seed(1)
|
|
|
|
#' a <- rmvnorm(7, c(0, 0))
|
|
|
|
#' b <- rmvnorm(8, c(10, 10))
|
2016-08-17 14:18:33 -04:00
|
|
|
#' data <- rbind(a, b)
|
|
|
|
#' model <- mvnormalmixEM(data, k = 2)
|
|
|
|
#' model$lambda
|
|
|
|
#
|
2016-08-21 05:23:31 -04:00
|
|
|
# [1] 0.4666667 0.5333333
|
2016-08-17 14:18:33 -04:00
|
|
|
#
|
|
|
|
#' model$mu
|
|
|
|
#
|
2016-08-21 05:23:31 -04:00
|
|
|
# [1] 0.11731091 -0.06192351
|
|
|
|
# [1] 10.363673 9.897081
|
2016-08-17 14:18:33 -04:00
|
|
|
#
|
|
|
|
#' model$sigma
|
|
|
|
#
|
|
|
|
# [[1]]
|
2016-08-21 05:23:31 -04:00
|
|
|
# [,1] [,2]
|
|
|
|
# [1,] 0.62049934 0.06880802
|
|
|
|
# [2,] 0.06880802 1.27431874
|
2016-08-17 14:18:33 -04:00
|
|
|
#
|
|
|
|
# [[2]]
|
2016-08-21 05:23:31 -04:00
|
|
|
# [,1] [,2]
|
|
|
|
# [1,] 0.2961543 0.160783
|
|
|
|
# [2,] 0.1607830 1.008878
|
2016-08-17 14:18:33 -04:00
|
|
|
# nolint end
|
2016-08-21 05:23:31 -04:00
|
|
|
data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808),
|
|
|
|
list(0.3295078, -0.8204684), list(0.4874291, 0.7383247),
|
|
|
|
list(0.5757814, -0.3053884), list(1.5117812, 0.3898432),
|
|
|
|
list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664),
|
|
|
|
list(9.9838097, 10.9438362), list(10.8212212, 10.5939013),
|
|
|
|
list(10.9189774, 10.7821363), list(10.0745650, 8.0106483),
|
|
|
|
list(10.6198257, 9.9438713), list(9.8442045, 8.5292476),
|
|
|
|
list(9.5218499, 10.4179416))
|
2016-08-17 14:18:33 -04:00
|
|
|
df <- createDataFrame(data, c("x1", "x2"))
|
|
|
|
model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
|
|
|
|
stats <- summary(model)
|
2016-08-21 05:23:31 -04:00
|
|
|
rLambda <- c(0.4666667, 0.5333333)
|
|
|
|
rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081)
|
|
|
|
rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874,
|
|
|
|
0.2961543, 0.160783, 0.1607830, 1.008878)
|
2016-08-19 05:11:59 -04:00
|
|
|
expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
|
2016-08-17 14:18:33 -04:00
|
|
|
expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
|
|
|
|
expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
|
|
|
|
p <- collect(select(predict(model, df), "prediction"))
|
2016-08-21 05:23:31 -04:00
|
|
|
expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
|
2016-08-17 14:18:33 -04:00
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
expect_equal(stats$lambda, stats2$lambda)
|
|
|
|
expect_equal(unlist(stats$mu), unlist(stats2$mu))
|
|
|
|
expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
2016-08-18 08:33:52 -04:00
|
|
|
test_that("spark.lda with libsvm", {
|
2016-09-07 06:24:03 -04:00
|
|
|
text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm")
|
2016-08-18 08:33:52 -04:00
|
|
|
model <- spark.lda(text, optimizer = "em")
|
|
|
|
|
|
|
|
stats <- summary(model, 10)
|
|
|
|
isDistributed <- stats$isDistributed
|
|
|
|
logLikelihood <- stats$logLikelihood
|
|
|
|
logPerplexity <- stats$logPerplexity
|
|
|
|
vocabSize <- stats$vocabSize
|
|
|
|
topics <- stats$topicTopTerms
|
|
|
|
weights <- stats$topicTopTermsWeights
|
|
|
|
vocabulary <- stats$vocabulary
|
|
|
|
|
|
|
|
expect_false(isDistributed)
|
|
|
|
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
|
|
|
|
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
|
|
|
|
expect_equal(vocabSize, 11)
|
|
|
|
expect_true(is.null(vocabulary))
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
|
|
|
|
expect_false(stats2$isDistributed)
|
|
|
|
expect_equal(logLikelihood, stats2$logLikelihood)
|
|
|
|
expect_equal(logPerplexity, stats2$logPerplexity)
|
|
|
|
expect_equal(vocabSize, stats2$vocabSize)
|
|
|
|
expect_equal(vocabulary, stats2$vocabulary)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.lda with text input", {
|
2016-09-07 06:24:03 -04:00
|
|
|
text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
|
2016-08-18 08:33:52 -04:00
|
|
|
model <- spark.lda(text, optimizer = "online", features = "value")
|
|
|
|
|
|
|
|
stats <- summary(model)
|
|
|
|
isDistributed <- stats$isDistributed
|
|
|
|
logLikelihood <- stats$logLikelihood
|
|
|
|
logPerplexity <- stats$logPerplexity
|
|
|
|
vocabSize <- stats$vocabSize
|
|
|
|
topics <- stats$topicTopTerms
|
|
|
|
weights <- stats$topicTopTermsWeights
|
|
|
|
vocabulary <- stats$vocabulary
|
|
|
|
|
|
|
|
expect_false(isDistributed)
|
|
|
|
expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
|
|
|
|
expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
|
|
|
|
expect_equal(vocabSize, 10)
|
|
|
|
expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
|
|
|
|
expect_false(stats2$isDistributed)
|
|
|
|
expect_equal(logLikelihood, stats2$logLikelihood)
|
|
|
|
expect_equal(logPerplexity, stats2$logPerplexity)
|
|
|
|
expect_equal(vocabSize, stats2$vocabSize)
|
|
|
|
expect_true(all.equal(vocabulary, stats2$vocabulary))
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
|
|
|
test_that("spark.posterior and spark.perplexity", {
|
2016-09-07 06:24:03 -04:00
|
|
|
text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
|
2016-08-18 08:33:52 -04:00
|
|
|
model <- spark.lda(text, features = "value", k = 3)
|
|
|
|
|
|
|
|
# Assert perplexities are equal
|
|
|
|
stats <- summary(model)
|
|
|
|
logPerplexity <- spark.perplexity(model, text)
|
|
|
|
expect_equal(logPerplexity, stats$logPerplexity)
|
|
|
|
|
|
|
|
# Assert the sum of every topic distribution is equal to 1
|
|
|
|
posterior <- spark.posterior(model, text)
|
|
|
|
local.posterior <- collect(posterior)$topicDistribution
|
|
|
|
expect_equal(length(local.posterior), sum(unlist(local.posterior)))
|
|
|
|
})
|
|
|
|
|
2016-08-19 17:24:09 -04:00
|
|
|
test_that("spark.als", {
|
|
|
|
data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
|
2016-12-07 23:23:28 -05:00
|
|
|
list(2, 1, 1.0), list(2, 2, 5.0))
|
2016-08-19 17:24:09 -04:00
|
|
|
df <- createDataFrame(data, c("user", "item", "score"))
|
|
|
|
model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
|
2016-12-07 23:23:28 -05:00
|
|
|
rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
|
2016-08-19 17:24:09 -04:00
|
|
|
stats <- summary(model)
|
|
|
|
expect_equal(stats$rank, 10)
|
|
|
|
test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
|
|
|
|
predictions <- collect(predict(model, test))
|
|
|
|
|
|
|
|
expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
|
|
|
|
tolerance = 1e-4)
|
|
|
|
|
|
|
|
# Test model save/load
|
|
|
|
modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
expect_equal(stats2$rating, "score")
|
|
|
|
userFactors <- collect(stats$userFactors)
|
|
|
|
itemFactors <- collect(stats$itemFactors)
|
|
|
|
userFactors2 <- collect(stats2$userFactors)
|
|
|
|
itemFactors2 <- collect(stats2$itemFactors)
|
|
|
|
|
|
|
|
orderUser <- order(userFactors$id)
|
|
|
|
orderUser2 <- order(userFactors2$id)
|
|
|
|
expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
|
|
|
|
expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
|
|
|
|
|
|
|
|
orderItem <- order(itemFactors$id)
|
|
|
|
orderItem2 <- order(itemFactors2$id)
|
|
|
|
expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
|
|
|
|
expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
})
|
|
|
|
|
2016-09-03 15:26:30 -04:00
|
|
|
test_that("spark.kstest", {
|
|
|
|
data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
|
|
|
|
df <- createDataFrame(data)
|
|
|
|
testResult <- spark.kstest(df, "test", "norm")
|
|
|
|
stats <- summary(testResult)
|
|
|
|
|
|
|
|
rStats <- ks.test(data$test, "pnorm", alternative = "two.sided")
|
|
|
|
|
|
|
|
expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
|
|
|
|
expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
|
[SPARK-17315][FOLLOW-UP][SPARKR][ML] Fix print of Kolmogorov-Smirnov test summary
## What changes were proposed in this pull request?
#14881 added Kolmogorov-Smirnov Test wrapper to SparkR. I found that ```print.summary.KSTest``` was implemented inappropriately and result in no effect.
Running the following code for KSTest:
```Scala
data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
df <- createDataFrame(data)
testResult <- spark.kstest(df, "test", "norm")
summary(testResult)
```
Before this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/18615016/b9a2823a-7d4f-11e6-934b-128beade355e.png)
After this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/18615014/aafe2798-7d4f-11e6-8b99-c705bb9fe8f2.png)
The new implementation is similar with [```print.summary.GeneralizedLinearRegressionModel```](https://github.com/apache/spark/blob/master/R/pkg/R/mllib.R#L284) of SparkR and [```print.summary.glm```](https://svn.r-project.org/R/trunk/src/library/stats/R/glm.R) of native R.
BTW, I removed the comparison of ```print.summary.KSTest``` in unit test, since it's only wrappers of the summary output which has been checked. Another reason is that these comparison will output summary information to the test console, it will make the test output in a mess.
## How was this patch tested?
Existing test.
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #15139 from yanboliang/spark-17315.
2016-09-21 23:14:18 -04:00
|
|
|
expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
|
2016-09-03 15:26:30 -04:00
|
|
|
|
|
|
|
testResult <- spark.kstest(df, "test", "norm", -0.5)
|
|
|
|
stats <- summary(testResult)
|
|
|
|
|
|
|
|
rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided")
|
|
|
|
|
|
|
|
expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
|
|
|
|
expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
|
[SPARK-17315][FOLLOW-UP][SPARKR][ML] Fix print of Kolmogorov-Smirnov test summary
## What changes were proposed in this pull request?
#14881 added Kolmogorov-Smirnov Test wrapper to SparkR. I found that ```print.summary.KSTest``` was implemented inappropriately and result in no effect.
Running the following code for KSTest:
```Scala
data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
df <- createDataFrame(data)
testResult <- spark.kstest(df, "test", "norm")
summary(testResult)
```
Before this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/18615016/b9a2823a-7d4f-11e6-934b-128beade355e.png)
After this PR:
![image](https://cloud.githubusercontent.com/assets/1962026/18615014/aafe2798-7d4f-11e6-8b99-c705bb9fe8f2.png)
The new implementation is similar with [```print.summary.GeneralizedLinearRegressionModel```](https://github.com/apache/spark/blob/master/R/pkg/R/mllib.R#L284) of SparkR and [```print.summary.glm```](https://svn.r-project.org/R/trunk/src/library/stats/R/glm.R) of native R.
BTW, I removed the comparison of ```print.summary.KSTest``` in unit test, since it's only wrappers of the summary output which has been checked. Another reason is that these comparison will output summary information to the test console, it will make the test output in a mess.
## How was this patch tested?
Existing test.
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #15139 from yanboliang/spark-17315.
2016-09-21 23:14:18 -04:00
|
|
|
expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
|
2016-09-03 15:26:30 -04:00
|
|
|
})
|
|
|
|
|
2016-11-13 23:25:12 -05:00
|
|
|
test_that("spark.randomForest", {
|
|
|
|
# regression
|
2016-10-30 19:19:19 -04:00
|
|
|
data <- suppressWarnings(createDataFrame(longley))
|
|
|
|
model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
|
|
|
|
numTrees = 1)
|
|
|
|
|
|
|
|
predictions <- collect(predict(model, data))
|
|
|
|
expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
|
|
|
|
63.221, 63.639, 64.989, 63.761,
|
|
|
|
66.019, 67.857, 68.169, 66.513,
|
|
|
|
68.655, 69.564, 69.331, 70.551),
|
|
|
|
tolerance = 1e-4)
|
|
|
|
|
|
|
|
stats <- summary(model)
|
|
|
|
expect_equal(stats$numTrees, 1)
|
|
|
|
expect_error(capture.output(stats), NA)
|
|
|
|
expect_true(length(capture.output(stats)) > 6)
|
|
|
|
|
|
|
|
model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
|
|
|
|
numTrees = 20, seed = 123)
|
|
|
|
predictions <- collect(predict(model, data))
|
2016-12-07 04:34:45 -05:00
|
|
|
expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
|
|
|
|
63.53160, 64.05470, 65.12710, 64.30450,
|
|
|
|
66.70910, 67.86125, 68.08700, 67.21865,
|
|
|
|
68.89275, 69.53180, 69.39640, 69.68250),
|
|
|
|
|
2016-10-30 19:19:19 -04:00
|
|
|
tolerance = 1e-4)
|
|
|
|
stats <- summary(model)
|
|
|
|
expect_equal(stats$numTrees, 20)
|
|
|
|
|
|
|
|
modelPath <- tempfile(pattern = "spark-randomForestRegression", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
expect_equal(stats$formula, stats2$formula)
|
|
|
|
expect_equal(stats$numFeatures, stats2$numFeatures)
|
|
|
|
expect_equal(stats$features, stats2$features)
|
|
|
|
expect_equal(stats$featureImportances, stats2$featureImportances)
|
|
|
|
expect_equal(stats$numTrees, stats2$numTrees)
|
|
|
|
expect_equal(stats$treeWeights, stats2$treeWeights)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
2016-11-13 23:25:12 -05:00
|
|
|
# classification
|
2016-10-30 19:19:19 -04:00
|
|
|
data <- suppressWarnings(createDataFrame(iris))
|
|
|
|
model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
|
|
|
|
maxDepth = 5, maxBins = 16)
|
|
|
|
|
|
|
|
stats <- summary(model)
|
|
|
|
expect_equal(stats$numFeatures, 2)
|
|
|
|
expect_equal(stats$numTrees, 20)
|
|
|
|
expect_error(capture.output(stats), NA)
|
|
|
|
expect_true(length(capture.output(stats)) > 6)
|
2016-11-10 20:13:10 -05:00
|
|
|
# Test string prediction values
|
|
|
|
predictions <- collect(predict(model, data))$prediction
|
|
|
|
expect_equal(length(grep("setosa", predictions)), 50)
|
|
|
|
expect_equal(length(grep("versicolor", predictions)), 50)
|
2016-10-30 19:19:19 -04:00
|
|
|
|
|
|
|
modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
expect_equal(stats$depth, stats2$depth)
|
|
|
|
expect_equal(stats$numNodes, stats2$numNodes)
|
|
|
|
expect_equal(stats$numClasses, stats2$numClasses)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
2016-11-10 20:13:10 -05:00
|
|
|
|
|
|
|
# Test numeric response variable
|
|
|
|
labelToIndex <- function(species) {
|
|
|
|
switch(as.character(species),
|
|
|
|
setosa = 0.0,
|
|
|
|
versicolor = 1.0,
|
|
|
|
virginica = 2.0
|
|
|
|
)
|
|
|
|
}
|
|
|
|
iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
|
|
|
|
data <- suppressWarnings(createDataFrame(iris[-5]))
|
|
|
|
model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
|
|
|
|
maxDepth = 5, maxBins = 16)
|
|
|
|
stats <- summary(model)
|
|
|
|
expect_equal(stats$numFeatures, 2)
|
|
|
|
expect_equal(stats$numTrees, 20)
|
|
|
|
# Test numeric prediction values
|
|
|
|
predictions <- collect(predict(model, data))$prediction
|
|
|
|
expect_equal(length(grep("1.0", predictions)), 50)
|
|
|
|
expect_equal(length(grep("2.0", predictions)), 50)
|
2016-11-13 23:25:12 -05:00
|
|
|
|
|
|
|
# spark.randomForest classification can work on libsvm data
|
|
|
|
data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
|
|
|
|
source = "libsvm")
|
|
|
|
model <- spark.randomForest(data, label ~ features, "classification")
|
|
|
|
expect_equal(summary(model)$numFeatures, 4)
|
2016-10-30 19:19:19 -04:00
|
|
|
})
|
|
|
|
|
2016-11-08 19:00:45 -05:00
|
|
|
test_that("spark.gbt", {
|
|
|
|
# regression
|
|
|
|
data <- suppressWarnings(createDataFrame(longley))
|
|
|
|
model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123)
|
|
|
|
predictions <- collect(predict(model, data))
|
|
|
|
expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
|
|
|
|
63.221, 63.639, 64.989, 63.761,
|
|
|
|
66.019, 67.857, 68.169, 66.513,
|
|
|
|
68.655, 69.564, 69.331, 70.551),
|
|
|
|
tolerance = 1e-4)
|
|
|
|
stats <- summary(model)
|
|
|
|
expect_equal(stats$numTrees, 20)
|
|
|
|
expect_equal(stats$formula, "Employed ~ .")
|
|
|
|
expect_equal(stats$numFeatures, 6)
|
|
|
|
expect_equal(length(stats$treeWeights), 20)
|
|
|
|
|
|
|
|
modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
expect_equal(stats$formula, stats2$formula)
|
|
|
|
expect_equal(stats$numFeatures, stats2$numFeatures)
|
|
|
|
expect_equal(stats$features, stats2$features)
|
|
|
|
expect_equal(stats$featureImportances, stats2$featureImportances)
|
|
|
|
expect_equal(stats$numTrees, stats2$numTrees)
|
|
|
|
expect_equal(stats$treeWeights, stats2$treeWeights)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
|
|
|
# classification
|
|
|
|
# label must be binary - GBTClassifier currently only supports binary classification.
|
|
|
|
iris2 <- iris[iris$Species != "virginica", ]
|
|
|
|
data <- suppressWarnings(createDataFrame(iris2))
|
|
|
|
model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
|
|
|
|
stats <- summary(model)
|
|
|
|
expect_equal(stats$numFeatures, 2)
|
|
|
|
expect_equal(stats$numTrees, 20)
|
|
|
|
expect_error(capture.output(stats), NA)
|
|
|
|
expect_true(length(capture.output(stats)) > 6)
|
|
|
|
predictions <- collect(predict(model, data))$prediction
|
|
|
|
# test string prediction values
|
|
|
|
expect_equal(length(grep("setosa", predictions)), 50)
|
|
|
|
expect_equal(length(grep("versicolor", predictions)), 50)
|
|
|
|
|
|
|
|
modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp")
|
|
|
|
write.ml(model, modelPath)
|
|
|
|
expect_error(write.ml(model, modelPath))
|
|
|
|
write.ml(model, modelPath, overwrite = TRUE)
|
|
|
|
model2 <- read.ml(modelPath)
|
|
|
|
stats2 <- summary(model2)
|
|
|
|
expect_equal(stats$depth, stats2$depth)
|
|
|
|
expect_equal(stats$numNodes, stats2$numNodes)
|
|
|
|
expect_equal(stats$numClasses, stats2$numClasses)
|
|
|
|
|
|
|
|
unlink(modelPath)
|
|
|
|
|
|
|
|
iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
|
|
|
|
df <- suppressWarnings(createDataFrame(iris2))
|
|
|
|
m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
|
|
|
|
s <- summary(m)
|
|
|
|
# test numeric prediction values
|
|
|
|
expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
|
|
|
|
expect_equal(s$numFeatures, 5)
|
|
|
|
expect_equal(s$numTrees, 20)
|
2016-11-13 23:25:12 -05:00
|
|
|
|
|
|
|
# spark.gbt classification can work on libsvm data
|
|
|
|
data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
|
|
|
|
source = "libsvm")
|
|
|
|
model <- spark.gbt(data, label ~ features, "classification")
|
|
|
|
expect_equal(summary(model)$numFeatures, 692)
|
2016-11-08 19:00:45 -05:00
|
|
|
})
|
|
|
|
|
2016-07-17 22:02:21 -04:00
|
|
|
sparkR.session.stop()
|