spark-instrumented-optimizer/R/pkg/inst/tests/testthat/test_mllib.R

215 lines
8.1 KiB
R
Raw Normal View History

[SPARK-9201] [ML] Initial integration of MLlib + SparkR using RFormula This exposes the SparkR:::glm() and SparkR:::predict() APIs. It was necessary to change RFormula to silently drop the label column if it was missing from the input dataset, which is kind of a hack but necessary to integrate with the Pipeline API. The umbrella design doc for MLlib + SparkR integration can be viewed here: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit mengxr Author: Eric Liang <ekl@databricks.com> Closes #7483 from ericl/spark-8774 and squashes the following commits: 3dfac0c [Eric Liang] update 17ef516 [Eric Liang] more comments 1753a0f [Eric Liang] make glm generic b0f50f8 [Eric Liang] equivalence test 550d56d [Eric Liang] export methods c015697 [Eric Liang] second pass 117949a [Eric Liang] comments 5afbc67 [Eric Liang] test label columns 6b7f15f [Eric Liang] Fri Jul 17 14:20:22 PDT 2015 3a63ae5 [Eric Liang] Fri Jul 17 13:41:52 PDT 2015 ce61367 [Eric Liang] Fri Jul 17 13:41:17 PDT 2015 0299c59 [Eric Liang] Fri Jul 17 13:40:32 PDT 2015 e37603f [Eric Liang] Fri Jul 17 12:15:03 PDT 2015 d417d0c [Eric Liang] Merge remote-tracking branch 'upstream/master' into spark-8774 29a2ce7 [Eric Liang] Merge branch 'spark-8774-1' into spark-8774 d1959d2 [Eric Liang] clarify comment 2db68aa [Eric Liang] second round of comments dc3c943 [Eric Liang] address comments 5765ec6 [Eric Liang] fix style checks 1f361b0 [Eric Liang] doc d33211b [Eric Liang] r support fb0826b [Eric Liang] [SPARK-8774] Add R model formula with basic support as a transformer
2015-07-20 23:49:38 -04:00
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
library(testthat)
context("MLlib functions")
# Tests for MLlib functions in SparkR
sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)
test_that("formula of glm", {
training <- suppressWarnings(createDataFrame(sqlContext, iris))
# dot minus and intercept vs native glm
model <- glm(Sepal_Width ~ . - Species + 0, data = training)
vals <- collect(select(predict(model, training), "prediction"))
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
# feature interaction vs native glm
model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
vals <- collect(select(predict(model, training), "prediction"))
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
[SPARK-9201] [ML] Initial integration of MLlib + SparkR using RFormula This exposes the SparkR:::glm() and SparkR:::predict() APIs. It was necessary to change RFormula to silently drop the label column if it was missing from the input dataset, which is kind of a hack but necessary to integrate with the Pipeline API. The umbrella design doc for MLlib + SparkR integration can be viewed here: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit mengxr Author: Eric Liang <ekl@databricks.com> Closes #7483 from ericl/spark-8774 and squashes the following commits: 3dfac0c [Eric Liang] update 17ef516 [Eric Liang] more comments 1753a0f [Eric Liang] make glm generic b0f50f8 [Eric Liang] equivalence test 550d56d [Eric Liang] export methods c015697 [Eric Liang] second pass 117949a [Eric Liang] comments 5afbc67 [Eric Liang] test label columns 6b7f15f [Eric Liang] Fri Jul 17 14:20:22 PDT 2015 3a63ae5 [Eric Liang] Fri Jul 17 13:41:52 PDT 2015 ce61367 [Eric Liang] Fri Jul 17 13:41:17 PDT 2015 0299c59 [Eric Liang] Fri Jul 17 13:40:32 PDT 2015 e37603f [Eric Liang] Fri Jul 17 12:15:03 PDT 2015 d417d0c [Eric Liang] Merge remote-tracking branch 'upstream/master' into spark-8774 29a2ce7 [Eric Liang] Merge branch 'spark-8774-1' into spark-8774 d1959d2 [Eric Liang] clarify comment 2db68aa [Eric Liang] second round of comments dc3c943 [Eric Liang] address comments 5765ec6 [Eric Liang] fix style checks 1f361b0 [Eric Liang] doc d33211b [Eric Liang] r support fb0826b [Eric Liang] [SPARK-8774] Add R model formula with basic support as a transformer
2015-07-20 23:49:38 -04:00
# glm should work with long formula
training <- suppressWarnings(createDataFrame(sqlContext, iris))
training$LongLongLongLongLongName <- training$Sepal_Width
training$VeryLongLongLongLonLongName <- training$Sepal_Length
training$AnotherLongLongLongLongName <- training$Species
model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName,
data = training)
vals <- collect(select(predict(model, training), "prediction"))
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
})
test_that("glm and predict", {
training <- suppressWarnings(createDataFrame(sqlContext, iris))
# gaussian family
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
vals <- collect(select(prediction, "prediction"))
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
[SPARK-9391] [ML] Support minus, dot, and intercept operators in SparkR RFormula Adds '.', '-', and intercept parsing to RFormula. Also splits RFormulaParser into a separate file. Umbrella design doc here: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit?usp=sharing mengxr Author: Eric Liang <ekl@databricks.com> Closes #7707 from ericl/string-features-2 and squashes the following commits: 8588625 [Eric Liang] exclude complex types for . 8106ffe [Eric Liang] comments a9350bb [Eric Liang] s/var/val 9c50d4d [Eric Liang] Merge branch 'string-features' into string-features-2 581afb2 [Eric Liang] Merge branch 'master' into string-features 08ae539 [Eric Liang] Merge branch 'string-features' into string-features-2 f99131a [Eric Liang] comments cecec43 [Eric Liang] Merge branch 'string-features' into string-features-2 0bf3c26 [Eric Liang] update docs 4592df2 [Eric Liang] intercept supports 7412a2e [Eric Liang] Fri Jul 24 14:56:51 PDT 2015 3cf848e [Eric Liang] fix the parser 0556c2b [Eric Liang] Merge branch 'string-features' into string-features-2 c302a2c [Eric Liang] fix tests 9d1ac82 [Eric Liang] Merge remote-tracking branch 'upstream/master' into string-features e713da3 [Eric Liang] comments cd231a9 [Eric Liang] Wed Jul 22 17:18:44 PDT 2015 4d79193 [Eric Liang] revert to seq + distinct 169a085 [Eric Liang] tweak functional test a230a47 [Eric Liang] Merge branch 'master' into string-features 72bd6f3 [Eric Liang] fix merge d841cec [Eric Liang] Merge branch 'master' into string-features 5b2c4a2 [Eric Liang] Mon Jul 20 18:45:33 PDT 2015 b01c7c5 [Eric Liang] add test 8a637db [Eric Liang] encoder wip a1d03f4 [Eric Liang] refactor into estimator
2015-07-28 17:16:57 -04:00
# poisson family
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
family = poisson(link = identity))
prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
vals <- collect(select(prediction, "prediction"))
rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
data = iris, family = poisson(link = identity)), iris))
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
# Test stats::predict is working
x <- rnorm(15)
y <- x + rnorm(15)
expect_equal(length(predict(lm(y ~ x))), 15)
})
test_that("kmeans", {
newIris <- iris
newIris$Species <- NULL
training <- suppressWarnings(createDataFrame(sqlContext, newIris))
# Cache the DataFrame here to work around the bug SPARK-13178.
cache(training)
take(training, 1)
model <- kmeans(x = training, centers = 2)
sample <- take(select(predict(model, training), "prediction"), 1)
expect_equal(typeof(sample$prediction), "integer")
expect_equal(sample$prediction, 1)
# Test stats::kmeans is working
statsModel <- kmeans(x = newIris, centers = 2)
expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
# Test fitted works on KMeans
fitted.model <- fitted(model)
expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
# Test summary works on KMeans
summary.model <- summary(model)
cluster <- summary.model$cluster
expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
})
test_that("naiveBayes", {
# R code to reproduce the result.
# We do not support instance weights yet. So we ignore the frequencies.
#
#' library(e1071)
#' t <- as.data.frame(Titanic)
#' t1 <- t[t$Freq > 0, -5]
#' m <- naiveBayes(Survived ~ ., data = t1)
#' m
#' predict(m, t1)
#
# -- output of 'm'
#
# A-priori probabilities:
# Y
# No Yes
# 0.4166667 0.5833333
#
# Conditional probabilities:
# Class
# Y 1st 2nd 3rd Crew
# No 0.2000000 0.2000000 0.4000000 0.2000000
# Yes 0.2857143 0.2857143 0.2857143 0.1428571
#
# Sex
# Y Male Female
# No 0.5 0.5
# Yes 0.5 0.5
#
# Age
# Y Child Adult
# No 0.2000000 0.8000000
# Yes 0.4285714 0.5714286
#
# -- output of 'predict(m, t1)'
#
# Yes Yes Yes Yes No No Yes Yes No No Yes Yes Yes Yes Yes Yes Yes Yes No No Yes Yes No No
#
t <- as.data.frame(Titanic)
t1 <- t[t$Freq > 0, -5]
df <- suppressWarnings(createDataFrame(sqlContext, t1))
m <- naiveBayes(Survived ~ ., data = df)
s <- summary(m)
expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
expect_equal(sum(s$apriori), 1)
expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6)
p <- collect(select(predict(m, df), "prediction"))
expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No",
"Yes", "Yes", "No", "No"))
# Test e1071::naiveBayes
if (requireNamespace("e1071", quietly = TRUE)) {
expect_that(m <- e1071::naiveBayes(Survived ~ ., data = t1), not(throws_error()))
expect_equal(as.character(predict(m, t1[1, ])), "Yes")
}
})
test_that("survreg", {
# R code to reproduce the result.
#
#' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
#' x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
#' library(survival)
#' model <- survreg(Surv(time, status) ~ x + sex, rData)
#' summary(model)
#' predict(model, data)
#
# -- output of 'summary(model)'
#
# Value Std. Error z p
# (Intercept) 1.315 0.270 4.88 1.07e-06
# x -0.190 0.173 -1.10 2.72e-01
# sex -0.253 0.329 -0.77 4.42e-01
# Log(scale) -1.160 0.396 -2.93 3.41e-03
#
# -- output of 'predict(model, data)'
#
# 1 2 3 4 5 6 7
# 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269
#
data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0),
list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1))
df <- createDataFrame(sqlContext, data, c("time", "status", "x", "sex"))
model <- survreg(Surv(time, status) ~ x + sex, df)
stats <- summary(model)
coefs <- as.vector(stats$coefficients[, 1])
rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800)
expect_equal(coefs, rCoefs, tolerance = 1e-4)
expect_true(all(
rownames(stats$coefficients) ==
c("(Intercept)", "x", "sex", "Log(scale)")))
p <- collect(select(predict(model, df), "prediction"))
expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035,
2.390146, 2.891269, 2.891269), tolerance = 1e-4)
# Test survival::survreg
if (requireNamespace("survival", quietly = TRUE)) {
rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
expect_that(
model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
not(throws_error()))
expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
}
})