spark-instrumented-optimizer/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
HyukjinKwon 0f48aafab8 [SPARK-29339][R] Support Arrow 0.14 in vectoried dapply and gapply (test it in AppVeyor build)
### What changes were proposed in this pull request?

This PR proposes:

1. Use `is.data.frame` to check if it is a DataFrame.
2. to install Arrow and test Arrow optimization in AppVeyor build. We're currently not testing this in CI.

### Why are the changes needed?

1. To support SparkR with Arrow 0.14
2. To check if there's any regression and if it works correctly.

### Does this PR introduce any user-facing change?

```r
df <- createDataFrame(mtcars)
collect(dapply(df, function(rdf) { data.frame(rdf$gear + 1) }, structType("gear double")))
```

**Before:**

```
Error in readBin(con, raw(), as.integer(dataLen), endian = "big") :
  invalid 'n' argument
```

**After:**

```
   gear
1     5
2     5
3     5
4     4
5     4
6     4
7     4
8     5
9     5
...
```

### How was this patch tested?

AppVeyor

Closes #25993 from HyukjinKwon/arrow-r-appveyor.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2019-10-04 08:56:45 +09:00

316 lines
11 KiB
R

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
library(testthat)
context("SparkSQL Arrow optimization")
sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
test_that("createDataFrame/collect Arrow optimization", {
skip_if_not_installed("arrow")
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false")
tryCatch({
expected <- collect(createDataFrame(mtcars))
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
expect_equal(collect(createDataFrame(mtcars)), expected)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("createDataFrame/collect Arrow optimization - many partitions (partition order test)", {
skip_if_not_installed("arrow")
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
expect_equal(collect(createDataFrame(mtcars, numPartitions = 32)),
collect(createDataFrame(mtcars, numPartitions = 1)))
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("createDataFrame/collect Arrow optimization - type specification", {
skip_if_not_installed("arrow")
rdf <- data.frame(list(list(a = 1,
b = "a",
c = TRUE,
d = 1.1,
e = 1L,
f = as.Date("1990-02-24"),
g = as.POSIXct("1990-02-24 12:34:56"))))
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
conf <- callJMethod(sparkSession, "conf")
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false")
tryCatch({
expected <- collect(createDataFrame(rdf))
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
expect_equal(collect(createDataFrame(rdf)), expected)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("dapply() Arrow optimization", {
skip_if_not_installed("arrow")
df <- createDataFrame(mtcars)
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false")
tryCatch({
ret <- dapply(df,
function(rdf) {
stopifnot(is.data.frame(rdf))
rdf
},
schema(df))
expected <- collect(ret)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
ret <- dapply(df,
function(rdf) {
stopifnot(is.data.frame(rdf))
# mtcars' hp is more then 50.
stopifnot(all(rdf$hp > 50))
rdf
},
schema(df))
actual <- collect(ret)
expect_equal(actual, expected)
expect_equal(count(ret), nrow(mtcars))
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("dapply() Arrow optimization - type specification", {
skip_if_not_installed("arrow")
# Note that regular dapply() seems not supporting date and timestamps
# whereas Arrow-optimized dapply() does.
rdf <- data.frame(list(list(a = 1,
b = "a",
c = TRUE,
d = 1.1,
e = 1L)))
# numPartitions are set to 8 intentionally to test empty partitions as well.
df <- createDataFrame(rdf, numPartitions = 8)
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false")
tryCatch({
ret <- dapply(df, function(rdf) { rdf }, schema(df))
expected <- collect(ret)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
ret <- dapply(df, function(rdf) { rdf }, schema(df))
actual <- collect(ret)
expect_equal(actual, expected)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("dapply() Arrow optimization - type specification (date and timestamp)", {
skip_if_not_installed("arrow")
rdf <- data.frame(list(list(a = as.Date("1990-02-24"),
b = as.POSIXct("1990-02-24 12:34:56"))))
df <- createDataFrame(rdf)
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
ret <- dapply(df, function(rdf) { rdf }, schema(df))
expect_equal(collect(ret), rdf)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("gapply() Arrow optimization", {
skip_if_not_installed("arrow")
df <- createDataFrame(mtcars)
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false")
tryCatch({
ret <- gapply(df,
"gear",
function(key, grouped) {
if (length(key) > 0) {
stopifnot(is.numeric(key[[1]]))
}
stopifnot(is.data.frame(grouped))
grouped
},
schema(df))
expected <- collect(ret)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
ret <- gapply(df,
"gear",
function(key, grouped) {
if (length(key) > 0) {
stopifnot(is.numeric(key[[1]]))
}
stopifnot(is.data.frame(grouped))
stopifnot(length(colnames(grouped)) == 11)
# mtcars' hp is more then 50.
stopifnot(all(grouped$hp > 50))
grouped
},
schema(df))
actual <- collect(ret)
expect_equal(actual, expected)
expect_equal(count(ret), nrow(mtcars))
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("gapply() Arrow optimization - type specification", {
skip_if_not_installed("arrow")
# Note that regular gapply() seems not supporting date and timestamps
# whereas Arrow-optimized gapply() does.
rdf <- data.frame(list(list(a = 1,
b = "a",
c = TRUE,
d = 1.1,
e = 1L)))
df <- createDataFrame(rdf)
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false")
tryCatch({
ret <- gapply(df,
"a",
function(key, grouped) { grouped }, schema(df))
expected <- collect(ret)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
ret <- gapply(df,
"a",
function(key, grouped) { grouped }, schema(df))
actual <- collect(ret)
expect_equal(actual, expected)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("gapply() Arrow optimization - type specification (date and timestamp)", {
skip_if_not_installed("arrow")
rdf <- data.frame(list(list(a = as.Date("1990-02-24"),
b = as.POSIXct("1990-02-24 12:34:56"))))
df <- createDataFrame(rdf)
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
ret <- gapply(df,
"a",
function(key, grouped) { grouped }, schema(df))
expect_equal(collect(ret), rdf)
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
test_that("Arrow optimization - unsupported types", {
skip_if_not_installed("arrow")
conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
expect_error(checkSchemaInArrow(structType("a FLOAT")), "not support float type")
expect_error(checkSchemaInArrow(structType("a BINARY")), "not support binary type")
expect_error(checkSchemaInArrow(structType("a ARRAY<INT>")), "not support array type")
expect_error(checkSchemaInArrow(structType("a MAP<INT, INT>")), "not support map type")
expect_error(checkSchemaInArrow(structType("a STRUCT<a: INT>")),
"not support nested struct type")
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})
sparkR.session.stop()