[SPARK-20438][R] SparkR wrappers for split and repeat

## What changes were proposed in this pull request?

Add wrappers for `o.a.s.sql.functions`:

- `split` as `split_string`
- `repeat` as `repeat_string`

## How was this patch tested?

Existing tests, additional unit tests, `check-cran.sh`

Author: zero323 <zero323@users.noreply.github.com>

Closes #17729 from zero323/SPARK-20438.
This commit is contained in:
zero323 2017-04-24 10:56:57 -07:00 committed by Felix Cheung
parent 90264aced7
commit 8a272ddc9d
4 changed files with 102 additions and 0 deletions

View file

@ -300,6 +300,7 @@ exportMethods("%in%",
"rank", "rank",
"regexp_extract", "regexp_extract",
"regexp_replace", "regexp_replace",
"repeat_string",
"reverse", "reverse",
"rint", "rint",
"rlike", "rlike",
@ -323,6 +324,7 @@ exportMethods("%in%",
"sort_array", "sort_array",
"soundex", "soundex",
"spark_partition_id", "spark_partition_id",
"split_string",
"stddev", "stddev",
"stddev_pop", "stddev_pop",
"stddev_samp", "stddev_samp",

View file

@ -3745,3 +3745,61 @@ setMethod("collect_set",
jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc) jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
column(jc) column(jc)
}) })
#' split_string
#'
#' Splits string on regular expression.
#'
#' Equivalent to \code{split} SQL function
#'
#' @param x Column to compute on
#' @param pattern Java regular expression
#'
#' @rdname split_string
#' @family string_funcs
#' @aliases split_string,Column-method
#' @export
#' @examples \dontrun{
#' df <- read.text("README.md")
#'
#' head(select(df, split_string(df$value, "\\s+")))
#'
#' # This is equivalent to the following SQL expression
#' head(selectExpr(df, "split(value, '\\\\s+')"))
#' }
#' @note split_string 2.3.0
setMethod("split_string",
signature(x = "Column", pattern = "character"),
function(x, pattern) {
jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern)
column(jc)
})
#' repeat_string
#'
#' Repeats string n times.
#'
#' Equivalent to \code{repeat} SQL function
#'
#' @param x Column to compute on
#' @param n Number of repetitions
#'
#' @rdname repeat_string
#' @family string_funcs
#' @aliases repeat_string,Column-method
#' @export
#' @examples \dontrun{
#' df <- read.text("README.md")
#'
#' first(select(df, repeat_string(df$value, 3)))
#'
#' # This is equivalent to the following SQL expression
#' first(selectExpr(df, "repeat(value, 3)"))
#' }
#' @note repeat_string 2.3.0
setMethod("repeat_string",
signature(x = "Column", n = "numeric"),
function(x, n) {
jc <- callJStatic("org.apache.spark.sql.functions", "repeat", x@jc, numToInt(n))
column(jc)
})

View file

@ -1192,6 +1192,10 @@ setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp
setGeneric("regexp_replace", setGeneric("regexp_replace",
function(x, pattern, replacement) { standardGeneric("regexp_replace") }) function(x, pattern, replacement) { standardGeneric("regexp_replace") })
#' @rdname repeat_string
#' @export
setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })
#' @rdname reverse #' @rdname reverse
#' @export #' @export
setGeneric("reverse", function(x) { standardGeneric("reverse") }) setGeneric("reverse", function(x) { standardGeneric("reverse") })
@ -1257,6 +1261,10 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
#' @export #' @export
setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") }) setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
#' @rdname split_string
#' @export
setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })
#' @rdname soundex #' @rdname soundex
#' @export #' @export
setGeneric("soundex", function(x) { standardGeneric("soundex") }) setGeneric("soundex", function(x) { standardGeneric("soundex") })

View file

@ -1546,6 +1546,40 @@ test_that("string operators", {
expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b") expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d") expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d") expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
l4 <- list(list(a = "a.b@c.d 1\\b"))
df4 <- createDataFrame(l4)
expect_equal(
collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
list(list("a.b@c.d", "1\\b"))
)
expect_equal(
collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
list(list("a", "b@c", "d 1\\b"))
)
expect_equal(
collect(select(df4, split_string(df4$a, "@")))[1, 1],
list(list("a.b", "c.d 1\\b"))
)
expect_equal(
collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
list(list("a.b@c.d 1", "b"))
)
l5 <- list(list(a = "abc"))
df5 <- createDataFrame(l5)
expect_equal(
collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
"abc"
)
expect_equal(
collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
"abcabcabc"
)
expect_equal(
collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
""
)
}) })
test_that("date functions on a DataFrame", { test_that("date functions on a DataFrame", {