[SPARK-20438][R] SparkR wrappers for split and repeat
## What changes were proposed in this pull request? Add wrappers for `o.a.s.sql.functions`: - `split` as `split_string` - `repeat` as `repeat_string` ## How was this patch tested? Existing tests, additional unit tests, `check-cran.sh` Author: zero323 <zero323@users.noreply.github.com> Closes #17729 from zero323/SPARK-20438.
This commit is contained in:
parent
90264aced7
commit
8a272ddc9d
|
@ -300,6 +300,7 @@ exportMethods("%in%",
|
||||||
"rank",
|
"rank",
|
||||||
"regexp_extract",
|
"regexp_extract",
|
||||||
"regexp_replace",
|
"regexp_replace",
|
||||||
|
"repeat_string",
|
||||||
"reverse",
|
"reverse",
|
||||||
"rint",
|
"rint",
|
||||||
"rlike",
|
"rlike",
|
||||||
|
@ -323,6 +324,7 @@ exportMethods("%in%",
|
||||||
"sort_array",
|
"sort_array",
|
||||||
"soundex",
|
"soundex",
|
||||||
"spark_partition_id",
|
"spark_partition_id",
|
||||||
|
"split_string",
|
||||||
"stddev",
|
"stddev",
|
||||||
"stddev_pop",
|
"stddev_pop",
|
||||||
"stddev_samp",
|
"stddev_samp",
|
||||||
|
|
|
@ -3745,3 +3745,61 @@ setMethod("collect_set",
|
||||||
jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
|
jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
|
||||||
column(jc)
|
column(jc)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
#' split_string
|
||||||
|
#'
|
||||||
|
#' Splits string on regular expression.
|
||||||
|
#'
|
||||||
|
#' Equivalent to \code{split} SQL function
|
||||||
|
#'
|
||||||
|
#' @param x Column to compute on
|
||||||
|
#' @param pattern Java regular expression
|
||||||
|
#'
|
||||||
|
#' @rdname split_string
|
||||||
|
#' @family string_funcs
|
||||||
|
#' @aliases split_string,Column-method
|
||||||
|
#' @export
|
||||||
|
#' @examples \dontrun{
|
||||||
|
#' df <- read.text("README.md")
|
||||||
|
#'
|
||||||
|
#' head(select(df, split_string(df$value, "\\s+")))
|
||||||
|
#'
|
||||||
|
#' # This is equivalent to the following SQL expression
|
||||||
|
#' head(selectExpr(df, "split(value, '\\\\s+')"))
|
||||||
|
#' }
|
||||||
|
#' @note split_string 2.3.0
|
||||||
|
setMethod("split_string",
|
||||||
|
signature(x = "Column", pattern = "character"),
|
||||||
|
function(x, pattern) {
|
||||||
|
jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern)
|
||||||
|
column(jc)
|
||||||
|
})
|
||||||
|
|
||||||
|
#' repeat_string
|
||||||
|
#'
|
||||||
|
#' Repeats string n times.
|
||||||
|
#'
|
||||||
|
#' Equivalent to \code{repeat} SQL function
|
||||||
|
#'
|
||||||
|
#' @param x Column to compute on
|
||||||
|
#' @param n Number of repetitions
|
||||||
|
#'
|
||||||
|
#' @rdname repeat_string
|
||||||
|
#' @family string_funcs
|
||||||
|
#' @aliases repeat_string,Column-method
|
||||||
|
#' @export
|
||||||
|
#' @examples \dontrun{
|
||||||
|
#' df <- read.text("README.md")
|
||||||
|
#'
|
||||||
|
#' first(select(df, repeat_string(df$value, 3)))
|
||||||
|
#'
|
||||||
|
#' # This is equivalent to the following SQL expression
|
||||||
|
#' first(selectExpr(df, "repeat(value, 3)"))
|
||||||
|
#' }
|
||||||
|
#' @note repeat_string 2.3.0
|
||||||
|
setMethod("repeat_string",
|
||||||
|
signature(x = "Column", n = "numeric"),
|
||||||
|
function(x, n) {
|
||||||
|
jc <- callJStatic("org.apache.spark.sql.functions", "repeat", x@jc, numToInt(n))
|
||||||
|
column(jc)
|
||||||
|
})
|
||||||
|
|
|
@ -1192,6 +1192,10 @@ setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp
|
||||||
setGeneric("regexp_replace",
|
setGeneric("regexp_replace",
|
||||||
function(x, pattern, replacement) { standardGeneric("regexp_replace") })
|
function(x, pattern, replacement) { standardGeneric("regexp_replace") })
|
||||||
|
|
||||||
|
#' @rdname repeat_string
|
||||||
|
#' @export
|
||||||
|
setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })
|
||||||
|
|
||||||
#' @rdname reverse
|
#' @rdname reverse
|
||||||
#' @export
|
#' @export
|
||||||
setGeneric("reverse", function(x) { standardGeneric("reverse") })
|
setGeneric("reverse", function(x) { standardGeneric("reverse") })
|
||||||
|
@ -1257,6 +1261,10 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
|
||||||
#' @export
|
#' @export
|
||||||
setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
|
setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
|
||||||
|
|
||||||
|
#' @rdname split_string
|
||||||
|
#' @export
|
||||||
|
setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })
|
||||||
|
|
||||||
#' @rdname soundex
|
#' @rdname soundex
|
||||||
#' @export
|
#' @export
|
||||||
setGeneric("soundex", function(x) { standardGeneric("soundex") })
|
setGeneric("soundex", function(x) { standardGeneric("soundex") })
|
||||||
|
|
|
@ -1546,6 +1546,40 @@ test_that("string operators", {
|
||||||
expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
|
expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
|
||||||
expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
|
expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
|
||||||
expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
|
expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
|
||||||
|
|
||||||
|
l4 <- list(list(a = "a.b@c.d 1\\b"))
|
||||||
|
df4 <- createDataFrame(l4)
|
||||||
|
expect_equal(
|
||||||
|
collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
|
||||||
|
list(list("a.b@c.d", "1\\b"))
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
|
||||||
|
list(list("a", "b@c", "d 1\\b"))
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
collect(select(df4, split_string(df4$a, "@")))[1, 1],
|
||||||
|
list(list("a.b", "c.d 1\\b"))
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
|
||||||
|
list(list("a.b@c.d 1", "b"))
|
||||||
|
)
|
||||||
|
|
||||||
|
l5 <- list(list(a = "abc"))
|
||||||
|
df5 <- createDataFrame(l5)
|
||||||
|
expect_equal(
|
||||||
|
collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
|
||||||
|
"abc"
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
|
||||||
|
"abcabcabc"
|
||||||
|
)
|
||||||
|
expect_equal(
|
||||||
|
collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
|
||||||
|
""
|
||||||
|
)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("date functions on a DataFrame", {
|
test_that("date functions on a DataFrame", {
|
||||||
|
|
Loading…
Reference in a new issue