[SPARK-20438][R] SparkR wrappers for split and repeat
## What changes were proposed in this pull request? Add wrappers for `o.a.s.sql.functions`: - `split` as `split_string` - `repeat` as `repeat_string` ## How was this patch tested? Existing tests, additional unit tests, `check-cran.sh` Author: zero323 <zero323@users.noreply.github.com> Closes #17729 from zero323/SPARK-20438.
This commit is contained in:
parent
90264aced7
commit
8a272ddc9d
|
@ -300,6 +300,7 @@ exportMethods("%in%",
|
|||
"rank",
|
||||
"regexp_extract",
|
||||
"regexp_replace",
|
||||
"repeat_string",
|
||||
"reverse",
|
||||
"rint",
|
||||
"rlike",
|
||||
|
@ -323,6 +324,7 @@ exportMethods("%in%",
|
|||
"sort_array",
|
||||
"soundex",
|
||||
"spark_partition_id",
|
||||
"split_string",
|
||||
"stddev",
|
||||
"stddev_pop",
|
||||
"stddev_samp",
|
||||
|
|
|
@ -3745,3 +3745,61 @@ setMethod("collect_set",
|
|||
jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
|
||||
column(jc)
|
||||
})
|
||||
|
||||
#' split_string
|
||||
#'
|
||||
#' Splits string on regular expression.
|
||||
#'
|
||||
#' Equivalent to \code{split} SQL function
|
||||
#'
|
||||
#' @param x Column to compute on
|
||||
#' @param pattern Java regular expression
|
||||
#'
|
||||
#' @rdname split_string
|
||||
#' @family string_funcs
|
||||
#' @aliases split_string,Column-method
|
||||
#' @export
|
||||
#' @examples \dontrun{
|
||||
#' df <- read.text("README.md")
|
||||
#'
|
||||
#' head(select(df, split_string(df$value, "\\s+")))
|
||||
#'
|
||||
#' # This is equivalent to the following SQL expression
|
||||
#' head(selectExpr(df, "split(value, '\\\\s+')"))
|
||||
#' }
|
||||
#' @note split_string 2.3.0
|
||||
setMethod("split_string",
|
||||
signature(x = "Column", pattern = "character"),
|
||||
function(x, pattern) {
|
||||
jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern)
|
||||
column(jc)
|
||||
})
|
||||
|
||||
#' repeat_string
|
||||
#'
|
||||
#' Repeats string n times.
|
||||
#'
|
||||
#' Equivalent to \code{repeat} SQL function
|
||||
#'
|
||||
#' @param x Column to compute on
|
||||
#' @param n Number of repetitions
|
||||
#'
|
||||
#' @rdname repeat_string
|
||||
#' @family string_funcs
|
||||
#' @aliases repeat_string,Column-method
|
||||
#' @export
|
||||
#' @examples \dontrun{
|
||||
#' df <- read.text("README.md")
|
||||
#'
|
||||
#' first(select(df, repeat_string(df$value, 3)))
|
||||
#'
|
||||
#' # This is equivalent to the following SQL expression
|
||||
#' first(selectExpr(df, "repeat(value, 3)"))
|
||||
#' }
|
||||
#' @note repeat_string 2.3.0
|
||||
setMethod("repeat_string",
|
||||
signature(x = "Column", n = "numeric"),
|
||||
function(x, n) {
|
||||
jc <- callJStatic("org.apache.spark.sql.functions", "repeat", x@jc, numToInt(n))
|
||||
column(jc)
|
||||
})
|
||||
|
|
|
@ -1192,6 +1192,10 @@ setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp
|
|||
setGeneric("regexp_replace",
|
||||
function(x, pattern, replacement) { standardGeneric("regexp_replace") })
|
||||
|
||||
#' @rdname repeat_string
|
||||
#' @export
|
||||
setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })
|
||||
|
||||
#' @rdname reverse
|
||||
#' @export
|
||||
setGeneric("reverse", function(x) { standardGeneric("reverse") })
|
||||
|
@ -1257,6 +1261,10 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
|
|||
#' @export
|
||||
setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
|
||||
|
||||
#' @rdname split_string
|
||||
#' @export
|
||||
setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })
|
||||
|
||||
#' @rdname soundex
|
||||
#' @export
|
||||
setGeneric("soundex", function(x) { standardGeneric("soundex") })
|
||||
|
|
|
@ -1546,6 +1546,40 @@ test_that("string operators", {
|
|||
expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
|
||||
expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
|
||||
expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
|
||||
|
||||
l4 <- list(list(a = "a.b@c.d 1\\b"))
|
||||
df4 <- createDataFrame(l4)
|
||||
expect_equal(
|
||||
collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
|
||||
list(list("a.b@c.d", "1\\b"))
|
||||
)
|
||||
expect_equal(
|
||||
collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
|
||||
list(list("a", "b@c", "d 1\\b"))
|
||||
)
|
||||
expect_equal(
|
||||
collect(select(df4, split_string(df4$a, "@")))[1, 1],
|
||||
list(list("a.b", "c.d 1\\b"))
|
||||
)
|
||||
expect_equal(
|
||||
collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
|
||||
list(list("a.b@c.d 1", "b"))
|
||||
)
|
||||
|
||||
l5 <- list(list(a = "abc"))
|
||||
df5 <- createDataFrame(l5)
|
||||
expect_equal(
|
||||
collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
|
||||
"abc"
|
||||
)
|
||||
expect_equal(
|
||||
collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
|
||||
"abcabcabc"
|
||||
)
|
||||
expect_equal(
|
||||
collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
|
||||
""
|
||||
)
|
||||
})
|
||||
|
||||
test_that("date functions on a DataFrame", {
|
||||
|
|
Loading…
Reference in a new issue