[SPARK-20438][R] SparkR wrappers for split and repeat

## What changes were proposed in this pull request? Add wrappers for `o.a.s.sql.functions`: - `split` as `split_string` - `repeat` as `repeat_string` ## How was this patch tested? Existing tests, additional unit tests, `check-cran.sh` Author: zero323 <zero323@users.noreply.github.com> Closes #17729 from zero323/SPARK-20438.
2017-04-24 10:56:57 -07:00 · 2017-04-24 10:56:57 -07:00 · 8a272ddc9d
parent 90264aced7
commit 8a272ddc9d
4 changed files with 102 additions and 0 deletions
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@ -300,6 +300,7 @@ exportMethods("%in%",
              "rank",
              "regexp_extract",
              "regexp_replace",
              "repeat_string",
              "reverse",
              "rint",
              "rlike",
@ -323,6 +324,7 @@ exportMethods("%in%",
              "sort_array",
              "soundex",
              "spark_partition_id",
              "split_string",
              "stddev",
              "stddev_pop",
              "stddev_samp",
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@ -3745,3 +3745,61 @@ setMethod("collect_set",
            jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
            column(jc)
          })
 #' split_string
 #'
 #' Splits string on regular expression.
 #'
 #' Equivalent to \code{split} SQL function
 #'
 #' @param x Column to compute on
 #' @param pattern Java regular expression
 #'
 #' @rdname split_string
 #' @family string_funcs
 #' @aliases split_string,Column-method
 #' @export
 #' @examples \dontrun{
 #' df <- read.text("README.md")
 #'
 #' head(select(df, split_string(df$value, "\\s+")))
 #'
 #' # This is equivalent to the following SQL expression
 #' head(selectExpr(df, "split(value, '\\\\s+')"))
 #' }
 #' @note split_string 2.3.0
 setMethod("split_string",
          signature(x = "Column", pattern = "character"),
          function(x, pattern) {
            jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern)
            column(jc)
          })
 #' repeat_string
 #'
 #' Repeats string n times.
 #'
 #' Equivalent to \code{repeat} SQL function
 #'
 #' @param x Column to compute on
 #' @param n Number of repetitions
 #'
 #' @rdname repeat_string
 #' @family string_funcs
 #' @aliases repeat_string,Column-method
 #' @export
 #' @examples \dontrun{
 #' df <- read.text("README.md")
 #'
 #' first(select(df, repeat_string(df$value, 3)))
 #'
 #' # This is equivalent to the following SQL expression
 #' first(selectExpr(df, "repeat(value, 3)"))
 #' }
 #' @note repeat_string 2.3.0
 setMethod("repeat_string",
          signature(x = "Column", n = "numeric"),
          function(x, n) {
            jc <- callJStatic("org.apache.spark.sql.functions", "repeat", x@jc, numToInt(n))
            column(jc)
          })
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@ -1192,6 +1192,10 @@ setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp
 setGeneric("regexp_replace",
           function(x, pattern, replacement) { standardGeneric("regexp_replace") })
 #' @rdname repeat_string
 #' @export
 setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })
 #' @rdname reverse
 #' @export
 setGeneric("reverse", function(x) { standardGeneric("reverse") })
@ -1257,6 +1261,10 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
 #' @export
 setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
 #' @rdname split_string
 #' @export
 setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })
 #' @rdname soundex
 #' @export
 setGeneric("soundex", function(x) { standardGeneric("soundex") })
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@ -1546,6 +1546,40 @@ test_that("string operators", {
  expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
  expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
  expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
  l4 <- list(list(a = "a.b@c.d   1\\b"))
  df4 <- createDataFrame(l4)
  expect_equal(
    collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
    list(list("a.b@c.d", "1\\b"))
  )
  expect_equal(
    collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
    list(list("a", "b@c", "d   1\\b"))
  )
  expect_equal(
    collect(select(df4, split_string(df4$a, "@")))[1, 1],
    list(list("a.b", "c.d   1\\b"))
  )
  expect_equal(
    collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
    list(list("a.b@c.d   1", "b"))
  )
  l5 <- list(list(a = "abc"))
  df5 <- createDataFrame(l5)
  expect_equal(
    collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
    "abc"
  )
  expect_equal(
    collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
    "abcabcabc"
  )
  expect_equal(
    collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
    ""
  )
 })
 test_that("date functions on a DataFrame", {