[SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR
## What changes were proposed in this pull request? This PR adds varargs-type `dropDuplicates` function to SparkR for API parity. Refer to https://issues.apache.org/jira/browse/SPARK-15807, too. ## How was this patch tested? Pass the Jenkins tests with new testcases. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #13684 from dongjoon-hyun/SPARK-15908.
This commit is contained in:
parent
5fd20b66ff
commit
513a03e41e
|
@ -1936,10 +1936,11 @@ setMethod("where",
|
|||
#' the subset of columns.
|
||||
#'
|
||||
#' @param x A SparkDataFrame.
|
||||
#' @param colnames A character vector of column names.
|
||||
#' @param ... A character vector of column names or string column names.
|
||||
#' If the first argument contains a character vector, the followings are ignored.
|
||||
#' @return A SparkDataFrame with duplicate rows removed.
|
||||
#' @family SparkDataFrame functions
|
||||
#' @rdname dropduplicates
|
||||
#' @rdname dropDuplicates
|
||||
#' @name dropDuplicates
|
||||
#' @export
|
||||
#' @examples
|
||||
|
@ -1949,14 +1950,26 @@ setMethod("where",
|
|||
#' path <- "path/to/file.json"
|
||||
#' df <- read.json(path)
|
||||
#' dropDuplicates(df)
|
||||
#' dropDuplicates(df, "col1", "col2")
|
||||
#' dropDuplicates(df, c("col1", "col2"))
|
||||
#' }
|
||||
setMethod("dropDuplicates",
|
||||
signature(x = "SparkDataFrame"),
|
||||
function(x, colNames = columns(x)) {
|
||||
stopifnot(class(colNames) == "character")
|
||||
|
||||
sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(colNames))
|
||||
function(x, ...) {
|
||||
cols <- list(...)
|
||||
if (length(cols) == 0) {
|
||||
sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(columns(x)))
|
||||
} else {
|
||||
if (!all(sapply(cols, function(c) { is.character(c) }))) {
|
||||
stop("all columns names should be characters")
|
||||
}
|
||||
col <- cols[[1]]
|
||||
if (length(col) > 1) {
|
||||
sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(col))
|
||||
} else {
|
||||
sdf <- callJMethod(x@sdf, "dropDuplicates", cols)
|
||||
}
|
||||
}
|
||||
dataFrame(sdf)
|
||||
})
|
||||
|
||||
|
|
|
@ -466,12 +466,9 @@ setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
|
|||
#' @export
|
||||
setGeneric("drop", function(x, ...) { standardGeneric("drop") })
|
||||
|
||||
#' @rdname dropduplicates
|
||||
#' @rdname dropDuplicates
|
||||
#' @export
|
||||
setGeneric("dropDuplicates",
|
||||
function(x, colNames = columns(x)) {
|
||||
standardGeneric("dropDuplicates")
|
||||
})
|
||||
setGeneric("dropDuplicates", function(x, ...) { standardGeneric("dropDuplicates") })
|
||||
|
||||
#' @rdname nafunctions
|
||||
#' @export
|
||||
|
|
|
@ -796,6 +796,14 @@ test_that("distinct(), unique() and dropDuplicates() on DataFrames", {
|
|||
result[order(result$key, result$value1, result$value2), ],
|
||||
expected)
|
||||
|
||||
result <- collect(dropDuplicates(df, "key", "value1"))
|
||||
expected <- rbind.data.frame(
|
||||
c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
|
||||
names(expected) <- c("key", "value1", "value2")
|
||||
expect_equivalent(
|
||||
result[order(result$key, result$value1, result$value2), ],
|
||||
expected)
|
||||
|
||||
result <- collect(dropDuplicates(df, "key"))
|
||||
expected <- rbind.data.frame(
|
||||
c(1, 1, 1), c(2, 1, 2))
|
||||
|
|
Loading…
Reference in a new issue