[SPARK-18285][SPARKR] SparkR approxQuantile supports input multiple columns
## What changes were proposed in this pull request? SparkR ```approxQuantile``` supports input multiple columns. ## How was this patch tested? Unit test. Author: Yanbo Liang <ybliang8@gmail.com> Closes #16951 from yanboliang/spark-19619.
This commit is contained in:
parent
1a3f5f8c55
commit
b406598382
|
@ -66,7 +66,7 @@ setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("fre
|
||||||
# @rdname approxQuantile
|
# @rdname approxQuantile
|
||||||
# @export
|
# @export
|
||||||
setGeneric("approxQuantile",
|
setGeneric("approxQuantile",
|
||||||
function(x, col, probabilities, relativeError) {
|
function(x, cols, probabilities, relativeError) {
|
||||||
standardGeneric("approxQuantile")
|
standardGeneric("approxQuantile")
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -138,9 +138,9 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
|
||||||
collect(dataFrame(sct))
|
collect(dataFrame(sct))
|
||||||
})
|
})
|
||||||
|
|
||||||
#' Calculates the approximate quantiles of a numerical column of a SparkDataFrame
|
#' Calculates the approximate quantiles of numerical columns of a SparkDataFrame
|
||||||
#'
|
#'
|
||||||
#' Calculates the approximate quantiles of a numerical column of a SparkDataFrame.
|
#' Calculates the approximate quantiles of numerical columns of a SparkDataFrame.
|
||||||
#' The result of this algorithm has the following deterministic bound:
|
#' The result of this algorithm has the following deterministic bound:
|
||||||
#' If the SparkDataFrame has N elements and if we request the quantile at probability p up to
|
#' If the SparkDataFrame has N elements and if we request the quantile at probability p up to
|
||||||
#' error err, then the algorithm will return a sample x from the SparkDataFrame so that the
|
#' error err, then the algorithm will return a sample x from the SparkDataFrame so that the
|
||||||
|
@ -149,15 +149,19 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
|
||||||
#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
|
#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
|
||||||
#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
|
#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
|
||||||
#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
|
#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
|
||||||
|
#' Note that rows containing any NA values will be removed before calculation.
|
||||||
#'
|
#'
|
||||||
#' @param x A SparkDataFrame.
|
#' @param x A SparkDataFrame.
|
||||||
#' @param col The name of the numerical column.
|
#' @param cols A single column name, or a list of names for multiple columns.
|
||||||
#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
|
#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
|
||||||
#' For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
|
#' For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
|
||||||
#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
|
#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
|
||||||
#' the exact quantiles are computed, which could be very expensive.
|
#' the exact quantiles are computed, which could be very expensive.
|
||||||
#' Note that values greater than 1 are accepted but give the same result as 1.
|
#' Note that values greater than 1 are accepted but give the same result as 1.
|
||||||
#' @return The approximate quantiles at the given probabilities.
|
#' @return The approximate quantiles at the given probabilities. If the input is a single column name,
|
||||||
|
#' the output is a list of approximate quantiles in that column; If the input is
|
||||||
|
#' multiple column names, the output should be a list, and each element in it is a list of
|
||||||
|
#' numeric values which represents the approximate quantiles in corresponding column.
|
||||||
#'
|
#'
|
||||||
#' @rdname approxQuantile
|
#' @rdname approxQuantile
|
||||||
#' @name approxQuantile
|
#' @name approxQuantile
|
||||||
|
@ -171,12 +175,17 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
|
||||||
#' }
|
#' }
|
||||||
#' @note approxQuantile since 2.0.0
|
#' @note approxQuantile since 2.0.0
|
||||||
setMethod("approxQuantile",
|
setMethod("approxQuantile",
|
||||||
signature(x = "SparkDataFrame", col = "character",
|
signature(x = "SparkDataFrame", cols = "character",
|
||||||
probabilities = "numeric", relativeError = "numeric"),
|
probabilities = "numeric", relativeError = "numeric"),
|
||||||
function(x, col, probabilities, relativeError) {
|
function(x, cols, probabilities, relativeError) {
|
||||||
statFunctions <- callJMethod(x@sdf, "stat")
|
statFunctions <- callJMethod(x@sdf, "stat")
|
||||||
callJMethod(statFunctions, "approxQuantile", col,
|
quantiles <- callJMethod(statFunctions, "approxQuantile", as.list(cols),
|
||||||
as.list(probabilities), relativeError)
|
as.list(probabilities), relativeError)
|
||||||
|
if (length(cols) == 1) {
|
||||||
|
quantiles[[1]]
|
||||||
|
} else {
|
||||||
|
quantiles
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
#' Returns a stratified sample without replacement
|
#' Returns a stratified sample without replacement
|
||||||
|
|
|
@ -2222,11 +2222,19 @@ test_that("sampleBy() on a DataFrame", {
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("approxQuantile() on a DataFrame", {
|
test_that("approxQuantile() on a DataFrame", {
|
||||||
l <- lapply(c(0:99), function(i) { i })
|
l <- lapply(c(0:99), function(i) { list(i, 99 - i) })
|
||||||
df <- createDataFrame(l, "key")
|
df <- createDataFrame(l, list("a", "b"))
|
||||||
quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
|
quantiles <- approxQuantile(df, "a", c(0.5, 0.8), 0.0)
|
||||||
expect_equal(quantiles[[1]], 50)
|
expect_equal(quantiles, list(50, 80))
|
||||||
expect_equal(quantiles[[2]], 80)
|
quantiles2 <- approxQuantile(df, c("a", "b"), c(0.5, 0.8), 0.0)
|
||||||
|
expect_equal(quantiles2[[1]], list(50, 80))
|
||||||
|
expect_equal(quantiles2[[2]], list(50, 80))
|
||||||
|
|
||||||
|
dfWithNA <- createDataFrame(data.frame(a = c(NA, 30, 19, 11, 28, 15),
|
||||||
|
b = c(-30, -19, NA, -11, -28, -15)))
|
||||||
|
quantiles3 <- approxQuantile(dfWithNA, c("a", "b"), c(0.5), 0.0)
|
||||||
|
expect_equal(quantiles3[[1]], list(28))
|
||||||
|
expect_equal(quantiles3[[2]], list(-15))
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("SQL error message is returned from JVM", {
|
test_that("SQL error message is returned from JVM", {
|
||||||
|
|
Loading…
Reference in a new issue