[SPARK-6841] [SPARKR] add support for mean, median, stdev etc.
Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241 sum() has been implemented. (https://github.com/amplab-extras/SparkR-pkg/pull/242) Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841 Author: qhuang <qian.huang@intel.com> Closes #5446 from hqzizania/R and squashes the following commits: f283572 [qhuang] add test unit for describe() 2e74d5a [qhuang] add describe() DataFrame API
This commit is contained in:
parent
51b3d41e16
commit
a466944399
|
@ -13,6 +13,7 @@ exportMethods("cache",
|
|||
"collect",
|
||||
"columns",
|
||||
"count",
|
||||
"describe",
|
||||
"distinct",
|
||||
"dtypes",
|
||||
"except",
|
||||
|
|
|
@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
|
|||
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
|
||||
})
|
||||
|
||||
#' describe
|
||||
#'
|
||||
#' Computes statistics for numeric columns.
|
||||
#' If no columns are given, this function computes statistics for all numerical columns.
|
||||
#'
|
||||
#' @param x A DataFrame to be computed.
|
||||
#' @param col A string of name
|
||||
#' @param ... Additional expressions
|
||||
#' @return A DataFrame
|
||||
#' @rdname describe
|
||||
#' @export
|
||||
#' @examples
|
||||
#'\dontrun{
|
||||
#' sc <- sparkR.init()
|
||||
#' sqlCtx <- sparkRSQL.init(sc)
|
||||
#' path <- "path/to/file.json"
|
||||
#' df <- jsonFile(sqlCtx, path)
|
||||
#' describe(df)
|
||||
#' describe(df, "col1")
|
||||
#' describe(df, "col1", "col2")
|
||||
#' }
|
||||
setMethod("describe",
|
||||
signature(x = "DataFrame", col = "character"),
|
||||
function(x, col, ...) {
|
||||
colList <- list(col, ...)
|
||||
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
|
||||
dataFrame(sdf)
|
||||
})
|
||||
|
||||
#' @rdname describe
|
||||
setMethod("describe",
|
||||
signature(x = "DataFrame"),
|
||||
function(x) {
|
||||
colList <- as.list(c(columns(x)))
|
||||
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
|
||||
dataFrame(sdf)
|
||||
})
|
||||
|
|
|
@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
|
|||
#' @export
|
||||
setGeneric("columns", function(x) {standardGeneric("columns") })
|
||||
|
||||
#' @rdname describe
|
||||
#' @export
|
||||
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
|
||||
|
||||
#' @rdname schema
|
||||
#' @export
|
||||
setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
|
||||
|
|
|
@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
|
|||
expect_true(count(parquetDF) == count(df)*2)
|
||||
})
|
||||
|
||||
test_that("describe() on a DataFrame", {
|
||||
df <- jsonFile(sqlCtx, jsonPath)
|
||||
stats <- describe(df, "age")
|
||||
expect_true(collect(stats)[1, "summary"] == "count")
|
||||
expect_true(collect(stats)[2, "age"] == 24.5)
|
||||
expect_true(collect(stats)[3, "age"] == 5.5)
|
||||
stats <- describe(df)
|
||||
expect_true(collect(stats)[4, "name"] == "Andy")
|
||||
expect_true(collect(stats)[5, "age"] == 30.0)
|
||||
})
|
||||
|
||||
unlink(parquetPath)
|
||||
unlink(jsonPath)
|
||||
|
|
Loading…
Reference in a new issue