[SPARK-6841] [SPARKR] add support for mean, median, stdev etc.

Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241
sum() has been implemented. (https://github.com/amplab-extras/SparkR-pkg/pull/242)

Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841

Author: qhuang <qian.huang@intel.com>

Closes #5446 from hqzizania/R and squashes the following commits:

f283572 [qhuang] add test unit for describe()
2e74d5a [qhuang] add describe() DataFrame API
This commit is contained in:
qhuang 2015-05-05 20:39:56 -07:00 committed by Reynold Xin
parent 51b3d41e16
commit a466944399
4 changed files with 53 additions and 0 deletions

View file

@ -13,6 +13,7 @@ exportMethods("cache",
"collect",
"columns",
"count",
"describe",
"distinct",
"dtypes",
"except",

View file

@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
})
#' describe
#'
#' Computes statistics for numeric columns.
#' If no columns are given, this function computes statistics for all numerical columns.
#'
#' @param x A DataFrame to be computed.
#' @param col A string of name
#' @param ... Additional expressions
#' @return A DataFrame
#' @rdname describe
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlCtx <- sparkRSQL.init(sc)
#' path <- "path/to/file.json"
#' df <- jsonFile(sqlCtx, path)
#' describe(df)
#' describe(df, "col1")
#' describe(df, "col1", "col2")
#' }
setMethod("describe",
signature(x = "DataFrame", col = "character"),
function(x, col, ...) {
colList <- list(col, ...)
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
dataFrame(sdf)
})
#' @rdname describe
setMethod("describe",
signature(x = "DataFrame"),
function(x) {
colList <- as.list(c(columns(x)))
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
dataFrame(sdf)
})

View file

@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
#' @export
setGeneric("columns", function(x) {standardGeneric("columns") })
#' @rdname describe
#' @export
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
#' @rdname schema
#' @export
setGeneric("dtypes", function(x) { standardGeneric("dtypes") })

View file

@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
expect_true(count(parquetDF) == count(df)*2)
})
test_that("describe() on a DataFrame", {
df <- jsonFile(sqlCtx, jsonPath)
stats <- describe(df, "age")
expect_true(collect(stats)[1, "summary"] == "count")
expect_true(collect(stats)[2, "age"] == 24.5)
expect_true(collect(stats)[3, "age"] == 5.5)
stats <- describe(df)
expect_true(collect(stats)[4, "name"] == "Andy")
expect_true(collect(stats)[5, "age"] == 30.0)
})
unlink(parquetPath)
unlink(jsonPath)