[SPARK-6841] [SPARKR] add support for mean, median, stdev etc.

Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241 sum() has been implemented. (https://github.com/amplab-extras/SparkR-pkg/pull/242) Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841 Author: qhuang <qian.huang@intel.com> Closes #5446 from hqzizania/R and squashes the following commits: f283572 [qhuang] add test unit for describe() 2e74d5a [qhuang] add describe() DataFrame API
2015-05-05 20:39:56 -07:00 · 2015-05-05 20:39:56 -07:00 · a466944399
parent 51b3d41e16
commit a466944399
4 changed files with 53 additions and 0 deletions
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@ -13,6 +13,7 @@ exportMethods("cache",
              "collect",
              "columns",
              "count",
+              "describe",
              "distinct",
              "dtypes",
              "except",
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
            callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
          })

+#' describe
+#'
+#' Computes statistics for numeric columns.
+#' If no columns are given, this function computes statistics for all numerical columns.
+#'
+#' @param x A DataFrame to be computed.
+#' @param col A string of name
+#' @param ... Additional expressions
+#' @return A DataFrame
+#' @rdname describe 
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' describe(df)
+#' describe(df, "col1")
+#' describe(df, "col1", "col2")
+#' }
+setMethod("describe",
+          signature(x = "DataFrame", col = "character"),
+          function(x, col, ...) {
+            colList <- list(col, ...)
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
+
+#' @rdname describe
+setMethod("describe",
+          signature(x = "DataFrame"),
+          function(x) {
+            colList <- as.list(c(columns(x)))
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
 #' @export
 setGeneric("columns", function(x) {standardGeneric("columns") })

+#' @rdname describe
+#' @export
+setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
+
 #' @rdname schema
 #' @export
 setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
  expect_true(count(parquetDF) == count(df)*2)
 })

+test_that("describe() on a DataFrame", {
+  df <- jsonFile(sqlCtx, jsonPath)
+  stats <- describe(df, "age")
+  expect_true(collect(stats)[1, "summary"] == "count")
+  expect_true(collect(stats)[2, "age"] == 24.5)
+  expect_true(collect(stats)[3, "age"] == 5.5)
+  stats <- describe(df)
+  expect_true(collect(stats)[4, "name"] == "Andy")
+  expect_true(collect(stats)[5, "age"] == 30.0)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)