[SPARK-8364] [SPARKR] Add crosstab to SparkR DataFrames
Add `crosstab` to SparkR DataFrames, which takes two column names and returns a local R data.frame. This is similar to `table` in R. However, `table` in SparkR is used for loading SQL tables as DataFrames. The return type is data.frame instead table for `crosstab` to be compatible with Scala/Python. I couldn't run R tests successfully on my local. Many unit tests failed. So let's try Jenkins. Author: Xiangrui Meng <meng@databricks.com> Closes #7318 from mengxr/SPARK-8364 and squashes the following commits: d75e894 [Xiangrui Meng] fix tests 53f6ddd [Xiangrui Meng] fix tests f1348d6 [Xiangrui Meng] update test 47cb088 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-8364 5621262 [Xiangrui Meng] first version without test
This commit is contained in:
parent
b217230f2a
commit
2f5cbd860e
|
@ -26,6 +26,7 @@ exportMethods("arrange",
|
|||
"collect",
|
||||
"columns",
|
||||
"count",
|
||||
"crosstab",
|
||||
"describe",
|
||||
"distinct",
|
||||
"dropna",
|
||||
|
|
|
@ -1554,3 +1554,31 @@ setMethod("fillna",
|
|||
}
|
||||
dataFrame(sdf)
|
||||
})
|
||||
|
||||
#' crosstab
|
||||
#'
|
||||
#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
|
||||
#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
|
||||
#' non-zero pair frequencies will be returned.
|
||||
#'
|
||||
#' @param col1 name of the first column. Distinct items will make the first item of each row.
|
||||
#' @param col2 name of the second column. Distinct items will make the column names of the output.
|
||||
#' @return a local R data.frame representing the contingency table. The first column of each row
|
||||
#' will be the distinct values of `col1` and the column names will be the distinct values
|
||||
#' of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
|
||||
#' occurrences will have `null` as their counts.
|
||||
#'
|
||||
#' @rdname statfunctions
|
||||
#' @export
|
||||
#' @examples
|
||||
#' \dontrun{
|
||||
#' df <- jsonFile(sqlCtx, "/path/to/file.json")
|
||||
#' ct = crosstab(df, "title", "gender")
|
||||
#' }
|
||||
setMethod("crosstab",
|
||||
signature(x = "DataFrame", col1 = "character", col2 = "character"),
|
||||
function(x, col1, col2) {
|
||||
statFunctions <- callJMethod(x@sdf, "stat")
|
||||
sct <- callJMethod(statFunctions, "crosstab", col1, col2)
|
||||
collect(dataFrame(sct))
|
||||
})
|
||||
|
|
|
@ -59,6 +59,10 @@ setGeneric("count", function(x) { standardGeneric("count") })
|
|||
# @export
|
||||
setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
|
||||
|
||||
# @rdname statfunctions
|
||||
# @export
|
||||
setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
|
||||
|
||||
# @rdname distinct
|
||||
# @export
|
||||
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
|
||||
|
|
|
@ -987,6 +987,19 @@ test_that("fillna() on a DataFrame", {
|
|||
expect_identical(expected, actual)
|
||||
})
|
||||
|
||||
test_that("crosstab() on a DataFrame", {
|
||||
rdd <- lapply(parallelize(sc, 0:3), function(x) {
|
||||
list(paste0("a", x %% 3), paste0("b", x %% 2))
|
||||
})
|
||||
df <- toDF(rdd, list("a", "b"))
|
||||
ct <- crosstab(df, "a", "b")
|
||||
ordered <- ct[order(ct$a_b),]
|
||||
row.names(ordered) <- NULL
|
||||
expected <- data.frame("a_b" = c("a0", "a1", "a2"), "b0" = c(1, 0, 1), "b1" = c(1, 1, 0),
|
||||
stringsAsFactors = FALSE, row.names = NULL)
|
||||
expect_identical(expected, ordered)
|
||||
})
|
||||
|
||||
unlink(parquetPath)
|
||||
unlink(jsonPath)
|
||||
unlink(jsonPathNa)
|
||||
|
|
Loading…
Reference in a new issue