448ff333fa
Changes include
1. Rename sortDF to arrange
2. Add new aliases `group_by` and `sample_frac`, `summarize`
3. Add more user friendly column addition (mutate), rename
4. Support mean as an alias for avg in Scala and also support n_distinct, n as in dplyr
Using these changes we can pretty much run the examples as described in http://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html with the same syntax
The only thing missing in SparkR is auto resolving column names when used in an expression i.e. making something like `select(flights, delay)` works in dply but we right now need `select(flights, flights$delay)` or `select(flights, "delay")`. But this is a complicated change and I'll file a new issue for it
cc sun-rui rxin
Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Closes #6005 from shivaram/sparkr-df-api and squashes the following commits:
5e0716a [Shivaram Venkataraman] Fix some roxygen bugs
1254953 [Shivaram Venkataraman] Merge branch 'master' of https://github.com/apache/spark into sparkr-df-api
0521149 [Shivaram Venkataraman] Changes to make SparkR DataFrame dplyr friendly. Changes include 1. Rename sortDF to arrange 2. Add new aliases `group_by` and `sample_frac`, `summarize` 3. Add more user friendly column addition (mutate), rename 4. Support mean as an alias for avg in Scala and also support n_distinct, n as in dplyr
(cherry picked from commit 0a901dd3a1
)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
118 lines
2.6 KiB
Plaintext
118 lines
2.6 KiB
Plaintext
# Imports from base R
|
|
importFrom(methods, setGeneric, setMethod, setOldClass)
|
|
useDynLib(SparkR, stringHashCode)
|
|
|
|
# S3 methods exported
|
|
export("sparkR.init")
|
|
export("sparkR.stop")
|
|
export("print.jobj")
|
|
|
|
exportClasses("DataFrame")
|
|
|
|
exportMethods("arrange",
|
|
"cache",
|
|
"collect",
|
|
"columns",
|
|
"count",
|
|
"describe",
|
|
"distinct",
|
|
"dtypes",
|
|
"except",
|
|
"explain",
|
|
"filter",
|
|
"first",
|
|
"group_by",
|
|
"groupBy",
|
|
"head",
|
|
"insertInto",
|
|
"intersect",
|
|
"isLocal",
|
|
"join",
|
|
"limit",
|
|
"orderBy",
|
|
"mutate",
|
|
"names",
|
|
"persist",
|
|
"printSchema",
|
|
"registerTempTable",
|
|
"rename",
|
|
"repartition",
|
|
"sampleDF",
|
|
"sample_frac",
|
|
"saveAsParquetFile",
|
|
"saveAsTable",
|
|
"saveDF",
|
|
"schema",
|
|
"select",
|
|
"selectExpr",
|
|
"show",
|
|
"showDF",
|
|
"summarize",
|
|
"take",
|
|
"unionAll",
|
|
"unpersist",
|
|
"where",
|
|
"withColumn",
|
|
"withColumnRenamed")
|
|
|
|
exportClasses("Column")
|
|
|
|
exportMethods("abs",
|
|
"alias",
|
|
"approxCountDistinct",
|
|
"asc",
|
|
"avg",
|
|
"cast",
|
|
"contains",
|
|
"countDistinct",
|
|
"desc",
|
|
"endsWith",
|
|
"getField",
|
|
"getItem",
|
|
"isNotNull",
|
|
"isNull",
|
|
"last",
|
|
"like",
|
|
"lower",
|
|
"max",
|
|
"mean",
|
|
"min",
|
|
"n",
|
|
"n_distinct",
|
|
"rlike",
|
|
"sqrt",
|
|
"startsWith",
|
|
"substr",
|
|
"sum",
|
|
"sumDistinct",
|
|
"upper")
|
|
|
|
exportClasses("GroupedData")
|
|
exportMethods("agg")
|
|
|
|
export("sparkRSQL.init",
|
|
"sparkRHive.init")
|
|
|
|
export("cacheTable",
|
|
"clearCache",
|
|
"createDataFrame",
|
|
"createExternalTable",
|
|
"dropTempTable",
|
|
"jsonFile",
|
|
"loadDF",
|
|
"parquetFile",
|
|
"sql",
|
|
"table",
|
|
"tableNames",
|
|
"tables",
|
|
"uncacheTable")
|
|
|
|
export("structField",
|
|
"structField.jobj",
|
|
"structField.character",
|
|
"print.structField",
|
|
"structType",
|
|
"structType.jobj",
|
|
"structType.structField",
|
|
"print.structType")
|