diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 36d7a9b38b..5a5dc20ff3 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -420,13 +420,11 @@ export("as.DataFrame", "currentDatabase", "dropTempTable", "dropTempView", - "jsonFile", "listColumns", "listDatabases", "listFunctions", "listTables", "loadDF", - "parquetFile", "read.df", "read.jdbc", "read.json", diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index c819a7d14a..3f89ee99e2 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -343,7 +343,6 @@ setMethod("toDF", signature(x = "RDD"), #' path <- "path/to/file.json" #' df <- read.json(path) #' df <- read.json(path, multiLine = TRUE) -#' df <- jsonFile(path) #' } #' @name read.json #' @method read.json default @@ -363,51 +362,6 @@ read.json <- function(x, ...) { dispatchFunc("read.json(path)", x, ...) } -#' @rdname read.json -#' @name jsonFile -#' @method jsonFile default -#' @note jsonFile since 1.4.0 -jsonFile.default <- function(path) { - .Deprecated("read.json") - read.json(path) -} - -jsonFile <- function(x, ...) { - dispatchFunc("jsonFile(path)", x, ...) -} - -#' JSON RDD -#' -#' Loads an RDD storing one JSON object per string as a SparkDataFrame. -#' -#' @param sqlContext SQLContext to use -#' @param rdd An RDD of JSON string -#' @param schema A StructType object to use as schema -#' @param samplingRatio The ratio of simpling used to infer the schema -#' @return A SparkDataFrame -#' @noRd -#' @examples -#'\dontrun{ -#' sparkR.session() -#' rdd <- texFile(sc, "path/to/json") -#' df <- jsonRDD(sqlContext, rdd) -#'} - -# TODO: remove - this method is no longer exported -# TODO: support schema -jsonRDD <- function(sqlContext, rdd, schema = NULL, samplingRatio = 1.0) { - .Deprecated("read.json") - rdd <- serializeToString(rdd) - if (is.null(schema)) { - read <- callJMethod(sqlContext, "read") - # samplingRatio is deprecated - sdf <- callJMethod(read, "json", callJMethod(getJRDD(rdd), "rdd")) - dataFrame(sdf) - } else { - stop("not implemented") - } -} - #' Create a SparkDataFrame from an ORC file. #' #' Loads an ORC file, returning the result as a SparkDataFrame. @@ -434,6 +388,7 @@ read.orc <- function(path, ...) { #' Loads a Parquet file, returning the result as a SparkDataFrame. #' #' @param path path of file to read. A vector of multiple paths is allowed. +#' @param ... additional external data source specific named properties. #' @return SparkDataFrame #' @rdname read.parquet #' @name read.parquet @@ -454,20 +409,6 @@ read.parquet <- function(x, ...) { dispatchFunc("read.parquet(...)", x, ...) } -#' @param ... argument(s) passed to the method. -#' @rdname read.parquet -#' @name parquetFile -#' @method parquetFile default -#' @note parquetFile since 1.4.0 -parquetFile.default <- function(...) { - .Deprecated("read.parquet") - read.parquet(unlist(list(...))) -} - -parquetFile <- function(x, ...) { - dispatchFunc("parquetFile(...)", x, ...) -} - #' Create a SparkDataFrame from a text file. #' #' Loads text files and returns a SparkDataFrame whose schema starts with diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 509f689ac5..68bf5eac98 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -628,14 +628,10 @@ test_that("read/write json files", { jsonPath3 <- tempfile(pattern = "jsonPath3", fileext = ".json") write.json(df, jsonPath3) - # Test read.json()/jsonFile() works with multiple input paths + # Test read.json() works with multiple input paths jsonDF1 <- read.json(c(jsonPath2, jsonPath3)) expect_is(jsonDF1, "SparkDataFrame") expect_equal(count(jsonDF1), 6) - # Suppress warnings because jsonFile is deprecated - jsonDF2 <- suppressWarnings(jsonFile(c(jsonPath2, jsonPath3))) - expect_is(jsonDF2, "SparkDataFrame") - expect_equal(count(jsonDF2), 6) unlink(jsonPath2) unlink(jsonPath3) @@ -655,20 +651,6 @@ test_that("read/write json files - compression option", { unlink(jsonPath) }) -test_that("jsonRDD() on a RDD with json string", { - sqlContext <- suppressWarnings(sparkRSQL.init(sc)) - rdd <- parallelize(sc, mockLines) - expect_equal(countRDD(rdd), 3) - df <- suppressWarnings(jsonRDD(sqlContext, rdd)) - expect_is(df, "SparkDataFrame") - expect_equal(count(df), 3) - - rdd2 <- flatMap(rdd, function(x) c(x, x)) - df <- suppressWarnings(jsonRDD(sqlContext, rdd2)) - expect_is(df, "SparkDataFrame") - expect_equal(count(df), 6) -}) - test_that("test tableNames and tables", { count <- count(listTables()) @@ -2658,7 +2640,7 @@ test_that("read/write Parquet files", { expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 3) - # Test write.parquet/saveAsParquetFile and read.parquet/parquetFile + # Test write.parquet/saveAsParquetFile and read.parquet parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet") write.parquet(df, parquetPath2) parquetPath3 <- tempfile(pattern = "parquetPath3", fileext = ".parquet") @@ -2666,9 +2648,6 @@ test_that("read/write Parquet files", { parquetDF <- read.parquet(c(parquetPath2, parquetPath3)) expect_is(parquetDF, "SparkDataFrame") expect_equal(count(parquetDF), count(df) * 2) - parquetDF2 <- suppressWarnings(parquetFile(parquetPath2, parquetPath3)) - expect_is(parquetDF2, "SparkDataFrame") - expect_equal(count(parquetDF2), count(df) * 2) # Test if varargs works with variables saveMode <- "overwrite" diff --git a/docs/sparkr.md b/docs/sparkr.md index 79f8ab8134..5882ed7923 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -709,8 +709,12 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma ## Upgrading to SparkR 2.3.1 and above - - In SparkR 2.3.0 and earlier, the `start` parameter of `substr` method was wrongly subtracted by one and considered as 0-based. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. In version 2.3.1 and later, it has been fixed so the `start` parameter of `substr` method is now 1-base. As an example, `substr(lit('abcdef'), 2, 4))` would result to `abc` in SparkR 2.3.0, and the result would be `bcd` in SparkR 2.3.1. + - In SparkR 2.3.0 and earlier, the `start` parameter of `substr` method was wrongly subtracted by one and considered as 0-based. This can lead to inconsistent substring results and also does not match with the behaviour with `substr` in R. In version 2.3.1 and later, it has been fixed so the `start` parameter of `substr` method is now 1-based. As an example, `substr(lit('abcdef'), 2, 4))` would result to `abc` in SparkR 2.3.0, and the result would be `bcd` in SparkR 2.3.1. ## Upgrading to SparkR 2.4.0 - Previously, we don't check the validity of the size of the last layer in `spark.mlp`. For example, if the training data only has two labels, a `layers` param like `c(1, 3)` doesn't cause an error previously, now it does. + +## Upgrading to SparkR 3.0.0 + + - The deprecated methods `parquetFile`, `jsonRDD` and `jsonFile` in `SQLContext` have been removed. Use `read.parquet` and `read.json`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index dfb12f272e..1b7e969a71 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -755,289 +755,6 @@ class SQLContext private[sql](val sparkSession: SparkSession) sessionState.catalog.listTables(databaseName).map(_.table).toArray } - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - // Deprecated methods - //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - - /** - * @deprecated As of 1.3.0, replaced by `createDataFrame()`. - */ - @deprecated("Use createDataFrame instead.", "1.3.0") - def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = { - createDataFrame(rowRDD, schema) - } - - /** - * @deprecated As of 1.3.0, replaced by `createDataFrame()`. - */ - @deprecated("Use createDataFrame instead.", "1.3.0") - def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = { - createDataFrame(rowRDD, schema) - } - - /** - * @deprecated As of 1.3.0, replaced by `createDataFrame()`. - */ - @deprecated("Use createDataFrame instead.", "1.3.0") - def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = { - createDataFrame(rdd, beanClass) - } - - /** - * @deprecated As of 1.3.0, replaced by `createDataFrame()`. - */ - @deprecated("Use createDataFrame instead.", "1.3.0") - def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = { - createDataFrame(rdd, beanClass) - } - - /** - * Loads a Parquet file, returning the result as a `DataFrame`. This function returns an empty - * `DataFrame` if no paths are passed in. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().parquet()`. - */ - @deprecated("Use read.parquet() instead.", "1.4.0") - @scala.annotation.varargs - def parquetFile(paths: String*): DataFrame = { - if (paths.isEmpty) { - emptyDataFrame - } else { - read.parquet(paths : _*) - } - } - - /** - * Loads a JSON file (one object per line), returning the result as a `DataFrame`. - * It goes through the entire dataset once to determine the schema. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonFile(path: String): DataFrame = { - read.json(path) - } - - /** - * Loads a JSON file (one object per line) and applies the given schema, - * returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonFile(path: String, schema: StructType): DataFrame = { - read.schema(schema).json(path) - } - - /** - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonFile(path: String, samplingRatio: Double): DataFrame = { - read.option("samplingRatio", samplingRatio.toString).json(path) - } - - /** - * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a - * `DataFrame`. - * It goes through the entire dataset once to determine the schema. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: RDD[String]): DataFrame = read.json(json) - - /** - * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a - * `DataFrame`. - * It goes through the entire dataset once to determine the schema. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: JavaRDD[String]): DataFrame = read.json(json) - - /** - * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema, - * returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: RDD[String], schema: StructType): DataFrame = { - read.schema(schema).json(json) - } - - /** - * Loads an JavaRDD[String] storing JSON objects (one object per record) and applies the given - * schema, returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: JavaRDD[String], schema: StructType): DataFrame = { - read.schema(schema).json(json) - } - - /** - * Loads an RDD[String] storing JSON objects (one object per record) inferring the - * schema, returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = { - read.option("samplingRatio", samplingRatio.toString).json(json) - } - - /** - * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the - * schema, returning the result as a `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().json()`. - */ - @deprecated("Use read.json() instead.", "1.4.0") - def jsonRDD(json: JavaRDD[String], samplingRatio: Double): DataFrame = { - read.option("samplingRatio", samplingRatio.toString).json(json) - } - - /** - * Returns the dataset stored at path as a DataFrame, - * using the default data source configured by spark.sql.sources.default. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by `read().load(path)`. - */ - @deprecated("Use read.load(path) instead.", "1.4.0") - def load(path: String): DataFrame = { - read.load(path) - } - - /** - * Returns the dataset stored at path as a DataFrame, using the given data source. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by `read().format(source).load(path)`. - */ - @deprecated("Use read.format(source).load(path) instead.", "1.4.0") - def load(path: String, source: String): DataFrame = { - read.format(source).load(path) - } - - /** - * (Java-specific) Returns the dataset specified by the given data source and - * a set of options as a DataFrame. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by `read().format(source).options(options).load()`. - */ - @deprecated("Use read.format(source).options(options).load() instead.", "1.4.0") - def load(source: String, options: java.util.Map[String, String]): DataFrame = { - read.options(options).format(source).load() - } - - /** - * (Scala-specific) Returns the dataset specified by the given data source and - * a set of options as a DataFrame. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by `read().format(source).options(options).load()`. - */ - @deprecated("Use read.format(source).options(options).load() instead.", "1.4.0") - def load(source: String, options: Map[String, String]): DataFrame = { - read.options(options).format(source).load() - } - - /** - * (Java-specific) Returns the dataset specified by the given data source and - * a set of options as a DataFrame, using the given schema as the schema of the DataFrame. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by - * `read().format(source).schema(schema).options(options).load()`. - */ - @deprecated("Use read.format(source).schema(schema).options(options).load() instead.", "1.4.0") - def load( - source: String, - schema: StructType, - options: java.util.Map[String, String]): DataFrame = { - read.format(source).schema(schema).options(options).load() - } - - /** - * (Scala-specific) Returns the dataset specified by the given data source and - * a set of options as a DataFrame, using the given schema as the schema of the DataFrame. - * - * @group genericdata - * @deprecated As of 1.4.0, replaced by - * `read().format(source).schema(schema).options(options).load()`. - */ - @deprecated("Use read.format(source).schema(schema).options(options).load() instead.", "1.4.0") - def load(source: String, schema: StructType, options: Map[String, String]): DataFrame = { - read.format(source).schema(schema).options(options).load() - } - - /** - * Construct a `DataFrame` representing the database table accessible via JDBC URL - * url named table. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().jdbc()`. - */ - @deprecated("Use read.jdbc() instead.", "1.4.0") - def jdbc(url: String, table: String): DataFrame = { - read.jdbc(url, table, new Properties) - } - - /** - * Construct a `DataFrame` representing the database table accessible via JDBC URL - * url named table. Partitions of the table will be retrieved in parallel based on the parameters - * passed to this function. - * - * @param columnName the name of a column of integral type that will be used for partitioning. - * @param lowerBound the minimum value of `columnName` used to decide partition stride - * @param upperBound the maximum value of `columnName` used to decide partition stride - * @param numPartitions the number of partitions. the range `minValue`-`maxValue` will be split - * evenly into this many partitions - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().jdbc()`. - */ - @deprecated("Use read.jdbc() instead.", "1.4.0") - def jdbc( - url: String, - table: String, - columnName: String, - lowerBound: Long, - upperBound: Long, - numPartitions: Int): DataFrame = { - read.jdbc(url, table, columnName, lowerBound, upperBound, numPartitions, new Properties) - } - - /** - * Construct a `DataFrame` representing the database table accessible via JDBC URL - * url named table. The theParts parameter gives a list expressions - * suitable for inclusion in WHERE clauses; each one defines one partition - * of the `DataFrame`. - * - * @group specificdata - * @deprecated As of 1.4.0, replaced by `read().jdbc()`. - */ - @deprecated("Use read.jdbc() instead.", "1.4.0") - def jdbc(url: String, table: String, theParts: Array[String]): DataFrame = { - read.jdbc(url, table, theParts, new Properties) - } } /**