[SPARK-14883][DOCS] Fix wrong R examples and make them up-to-date
## What changes were proposed in this pull request? This issue aims to fix some errors in R examples and make them up-to-date in docs and example modules. - Remove the wrong usage of `map`. We need to use `lapply` in `sparkR` if needed. However, `lapply` is private so far. The corrected example will be added later. - Fix the wrong example in Section `Generic Load/Save Functions` of `docs/sql-programming-guide.md` for consistency - Fix datatypes in `sparkr.md`. - Update a data result in `sparkr.md`. - Replace deprecated functions to remove warnings: jsonFile -> read.json, parquetFile -> read.parquet - Use up-to-date R-like functions: loadDF -> read.df, saveDF -> write.df, saveAsParquetFile -> write.parquet - Replace `SparkR DataFrame` with `SparkDataFrame` in `dataframe.R` and `data-manipulation.R`. - Other minor syntax fixes and a typo. ## How was this patch tested? Manual. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #12649 from dongjoon-hyun/SPARK-14883.
This commit is contained in:
parent
35319d3264
commit
6ab4d9e0c7
|
@ -845,7 +845,7 @@ setMethod("ncol",
|
|||
length(columns(x))
|
||||
})
|
||||
|
||||
#' Returns the dimentions (number of rows and columns) of a SparkDataFrame
|
||||
#' Returns the dimensions (number of rows and columns) of a SparkDataFrame
|
||||
#' @param x a SparkDataFrame
|
||||
#'
|
||||
#' @family SparkDataFrame functions
|
||||
|
|
|
@ -141,7 +141,7 @@ head(people)
|
|||
# SparkR automatically infers the schema from the JSON file
|
||||
printSchema(people)
|
||||
# root
|
||||
# |-- age: integer (nullable = true)
|
||||
# |-- age: long (nullable = true)
|
||||
# |-- name: string (nullable = true)
|
||||
|
||||
{% endhighlight %}
|
||||
|
@ -195,7 +195,7 @@ df <- createDataFrame(sqlContext, faithful)
|
|||
|
||||
# Get basic information about the DataFrame
|
||||
df
|
||||
## DataFrame[eruptions:double, waiting:double]
|
||||
## SparkDataFrame[eruptions:double, waiting:double]
|
||||
|
||||
# Select only the "eruptions" column
|
||||
head(select(df, df$eruptions))
|
||||
|
@ -228,14 +228,13 @@ SparkR data frames support a number of commonly used functions to aggregate data
|
|||
# We use the `n` operator to count the number of times each waiting time appears
|
||||
head(summarize(groupBy(df, df$waiting), count = n(df$waiting)))
|
||||
## waiting count
|
||||
##1 81 13
|
||||
##2 60 6
|
||||
##3 68 1
|
||||
##1 70 4
|
||||
##2 67 1
|
||||
##3 69 2
|
||||
|
||||
# We can also sort the output from the aggregation to get the most common waiting times
|
||||
waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting))
|
||||
head(arrange(waiting_counts, desc(waiting_counts$count)))
|
||||
|
||||
## waiting count
|
||||
##1 78 15
|
||||
##2 83 14
|
||||
|
|
|
@ -173,7 +173,7 @@ df.show()
|
|||
{% highlight r %}
|
||||
sqlContext <- SQLContext(sc)
|
||||
|
||||
df <- jsonFile(sqlContext, "examples/src/main/resources/people.json")
|
||||
df <- read.json(sqlContext, "examples/src/main/resources/people.json")
|
||||
|
||||
# Displays the content of the DataFrame to stdout
|
||||
showDF(df)
|
||||
|
@ -366,7 +366,7 @@ In addition to simple column references and expressions, DataFrames also have a
|
|||
sqlContext <- sparkRSQL.init(sc)
|
||||
|
||||
# Create the DataFrame
|
||||
df <- jsonFile(sqlContext, "examples/src/main/resources/people.json")
|
||||
df <- read.json(sqlContext, "examples/src/main/resources/people.json")
|
||||
|
||||
# Show the content of the DataFrame
|
||||
showDF(df)
|
||||
|
@ -889,8 +889,8 @@ df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
|
|||
<div data-lang="r" markdown="1">
|
||||
|
||||
{% highlight r %}
|
||||
df <- loadDF(sqlContext, "people.parquet")
|
||||
saveDF(select(df, "name", "age"), "namesAndAges.parquet")
|
||||
df <- read.df(sqlContext, "examples/src/main/resources/users.parquet")
|
||||
write.df(select(df, "name", "favorite_color"), "namesAndFavColors.parquet")
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
|
@ -939,8 +939,8 @@ df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")
|
|||
|
||||
{% highlight r %}
|
||||
|
||||
df <- loadDF(sqlContext, "people.json", "json")
|
||||
saveDF(select(df, "name", "age"), "namesAndAges.parquet", "parquet")
|
||||
df <- read.df(sqlContext, "examples/src/main/resources/people.json", "json")
|
||||
write.df(select(df, "name", "age"), "namesAndAges.parquet", "parquet")
|
||||
|
||||
{% endhighlight %}
|
||||
|
||||
|
@ -1138,19 +1138,15 @@ for teenName in teenNames.collect():
|
|||
schemaPeople # The DataFrame from the previous example.
|
||||
|
||||
# DataFrames can be saved as Parquet files, maintaining the schema information.
|
||||
saveAsParquetFile(schemaPeople, "people.parquet")
|
||||
write.parquet(schemaPeople, "people.parquet")
|
||||
|
||||
# Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.
|
||||
# The result of loading a parquet file is also a DataFrame.
|
||||
parquetFile <- parquetFile(sqlContext, "people.parquet")
|
||||
parquetFile <- read.parquet(sqlContext, "people.parquet")
|
||||
|
||||
# Parquet files can also be registered as tables and then used in SQL statements.
|
||||
registerTempTable(parquetFile, "parquetFile");
|
||||
registerTempTable(parquetFile, "parquetFile")
|
||||
teenagers <- sql(sqlContext, "SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
|
||||
teenNames <- map(teenagers, function(p) { paste("Name:", p$name)})
|
||||
for (teenName in collect(teenNames)) {
|
||||
cat(teenName, "\n")
|
||||
}
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
|
@ -1318,14 +1314,14 @@ df3.printSchema()
|
|||
# sqlContext from the previous example is used in this example.
|
||||
|
||||
# Create a simple DataFrame, stored into a partition directory
|
||||
saveDF(df1, "data/test_table/key=1", "parquet", "overwrite")
|
||||
write.df(df1, "data/test_table/key=1", "parquet", "overwrite")
|
||||
|
||||
# Create another DataFrame in a new partition directory,
|
||||
# adding a new column and dropping an existing column
|
||||
saveDF(df2, "data/test_table/key=2", "parquet", "overwrite")
|
||||
write.df(df2, "data/test_table/key=2", "parquet", "overwrite")
|
||||
|
||||
# Read the partitioned table
|
||||
df3 <- loadDF(sqlContext, "data/test_table", "parquet", mergeSchema="true")
|
||||
df3 <- read.df(sqlContext, "data/test_table", "parquet", mergeSchema="true")
|
||||
printSchema(df3)
|
||||
|
||||
# The final schema consists of all 3 columns in the Parquet files together
|
||||
|
@ -1612,7 +1608,7 @@ sqlContext <- sparkRSQL.init(sc)
|
|||
# The path can be either a single text file or a directory storing text files.
|
||||
path <- "examples/src/main/resources/people.json"
|
||||
# Create a DataFrame from the file(s) pointed to by path
|
||||
people <- jsonFile(sqlContext, path)
|
||||
people <- read.json(sqlContext, path)
|
||||
|
||||
# The inferred schema can be visualized using the printSchema() method.
|
||||
printSchema(people)
|
||||
|
|
|
@ -30,7 +30,7 @@ args <- commandArgs(trailing = TRUE)
|
|||
|
||||
if (length(args) != 1) {
|
||||
print("Usage: data-manipulation.R <path-to-flights.csv")
|
||||
print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ")
|
||||
print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv")
|
||||
q("no")
|
||||
}
|
||||
|
||||
|
@ -49,33 +49,33 @@ flights_df$date <- as.Date(flights_df$date)
|
|||
## Filter flights whose destination is San Francisco and write to a local data frame
|
||||
SFO_df <- flights_df[flights_df$dest == "SFO", ]
|
||||
|
||||
# Convert the local data frame into a SparkR DataFrame
|
||||
# Convert the local data frame into a SparkDataFrame
|
||||
SFO_DF <- createDataFrame(sqlContext, SFO_df)
|
||||
|
||||
# Directly create a SparkR DataFrame from the source data
|
||||
# Directly create a SparkDataFrame from the source data
|
||||
flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true")
|
||||
|
||||
# Print the schema of this Spark DataFrame
|
||||
# Print the schema of this SparkDataFrame
|
||||
printSchema(flightsDF)
|
||||
|
||||
# Cache the DataFrame
|
||||
# Cache the SparkDataFrame
|
||||
cache(flightsDF)
|
||||
|
||||
# Print the first 6 rows of the DataFrame
|
||||
# Print the first 6 rows of the SparkDataFrame
|
||||
showDF(flightsDF, numRows = 6) ## Or
|
||||
head(flightsDF)
|
||||
|
||||
# Show the column names in the DataFrame
|
||||
# Show the column names in the SparkDataFrame
|
||||
columns(flightsDF)
|
||||
|
||||
# Show the number of rows in the DataFrame
|
||||
# Show the number of rows in the SparkDataFrame
|
||||
count(flightsDF)
|
||||
|
||||
# Select specific columns
|
||||
destDF <- select(flightsDF, "dest", "cancelled")
|
||||
|
||||
# Using SQL to select columns of data
|
||||
# First, register the flights DataFrame as a table
|
||||
# First, register the flights SparkDataFrame as a table
|
||||
registerTempTable(flightsDF, "flightsTable")
|
||||
destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable")
|
||||
|
||||
|
@ -95,11 +95,11 @@ if("magrittr" %in% rownames(installed.packages())) {
|
|||
library(magrittr)
|
||||
|
||||
# Group the flights by date and then find the average daily delay
|
||||
# Write the result into a DataFrame
|
||||
# Write the result into a SparkDataFrame
|
||||
groupBy(flightsDF, flightsDF$date) %>%
|
||||
summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF
|
||||
|
||||
# Print the computed data frame
|
||||
# Print the computed SparkDataFrame
|
||||
head(dailyDelayDF)
|
||||
}
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ sqlContext <- sparkRSQL.init(sc)
|
|||
# Create a simple local data.frame
|
||||
localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18))
|
||||
|
||||
# Convert local data frame to a SparkR DataFrame
|
||||
# Convert local data frame to a SparkDataFrame
|
||||
df <- createDataFrame(sqlContext, localDF)
|
||||
|
||||
# Print its schema
|
||||
|
|
Loading…
Reference in a new issue