From e9f2e342612a01d33bdf814d87a98ad341ab475b Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Thu, 2 Sep 2021 13:27:43 +0900 Subject: [PATCH] [SPARK-36631][R] Ask users if they want to download and install SparkR in non Spark scripts ### What changes were proposed in this pull request? This PR proposes to ask users if they want to download and install SparkR when they install SparkR from CRAN. `SPARKR_ASK_INSTALLATION` environment variable was added in case other notebook projects are affected. ### Why are the changes needed? This is required for CRAN. Currently SparkR is removed: https://cran.r-project.org/web/packages/SparkR/index.html. See also https://lists.apache.org/thread.html/r02b9046273a518e347dfe85f864d23d63d3502c6c1edd33df17a3b86%40%3Cdev.spark.apache.org%3E ### Does this PR introduce _any_ user-facing change? Yes, `sparkR.session(...)` will ask if users want to download and install Spark package or not if they are in the plain R shell or `Rscript`. ### How was this patch tested? **R shell** Valid input (`n`): ``` > sparkR.session(master="local") Spark not found in SPARK_HOME: Will you download and install (or reuse if it exists) Spark package under the cache [/.../Caches/spark]? (y/n): n ``` ``` Error in sparkCheckInstall(sparkHome, master, deployMode) : Please make sure Spark package is installed in this machine. - If there is one, set the path in sparkHome parameter or environment variable SPARK_HOME. - If not, you may run install.spark function to do the job. ``` Invalid input: ``` > sparkR.session(master="local") Spark not found in SPARK_HOME: Will you download and install (or reuse if it exists) Spark package under the cache [/.../Caches/spark]? (y/n): abc ``` ``` Will you download and install (or reuse if it exists) Spark package under the cache [/.../Caches/spark]? (y/n): ``` Valid input (`y`): ``` > sparkR.session(master="local") Will you download and install (or reuse if it exists) Spark package under the cache [/.../Caches/spark]? (y/n): y Spark not found in the cache directory. Installation will start. MirrorUrl not provided. Looking for preferred site from apache website... Preferred mirror site found: https://ftp.riken.jp/net/apache/spark Downloading spark-3.3.0 for Hadoop 2.7 from: - https://ftp.riken.jp/net/apache/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.7.tgz trying URL 'https://ftp.riken.jp/net/apache/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.7.tgz' ... ``` **Rscript** ``` cat tmp.R ``` ``` library(SparkR, lib.loc = c(file.path(".", "R", "lib"))) sparkR.session(master="local") ``` ``` Rscript tmp.R ``` Valid input (`n`): ``` Spark not found in SPARK_HOME: Will you download and install (or reuse if it exists) Spark package under the cache [/.../Caches/spark]? (y/n): n ``` ``` Error in sparkCheckInstall(sparkHome, master, deployMode) : Please make sure Spark package is installed in this machine. - If there is one, set the path in sparkHome parameter or environment variable SPARK_HOME. - If not, you may run install.spark function to do the job. Calls: sparkR.session -> sparkCheckInstall ``` Invalid input: ``` Spark not found in SPARK_HOME: Will you download and install (or reuse if it exists) Spark package under the cache [/.../Caches/spark]? (y/n): abc ``` ``` Will you download and install (or reuse if it exists) Spark package under the cache [/.../Caches/spark]? (y/n): ``` Valid input (`y`): ``` ... Spark not found in SPARK_HOME: Will you download and install (or reuse if it exists) Spark package under the cache [/.../Caches/spark]? (y/n): y Spark not found in the cache directory. Installation will start. MirrorUrl not provided. Looking for preferred site from apache website... Preferred mirror site found: https://ftp.riken.jp/net/apache/spark Downloading spark-3.3.0 for Hadoop 2.7 from: ... ``` `bin/sparkR` and `bin/spark-submit *.R` are not affected (tested). Closes #33887 from HyukjinKwon/SPARK-36631. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon (cherry picked from commit e983ba8fce2b41f0c398fa279f376090376ab1f4) Signed-off-by: Hyukjin Kwon --- R/pkg/R/sparkR.R | 34 ++++++++++++++++++++++++++++++++++ docs/sparkr-migration-guide.md | 4 ++++ 2 files changed, 38 insertions(+) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index e4a11a5f78..924ebbf37e 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -655,6 +655,40 @@ sparkCheckInstall <- function(sparkHome, master, deployMode) { } else { if (interactive() || isMasterLocal(master)) { message("Spark not found in SPARK_HOME: ", sparkHome) + # If EXISTING_SPARKR_BACKEND_PORT environment variable is set, assume + # that we're in Spark submit. spark-submit always sets Spark home + # so this case should not happen. This is just a safeguard. + isSparkRSubmit <- Sys.getenv("EXISTING_SPARKR_BACKEND_PORT", "") != "" + + # SPARKR_ASK_INSTALLATION is an internal environment variable in case + # users want to disable this behavior. This environment variable should + # be removed if no user complains. This environment variable was added + # in case other notebook projects are affected. + if (!isSparkRSubmit && Sys.getenv("SPARKR_ASK_INSTALLATION", "TRUE") == "TRUE") { + # Finally, we're either plain R shell or Rscript. + msg <- paste0( + "Will you download and install (or reuse if it exists) Spark package ", + "under the cache [", sparkCachePath(), "]? (y/n): ") + + answer <- NA + while (is.na(answer) || (answer != "y" && answer != "n")) { + # Dispatch on R shell in case readLines does not work in RStudio + # See https://stackoverflow.com/questions/30191232/use-stdin-from-within-r-studio + if (interactive()) { + answer <- readline(prompt = msg) + } else { + cat(msg) + answer <- readLines("stdin", n = 1) + } + } + if (answer == "n") { + stop(paste0( + "Please make sure Spark package is installed in this machine.\n", + " - If there is one, set the path in sparkHome parameter or ", + "environment variable SPARK_HOME.\n", + " - If not, you may run install.spark function to do the job.")) + } + } packageLocalDir <- install.spark() packageLocalDir } else if (isClientMode(master) || deployMode == "client") { diff --git a/docs/sparkr-migration-guide.md b/docs/sparkr-migration-guide.md index 32836cdac5..3a937b729a 100644 --- a/docs/sparkr-migration-guide.md +++ b/docs/sparkr-migration-guide.md @@ -26,6 +26,10 @@ Note that this migration guide describes the items specific to SparkR. Many items of SQL migration can be applied when migrating SparkR to higher versions. Please refer [Migration Guide: SQL, Datasets and DataFrame](sql-migration-guide.html). +## Upgrading from SparkR 3.1 to 3.2 + + - Previously, SparkR automatically downloaded and installed the Spark distribution in user' cache directory to complete SparkR installation when SparkR runs in a plain R shell or Rscript, and the Spark distribution cannot be found. Now, it asks if users want to download and install or not. To restore the previous behavior, set `SPARKR_ASK_INSTALLATION` environment variable to `FALSE`. + ## Upgrading from SparkR 2.4 to 3.0 - The deprecated methods `parquetFile`, `saveAsParquetFile`, `jsonFile`, `jsonRDD` have been removed. Use `read.parquet`, `write.parquet`, `read.json` instead.