From 7892f88f84acc8c061aaa3d2987f2c8b71e41963 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Sat, 7 Mar 2020 11:43:32 -0600 Subject: [PATCH] [SPARK-30879][DOCS] Refine workflow for building docs ### What changes were proposed in this pull request? This PR makes the following refinements to the workflow for building docs: * Install Python and Ruby consistently using pyenv and rbenv across both the docs README and the release Dockerfile. * Pin the Python and Ruby versions we use. * Pin all direct Python and Ruby dependency versions. * Eliminate any use of `sudo pip`, which the Python community discourages, or `sudo gem`. ### Why are the changes needed? This PR should increase the consistency and reproducibility of the doc-building process by managing Python and Ruby in a more consistent way, and by eliminating unused or outdated code. Here's a possible example of an issue building the docs that would be addressed by the changes in this PR: https://github.com/apache/spark/pull/27459#discussion_r376135719 ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Manual tests: * I was able to build the Docker image successfully, minus the final part about `RUN useradd`. * I am unable to run `do-release-docker.sh` because I am not a committer and don't have the required GPG key. * I built the docs locally and viewed them in the browser. I think I need a committer to more fully test out these changes. Closes #27534 from nchammas/SPARK-30731-building-docs. Authored-by: Nicholas Chammas Signed-off-by: Sean Owen --- .gitignore | 2 + dev/create-release/do-release-docker.sh | 2 +- dev/create-release/spark-rm/Dockerfile | 61 +++++++++++++------------ docs/README.md | 44 +++++++++++++++--- 4 files changed, 72 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index 198fdee39b..60a12e3d7b 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,8 @@ .idea_modules/ .project .pydevproject +.python-version +.ruby-version .scala_dependencies .settings /lib/ diff --git a/dev/create-release/do-release-docker.sh b/dev/create-release/do-release-docker.sh index 694a87bf78..cda21ebdcd 100755 --- a/dev/create-release/do-release-docker.sh +++ b/dev/create-release/do-release-docker.sh @@ -96,7 +96,7 @@ fcreate_secure "$GPG_KEY_FILE" $GPG --export-secret-key --armor "$GPG_KEY" > "$GPG_KEY_FILE" run_silent "Building spark-rm image with tag $IMGTAG..." "docker-build.log" \ - docker build -t "spark-rm:$IMGTAG" --build-arg UID=$UID "$SELF/spark-rm" + docker build --no-cache -t "spark-rm:$IMGTAG" --build-arg UID=$UID "$SELF/spark-rm" # Write the release information to a file with environment variables to be used when running the # image. diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 63451687ee..d310aaf988 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -20,9 +20,9 @@ # Includes: # * Java 8 # * Ivy -# * Python (2.7.15/3.6.7) +# * Python 3.7 +# * Ruby 2.7 # * R-base/R-base-dev (3.6.1) -# * Ruby 2.3 build utilities FROM ubuntu:18.04 @@ -33,15 +33,11 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true # These arguments are just for reuse and not really meant to be customized. ARG APT_INSTALL="apt-get install --no-install-recommends -y" -ARG BASE_PIP_PKGS="setuptools wheel" -ARG PIP_PKGS="pyopenssl numpy sphinx" +ARG PIP_PKGS="sphinx==2.3.1 mkdocs==1.0.4 numpy==1.18.1" +ARG GEM_PKGS="jekyll:4.0.0 jekyll-redirect-from:0.16.0 rouge:3.15.0" # Install extra needed repos and refresh. # - CRAN repo -# - Ruby repo (for doc generation) -# -# This is all in a single "RUN" command so that if anything changes, "apt update" is run to fetch -# the most current package versions (instead of potentially using old versions cached by docker). RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' >> /etc/apt/sources.list && \ gpg --keyserver keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ @@ -50,36 +46,43 @@ RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ rm -rf /var/lib/apt/lists/* && \ apt-get clean && \ apt-get update && \ - $APT_INSTALL software-properties-common && \ - apt-add-repository -y ppa:brightbox/ruby-ng && \ - apt-get update && \ # Install openjdk 8. $APT_INSTALL openjdk-8-jdk && \ update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java && \ # Install build / source control tools $APT_INSTALL curl wget git maven ivy subversion make gcc lsof libffi-dev \ - pandoc pandoc-citeproc libssl-dev libcurl4-openssl-dev libxml2-dev && \ + pandoc pandoc-citeproc libssl-dev libcurl4-openssl-dev libxml2-dev + +ENV PATH "$PATH:/root/.pyenv/bin:/root/.pyenv/shims" +RUN curl -L https://github.com/pyenv/pyenv-installer/raw/dd3f7d0914c5b4a416ca71ffabdf2954f2021596/bin/pyenv-installer | bash +RUN $APT_INSTALL libbz2-dev libreadline-dev libsqlite3-dev +RUN pyenv install 3.7.6 +RUN pyenv global 3.7.6 +RUN python --version +RUN pip install --upgrade pip +RUN pip --version +RUN pip install $PIP_PKGS + +ENV PATH "$PATH:/root/.rbenv/bin:/root/.rbenv/shims" +RUN curl -fsSL https://github.com/rbenv/rbenv-installer/raw/108c12307621a0aa06f19799641848dde1987deb/bin/rbenv-installer | bash +RUN rbenv install 2.7.0 +RUN rbenv global 2.7.0 +RUN ruby --version +RUN $APT_INSTALL g++ +RUN gem --version +RUN gem install --no-document $GEM_PKGS + +RUN \ curl -sL https://deb.nodesource.com/setup_11.x | bash && \ - $APT_INSTALL nodejs && \ - # Install needed python packages. Use pip for installing packages (for consistency). - $APT_INSTALL libpython3-dev python3-pip && \ - # Change default python version to python3. - update-alternatives --install /usr/bin/python python /usr/bin/python2.7 1 && \ - update-alternatives --install /usr/bin/python python /usr/bin/python3.6 2 && \ - update-alternatives --set python /usr/bin/python3.6 && \ - pip3 install $BASE_PIP_PKGS && \ - pip3 install $PIP_PKGS && \ - # Install R packages and dependencies used when building. - # R depends on pandoc*, libssl (which are installed above). + $APT_INSTALL nodejs + +# Install R packages and dependencies used when building. +# R depends on pandoc*, libssl (which are installed above). +RUN \ $APT_INSTALL r-base r-base-dev && \ $APT_INSTALL texlive-latex-base texlive texlive-fonts-extra texinfo qpdf && \ Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" && \ - Rscript -e "devtools::install_github('jimhester/lintr')" && \ - # Install tools needed to build the documentation. - $APT_INSTALL ruby2.3 ruby2.3-dev mkdocs && \ - gem install jekyll --no-rdoc --no-ri -v 3.8.6 && \ - gem install jekyll-redirect-from -v 0.15.0 && \ - gem install rouge + Rscript -e "devtools::install_github('jimhester/lintr')" WORKDIR /opt/spark-rm/output diff --git a/docs/README.md b/docs/README.md index 22039871cf..c16f67c2c8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -31,19 +31,49 @@ whichever version of Spark you currently have checked out of revision control. The Spark documentation build uses a number of tools to build HTML docs and API docs in Scala, Java, Python, R and SQL. -You need to have [Ruby](https://www.ruby-lang.org/en/documentation/installation/) and -[Python](https://docs.python.org/2/using/unix.html#getting-and-installing-the-latest-version-of-python) -installed. Also install the following libraries: +You need to have Ruby 2 (preferably Ruby 2.6+) and Python 3 (preferably Python 3.7+) installed. + +You'll also need to install the following libraries: ```sh -$ sudo gem install jekyll jekyll-redirect-from rouge +gem install jekyll:4.0.0 jekyll-redirect-from:0.16.0 rouge:3.15.0 ``` -Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0. +### Using rbenv and pyenv + +A handy way to install and manage various versions of Ruby and Python is with [`rbenv`] and [`pyenv`]. + +[`rbenv`]: https://github.com/rbenv/rbenv +[`pyenv`]: https://github.com/pyenv/pyenv + +On macOS you can install them with Homebrew: + +```sh +brew install rbenv pyenv +``` + +To activate them, you'll need to run these commands or add them to the end of your `.bash_profile`: + +```sh +eval "$(rbenv init -)" +eval "$(pyenv init -)" +``` + +You can now use them to install specific versions of Ruby and Python and associate them with +the Spark home directory. Whenever you navigate to this directory or any of its subdirectories, these versions of Ruby and Python will be automatically activated. + +```sh +rbenv install 2.7.0 +pyenv install 3.7.6 + +cd /path/to/spark/root +rbenv local 2.7.0 +pyenv local 3.7.6 +``` ### R Documentation -If you'd like to generate R documentation, you'll need to [install Pandoc](https://pandoc.org/installing.html) +If you'd like to generate R documentation, you'll need to install R, [install Pandoc](https://pandoc.org/installing.html), and install these libraries: ```sh @@ -58,7 +88,7 @@ Note: Other versions of roxygen2 might work in SparkR documentation generation b To generate API docs for any language, you'll need to install these libraries: ```sh -$ sudo pip install sphinx mkdocs numpy +pip install sphinx==2.3.1 mkdocs==1.0.4 numpy==1.18.1 ``` ## Generating the Documentation HTML