name: Build and test on: push: branches: - '**' - '!branch-*.*' jobs: # Build: build Spark and run the tests for specified modules. build: name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: java: - 8 hadoop: - hadoop3.2 hive: - hive2.3 # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. # Kinesis tests depends on external Amazon kinesis service. # Note that the modules below are from sparktestsupport/modules.py. modules: - >- core, unsafe, kvstore, avro, network-common, network-shuffle, repl, launcher, examples, sketch, graphx - >- catalyst, hive-thriftserver - >- streaming, sql-kafka-0-10, streaming-kafka-0-10, mllib-local, mllib, yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl # Here, we split Hive and SQL tests into some of slow ones and the rest of them. included-tags: [""] excluded-tags: [""] comment: [""] include: # Hive tests - modules: hive java: 8 hadoop: hadoop3.2 hive: hive2.3 included-tags: org.apache.spark.tags.SlowHiveTest comment: "- slow tests" - modules: hive java: 8 hadoop: hadoop3.2 hive: hive2.3 excluded-tags: org.apache.spark.tags.SlowHiveTest comment: "- other tests" # SQL tests - modules: sql java: 8 hadoop: hadoop3.2 hive: hive2.3 included-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- slow tests" - modules: sql java: 8 hadoop: hadoop3.2 hive: hive2.3 excluded-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- other tests" env: MODULES_TO_TEST: ${{ matrix.modules }} EXCLUDED_TAGS: ${{ matrix.excluded-tags }} INCLUDED_TAGS: ${{ matrix.included-tags }} HADOOP_PROFILE: ${{ matrix.hadoop }} HIVE_PROFILE: ${{ matrix.hive }} # GitHub Actions' default miniconda to use in pip packaging test. CONDA_PREFIX: /usr/share/miniconda GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost steps: - name: Checkout Spark repository uses: actions/checkout@v2 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark ref: master - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' id: sync-branch run: | apache_spark_ref=`git rev-parse HEAD` git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven uses: actions/cache@v2 with: path: | build/apache-maven-* build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v2 with: path: ~/.cache/coursier key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - name: Install Java ${{ matrix.java }} uses: actions/setup-java@v1 with: java-version: ${{ matrix.java }} - name: Install Python 3.8 uses: actions/setup-python@v2 # We should install one Python that is higher then 3+ for SQL and Yarn because: # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. # - Yarn has a Python specific test too, for example, YarnClusterSuite. if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) with: python-version: 3.8 architecture: x64 - name: Install Python packages (Python 3.8) if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) run: | python3.8 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner python3.8 -m pip list # Run the tests. - name: Run tests run: | export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} # Hive and SQL tests become flaky when running in parallel as it's too intensive. if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - name: Upload test results to report if: always() uses: actions/upload-artifact@v2 with: name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: failure() uses: actions/upload-artifact@v2 with: name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} path: "**/target/unit-tests.log" pyspark: name: "Build modules: ${{ matrix.modules }}" runs-on: ubuntu-20.04 container: image: dongjoon/apache-spark-github-action-image:20201025 strategy: fail-fast: false matrix: modules: - >- pyspark-sql, pyspark-mllib, pyspark-resource - >- pyspark-core, pyspark-streaming, pyspark-ml - >- pyspark-pandas env: MODULES_TO_TEST: ${{ matrix.modules }} HADOOP_PROFILE: hadoop3.2 HIVE_PROFILE: hive2.3 # GitHub Actions' default miniconda to use in pip packaging test. CONDA_PREFIX: /usr/share/miniconda GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost steps: - name: Checkout Spark repository uses: actions/checkout@v2 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark ref: master - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' id: sync-branch run: | apache_spark_ref=`git rev-parse HEAD` git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven uses: actions/cache@v2 with: path: | build/apache-maven-* build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v2 with: path: ~/.cache/coursier key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | pyspark-coursier- - name: Install Python 3.6 uses: actions/setup-python@v2 with: python-version: 3.6 architecture: x64 # This step takes much less time (~30s) than other Python versions so it is not included # in the Docker image being used. There is also a technical issue to install Python 3.6 on # Ubuntu 20.04. See also SPARK-33162. - name: Install Python packages (Python 3.6) run: | python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner python3.6 -m pip list # Run the tests. - name: Run tests run: | export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" - name: Upload test results to report if: always() uses: actions/upload-artifact@v2 with: name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: failure() uses: actions/upload-artifact@v2 with: name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3 path: "**/target/unit-tests.log" sparkr: name: "Build modules: sparkr" runs-on: ubuntu-20.04 container: image: dongjoon/apache-spark-github-action-image:20201025 env: HADOOP_PROFILE: hadoop3.2 HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost steps: - name: Checkout Spark repository uses: actions/checkout@v2 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark ref: master - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' id: sync-branch run: | apache_spark_ref=`git rev-parse HEAD` git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven uses: actions/cache@v2 with: path: | build/apache-maven-* build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v2 with: path: ~/.cache/coursier key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | sparkr-coursier- - name: Run tests run: | # The followings are also used by `r-lib/actions/setup-r` to avoid # R issues at docker environment export TZ=UTC export _R_CHECK_SYSTEM_CLOCK_=FALSE export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} ./dev/run-tests --parallelism 2 --modules sparkr - name: Upload test results to report if: always() uses: actions/upload-artifact@v2 with: name: test-results-sparkr--8-hadoop3.2-hive2.3 path: "**/target/test-reports/*.xml" # Static analysis, and documentation build lint: name: Linters, licenses, dependencies and documentation generation runs-on: ubuntu-20.04 env: LC_ALL: C.UTF-8 LANG: C.UTF-8 container: image: dongjoon/apache-spark-github-action-image:20201025 steps: - name: Checkout Spark repository uses: actions/checkout@v2 # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven uses: actions/cache@v2 with: path: | build/apache-maven-* build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v2 with: path: ~/.cache/coursier key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | docs-coursier- - name: Cache Maven local repository uses: actions/cache@v2 with: path: ~/.m2/repository key: docs-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | docs-maven- - name: Install Python 3.6 uses: actions/setup-python@v2 with: python-version: 3.6 architecture: x64 - name: Install Python linter dependencies run: | # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. # See also https://github.com/sphinx-doc/sphinx/issues/7551. python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc - name: Install R linter dependencies and SparkR run: | apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" Rscript -e "devtools::install_github('jimhester/lintr@v2.0.1')" ./R/install-dev.sh - name: Install dependencies for documentation generation run: | # pandoc is required to generate PySpark APIs as well in nbsphinx. apt-get install -y libcurl4-openssl-dev pandoc # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. # See also https://github.com/sphinx-doc/sphinx/issues/7551. python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc apt-get update -y apt-get install -y ruby ruby-dev Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" gem install bundler cd docs bundle install - name: Scala linter run: ./dev/lint-scala - name: Java linter run: ./dev/lint-java - name: Python linter run: ./dev/lint-python - name: R linter run: ./dev/lint-r - name: License test run: ./dev/check-license - name: Dependencies test run: ./dev/test-dependencies.sh - name: Run documentation build run: | cd docs bundle exec jekyll build java-11: name: Java 11 build with Maven runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository uses: actions/checkout@v2 - name: Cache Scala, SBT and Maven uses: actions/cache@v2 with: path: | build/apache-maven-* build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Maven local repository uses: actions/cache@v2 with: path: ~/.m2/repository key: java11-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | java11-maven- - name: Install Java 11 uses: actions/setup-java@v1 with: java-version: 11 - name: Build with Maven run: | export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" export MAVEN_CLI_OPTS="--no-transfer-progress" # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install rm -rf ~/.m2/repository/org/apache/spark scala-213: name: Scala 2.13 build with SBT runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository uses: actions/checkout@v2 - name: Cache Scala, SBT and Maven uses: actions/cache@v2 with: path: | build/apache-maven-* build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v2 with: path: ~/.cache/coursier key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | scala-213-coursier- - name: Install Java 8 uses: actions/setup-java@v1 with: java-version: 8 - name: Build with SBT run: | ./dev/change-scala-version.sh 2.13 ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile hadoop-2: name: Hadoop 2 build with SBT runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository uses: actions/checkout@v2 - name: Cache Scala, SBT and Maven uses: actions/cache@v2 with: path: | build/apache-maven-* build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v2 with: path: ~/.cache/coursier key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | hadoop-2-coursier- - name: Install Java 8 uses: actions/setup-java@v1 with: java-version: 8 - name: Build with SBT run: | ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile tpcds-1g: name: Run TPC-DS queries with SF=1 runs-on: ubuntu-20.04 env: SPARK_LOCAL_IP: localhost steps: - name: Checkout Spark repository uses: actions/checkout@v2 - name: Cache TPC-DS generated data id: cache-tpcds-sf-1 uses: actions/cache@v2 with: path: ./tpcds-sf-1 key: tpcds-556111e35d400f56cb0625dc16e9063d54628320 - name: Checkout TPC-DS (SF=1) generated data repository if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' uses: actions/checkout@v2 with: repository: maropu/spark-tpcds-sf-1 ref: 556111e35d400f56cb0625dc16e9063d54628320 path: ./tpcds-sf-1 - name: Cache Scala, SBT and Maven uses: actions/cache@v2 with: path: | build/apache-maven-* build/scala-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v2 with: path: ~/.cache/coursier key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | tpcds-coursier- - name: Install Java 8 uses: actions/setup-java@v1 with: java-version: 8 - name: Run TPC-DS queries run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" - name: Upload test results to report if: always() uses: actions/upload-artifact@v2 with: name: test-results-tpcds--8-hadoop3.2-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: failure() uses: actions/upload-artifact@v2 with: name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3 path: "**/target/unit-tests.log"