spark-instrumented-optimizer/.github/workflows/build_and_test.yml

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: Build and test

on:
  push:
    branches:
      - '**'

jobs:
  # Build: build Spark and run the tests for specified modules.
  build:
    name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
    # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        java:
          - 8
        hadoop:
          - hadoop3.2
        hive:
          - hive2.3
        # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
        # Kinesis tests depends on external Amazon kinesis service.
        # Note that the modules below are from sparktestsupport/modules.py.
        modules:
          - >-
            core, unsafe, kvstore, avro,
            network-common, network-shuffle, repl, launcher,
            examples, sketch, graphx
          - >-
            catalyst, hive-thriftserver
          - >-
            streaming, sql-kafka-0-10, streaming-kafka-0-10,
            mllib-local, mllib,
            yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
        # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
        included-tags: [""]
        excluded-tags: [""]
        comment: [""]
        include:
          # Hive tests
          - modules: hive
            java: 8
            hadoop: hadoop3.2
            hive: hive2.3
            included-tags: org.apache.spark.tags.SlowHiveTest
            comment: "- slow tests"
          - modules: hive
            java: 8
            hadoop: hadoop3.2
            hive: hive2.3
            excluded-tags: org.apache.spark.tags.SlowHiveTest
            comment: "- other tests"
          # SQL tests
          - modules: sql
            java: 8
            hadoop: hadoop3.2
            hive: hive2.3
            included-tags: org.apache.spark.tags.ExtendedSQLTest
            comment: "- slow tests"
          - modules: sql
            java: 8
            hadoop: hadoop3.2
            hive: hive2.3
            excluded-tags: org.apache.spark.tags.ExtendedSQLTest
            comment: "- other tests"
    env:
      MODULES_TO_TEST: ${{ matrix.modules }}
      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
      INCLUDED_TAGS: ${{ matrix.included-tags }}
      HADOOP_PROFILE: ${{ matrix.hadoop }}
      HIVE_PROFILE: ${{ matrix.hive }}
      GITHUB_PREV_SHA: ${{ github.event.before }}
      SPARK_LOCAL_IP: localhost
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      # In order to fetch changed files
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v2
      with:
        path: ~/.cache/coursier
        key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
    - name: Install Java ${{ matrix.java }}
      uses: actions/setup-java@v1
      with:
        java-version: ${{ matrix.java }}
    - name: Install Python 3.8
      uses: actions/setup-python@v2
      # We should install one Python that is higher then 3+ for SQL and Yarn because:
      # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
      # - Yarn has a Python specific test too, for example, YarnClusterSuite.
      if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
      with:
        python-version: 3.8
        architecture: x64
    - name: Install Python packages (Python 3.8)
      if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
      run: |
        python3.8 -m pip install numpy 'pyarrow<5.0.0' pandas scipy xmlrunner
        python3.8 -m pip list
    # Run the tests.
    - name: Run tests
      run: |
        # Hive "other tests" test needs larger metaspace size based on experiment.
        if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi
        export SERIAL_SBT_TESTS=1
        ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
    - name: Upload test results to report
      if: always()
      uses: actions/upload-artifact@v2
      with:
        name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
        path: "**/target/test-reports/*.xml"
    - name: Upload unit tests log files
      if: failure()
      uses: actions/upload-artifact@v2
      with:
        name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
        path: "**/target/unit-tests.log"

  pyspark:
    name: "Build modules: ${{ matrix.modules }}"
    runs-on: ubuntu-20.04
    container:
      image: dongjoon/apache-spark-github-action-image:20210730
    strategy:
      fail-fast: false
      matrix:
        modules:
          - >-
            pyspark-sql, pyspark-mllib, pyspark-resource
          - >-
            pyspark-core, pyspark-streaming, pyspark-ml
          - >-
            pyspark-pandas
          - >-
            pyspark-pandas-slow
    env:
      MODULES_TO_TEST: ${{ matrix.modules }}
      HADOOP_PROFILE: hadoop3.2
      HIVE_PROFILE: hive2.3
      GITHUB_PREV_SHA: ${{ github.event.before }}
      SPARK_LOCAL_IP: localhost
      SKIP_UNIDOC: true
      SKIP_MIMA: true
      METASPACE_SIZE: 1g
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      # In order to fetch changed files
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v2
      with:
        path: ~/.cache/coursier
        key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          pyspark-coursier-
    - name: Install Python 3.6
      uses: actions/setup-python@v2
      with:
        python-version: 3.6
        architecture: x64
    # This step takes much less time (~30s) than other Python versions so it is not included
    # in the Docker image being used. There is also a technical issue to install Python 3.6 on
    # Ubuntu 20.04. See also SPARK-33162.
    - name: Install Python packages (Python 3.6)
      run: |
        python3.6 -m pip install numpy 'pyarrow<4.0.0' pandas scipy xmlrunner 'plotly>=4.8'
        python3.6 -m pip list
    - name: List Python packages (Python 3.9)
      run: |
        python3.9 -m pip list
    - name: Install Conda for pip packaging test
      run: |
        curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh
        bash miniconda.sh -b -p $HOME/miniconda
    # Run the tests.
    - name: Run tests
      run: |
        export PATH=$PATH:$HOME/miniconda/bin
        ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
    - name: Upload test results to report
      if: always()
      uses: actions/upload-artifact@v2
      with:
        name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3
        path: "**/target/test-reports/*.xml"
    - name: Upload unit tests log files
      if: failure()
      uses: actions/upload-artifact@v2
      with:
        name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3
        path: "**/target/unit-tests.log"

  sparkr:
    name: "Build modules: sparkr"
    runs-on: ubuntu-20.04
    container:
      image: dongjoon/apache-spark-github-action-image:20210602
    env:
      HADOOP_PROFILE: hadoop3.2
      HIVE_PROFILE: hive2.3
      GITHUB_PREV_SHA: ${{ github.event.before }}
      SPARK_LOCAL_IP: localhost
      SKIP_MIMA: true
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      # In order to fetch changed files
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v2
      with:
        path: ~/.cache/coursier
        key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          sparkr-coursier-
    - name: Run tests
      run: |
        # The followings are also used by `r-lib/actions/setup-r` to avoid
        # R issues at docker environment
        export TZ=UTC
        export _R_CHECK_SYSTEM_CLOCK_=FALSE
        ./dev/run-tests --parallelism 1 --modules sparkr
    - name: Upload test results to report
      if: always()
      uses: actions/upload-artifact@v2
      with:
        name: test-results-sparkr--8-hadoop3.2-hive2.3
        path: "**/target/test-reports/*.xml"

  # Static analysis, and documentation build
  lint:
    name: Linters, licenses, dependencies and documentation generation
    runs-on: ubuntu-20.04
    env:
      LC_ALL: C.UTF-8
      LANG: C.UTF-8
      PYSPARK_DRIVER_PYTHON: python3.9
    container:
      image: dongjoon/apache-spark-github-action-image:20210602
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v2
      with:
        path: ~/.cache/coursier
        key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          docs-coursier-
    - name: Cache Maven local repository
      uses: actions/cache@v2
      with:
        path: ~/.m2/repository
        key: docs-maven-${{ hashFiles('**/pom.xml') }}
        restore-keys: |
          docs-maven-
    - name: Install Python linter dependencies
      run: |
        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
        # Jinja2 3.0.0+ causes error when building with Sphinx.
        #   See also https://issues.apache.org/jira/browse/SPARK-35375.
        python3.9 -m pip install flake8 pydata_sphinx_theme 'mypy==0.910' numpydoc 'jinja2<3.0.0' 'black==21.5b2'
    - name: Install R linter dependencies and SparkR
      run: |
        apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
        Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
        Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')"
        ./R/install-dev.sh
    - name: Instll JavaScript linter dependencies
      run: |
        apt update
        apt-get install -y nodejs npm
    - name: Install dependencies for documentation generation
      run: |
        # pandoc is required to generate PySpark APIs as well in nbsphinx.
        apt-get install -y libcurl4-openssl-dev pandoc
        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
        # Jinja2 3.0.0+ causes error when building with Sphinx.
        #   See also https://issues.apache.org/jira/browse/SPARK-35375.
        python3.9 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0'
        python3.9 -m pip install sphinx_plotly_directive 'pyarrow<5.0.0' pandas 'plotly>=4.8'
        apt-get update -y
        apt-get install -y ruby ruby-dev
        Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
        gem install bundler
        cd docs
        bundle install
    - name: Scala linter
      run: ./dev/lint-scala
    - name: Java linter
      run: ./dev/lint-java
    - name: Python linter
      run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
    - name: R linter
      run: ./dev/lint-r
    - name: JS linter
      run: ./dev/lint-js
    - name: License test
      run: ./dev/check-license
    - name: Dependencies test
      run: ./dev/test-dependencies.sh
    - name: Run documentation build
      run: |
        cd docs
        bundle exec jekyll build

  java-11-17:
    name: Java ${{ matrix.java }} build with Maven
    strategy:
      fail-fast: false
      matrix:
        java:
          - 11
          - 17
    runs-on: ubuntu-20.04
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Maven local repository
      uses: actions/cache@v2
      with:
        path: ~/.m2/repository
        key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }}
        restore-keys: |
          java${{ matrix.java }}-maven-
    - name: Install Java ${{ matrix.java }}
      uses: actions/setup-java@v1
      with:
        java-version: ${{ matrix.java }}
    - name: Build with Maven
      run: |
        export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
        export MAVEN_CLI_OPTS="--no-transfer-progress"
        export JAVA_VERSION=${{ matrix.java }}
        # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install
        rm -rf ~/.m2/repository/org/apache/spark

  scala-213:
    name: Scala 2.13 build with SBT
    runs-on: ubuntu-20.04
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v2
      with:
        path: ~/.cache/coursier
        key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          scala-213-coursier-
    - name: Install Java 8
      uses: actions/setup-java@v1
      with:
        java-version: 8
    - name: Build with SBT
      run: |
        ./dev/change-scala-version.sh 2.13
        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile

  hadoop-2:
    name: Hadoop 2 build with SBT
    runs-on: ubuntu-20.04
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v2
      with:
        path: ~/.cache/coursier
        key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          hadoop-2-coursier-
    - name: Install Java 8
      uses: actions/setup-java@v1
      with:
        java-version: 8
    - name: Build with SBT
      run: |
        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile

  tpcds-1g:
    name: Run TPC-DS queries with SF=1
    runs-on: ubuntu-20.04
    env:
      SPARK_LOCAL_IP: localhost
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v2
      with:
        path: ~/.cache/coursier
        key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          tpcds-coursier-
    - name: Install Java 8
      uses: actions/setup-java@v1
      with:
        java-version: 8
    - name: Cache TPC-DS generated data
      id: cache-tpcds-sf-1
      uses: actions/cache@v2
      with:
        path: ./tpcds-sf-1
        key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
    - name: Checkout tpcds-kit repository
      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
      uses: actions/checkout@v2
      with:
        repository: databricks/tpcds-kit
        ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
        path: ./tpcds-kit
    - name: Build tpcds-kit
      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
      run: cd tpcds-kit/tools && make OS=LINUX
    - name: Generate TPC-DS (SF=1) table data
      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
      run: build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"
    - name: Run TPC-DS queries
      run: |
        SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
    - name: Upload test results to report
      if: always()
      uses: actions/upload-artifact@v2
      with:
        name: test-results-tpcds--8-hadoop3.2-hive2.3
        path: "**/target/test-reports/*.xml"
    - name: Upload unit tests log files
      if: failure()
      uses: actions/upload-artifact@v2
      with:
        name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
        path: "**/target/unit-tests.log"

  docker-integration-tests:
    name: Run docker integration tests
    runs-on: ubuntu-20.04
    env:
      HADOOP_PROFILE: hadoop3.2
      HIVE_PROFILE: hive2.3
      GITHUB_PREV_SHA: ${{ github.event.before }}
      SPARK_LOCAL_IP: localhost
      ORACLE_DOCKER_IMAGE_NAME: oracle/database:18.4.0-xe
      SKIP_MIMA: true
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v2
      with:
        fetch-depth: 0
        repository: apache/spark
        ref: branch-3.2
    - name: Sync the current branch with the latest in Apache Spark
      if: github.repository != 'apache/spark'
      run: |
        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v2
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v2
      with:
        path: ~/.cache/coursier
        key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          docker-integration-coursier-
    - name: Install Java 8
      uses: actions/setup-java@v1
      with:
        java-version: 8
    - name: Cache Oracle docker-images repository
      id: cache-oracle-docker-images
      uses: actions/cache@v2
      with:
        path: ./oracle/docker-images
        # key should contains the commit hash of the Oracle docker images to be checkout.
        key: oracle-docker-images-3f422c4a35b423dfcdbcc57a84f01db6c82eb6c1
    - name: Checkout Oracle docker-images repository
      uses: actions/checkout@v2
      with:
        fetch-depth: 0
        repository: oracle/docker-images
        ref: 3f422c4a35b423dfcdbcc57a84f01db6c82eb6c1
        path: ./oracle/docker-images
    - name: Install Oracle Docker image
      run: |
        cd oracle/docker-images/OracleDatabase/SingleInstance/dockerfiles
        ./buildContainerImage.sh -v 18.4.0 -x
    - name: Run tests
      run: |
        ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
    - name: Upload test results to report
      if: always()
      uses: actions/upload-artifact@v2
      with:
        name: test-results-docker-integration--8-hadoop3.2-hive2.3
        path: "**/target/test-reports/*.xml"
    - name: Upload unit tests log files
      if: failure()
      uses: actions/upload-artifact@v2
      with:
        name: unit-tests-log-docker-integration--8-hadoop3.2-hive2.3
        path: "**/target/unit-tests.log"