2016-11-16 17:22:15 -05:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
|
|
|
# Stop on error
|
|
|
|
set -e
|
|
|
|
# Set nullglob for when we are checking existence based on globs
|
|
|
|
shopt -s nullglob
|
|
|
|
|
|
|
|
FWDIR="$(cd "$(dirname "$0")"/..; pwd)"
|
|
|
|
cd "$FWDIR"
|
|
|
|
|
2018-01-30 17:37:25 -05:00
|
|
|
echo "Constructing virtual env for testing"
|
2016-11-16 17:22:15 -05:00
|
|
|
VIRTUALENV_BASE=$(mktemp -d)
|
|
|
|
|
2018-01-30 17:37:25 -05:00
|
|
|
# Clean up the virtual env environment used if we created one.
|
2016-11-16 17:22:15 -05:00
|
|
|
function delete_virtualenv() {
|
|
|
|
echo "Cleaning up temporary directory - $VIRTUALENV_BASE"
|
|
|
|
rm -rf "$VIRTUALENV_BASE"
|
|
|
|
}
|
|
|
|
trap delete_virtualenv EXIT
|
|
|
|
|
2017-03-29 14:41:17 -04:00
|
|
|
PYTHON_EXECS=()
|
2016-11-16 17:22:15 -05:00
|
|
|
# Some systems don't have pip or virtualenv - in those cases our tests won't work.
|
2017-03-29 14:41:17 -04:00
|
|
|
if hash virtualenv 2>/dev/null && [ ! -n "$USE_CONDA" ]; then
|
|
|
|
echo "virtualenv installed - using. Note if this is a conda virtual env you may wish to set USE_CONDA"
|
2019-11-14 13:18:55 -05:00
|
|
|
# test only against python3
|
2017-03-29 14:41:17 -04:00
|
|
|
if hash python3 2>/dev/null; then
|
2019-11-14 13:18:55 -05:00
|
|
|
PYTHON_EXECS=('python3')
|
|
|
|
else
|
|
|
|
echo "Python3 not installed on system, skipping pip installability tests"
|
|
|
|
exit 0
|
2017-03-29 14:41:17 -04:00
|
|
|
fi
|
|
|
|
elif hash conda 2>/dev/null; then
|
2018-08-11 22:23:36 -04:00
|
|
|
echo "Using conda virtual environments"
|
2019-11-14 13:18:55 -05:00
|
|
|
PYTHON_EXECS=('3.6')
|
2017-03-29 14:41:17 -04:00
|
|
|
USE_CONDA=1
|
|
|
|
else
|
|
|
|
echo "Missing virtualenv & conda, skipping pip installability tests"
|
2016-11-16 17:22:15 -05:00
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
if ! hash pip 2>/dev/null; then
|
|
|
|
echo "Missing pip, skipping pip installability tests."
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Determine which version of PySpark we are building for archive name
|
2017-03-29 14:41:17 -04:00
|
|
|
PYSPARK_VERSION=$(python3 -c "exec(open('python/pyspark/version.py').read());print(__version__)")
|
2016-11-16 17:22:15 -05:00
|
|
|
PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz"
|
|
|
|
# The pip install options we use for all the pip commands
|
2020-07-21 09:49:14 -04:00
|
|
|
PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall"
|
2016-11-16 17:22:15 -05:00
|
|
|
# Test both regular user and edit/dev install modes.
|
|
|
|
PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST"
|
|
|
|
"pip install $PIP_OPTIONS -e python/")
|
|
|
|
|
2020-07-21 09:49:14 -04:00
|
|
|
# Jenkins has PySpark installed under user sitepackages shared for some reasons.
|
|
|
|
# In this test, explicitly exclude user sitepackages to prevent side effects
|
|
|
|
export PYTHONNOUSERSITE=1
|
|
|
|
|
2016-11-16 17:22:15 -05:00
|
|
|
for python in "${PYTHON_EXECS[@]}"; do
|
|
|
|
for install_command in "${PIP_COMMANDS[@]}"; do
|
|
|
|
echo "Testing pip installation with python $python"
|
|
|
|
# Create a temp directory for us to work in and save its name to a file for cleanup
|
|
|
|
echo "Using $VIRTUALENV_BASE for virtualenv"
|
|
|
|
VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
|
|
|
|
rm -rf "$VIRTUALENV_PATH"
|
2017-03-29 14:41:17 -04:00
|
|
|
if [ -n "$USE_CONDA" ]; then
|
[SPARK-32245][INFRA] Run Spark tests in Github Actions
### What changes were proposed in this pull request?
This PR aims to run the Spark tests in Github Actions.
To briefly explain the main idea:
- Reuse `dev/run-tests.py` with SBT build
- Reuse the modules in `dev/sparktestsupport/modules.py` to test each module
- Pass the modules to test into `dev/run-tests.py` directly via `TEST_ONLY_MODULES` environment variable. For example, `pyspark-sql,core,sql,hive`.
- `dev/run-tests.py` _does not_ take the dependent modules into account but solely the specified modules to test.
Another thing to note might be `SlowHiveTest` annotation. Running the tests in Hive modules takes too much so the slow tests are extracted and it runs as a separate job. It was extracted from the actual elapsed time in Jenkins:
![Screen Shot 2020-07-09 at 7 48 13 PM](https://user-images.githubusercontent.com/6477701/87050238-f6098e80-c238-11ea-9c4a-ab505af61381.png)
So, Hive tests are separated into to jobs. One is slow test cases, and the other one is the other test cases.
_Note that_ the current GitHub Actions build virtually copies what the default PR builder on Jenkins does (without other profiles such as JDK 11, Hadoop 2, etc.). The only exception is Kinesis https://github.com/apache/spark/pull/29057/files#diff-04eb107ee163a50b61281ca08f4e4c7bR23
### Why are the changes needed?
Last week and onwards, the Jenkins machines became very unstable for many reasons:
- Apparently, the machines became extremely slow. Almost all tests can't pass.
- One machine (worker 4) started to have the corrupt `.m2` which fails the build.
- Documentation build fails time to time for an unknown reason in Jenkins machine specifically. This is disabled for now at https://github.com/apache/spark/pull/29017.
- Almost all PRs are basically blocked by this instability currently.
The advantages of using Github Actions:
- To avoid depending on few persons who can access to the cluster.
- To reduce the elapsed time in the build - we could split the tests (e.g., SQL, ML, CORE), and run them in parallel so the total build time will significantly reduce.
- To control the environment more flexibly.
- Other contributors can test and propose to fix Github Actions configurations so we can distribute this build management cost.
Note that:
- The current build in Jenkins takes _more than 7 hours_. With Github actions it takes _less than 2 hours_
- We can now control the environments especially for Python easily.
- The test and build look more stable than the Jenkins'.
### Does this PR introduce _any_ user-facing change?
No, dev-only change.
### How was this patch tested?
Tested at https://github.com/HyukjinKwon/spark/pull/4
Closes #29057 from HyukjinKwon/migrate-to-github-actions.
Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
2020-07-11 16:09:06 -04:00
|
|
|
if [ -f "$CONDA_PREFIX/etc/profile.d/conda.sh" ]; then
|
|
|
|
# See also https://github.com/conda/conda/issues/7980
|
|
|
|
source "$CONDA_PREFIX/etc/profile.d/conda.sh"
|
|
|
|
fi
|
2020-04-03 19:09:15 -04:00
|
|
|
conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools
|
2020-07-21 09:49:14 -04:00
|
|
|
source activate "$VIRTUALENV_PATH" || (echo "Falling back to 'conda activate'" && conda activate "$VIRTUALENV_PATH")
|
2017-03-29 14:41:17 -04:00
|
|
|
else
|
|
|
|
mkdir -p "$VIRTUALENV_PATH"
|
|
|
|
virtualenv --python=$python "$VIRTUALENV_PATH"
|
|
|
|
source "$VIRTUALENV_PATH"/bin/activate
|
|
|
|
fi
|
2018-08-11 22:23:36 -04:00
|
|
|
# Upgrade pip & friends if using virtual env
|
2018-04-13 17:28:24 -04:00
|
|
|
if [ ! -n "$USE_CONDA" ]; then
|
2020-01-30 02:40:38 -05:00
|
|
|
pip install --upgrade pip wheel numpy
|
2017-03-29 14:41:17 -04:00
|
|
|
fi
|
2016-11-16 17:22:15 -05:00
|
|
|
|
|
|
|
echo "Creating pip installable source dist"
|
|
|
|
cd "$FWDIR"/python
|
2017-01-25 17:43:39 -05:00
|
|
|
# Delete the egg info file if it exists, this can cache the setup file.
|
|
|
|
rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
|
2019-11-14 13:18:55 -05:00
|
|
|
python3 setup.py sdist
|
2016-11-16 17:22:15 -05:00
|
|
|
|
|
|
|
|
|
|
|
echo "Installing dist into virtual env"
|
|
|
|
cd dist
|
|
|
|
# Verify that the dist directory only contains one thing to install
|
|
|
|
sdists=(*.tar.gz)
|
|
|
|
if [ ${#sdists[@]} -ne 1 ]; then
|
|
|
|
echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first."
|
|
|
|
exit -1
|
|
|
|
fi
|
|
|
|
# Do the actual installation
|
|
|
|
cd "$FWDIR"
|
|
|
|
$install_command
|
|
|
|
|
|
|
|
cd /
|
|
|
|
|
|
|
|
echo "Run basic sanity check on pip installed version with spark-submit"
|
|
|
|
spark-submit "$FWDIR"/dev/pip-sanity-check.py
|
|
|
|
echo "Run basic sanity check with import based"
|
2019-11-14 13:18:55 -05:00
|
|
|
python3 "$FWDIR"/dev/pip-sanity-check.py
|
2016-11-16 17:22:15 -05:00
|
|
|
echo "Run the tests for context.py"
|
2019-11-14 13:18:55 -05:00
|
|
|
python3 "$FWDIR"/python/pyspark/context.py
|
2016-11-16 17:22:15 -05:00
|
|
|
|
|
|
|
cd "$FWDIR"
|
|
|
|
|
2018-08-11 22:23:36 -04:00
|
|
|
# conda / virtualenv environments need to be deactivated differently
|
2017-03-29 14:41:17 -04:00
|
|
|
if [ -n "$USE_CONDA" ]; then
|
2020-07-21 09:49:14 -04:00
|
|
|
source deactivate || (echo "Falling back to 'conda deactivate'" && conda deactivate)
|
2017-03-29 14:41:17 -04:00
|
|
|
else
|
|
|
|
deactivate
|
|
|
|
fi
|
|
|
|
|
2016-11-16 17:22:15 -05:00
|
|
|
done
|
|
|
|
done
|
|
|
|
|
|
|
|
exit 0
|