eb509968a7
### What changes were proposed in this pull request? This PR switches python to python3 in `make-distribution.sh`. ### Why are the changes needed? SPARK-29672 changed this - https://github.com/apache/spark/pull/26330/files#diff-8cf6167d58ce775a08acafcfe6f40966 ### Does this PR introduce any user-facing change? No ### How was this patch tested? N/A Closes #26844 from wangyum/SPARK-30211. Authored-by: Yuming Wang <yumwang@ebay.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
292 lines
8.7 KiB
Bash
Executable file
292 lines
8.7 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
#
|
|
# Script to create a binary distribution for easy deploys of Spark.
|
|
# The distribution directory defaults to dist/ but can be overridden below.
|
|
# The distribution contains fat (assembly) jars that include the Scala library,
|
|
# so it is completely self contained.
|
|
# It does not contain source or *.class files.
|
|
|
|
set -o pipefail
|
|
set -e
|
|
set -x
|
|
|
|
# Figure out where the Spark framework is installed
|
|
SPARK_HOME="$(cd "`dirname "$0"`/.."; pwd)"
|
|
DISTDIR="$SPARK_HOME/dist"
|
|
|
|
MAKE_TGZ=false
|
|
MAKE_PIP=false
|
|
MAKE_R=false
|
|
NAME=none
|
|
MVN="$SPARK_HOME/build/mvn"
|
|
|
|
function exit_with_usage {
|
|
echo "make-distribution.sh - tool for making binary distributions of Spark"
|
|
echo ""
|
|
echo "usage:"
|
|
cl_options="[--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>]"
|
|
echo "make-distribution.sh $cl_options <maven build options>"
|
|
echo "See Spark's \"Building Spark\" doc for correct Maven options."
|
|
echo ""
|
|
exit 1
|
|
}
|
|
|
|
# Parse arguments
|
|
while (( "$#" )); do
|
|
case $1 in
|
|
--tgz)
|
|
MAKE_TGZ=true
|
|
;;
|
|
--pip)
|
|
MAKE_PIP=true
|
|
;;
|
|
--r)
|
|
MAKE_R=true
|
|
;;
|
|
--mvn)
|
|
MVN="$2"
|
|
shift
|
|
;;
|
|
--name)
|
|
NAME="$2"
|
|
shift
|
|
;;
|
|
--help)
|
|
exit_with_usage
|
|
;;
|
|
--*)
|
|
echo "Error: $1 is not supported"
|
|
exit_with_usage
|
|
;;
|
|
-*)
|
|
break
|
|
;;
|
|
*)
|
|
echo "Error: $1 is not supported"
|
|
exit_with_usage
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
if [ -z "$JAVA_HOME" ]; then
|
|
# Fall back on JAVA_HOME from rpm, if found
|
|
if [ $(command -v rpm) ]; then
|
|
RPM_JAVA_HOME="$(rpm -E %java_home 2>/dev/null)"
|
|
if [ "$RPM_JAVA_HOME" != "%java_home" ]; then
|
|
JAVA_HOME="$RPM_JAVA_HOME"
|
|
echo "No JAVA_HOME set, proceeding with '$JAVA_HOME' learned from rpm"
|
|
fi
|
|
fi
|
|
|
|
if [ -z "$JAVA_HOME" ]; then
|
|
if [ `command -v java` ]; then
|
|
# If java is in /usr/bin/java, we want /usr
|
|
JAVA_HOME="$(dirname $(dirname $(which java)))"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if [ -z "$JAVA_HOME" ]; then
|
|
echo "Error: JAVA_HOME is not set, cannot proceed."
|
|
exit -1
|
|
fi
|
|
|
|
if [ $(command -v git) ]; then
|
|
GITREV=$(git rev-parse --short HEAD 2>/dev/null || :)
|
|
if [ ! -z "$GITREV" ]; then
|
|
GITREVSTRING=" (git revision $GITREV)"
|
|
fi
|
|
unset GITREV
|
|
fi
|
|
|
|
|
|
if [ ! "$(command -v "$MVN")" ] ; then
|
|
echo -e "Could not locate Maven command: '$MVN'."
|
|
echo -e "Specify the Maven command with the --mvn flag"
|
|
exit -1;
|
|
fi
|
|
|
|
VERSION=$("$MVN" help:evaluate -Dexpression=project.version $@ 2>/dev/null\
|
|
| grep -v "INFO"\
|
|
| grep -v "WARNING"\
|
|
| tail -n 1)
|
|
SCALA_VERSION=$("$MVN" help:evaluate -Dexpression=scala.binary.version $@ 2>/dev/null\
|
|
| grep -v "INFO"\
|
|
| grep -v "WARNING"\
|
|
| tail -n 1)
|
|
SPARK_HADOOP_VERSION=$("$MVN" help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\
|
|
| grep -v "INFO"\
|
|
| grep -v "WARNING"\
|
|
| tail -n 1)
|
|
SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
|
|
| grep -v "INFO"\
|
|
| grep -v "WARNING"\
|
|
| fgrep --count "<id>hive</id>";\
|
|
# Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
|
|
# because we use "set -o pipefail"
|
|
echo -n)
|
|
|
|
if [ "$NAME" == "none" ]; then
|
|
NAME=$SPARK_HADOOP_VERSION
|
|
fi
|
|
|
|
echo "Spark version is $VERSION"
|
|
|
|
if [ "$MAKE_TGZ" == "true" ]; then
|
|
echo "Making spark-$VERSION-bin-$NAME.tgz"
|
|
else
|
|
echo "Making distribution for Spark $VERSION in '$DISTDIR'..."
|
|
fi
|
|
|
|
# Build uber fat JAR
|
|
cd "$SPARK_HOME"
|
|
|
|
export MAVEN_OPTS="${MAVEN_OPTS:--Xmx2g -XX:ReservedCodeCacheSize=1g}"
|
|
|
|
# Store the command as an array because $MVN variable might have spaces in it.
|
|
# Normal quoting tricks don't work.
|
|
# See: http://mywiki.wooledge.org/BashFAQ/050
|
|
BUILD_COMMAND=("$MVN" clean package -DskipTests -Dmaven.test.skip=true $@)
|
|
|
|
# Actually build the jar
|
|
echo -e "\nBuilding with..."
|
|
echo -e "\$ ${BUILD_COMMAND[@]}\n"
|
|
|
|
"${BUILD_COMMAND[@]}"
|
|
|
|
# Make directories
|
|
rm -rf "$DISTDIR"
|
|
mkdir -p "$DISTDIR/jars"
|
|
echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE"
|
|
echo "Build flags: $@" >> "$DISTDIR/RELEASE"
|
|
|
|
# Copy jars
|
|
cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"
|
|
|
|
# Only create the yarn directory if the yarn artifacts were built.
|
|
if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then
|
|
mkdir "$DISTDIR/yarn"
|
|
cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/yarn"
|
|
fi
|
|
|
|
# Only create and copy the dockerfiles directory if the kubernetes artifacts were built.
|
|
if [ -d "$SPARK_HOME"/resource-managers/kubernetes/core/target/ ]; then
|
|
mkdir -p "$DISTDIR/kubernetes/"
|
|
cp -a "$SPARK_HOME"/resource-managers/kubernetes/docker/src/main/dockerfiles "$DISTDIR/kubernetes/"
|
|
cp -a "$SPARK_HOME"/resource-managers/kubernetes/integration-tests/tests "$DISTDIR/kubernetes/"
|
|
fi
|
|
|
|
# Copy examples and dependencies
|
|
mkdir -p "$DISTDIR/examples/jars"
|
|
cp "$SPARK_HOME"/examples/target/scala*/jars/* "$DISTDIR/examples/jars"
|
|
|
|
# Deduplicate jars that have already been packaged as part of the main Spark dependencies.
|
|
for f in "$DISTDIR"/examples/jars/*; do
|
|
name=$(basename "$f")
|
|
if [ -f "$DISTDIR/jars/$name" ]; then
|
|
rm "$DISTDIR/examples/jars/$name"
|
|
fi
|
|
done
|
|
|
|
# Copy example sources (needed for python and SQL)
|
|
mkdir -p "$DISTDIR/examples/src/main"
|
|
cp -r "$SPARK_HOME/examples/src/main" "$DISTDIR/examples/src/"
|
|
|
|
# Copy license and ASF files
|
|
if [ -e "$SPARK_HOME/LICENSE-binary" ]; then
|
|
cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE"
|
|
cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses"
|
|
cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE"
|
|
else
|
|
echo "Skipping copying LICENSE files"
|
|
fi
|
|
|
|
if [ -e "$SPARK_HOME/CHANGES.txt" ]; then
|
|
cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR"
|
|
fi
|
|
|
|
# Copy data files
|
|
cp -r "$SPARK_HOME/data" "$DISTDIR"
|
|
|
|
# Make pip package
|
|
if [ "$MAKE_PIP" == "true" ]; then
|
|
echo "Building python distribution package"
|
|
pushd "$SPARK_HOME/python" > /dev/null
|
|
# Delete the egg info file if it exists, this can cache older setup files.
|
|
rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
|
|
python3 setup.py sdist
|
|
popd > /dev/null
|
|
else
|
|
echo "Skipping building python distribution package"
|
|
fi
|
|
|
|
# Make R package - this is used for both CRAN release and packing R layout into distribution
|
|
if [ "$MAKE_R" == "true" ]; then
|
|
echo "Building R source package"
|
|
R_PACKAGE_VERSION=`grep Version "$SPARK_HOME/R/pkg/DESCRIPTION" | awk '{print $NF}'`
|
|
pushd "$SPARK_HOME/R" > /dev/null
|
|
# Build source package and run full checks
|
|
# Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME
|
|
NO_TESTS=1 "$SPARK_HOME/R/check-cran.sh"
|
|
|
|
# Move R source package to match the Spark release version if the versions are not the same.
|
|
# NOTE(shivaram): `mv` throws an error on Linux if source and destination are same file
|
|
if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then
|
|
mv "$SPARK_HOME/R/SparkR_$R_PACKAGE_VERSION.tar.gz" "$SPARK_HOME/R/SparkR_$VERSION.tar.gz"
|
|
fi
|
|
|
|
# Install source package to get it to generate vignettes rds files, etc.
|
|
VERSION=$VERSION "$SPARK_HOME/R/install-source-package.sh"
|
|
popd > /dev/null
|
|
else
|
|
echo "Skipping building R source package"
|
|
fi
|
|
|
|
# Copy other things
|
|
mkdir "$DISTDIR/conf"
|
|
cp "$SPARK_HOME"/conf/*.template "$DISTDIR/conf"
|
|
cp "$SPARK_HOME/README.md" "$DISTDIR"
|
|
cp -r "$SPARK_HOME/bin" "$DISTDIR"
|
|
cp -r "$SPARK_HOME/python" "$DISTDIR"
|
|
|
|
# Remove the python distribution from dist/ if we built it
|
|
if [ "$MAKE_PIP" == "true" ]; then
|
|
rm -f "$DISTDIR"/python/dist/pyspark-*.tar.gz
|
|
fi
|
|
|
|
cp -r "$SPARK_HOME/sbin" "$DISTDIR"
|
|
# Copy SparkR if it exists
|
|
if [ -d "$SPARK_HOME/R/lib/SparkR" ]; then
|
|
mkdir -p "$DISTDIR/R/lib"
|
|
cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR/R/lib"
|
|
cp "$SPARK_HOME/R/lib/sparkr.zip" "$DISTDIR/R/lib"
|
|
fi
|
|
|
|
if [ "$MAKE_TGZ" == "true" ]; then
|
|
TARDIR_NAME=spark-$VERSION-bin-$NAME
|
|
TARDIR="$SPARK_HOME/$TARDIR_NAME"
|
|
rm -rf "$TARDIR"
|
|
cp -r "$DISTDIR" "$TARDIR"
|
|
tar czf "spark-$VERSION-bin-$NAME.tgz" -C "$SPARK_HOME" "$TARDIR_NAME"
|
|
rm -rf "$TARDIR"
|
|
fi
|