2013-07-15 21:13:17 -04:00
#!/usr/bin/env bash
2013-07-16 20:21:33 -04:00
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
2013-06-24 20:05:37 -04:00
#
# Script to create a binary distribution for easy deploys of Spark.
# The distribution directory defaults to dist/ but can be overridden below.
# The distribution contains fat (assembly) jars that include the Scala library,
# so it is completely self contained.
2013-06-25 03:15:58 -04:00
# It does not contain source or *.class files.
2013-06-24 20:05:37 -04:00
2014-05-06 18:41:46 -04:00
set -o pipefail
2014-05-09 01:26:17 -04:00
set -e
2015-05-01 20:01:36 -04:00
set -x
2014-05-09 01:26:17 -04:00
2013-06-24 20:05:37 -04:00
# Figure out where the Spark framework is installed
2015-01-09 12:40:18 -05:00
SPARK_HOME = " $( cd "`dirname " $0 "`" ; pwd ) "
DISTDIR = " $SPARK_HOME /dist "
2013-06-24 20:05:37 -04:00
2014-03-19 01:04:57 -04:00
SPARK_TACHYON = false
2015-04-24 17:57:41 -04:00
TACHYON_VERSION = "0.6.4"
2015-01-28 15:43:22 -05:00
TACHYON_TGZ = " tachyon- ${ TACHYON_VERSION } -bin.tar.gz "
TACHYON_URL = " https://github.com/amplab/tachyon/releases/download/v ${ TACHYON_VERSION } / ${ TACHYON_TGZ } "
2013-08-14 20:34:34 -04:00
MAKE_TGZ = false
2014-04-23 13:19:32 -04:00
NAME = none
2015-01-09 12:40:18 -05:00
MVN = " $SPARK_HOME /build/mvn "
2013-08-14 20:34:34 -04:00
2014-07-17 04:02:35 -04:00
function exit_with_usage {
echo "make-distribution.sh - tool for making binary distributions of Spark"
echo ""
echo "usage:"
2015-01-09 12:40:18 -05:00
cl_options = "[--name] [--tgz] [--mvn <mvn-command>] [--with-tachyon]"
echo " ./make-distribution.sh $cl_options <maven build options> "
2014-09-16 12:18:03 -04:00
echo "See Spark's \"Building Spark\" doc for correct Maven options."
2014-07-17 04:02:35 -04:00
echo ""
exit 1
}
2013-08-14 20:34:34 -04:00
# Parse arguments
while ( ( " $# " ) ) ; do
case $1 in
--hadoop)
2014-07-17 04:02:35 -04:00
echo "Error: '--hadoop' is no longer supported:"
2014-09-07 23:38:32 -04:00
echo "Error: use Maven profiles and options -Dhadoop.version and -Dyarn.version instead."
2015-05-03 16:22:31 -04:00
echo "Error: Related profiles include hadoop-2.2, hadoop-2.3 and hadoop-2.4."
2014-07-17 04:02:35 -04:00
exit_with_usage
2013-08-14 20:34:34 -04:00
; ;
--with-yarn)
2014-07-17 04:02:35 -04:00
echo "Error: '--with-yarn' is no longer supported, use Maven option -Pyarn"
exit_with_usage
2013-08-14 20:34:34 -04:00
; ;
2014-04-23 13:19:32 -04:00
--with-hive)
2014-11-12 00:36:48 -05:00
echo "Error: '--with-hive' is no longer supported, use Maven options -Phive and -Phive-thriftserver"
2014-07-17 04:02:35 -04:00
exit_with_usage
2014-04-23 13:19:32 -04:00
; ;
2014-05-24 21:27:00 -04:00
--skip-java-test)
SKIP_JAVA_TEST = true
; ;
2014-03-19 01:04:57 -04:00
--with-tachyon)
SPARK_TACHYON = true
; ;
2013-08-14 20:34:34 -04:00
--tgz)
MAKE_TGZ = true
; ;
2015-01-09 12:40:18 -05:00
--mvn)
MVN = " $2 "
shift
; ;
2014-04-23 13:19:32 -04:00
--name)
NAME = " $2 "
shift
; ;
2014-07-17 04:02:35 -04:00
--help)
exit_with_usage
; ;
*)
break
; ;
2013-08-14 20:34:34 -04:00
esac
shift
done
2014-06-24 22:31:20 -04:00
if [ -z " $JAVA_HOME " ] ; then
# Fall back on JAVA_HOME from rpm, if found
2015-01-28 15:43:22 -05:00
if [ $( command -v rpm) ] ; then
2015-02-12 17:52:38 -05:00
RPM_JAVA_HOME = " $( rpm -E %java_home 2>/dev/null) "
2014-06-24 22:31:20 -04:00
if [ " $RPM_JAVA_HOME " != "%java_home" ] ; then
2015-02-12 17:52:38 -05:00
JAVA_HOME = " $RPM_JAVA_HOME "
2014-06-24 22:31:20 -04:00
echo " No JAVA_HOME set, proceeding with ' $JAVA_HOME ' learned from rpm "
fi
fi
fi
2014-05-24 21:27:00 -04:00
if [ -z " $JAVA_HOME " ] ; then
echo "Error: JAVA_HOME is not set, cannot proceed."
exit -1
fi
2015-01-28 15:43:22 -05:00
if [ $( command -v git) ] ; then
2014-06-28 16:07:12 -04:00
GITREV = $( git rev-parse --short HEAD 2>/dev/null || :)
2015-02-12 17:52:38 -05:00
if [ ! -z " $GITREV " ] ; then
2014-06-28 16:07:12 -04:00
GITREVSTRING = " (git revision $GITREV ) "
fi
unset GITREV
fi
2015-01-28 15:43:22 -05:00
2015-02-12 17:52:38 -05:00
if [ ! $( command -v " $MVN " ) ] ; then
2015-01-09 12:40:18 -05:00
echo -e " Could not locate Maven command: ' $MVN '. "
echo -e "Specify the Maven command with the --mvn flag"
2014-05-24 21:27:00 -04:00
exit -1;
fi
2014-08-29 18:29:43 -04:00
2015-05-01 20:01:36 -04:00
VERSION = $( " $MVN " help:evaluate -Dexpression= project.version $@ 2>/dev/null | grep -v "INFO" | tail -n 1)
2015-03-12 15:16:58 -04:00
SCALA_VERSION = $( " $MVN " help:evaluate -Dexpression= scala.binary.version $@ 2>/dev/null\
| grep -v "INFO" \
| tail -n 1)
2015-02-12 17:52:38 -05:00
SPARK_HADOOP_VERSION = $( " $MVN " help:evaluate -Dexpression= hadoop.version $@ 2>/dev/null\
2014-08-29 18:29:43 -04:00
| grep -v "INFO" \
| tail -n 1)
2015-02-12 17:52:38 -05:00
SPARK_HIVE = $( " $MVN " help:evaluate -Dexpression= project.activeProfiles -pl sql/hive $@ 2>/dev/null\
2014-08-29 18:29:43 -04:00
| grep -v "INFO" \
| fgrep --count "<id>hive</id>" ; \
# Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
# because we use "set -o pipefail"
echo -n)
2014-05-24 21:27:00 -04:00
JAVA_CMD = " $JAVA_HOME " /bin/java
JAVA_VERSION = $( " $JAVA_CMD " -version 2>& 1)
if [ [ ! " $JAVA_VERSION " = ~ "1.6" && -z " $SKIP_JAVA_TEST " ] ] ; then
echo "***NOTE***: JAVA_HOME is not set to a JDK 6 installation. The resulting"
echo " distribution may not work well with PySpark and will not run"
echo " with Java 6 (See SPARK-1703 and SPARK-1911)."
echo " This test can be disabled by adding --skip-java-test."
echo "Output from 'java -version' was:"
echo " $JAVA_VERSION "
read -p "Would you like to continue anyways? [y,n]: " -r
2015-02-12 17:52:38 -05:00
if [ [ ! " $REPLY " = ~ ^[ Yy] $ ] ] ; then
2014-05-24 21:27:00 -04:00
echo "Okay, exiting."
exit 1
2014-08-01 01:53:42 -04:00
fi
2014-05-24 21:27:00 -04:00
fi
2014-04-23 13:19:32 -04:00
if [ " $NAME " = = "none" ] ; then
NAME = $SPARK_HADOOP_VERSION
fi
echo " Spark version is $VERSION "
2013-08-14 20:34:34 -04:00
if [ " $MAKE_TGZ " = = "true" ] ; then
2014-04-23 13:19:32 -04:00
echo " Making spark- $VERSION -bin- $NAME .tgz "
2013-07-15 21:13:17 -04:00
else
2014-04-23 13:19:32 -04:00
echo " Making distribution for Spark $VERSION in $DISTDIR ... "
2013-07-15 21:13:17 -04:00
fi
2014-03-19 01:04:57 -04:00
if [ " $SPARK_TACHYON " = = "true" ] ; then
echo "Tachyon Enabled"
else
echo "Tachyon Disabled"
fi
2014-04-23 13:19:32 -04:00
# Build uber fat JAR
2015-01-09 12:40:18 -05:00
cd " $SPARK_HOME "
2014-01-02 03:39:37 -05:00
2014-04-23 13:19:32 -04:00
export MAVEN_OPTS = "-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
2015-01-26 17:26:10 -05:00
# Store the command as an array because $MVN variable might have spaces in it.
# Normal quoting tricks don't work.
# See: http://mywiki.wooledge.org/BashFAQ/050
BUILD_COMMAND = ( " $MVN " clean package -DskipTests $@ )
2014-05-06 01:14:47 -04:00
# Actually build the jar
echo -e "\nBuilding with..."
2015-01-26 17:26:10 -05:00
echo -e " \$ ${ BUILD_COMMAND [@] } \n "
2014-07-17 04:02:35 -04:00
2015-01-26 17:26:10 -05:00
" ${ BUILD_COMMAND [@] } "
2013-06-24 20:05:37 -04:00
# Make directories
rm -rf " $DISTDIR "
2014-04-23 13:19:32 -04:00
mkdir -p " $DISTDIR /lib "
2014-06-28 16:07:12 -04:00
echo " Spark $VERSION $GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION " > " $DISTDIR /RELEASE "
2015-01-09 12:40:18 -05:00
echo " Build flags: $@ " >> " $DISTDIR /RELEASE "
2013-06-24 20:05:37 -04:00
# Copy jars
2015-01-09 12:40:18 -05:00
cp " $SPARK_HOME " /assembly/target/scala*/*assembly*hadoop*.jar " $DISTDIR /lib/ "
cp " $SPARK_HOME " /examples/target/scala*/spark-examples*.jar " $DISTDIR /lib/ "
2014-11-13 14:54:45 -05:00
# This will fail if the -Pyarn profile is not provided
# In this case, silence the error and ignore the return code of this command
2015-01-09 12:40:18 -05:00
cp " $SPARK_HOME " /network/yarn/target/scala*/spark-*-yarn-shuffle.jar " $DISTDIR /lib/ " & > /dev/null || :
2013-06-24 20:05:37 -04:00
2014-05-13 02:02:54 -04:00
# Copy example sources (needed for python and SQL)
mkdir -p " $DISTDIR /examples/src/main "
2015-01-09 12:40:18 -05:00
cp -r " $SPARK_HOME " /examples/src/main " $DISTDIR /examples/src/ "
2014-05-13 02:02:54 -04:00
2014-08-29 18:29:43 -04:00
if [ " $SPARK_HIVE " = = "1" ] ; then
2015-01-09 12:40:18 -05:00
cp " $SPARK_HOME " /lib_managed/jars/datanucleus*.jar " $DISTDIR /lib/ "
2014-05-05 19:28:07 -04:00
fi
2014-05-13 02:02:54 -04:00
# Copy license and ASF files
2015-01-09 12:40:18 -05:00
cp " $SPARK_HOME /LICENSE " " $DISTDIR "
cp " $SPARK_HOME /NOTICE " " $DISTDIR "
2014-05-13 02:02:54 -04:00
2015-01-09 12:40:18 -05:00
if [ -e " $SPARK_HOME " /CHANGES.txt ] ; then
cp " $SPARK_HOME /CHANGES.txt " " $DISTDIR "
2014-05-13 02:02:54 -04:00
fi
2014-12-01 03:31:04 -05:00
# Copy data files
2015-01-09 12:40:18 -05:00
cp -r " $SPARK_HOME /data " " $DISTDIR "
2014-12-01 03:31:04 -05:00
2013-06-24 20:05:37 -04:00
# Copy other things
2013-08-23 02:02:09 -04:00
mkdir " $DISTDIR " /conf
2015-01-09 12:40:18 -05:00
cp " $SPARK_HOME " /conf/*.template " $DISTDIR " /conf
cp " $SPARK_HOME /README.md " " $DISTDIR "
cp -r " $SPARK_HOME /bin " " $DISTDIR "
cp -r " $SPARK_HOME /python " " $DISTDIR "
cp -r " $SPARK_HOME /sbin " " $DISTDIR "
cp -r " $SPARK_HOME /ec2 " " $DISTDIR "
2013-07-15 21:13:17 -04:00
2014-03-19 01:04:57 -04:00
# Download and copy in tachyon, if requested
if [ " $SPARK_TACHYON " = = "true" ] ; then
2014-03-28 01:45:00 -04:00
TMPD = ` mktemp -d 2>/dev/null || mktemp -d -t 'disttmp' `
2014-03-19 01:04:57 -04:00
2015-02-12 17:52:38 -05:00
pushd " $TMPD " > /dev/null
2014-05-07 17:24:49 -04:00
echo "Fetching tachyon tgz"
2014-03-19 01:04:57 -04:00
2015-01-28 15:43:22 -05:00
TACHYON_DL = " ${ TACHYON_TGZ } .part "
if [ $( command -v curl) ] ; then
curl --silent -k -L " ${ TACHYON_URL } " > " ${ TACHYON_DL } " && mv " ${ TACHYON_DL } " " ${ TACHYON_TGZ } "
elif [ $( command -v wget) ] ; then
wget --quiet " ${ TACHYON_URL } " -O " ${ TACHYON_DL } " && mv " ${ TACHYON_DL } " " ${ TACHYON_TGZ } "
else
printf "You do not have curl or wget installed. please install Tachyon manually.\n"
exit -1
fi
tar xzf " ${ TACHYON_TGZ } "
2014-09-02 20:36:53 -04:00
cp " tachyon- ${ TACHYON_VERSION } /core/target/tachyon- ${ TACHYON_VERSION } -jar-with-dependencies.jar " " $DISTDIR /lib "
2014-03-19 01:04:57 -04:00
mkdir -p " $DISTDIR /tachyon/src/main/java/tachyon/web "
cp -r " tachyon- ${ TACHYON_VERSION } " /{ bin,conf,libexec} " $DISTDIR /tachyon "
2014-09-02 20:36:53 -04:00
cp -r " tachyon- ${ TACHYON_VERSION } " /core/src/main/java/tachyon/web " $DISTDIR /tachyon/src/main/java/tachyon/web "
2014-03-28 01:45:00 -04:00
if [ [ ` uname -a` = = Darwin* ] ] ; then
2014-03-28 16:33:35 -04:00
# need to run sed differently on osx
2014-04-23 13:19:32 -04:00
nl = $'\n' ; sed -i "" -e " s|export TACHYON_JAR=\$TACHYON_HOME/target/\(.*\)|# This is set for spark's make-distribution\\ $nl export TACHYON_JAR=\$TACHYON_HOME/../lib/\1| " " $DISTDIR /tachyon/libexec/tachyon-config.sh "
2014-03-28 01:45:00 -04:00
else
2014-04-23 13:19:32 -04:00
sed -i "s|export TACHYON_JAR=\$TACHYON_HOME/target/\(.*\)|# This is set for spark's make-distribution\n export TACHYON_JAR=\$TACHYON_HOME/../lib/\1|" " $DISTDIR /tachyon/libexec/tachyon-config.sh "
2014-03-28 01:45:00 -04:00
fi
2014-03-19 01:04:57 -04:00
popd > /dev/null
2015-02-12 17:52:38 -05:00
rm -rf " $TMPD "
2014-03-19 01:04:57 -04:00
fi
2013-08-14 20:34:34 -04:00
if [ " $MAKE_TGZ " = = "true" ] ; then
2014-04-23 13:19:32 -04:00
TARDIR_NAME = spark-$VERSION -bin-$NAME
2015-01-09 12:40:18 -05:00
TARDIR = " $SPARK_HOME / $TARDIR_NAME "
2014-04-27 18:50:48 -04:00
rm -rf " $TARDIR "
2013-08-14 20:34:34 -04:00
cp -r " $DISTDIR " " $TARDIR "
2015-01-09 12:40:18 -05:00
tar czf " spark- $VERSION -bin- $NAME .tgz " -C " $SPARK_HOME " " $TARDIR_NAME "
2013-08-14 20:34:34 -04:00
rm -rf " $TARDIR "
2013-07-15 21:13:17 -04:00
fi