2013-07-15 21:13:17 -04:00
#!/usr/bin/env bash
2013-07-16 20:21:33 -04:00
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
2013-06-24 20:05:37 -04:00
#
# Script to create a binary distribution for easy deploys of Spark.
# The distribution directory defaults to dist/ but can be overridden below.
# The distribution contains fat (assembly) jars that include the Scala library,
# so it is completely self contained.
2013-06-25 03:15:58 -04:00
# It does not contain source or *.class files.
#
2013-08-14 20:34:34 -04:00
# Optional Arguments
# --tgz: Additionally creates spark-$VERSION-bin.tar.gz
# --hadoop VERSION: Builds against specified version of Hadoop.
# --with-yarn: Enables support for Hadoop YARN.
2014-04-23 13:19:32 -04:00
# --with-hive: Enable support for reading Hive tables.
# --name: A moniker for the release target. Defaults to the Hadoop verison.
2013-07-15 21:13:17 -04:00
#
2013-06-25 03:15:58 -04:00
# Recommended deploy/testing procedure (standalone mode):
# 1) Rsync / deploy the dist/ dir to one host
2014-01-02 08:24:08 -05:00
# 2) cd to deploy dir; ./sbin/start-master.sh
2013-06-25 03:15:58 -04:00
# 3) Verify master is up by visiting web page, ie http://master-ip:8080. Note the spark:// URL.
2014-01-02 08:24:08 -05:00
# 4) ./sbin/start-slave.sh 1 <<spark:// URL>>
2014-04-25 02:59:16 -04:00
# 5) ./bin/spark-shell --master spark://my-master-ip:7077
2013-07-16 20:21:33 -04:00
#
2013-06-24 20:05:37 -04:00
2014-05-06 18:41:46 -04:00
set -o pipefail
2014-05-09 01:26:17 -04:00
set -e
2013-06-24 20:05:37 -04:00
# Figure out where the Spark framework is installed
FWDIR = " $( cd ` dirname $0 ` ; pwd ) "
DISTDIR = " $FWDIR /dist "
2013-08-14 20:34:34 -04:00
# Initialize defaults
2013-08-21 14:54:10 -04:00
SPARK_HADOOP_VERSION = 1.0.4
2013-08-24 02:30:17 -04:00
SPARK_YARN = false
2014-04-23 13:19:32 -04:00
SPARK_HIVE = false
2014-03-19 01:04:57 -04:00
SPARK_TACHYON = false
2013-08-14 20:34:34 -04:00
MAKE_TGZ = false
2014-04-23 13:19:32 -04:00
NAME = none
2013-08-14 20:34:34 -04:00
# Parse arguments
while ( ( " $# " ) ) ; do
case $1 in
--hadoop)
SPARK_HADOOP_VERSION = " $2 "
shift
; ;
--with-yarn)
2013-08-24 02:30:17 -04:00
SPARK_YARN = true
2013-08-14 20:34:34 -04:00
; ;
2014-04-23 13:19:32 -04:00
--with-hive)
SPARK_HIVE = true
; ;
2014-05-24 21:27:00 -04:00
--skip-java-test)
SKIP_JAVA_TEST = true
; ;
2014-03-19 01:04:57 -04:00
--with-tachyon)
SPARK_TACHYON = true
; ;
2013-08-14 20:34:34 -04:00
--tgz)
MAKE_TGZ = true
; ;
2014-04-23 13:19:32 -04:00
--name)
NAME = " $2 "
shift
; ;
2013-08-14 20:34:34 -04:00
esac
shift
done
2014-05-24 21:27:00 -04:00
if [ -z " $JAVA_HOME " ] ; then
echo "Error: JAVA_HOME is not set, cannot proceed."
exit -1
fi
VERSION = $( mvn help:evaluate -Dexpression= project.version 2>/dev/null | grep -v "INFO" | tail -n 1)
if [ $? != 0 ] ; then
echo -e "You need Maven installed to build Spark."
echo -e "Download Maven from https://maven.apache.org/"
exit -1;
fi
JAVA_CMD = " $JAVA_HOME " /bin/java
JAVA_VERSION = $( " $JAVA_CMD " -version 2>& 1)
if [ [ ! " $JAVA_VERSION " = ~ "1.6" && -z " $SKIP_JAVA_TEST " ] ] ; then
echo "***NOTE***: JAVA_HOME is not set to a JDK 6 installation. The resulting"
echo " distribution may not work well with PySpark and will not run"
echo " with Java 6 (See SPARK-1703 and SPARK-1911)."
echo " This test can be disabled by adding --skip-java-test."
echo "Output from 'java -version' was:"
echo " $JAVA_VERSION "
read -p "Would you like to continue anyways? [y,n]: " -r
if [ [ ! $REPLY = ~ ^[ Yy] $ ] ] ; then
echo "Okay, exiting."
exit 1
fi
fi
2014-04-23 13:19:32 -04:00
if [ " $NAME " = = "none" ] ; then
NAME = $SPARK_HADOOP_VERSION
fi
echo " Spark version is $VERSION "
2013-08-14 20:34:34 -04:00
if [ " $MAKE_TGZ " = = "true" ] ; then
2014-04-23 13:19:32 -04:00
echo " Making spark- $VERSION -bin- $NAME .tgz "
2013-07-15 21:13:17 -04:00
else
2014-04-23 13:19:32 -04:00
echo " Making distribution for Spark $VERSION in $DISTDIR ... "
2013-07-15 21:13:17 -04:00
fi
2013-08-14 20:34:34 -04:00
echo " Hadoop version set to $SPARK_HADOOP_VERSION "
2014-04-23 13:19:32 -04:00
echo " Release name set to $NAME "
2013-08-24 02:30:17 -04:00
if [ " $SPARK_YARN " = = "true" ] ; then
2013-08-14 20:34:34 -04:00
echo "YARN enabled"
else
echo "YARN disabled"
fi
2013-06-24 20:05:37 -04:00
2014-03-19 01:04:57 -04:00
if [ " $SPARK_TACHYON " = = "true" ] ; then
echo "Tachyon Enabled"
else
echo "Tachyon Disabled"
fi
2014-04-23 13:19:32 -04:00
# Build uber fat JAR
2014-01-02 03:39:37 -05:00
cd $FWDIR
2014-04-23 13:19:32 -04:00
export MAVEN_OPTS = "-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
2014-05-06 01:14:47 -04:00
BUILD_COMMAND = "mvn clean package"
# Use special profiles for hadoop versions 0.23.x, 2.2.x, 2.3.x, 2.4.x
if [ [ " $SPARK_HADOOP_VERSION " = ~ ^0\. 23\. ] ] ; then BUILD_COMMAND = " $BUILD_COMMAND -Phadoop-0.23 " ; fi
if [ [ " $SPARK_HADOOP_VERSION " = ~ ^2\. 2\. ] ] ; then BUILD_COMMAND = " $BUILD_COMMAND -Phadoop-2.2 " ; fi
if [ [ " $SPARK_HADOOP_VERSION " = ~ ^2\. 3\. ] ] ; then BUILD_COMMAND = " $BUILD_COMMAND -Phadoop-2.3 " ; fi
if [ [ " $SPARK_HADOOP_VERSION " = ~ ^2\. 4\. ] ] ; then BUILD_COMMAND = " $BUILD_COMMAND -Phadoop-2.4 " ; fi
if [ [ " $SPARK_HIVE " = = "true" ] ] ; then BUILD_COMMAND = " $BUILD_COMMAND -Phive " ; fi
if [ [ " $SPARK_YARN " = = "true" ] ] ; then
# For hadoop versions 0.23.x to 2.1.x, use the yarn-alpha profile
if [ [ " $SPARK_HADOOP_VERSION " = ~ ^0\. 2[ 3-9] \. ] ] ||
[ [ " $SPARK_HADOOP_VERSION " = ~ ^0\. [ 3-9] [ 0-9] \. ] ] ||
[ [ " $SPARK_HADOOP_VERSION " = ~ ^1\. [ 0-9] \. ] ] ||
[ [ " $SPARK_HADOOP_VERSION " = ~ ^2\. [ 0-1] \. ] ] ; then
BUILD_COMMAND = " $BUILD_COMMAND -Pyarn-alpha "
# For hadoop versions 2.2+, use the yarn profile
elif [ [ " $SPARK_HADOOP_VERSION " = ~ ^2.[ 2-9] . ] ] ; then
BUILD_COMMAND = " $BUILD_COMMAND -Pyarn "
2014-04-29 01:50:51 -04:00
fi
2014-05-06 01:14:47 -04:00
BUILD_COMMAND = " $BUILD_COMMAND -Dyarn.version= $SPARK_HADOOP_VERSION "
2014-04-23 13:19:32 -04:00
fi
2014-05-06 01:14:47 -04:00
BUILD_COMMAND = " $BUILD_COMMAND -Dhadoop.version= $SPARK_HADOOP_VERSION "
BUILD_COMMAND = " $BUILD_COMMAND -DskipTests "
# Actually build the jar
echo -e "\nBuilding with..."
echo -e " \$ $BUILD_COMMAND \n "
${ BUILD_COMMAND }
2013-06-24 20:05:37 -04:00
# Make directories
rm -rf " $DISTDIR "
2014-04-23 13:19:32 -04:00
mkdir -p " $DISTDIR /lib "
2013-08-24 02:30:17 -04:00
echo " Spark $VERSION built for Hadoop $SPARK_HADOOP_VERSION " > " $DISTDIR /RELEASE "
2013-06-24 20:05:37 -04:00
# Copy jars
2014-04-23 13:19:32 -04:00
cp $FWDIR /assembly/target/scala*/*assembly*hadoop*.jar " $DISTDIR /lib/ "
cp $FWDIR /examples/target/scala*/spark-examples*.jar " $DISTDIR /lib/ "
2013-06-24 20:05:37 -04:00
2014-05-13 02:02:54 -04:00
# Copy example sources (needed for python and SQL)
mkdir -p " $DISTDIR /examples/src/main "
cp -r $FWDIR /examples/src/main " $DISTDIR /examples/src/ "
2014-05-05 19:28:07 -04:00
if [ " $SPARK_HIVE " = = "true" ] ; then
cp $FWDIR /lib_managed/jars/datanucleus*.jar " $DISTDIR /lib/ "
fi
2014-05-13 02:02:54 -04:00
# Copy license and ASF files
cp " $FWDIR /LICENSE " " $DISTDIR "
cp " $FWDIR /NOTICE " " $DISTDIR "
if [ -e $FWDIR /CHANGES.txt ] ; then
cp " $FWDIR /CHANGES.txt " " $DISTDIR "
fi
2013-06-24 20:05:37 -04:00
# Copy other things
2013-08-23 02:02:09 -04:00
mkdir " $DISTDIR " /conf
2013-10-01 18:42:06 -04:00
cp " $FWDIR " /conf/*.template " $DISTDIR " /conf
2014-04-24 12:59:44 -04:00
cp " $FWDIR " /conf/slaves " $DISTDIR " /conf
2014-05-18 19:51:53 -04:00
cp " $FWDIR /README.md " " $DISTDIR "
2013-06-24 20:05:37 -04:00
cp -r " $FWDIR /bin " " $DISTDIR "
2013-08-23 02:02:09 -04:00
cp -r " $FWDIR /python " " $DISTDIR "
2013-09-22 23:28:58 -04:00
cp -r " $FWDIR /sbin " " $DISTDIR "
2014-05-13 02:02:54 -04:00
cp -r " $FWDIR /ec2 " " $DISTDIR "
2013-07-15 21:13:17 -04:00
2014-03-19 01:04:57 -04:00
# Download and copy in tachyon, if requested
if [ " $SPARK_TACHYON " = = "true" ] ; then
TACHYON_VERSION = "0.4.1"
TACHYON_URL = " https://github.com/amplab/tachyon/releases/download/v ${ TACHYON_VERSION } /tachyon- ${ TACHYON_VERSION } -bin.tar.gz "
2014-03-28 01:45:00 -04:00
TMPD = ` mktemp -d 2>/dev/null || mktemp -d -t 'disttmp' `
2014-03-19 01:04:57 -04:00
pushd $TMPD > /dev/null
2014-05-07 17:24:49 -04:00
echo "Fetching tachyon tgz"
2014-03-19 01:04:57 -04:00
wget " $TACHYON_URL "
tar xf " tachyon- ${ TACHYON_VERSION } -bin.tar.gz "
2014-04-23 13:19:32 -04:00
cp " tachyon- ${ TACHYON_VERSION } /target/tachyon- ${ TACHYON_VERSION } -jar-with-dependencies.jar " " $DISTDIR /lib "
2014-03-19 01:04:57 -04:00
mkdir -p " $DISTDIR /tachyon/src/main/java/tachyon/web "
cp -r " tachyon- ${ TACHYON_VERSION } " /{ bin,conf,libexec} " $DISTDIR /tachyon "
cp -r " tachyon- ${ TACHYON_VERSION } " /src/main/java/tachyon/web/resources " $DISTDIR /tachyon/src/main/java/tachyon/web "
2014-03-28 01:45:00 -04:00
if [ [ ` uname -a` = = Darwin* ] ] ; then
2014-03-28 16:33:35 -04:00
# need to run sed differently on osx
2014-04-23 13:19:32 -04:00
nl = $'\n' ; sed -i "" -e " s|export TACHYON_JAR=\$TACHYON_HOME/target/\(.*\)|# This is set for spark's make-distribution\\ $nl export TACHYON_JAR=\$TACHYON_HOME/../lib/\1| " " $DISTDIR /tachyon/libexec/tachyon-config.sh "
2014-03-28 01:45:00 -04:00
else
2014-04-23 13:19:32 -04:00
sed -i "s|export TACHYON_JAR=\$TACHYON_HOME/target/\(.*\)|# This is set for spark's make-distribution\n export TACHYON_JAR=\$TACHYON_HOME/../lib/\1|" " $DISTDIR /tachyon/libexec/tachyon-config.sh "
2014-03-28 01:45:00 -04:00
fi
2014-03-19 01:04:57 -04:00
popd > /dev/null
rm -rf $TMPD
fi
2013-08-14 20:34:34 -04:00
if [ " $MAKE_TGZ " = = "true" ] ; then
2014-04-23 13:19:32 -04:00
TARDIR_NAME = spark-$VERSION -bin-$NAME
TARDIR = " $FWDIR / $TARDIR_NAME "
2014-04-27 18:50:48 -04:00
rm -rf " $TARDIR "
2013-08-14 20:34:34 -04:00
cp -r " $DISTDIR " " $TARDIR "
2014-04-23 13:19:32 -04:00
tar czf " spark- $VERSION -bin- $NAME .tgz " -C " $FWDIR " " $TARDIR_NAME "
2013-08-14 20:34:34 -04:00
rm -rf " $TARDIR "
2013-07-15 21:13:17 -04:00
fi