SPARK-1314: Use SPARK_HIVE to determine if we include Hive in packaging
Previously, we based our decision regarding including datanucleus jars based on the existence of a spark-hive-assembly jar, which was incidentally built whenever "sbt assembly" is run. This means that a typical and previously supported pathway would start using hive jars. This patch has the following features/bug fixes: - Use of SPARK_HIVE (default false) to determine if we should include Hive in the assembly jar. - Analagous feature in Maven with -Phive (previously, there was no support for adding Hive to any of our jars produced by Maven) - assemble-deps fixed since we no longer use a different ASSEMBLY_DIR - avoid adding log message in compute-classpath.sh to the classpath :) Still TODO before mergeable: - We need to download the datanucleus jars outside of sbt. Perhaps we can have spark-class download them if SPARK_HIVE is set similar to how sbt downloads itself. - Spark SQL documentation updates. Author: Aaron Davidson <aaron@databricks.com> Closes #237 from aarondav/master and squashes the following commits: 5dc4329 [Aaron Davidson] Typo fixes dd4f298 [Aaron Davidson] Doc update dd1a365 [Aaron Davidson] Eliminate need for SPARK_HIVE at runtime by d/ling datanucleus from Maven a9269b5 [Aaron Davidson] [WIP] Use SPARK_HIVE to determine if we include Hive in packaging
This commit is contained in:
parent
7ce52c4a7a
commit
4106558435
|
@ -163,6 +163,16 @@
|
|||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>hive</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-hive_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>spark-ganglia-lgpl</id>
|
||||
<dependencies>
|
||||
|
|
|
@ -30,21 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
|
|||
# Build up classpath
|
||||
CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
|
||||
|
||||
# Support for interacting with Hive. Since hive pulls in a lot of dependencies that might break
|
||||
# existing Spark applications, it is not included in the standard spark assembly. Instead, we only
|
||||
# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
|
||||
# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
|
||||
# the future.
|
||||
if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
|
||||
|
||||
# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
|
||||
DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
|
||||
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
|
||||
|
||||
ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
|
||||
else
|
||||
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
|
||||
fi
|
||||
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
|
||||
|
||||
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
|
||||
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
|
||||
|
@ -59,7 +45,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
|
|||
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
|
||||
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
|
||||
|
||||
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
|
||||
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
|
||||
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
|
||||
else
|
||||
# Else use spark-assembly jar from either RELEASE or assembly directory
|
||||
|
@ -71,6 +57,23 @@ else
|
|||
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
|
||||
fi
|
||||
|
||||
# When Hive support is needed, Datanucleus jars must be included on the classpath.
|
||||
# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
|
||||
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
|
||||
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
|
||||
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
|
||||
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
|
||||
num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
|
||||
if [ $num_datanucleus_jars -gt 0 ]; then
|
||||
AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
|
||||
num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
|
||||
if [ $num_hive_files -gt 0 ]; then
|
||||
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
|
||||
DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
|
||||
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
|
||||
fi
|
||||
fi
|
||||
|
||||
# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
|
||||
if [[ $SPARK_TESTING == 1 ]]; then
|
||||
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"
|
||||
|
|
|
@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
|
|||
fi
|
||||
|
||||
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
|
||||
|
||||
|
||||
|
|
|
@ -49,14 +49,14 @@ mvn -DskipTests \
|
|||
-Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
|
||||
-Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \
|
||||
-Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
|
||||
-Pyarn -Pspark-ganglia-lgpl \
|
||||
-Pyarn -Phive -Pspark-ganglia-lgpl\
|
||||
-Dtag=$GIT_TAG -DautoVersionSubmodules=true \
|
||||
--batch-mode release:prepare
|
||||
|
||||
mvn -DskipTests \
|
||||
-Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
|
||||
-Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
|
||||
-Pyarn -Pspark-ganglia-lgpl\
|
||||
-Pyarn -Phive -Pspark-ganglia-lgpl\
|
||||
release:perform
|
||||
|
||||
rm -rf spark
|
||||
|
|
|
@ -264,8 +264,8 @@ evaluated by the SQL execution engine. A full list of the functions supported c
|
|||
|
||||
Spark SQL also supports reading and writing data stored in [Apache Hive](http://hive.apache.org/).
|
||||
However, since Hive has a large number of dependencies, it is not included in the default Spark assembly.
|
||||
In order to use Hive you must first run '`SPARK_HIVE=true sbt/sbt assembly/assembly`'. This command builds a new assembly
|
||||
jar that includes Hive. Note that this Hive assembly jar must also be present
|
||||
In order to use Hive you must first run '`SPARK_HIVE=true sbt/sbt assembly/assembly`' (or use `-Phive` for maven).
|
||||
This command builds a new assembly jar that includes Hive. Note that this Hive assembly jar must also be present
|
||||
on all of the worker nodes, as they will need access to the Hive serialization and deserialization libraries
|
||||
(SerDes) in order to acccess data stored in Hive.
|
||||
|
||||
|
|
7
pom.xml
7
pom.xml
|
@ -377,7 +377,6 @@
|
|||
<groupId>org.apache.derby</groupId>
|
||||
<artifactId>derby</artifactId>
|
||||
<version>10.4.2.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.liftweb</groupId>
|
||||
|
@ -580,6 +579,12 @@
|
|||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<!-- Matches the version of jackson-core-asl pulled in by avro -->
|
||||
<groupId>org.codehaus.jackson</groupId>
|
||||
<artifactId>jackson-mapper-asl</artifactId>
|
||||
<version>1.8.8</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
|
|
|
@ -43,6 +43,8 @@ object SparkBuild extends Build {
|
|||
|
||||
val DEFAULT_YARN = false
|
||||
|
||||
val DEFAULT_HIVE = false
|
||||
|
||||
// HBase version; set as appropriate.
|
||||
val HBASE_VERSION = "0.94.6"
|
||||
|
||||
|
@ -67,15 +69,17 @@ object SparkBuild extends Build {
|
|||
|
||||
lazy val sql = Project("sql", file("sql/core"), settings = sqlCoreSettings) dependsOn(core, catalyst)
|
||||
|
||||
// Since hive is its own assembly, it depends on all of the modules.
|
||||
lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql, graphx, bagel, mllib, streaming, repl)
|
||||
lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql)
|
||||
|
||||
lazy val maybeHive: Seq[ClasspathDependency] = if (isHiveEnabled) Seq(hive) else Seq()
|
||||
lazy val maybeHiveRef: Seq[ProjectReference] = if (isHiveEnabled) Seq(hive) else Seq()
|
||||
|
||||
lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn(core)
|
||||
|
||||
lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core)
|
||||
|
||||
lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
|
||||
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeGanglia: _*)
|
||||
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
|
||||
|
||||
lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
|
||||
|
||||
|
@ -101,6 +105,11 @@ object SparkBuild extends Build {
|
|||
lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client"
|
||||
val maybeAvro = if (hadoopVersion.startsWith("0.23.") && isYarnEnabled) Seq("org.apache.avro" % "avro" % "1.7.4") else Seq()
|
||||
|
||||
lazy val isHiveEnabled = Properties.envOrNone("SPARK_HIVE") match {
|
||||
case None => DEFAULT_HIVE
|
||||
case Some(v) => v.toBoolean
|
||||
}
|
||||
|
||||
// Include Ganglia integration if the user has enabled Ganglia
|
||||
// This is isolated from the normal build due to LGPL-licensed code in the library
|
||||
lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined
|
||||
|
@ -141,13 +150,13 @@ object SparkBuild extends Build {
|
|||
lazy val allExternalRefs = Seq[ProjectReference](externalTwitter, externalKafka, externalFlume, externalZeromq, externalMqtt)
|
||||
|
||||
lazy val examples = Project("examples", file("examples"), settings = examplesSettings)
|
||||
.dependsOn(core, mllib, graphx, bagel, streaming, externalTwitter, hive) dependsOn(allExternal: _*)
|
||||
.dependsOn(core, mllib, graphx, bagel, streaming, hive) dependsOn(allExternal: _*)
|
||||
|
||||
// Everything except assembly, hive, tools, java8Tests and examples belong to packageProjects
|
||||
lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeGangliaRef
|
||||
lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef
|
||||
|
||||
lazy val allProjects = packageProjects ++ allExternalRefs ++
|
||||
Seq[ProjectReference](examples, tools, assemblyProj, hive) ++ maybeJava8Tests
|
||||
Seq[ProjectReference](examples, tools, assemblyProj) ++ maybeJava8Tests
|
||||
|
||||
def sharedSettings = Defaults.defaultSettings ++ MimaBuild.mimaSettings(file(sparkHome)) ++ Seq(
|
||||
organization := "org.apache.spark",
|
||||
|
@ -417,10 +426,8 @@ object SparkBuild extends Build {
|
|||
|
||||
// Since we don't include hive in the main assembly this project also acts as an alternative
|
||||
// assembly jar.
|
||||
def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
|
||||
def hiveSettings = sharedSettings ++ Seq(
|
||||
name := "spark-hive",
|
||||
jarName in assembly <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
|
||||
jarName in packageDependency <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" },
|
||||
javaOptions += "-XX:MaxPermSize=1g",
|
||||
libraryDependencies ++= Seq(
|
||||
"org.apache.hive" % "hive-metastore" % hiveVersion,
|
||||
|
|
|
@ -63,6 +63,10 @@
|
|||
<artifactId>hive-exec</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.jackson</groupId>
|
||||
<artifactId>jackson-mapper-asl</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hive</groupId>
|
||||
<artifactId>hive-serde</artifactId>
|
||||
|
@ -87,6 +91,30 @@
|
|||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
|
||||
<!-- Deploy datanucleus jars to the spark/lib_managed/jars directory -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<version>2.4</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<!-- basedir is spark/sql/hive/ -->
|
||||
<outputDirectory>${basedir}/../../lib_managed/jars</outputDirectory>
|
||||
<overWriteReleases>false</overWriteReleases>
|
||||
<overWriteSnapshots>false</overWriteSnapshots>
|
||||
<overWriteIfNewer>true</overWriteIfNewer>
|
||||
<includeGroupIds>org.datanucleus</includeGroupIds>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
Loading…
Reference in a new issue