[SPARK-13579][BUILD] Stop building the main Spark assembly.

This change modifies the "assembly/" module to just copy needed
dependencies to its build directory, and modifies the packaging
script to pick those up (and remove duplicate jars packages in the
examples module).

I also made some minor adjustments to dependencies to remove some
test jars from the final packaging, and remove jars that conflict with each
other when packaged separately (e.g. servlet api).

Also note that this change restores guava in applications' classpaths, even
though it's still shaded inside Spark. This is now needed for the Hadoop
libraries that are packaged with Spark, which now are not processed by
the shade plugin.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #11796 from vanzin/SPARK-13579.
This commit is contained in:
Marcelo Vanzin 2016-04-04 16:52:21 -07:00 committed by Josh Rosen
parent 400b2f863f
commit 24d7d2e453
26 changed files with 231 additions and 319 deletions

View file

@ -33,9 +33,8 @@
<properties>
<sbt.project.name>assembly</sbt.project.name>
<spark.jar.dir>scala-${scala.binary.version}</spark.jar.dir>
<spark.jar.basename>spark-assembly-${project.version}-hadoop${hadoop.version}.jar</spark.jar.basename>
<spark.jar>${project.build.directory}/${spark.jar.dir}/${spark.jar.basename}</spark.jar>
<build.testJarPhase>none</build.testJarPhase>
<build.copyDependenciesPhase>package</build.copyDependenciesPhase>
</properties>
<dependencies>
@ -69,6 +68,17 @@
<artifactId>spark-repl_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<!--
Because we don't shade dependencies anymore, we need to restore Guava to compile scope so
that the libraries Spark depend on have it available. We'll package the version that Spark
uses (14.0.1) which is not the same as Hadoop dependencies, but works.
-->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<scope>${hadoop.deps.scope}</scope>
</dependency>
</dependencies>
<build>
@ -87,75 +97,26 @@
<skip>true</skip>
</configuration>
</plugin>
<!-- zip pyspark archives to run python application on yarn mode -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
<configuration>
<target>
<delete dir="${basedir}/../python/lib/pyspark.zip"/>
<zip destfile="${basedir}/../python/lib/pyspark.zip">
<fileset dir="${basedir}/../python/" includes="pyspark/**/*"/>
</zip>
</target>
</configuration>
</plugin>
<!-- Use the shade plugin to create a big JAR with all the dependencies -->
<!-- zip pyspark archives to run python application on yarn mode -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<configuration>
<shadedArtifactAttached>false</shadedArtifactAttached>
<outputFile>${spark.jar}</outputFile>
<artifactSet>
<includes>
<include>*:*</include>
</includes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>org/datanucleus/**</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/services/org.apache.hadoop.fs.FileSystem</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
<resource>log4j.properties</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"/>
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
</transformers>
</configuration>
</execution>
</executions>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
<configuration>
<target>
<delete dir="${basedir}/../python/lib/pyspark.zip"/>
<zip destfile="${basedir}/../python/lib/pyspark.zip">
<fileset dir="${basedir}/../python/" includes="pyspark/**/*"/>
</zip>
</target>
</configuration>
</plugin>
</plugins>
</build>

View file

@ -36,21 +36,20 @@ else
fi
# Find Spark jars.
# TODO: change the directory name when Spark jars move from "lib".
if [ -f "${SPARK_HOME}/RELEASE" ]; then
SPARK_JARS_DIR="${SPARK_HOME}/lib"
SPARK_JARS_DIR="${SPARK_HOME}/jars"
else
SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION"
SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
fi
if [ ! -d "$SPARK_JARS_DIR" ]; then
if [ ! -d "$SPARK_JARS_DIR" ] && [ -z "$SPARK_TESTING$SPARK_SQL_TESTING" ]; then
echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." 1>&2
echo "You need to build Spark before running this program." 1>&2
exit 1
else
LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*"
fi
LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*"
# Add the launcher build dir to the classpath if requested.
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"

View file

@ -29,11 +29,10 @@ if "x%1"=="x" (
)
rem Find Spark jars.
rem TODO: change the directory name when Spark jars move from "lib".
if exist "%SPARK_HOME%\RELEASE" (
set SPARK_JARS_DIR="%SPARK_HOME%\lib"
set SPARK_JARS_DIR="%SPARK_HOME%\jars"
) else (
set SPARK_JARS_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%"
set SPARK_JARS_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%\jars"
)
if not exist "%SPARK_JARS_DIR%"\ (

View file

@ -1121,9 +1121,9 @@ private[spark] object Utils extends Logging {
extraEnvironment: Map[String, String] = Map.empty,
redirectStderr: Boolean = true): String = {
val process = executeCommand(command, workingDir, extraEnvironment, redirectStderr)
val output = new StringBuffer
val output = new StringBuilder
val threadName = "read stdout for " + command(0)
def appendToOutput(s: String): Unit = output.append(s)
def appendToOutput(s: String): Unit = output.append(s).append("\n")
val stdoutThread = processStreamByLine(threadName, process.getInputStream, appendToOutput)
val exitCode = process.waitFor()
stdoutThread.join() // Wait for it to finish reading output

View file

@ -201,24 +201,29 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
// Make sure only logging errors
val logger = Logger.getRootLogger
val oldLogLevel = logger.getLevel
logger.setLevel(Level.ERROR)
logger.addAppender(mockAppender)
try {
logger.addAppender(mockAppender)
val testOutputStream = new PipedOutputStream()
val testInputStream = new PipedInputStream(testOutputStream)
val testOutputStream = new PipedOutputStream()
val testInputStream = new PipedInputStream(testOutputStream)
// Close the stream before appender tries to read will cause an IOException
testInputStream.close()
testOutputStream.close()
val appender = FileAppender(testInputStream, testFile, new SparkConf)
// Close the stream before appender tries to read will cause an IOException
testInputStream.close()
testOutputStream.close()
val appender = FileAppender(testInputStream, testFile, new SparkConf)
appender.awaitTermination()
appender.awaitTermination()
// If InputStream was closed without first stopping the appender, an exception will be logged
verify(mockAppender, atLeast(1)).doAppend(loggingEventCaptor.capture)
val loggingEvent = loggingEventCaptor.getValue
assert(loggingEvent.getThrowableInformation !== null)
assert(loggingEvent.getThrowableInformation.getThrowable.isInstanceOf[IOException])
// If InputStream was closed without first stopping the appender, an exception will be logged
verify(mockAppender, atLeast(1)).doAppend(loggingEventCaptor.capture)
val loggingEvent = loggingEventCaptor.getValue
assert(loggingEvent.getThrowableInformation !== null)
assert(loggingEvent.getThrowableInformation.getThrowable.isInstanceOf[IOException])
} finally {
logger.setLevel(oldLogLevel)
}
}
test("file appender async close stream gracefully") {
@ -228,30 +233,35 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
// Make sure only logging errors
val logger = Logger.getRootLogger
val oldLogLevel = logger.getLevel
logger.setLevel(Level.ERROR)
logger.addAppender(mockAppender)
try {
logger.addAppender(mockAppender)
val testOutputStream = new PipedOutputStream()
val testInputStream = new PipedInputStream(testOutputStream) with LatchedInputStream
val testOutputStream = new PipedOutputStream()
val testInputStream = new PipedInputStream(testOutputStream) with LatchedInputStream
// Close the stream before appender tries to read will cause an IOException
testInputStream.close()
testOutputStream.close()
val appender = FileAppender(testInputStream, testFile, new SparkConf)
// Close the stream before appender tries to read will cause an IOException
testInputStream.close()
testOutputStream.close()
val appender = FileAppender(testInputStream, testFile, new SparkConf)
// Stop the appender before an IOException is called during read
testInputStream.latchReadStarted.await()
appender.stop()
testInputStream.latchReadProceed.countDown()
// Stop the appender before an IOException is called during read
testInputStream.latchReadStarted.await()
appender.stop()
testInputStream.latchReadProceed.countDown()
appender.awaitTermination()
appender.awaitTermination()
// Make sure no IOException errors have been logged as a result of appender closing gracefully
verify(mockAppender, atLeast(0)).doAppend(loggingEventCaptor.capture)
import scala.collection.JavaConverters._
loggingEventCaptor.getAllValues.asScala.foreach { loggingEvent =>
assert(loggingEvent.getThrowableInformation === null
|| !loggingEvent.getThrowableInformation.getThrowable.isInstanceOf[IOException])
// Make sure no IOException errors have been logged as a result of appender closing gracefully
verify(mockAppender, atLeast(0)).doAppend(loggingEventCaptor.capture)
import scala.collection.JavaConverters._
loggingEventCaptor.getAllValues.asScala.foreach { loggingEvent =>
assert(loggingEvent.getThrowableInformation === null
|| !loggingEvent.getThrowableInformation.getThrowable.isInstanceOf[IOException])
}
} finally {
logger.setLevel(oldLogLevel)
}
}

View file

@ -12,7 +12,6 @@ asm-3.1.jar
asm-commons-3.1.jar
asm-tree-3.1.jar
avro-1.7.7.jar
avro-ipc-1.7.7-tests.jar
avro-ipc-1.7.7.jar
avro-mapred-1.7.7-hadoop2.jar
bonecp-0.8.0.RELEASE.jar
@ -61,6 +60,7 @@ grizzly-http-2.1.2.jar
grizzly-http-server-2.1.2.jar
grizzly-http-servlet-2.1.2.jar
grizzly-rcm-2.1.2.jar
guava-14.0.1.jar
guice-3.0.jar
guice-servlet-3.0.jar
hadoop-annotations-2.2.0.jar
@ -164,7 +164,6 @@ scala-parser-combinators_2.11-1.0.4.jar
scala-reflect-2.11.8.jar
scala-xml_2.11-1.0.2.jar
scalap-2.11.8.jar
servlet-api-2.5.jar
slf4j-api-1.7.16.jar
slf4j-log4j12-1.7.16.jar
snappy-0.2.jar
@ -177,7 +176,6 @@ stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
unused-1.0.0.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
xz-1.0.jar

View file

@ -12,7 +12,6 @@ asm-3.1.jar
asm-commons-3.1.jar
asm-tree-3.1.jar
avro-1.7.7.jar
avro-ipc-1.7.7-tests.jar
avro-ipc-1.7.7.jar
avro-mapred-1.7.7-hadoop2.jar
base64-2.3.8.jar
@ -56,6 +55,7 @@ eigenbase-properties-1.1.5.jar
geronimo-annotation_1.0_spec-1.1.1.jar
geronimo-jaspic_1.0_spec-1.0.jar
geronimo-jta_1.1_spec-1.1.1.jar
guava-14.0.1.jar
guice-3.0.jar
guice-servlet-3.0.jar
hadoop-annotations-2.3.0.jar
@ -155,7 +155,6 @@ scala-parser-combinators_2.11-1.0.4.jar
scala-reflect-2.11.8.jar
scala-xml_2.11-1.0.2.jar
scalap-2.11.8.jar
servlet-api-2.5.jar
slf4j-api-1.7.16.jar
slf4j-log4j12-1.7.16.jar
snappy-0.2.jar
@ -168,7 +167,6 @@ stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
unused-1.0.0.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
xz-1.0.jar

View file

@ -12,7 +12,6 @@ asm-3.1.jar
asm-commons-3.1.jar
asm-tree-3.1.jar
avro-1.7.7.jar
avro-ipc-1.7.7-tests.jar
avro-ipc-1.7.7.jar
avro-mapred-1.7.7-hadoop2.jar
base64-2.3.8.jar
@ -56,6 +55,7 @@ eigenbase-properties-1.1.5.jar
geronimo-annotation_1.0_spec-1.1.1.jar
geronimo-jaspic_1.0_spec-1.0.jar
geronimo-jta_1.1_spec-1.1.1.jar
guava-14.0.1.jar
guice-3.0.jar
guice-servlet-3.0.jar
hadoop-annotations-2.4.0.jar
@ -156,7 +156,6 @@ scala-parser-combinators_2.11-1.0.4.jar
scala-reflect-2.11.8.jar
scala-xml_2.11-1.0.2.jar
scalap-2.11.8.jar
servlet-api-2.5.jar
slf4j-api-1.7.16.jar
slf4j-log4j12-1.7.16.jar
snappy-0.2.jar
@ -169,7 +168,6 @@ stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
unused-1.0.0.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
xz-1.0.jar

View file

@ -16,7 +16,6 @@ asm-3.1.jar
asm-commons-3.1.jar
asm-tree-3.1.jar
avro-1.7.7.jar
avro-ipc-1.7.7-tests.jar
avro-ipc-1.7.7.jar
avro-mapred-1.7.7-hadoop2.jar
base64-2.3.8.jar
@ -61,6 +60,7 @@ geronimo-annotation_1.0_spec-1.1.1.jar
geronimo-jaspic_1.0_spec-1.0.jar
geronimo-jta_1.1_spec-1.1.1.jar
gson-2.2.4.jar
guava-14.0.1.jar
guice-3.0.jar
guice-servlet-3.0.jar
hadoop-annotations-2.6.0.jar
@ -162,7 +162,6 @@ scala-parser-combinators_2.11-1.0.4.jar
scala-reflect-2.11.8.jar
scala-xml_2.11-1.0.2.jar
scalap-2.11.8.jar
servlet-api-2.5.jar
slf4j-api-1.7.16.jar
slf4j-log4j12-1.7.16.jar
snappy-0.2.jar
@ -175,7 +174,6 @@ stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
unused-1.0.0.jar
xbean-asm5-shaded-4.4.jar
xercesImpl-2.9.1.jar
xmlenc-0.52.jar

View file

@ -16,7 +16,6 @@ asm-3.1.jar
asm-commons-3.1.jar
asm-tree-3.1.jar
avro-1.7.7.jar
avro-ipc-1.7.7-tests.jar
avro-ipc-1.7.7.jar
avro-mapred-1.7.7-hadoop2.jar
base64-2.3.8.jar
@ -61,6 +60,7 @@ geronimo-annotation_1.0_spec-1.1.1.jar
geronimo-jaspic_1.0_spec-1.0.jar
geronimo-jta_1.1_spec-1.1.1.jar
gson-2.2.4.jar
guava-14.0.1.jar
guice-3.0.jar
guice-servlet-3.0.jar
hadoop-annotations-2.7.0.jar
@ -163,7 +163,6 @@ scala-parser-combinators_2.11-1.0.4.jar
scala-reflect-2.11.8.jar
scala-xml_2.11-1.0.2.jar
scalap-2.11.8.jar
servlet-api-2.5.jar
slf4j-api-1.7.16.jar
slf4j-log4j12-1.7.16.jar
snappy-0.2.jar
@ -176,7 +175,6 @@ stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
univocity-parsers-1.5.6.jar
unused-1.0.0.jar
xbean-asm5-shaded-4.4.jar
xercesImpl-2.9.1.jar
xmlenc-0.52.jar

View file

@ -160,28 +160,35 @@ echo -e "\$ ${BUILD_COMMAND[@]}\n"
# Make directories
rm -rf "$DISTDIR"
mkdir -p "$DISTDIR/lib"
mkdir -p "$DISTDIR/jars"
echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE"
echo "Build flags: $@" >> "$DISTDIR/RELEASE"
# Copy jars
cp "$SPARK_HOME"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
# This will fail if the -Pyarn profile is not provided
# In this case, silence the error and ignore the return code of this command
cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :
cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"
# Only create the yarn directory if the yarn artifacts were build.
if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then
mkdir "$DISTDIR"/yarn
cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/yarn"
fi
# Copy examples and dependencies
mkdir -p "$DISTDIR/examples/jars"
cp "$SPARK_HOME"/examples/target/scala*/jars/* "$DISTDIR/examples/jars"
# Deduplicate jars that have already been packaged as part of the main Spark dependencies.
for f in "$DISTDIR/examples/jars/"*; do
name=$(basename "$f")
if [ -f "$DISTDIR/jars/$name" ]; then
rm "$DISTDIR/examples/jars/$name"
fi
done
# Copy example sources (needed for python and SQL)
mkdir -p "$DISTDIR/examples/src/main"
cp -r "$SPARK_HOME"/examples/src/main "$DISTDIR/examples/src/"
if [ "$SPARK_HIVE" == "1" ]; then
cp "$SPARK_HOME"/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
fi
# Copy license and ASF files
cp "$SPARK_HOME/LICENSE" "$DISTDIR"
cp -r "$SPARK_HOME/licenses" "$DISTDIR"

View file

@ -25,8 +25,8 @@ FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
cd "$FWDIR"
SPARK_PROFILES="-Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
TOOLS_CLASSPATH="$(build/sbt "export tools/fullClasspath" | tail -n1)"
OLD_DEPS_CLASSPATH="$(build/sbt $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
rm -f .generated-mima*
@ -36,7 +36,7 @@ java \
-cp "$TOOLS_CLASSPATH:$OLD_DEPS_CLASSPATH" \
org.apache.spark.tools.GenerateMIMAIgnore
echo -e "q\n" | build/sbt mimaReportBinaryIssues | grep -v -e "info.*Resolving"
echo -e "q\n" | build/sbt -DcopyDependencies=false "$@" mimaReportBinaryIssues | grep -v -e "info.*Resolving"
ret_val=$?
if [ $ret_val != 0 ]; then

View file

@ -350,7 +350,7 @@ def build_spark_sbt(hadoop_version):
def build_spark_assembly_sbt(hadoop_version):
# Enable all of the profiles for the build:
build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
sbt_goals = ["assembly/assembly"]
sbt_goals = ["assembly/package"]
profiles_and_goals = build_profiles + sbt_goals
print("[info] Building Spark assembly (w/Hive 1.2.1) using SBT with these arguments: ",
" ".join(profiles_and_goals))
@ -371,9 +371,10 @@ def build_apache_spark(build_tool, hadoop_version):
build_spark_sbt(hadoop_version)
def detect_binary_inop_with_mima():
def detect_binary_inop_with_mima(hadoop_version):
build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
run_cmd([os.path.join(SPARK_HOME, "dev", "mima")] + build_profiles)
def run_scala_tests_maven(test_profiles):
@ -571,8 +572,8 @@ def main():
# backwards compatibility checks
if build_tool == "sbt":
# Note: compatibility tests only supported in sbt for now
detect_binary_inop_with_mima()
# Since we did not build assembly/assembly before running dev/mima, we need to
detect_binary_inop_with_mima(hadoop_version)
# Since we did not build assembly/package before running dev/mima, we need to
# do it here because the tests still rely on it; see SPARK-13294 for details.
build_spark_assembly_sbt(hadoop_version)

View file

@ -1687,12 +1687,7 @@ on all of the worker nodes, as they will need access to the Hive serialization a
(SerDes) in order to access data stored in Hive.
Configuration of Hive is done by placing your `hive-site.xml`, `core-site.xml` (for security configuration),
`hdfs-site.xml` (for HDFS configuration) file in `conf/`. Please note when running
the query on a YARN cluster (`cluster` mode), the `datanucleus` jars under the `lib` directory
and `hive-site.xml` under `conf/` directory need to be available on the driver and all executors launched by the
YARN cluster. The convenient way to do this is adding them through the `--jars` option and `--file` option of the
`spark-submit` command.
`hdfs-site.xml` (for HDFS configuration) file in `conf/`.
<div class="codetabs">

View file

@ -27,13 +27,16 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-examples_2.11</artifactId>
<properties>
<sbt.project.name>examples</sbt.project.name>
</properties>
<packaging>jar</packaging>
<name>Spark Project Examples</name>
<url>http://spark.apache.org/</url>
<properties>
<sbt.project.name>examples</sbt.project.name>
<build.testJarPhase>none</build.testJarPhase>
<build.copyDependenciesPhase>package</build.copyDependenciesPhase>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
@ -75,23 +78,6 @@
<artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-testing-util</artifactId>
<version>${hbase.version}</version>
<scope>${hbase.deps.scope}</scope>
<exclusions>
<exclusion>
<!-- SPARK-4455 -->
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-annotations</artifactId>
</exclusion>
<exclusion>
<groupId>org.jruby</groupId>
<artifactId>jruby-complete</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-protocol</artifactId>
@ -139,6 +125,10 @@
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-annotations</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
@ -208,13 +198,6 @@
<version>${hbase.version}</version>
<scope>${hbase.deps.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-hadoop-compat</artifactId>
<version>${hbase.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
@ -294,17 +277,6 @@
<artifactId>scopt_${scala.binary.version}</artifactId>
<version>3.3.0</version>
</dependency>
<!--
The following dependencies are already present in the Spark assembly, so we want to force
them to be provided.
-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
@ -325,38 +297,6 @@
<skip>true</skip>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>prepare-test-jar</id>
<phase>none</phase>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
<configuration>
<outputDirectory>${jars.target.dir}</outputDirectory>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<includeScope>runtime</includeScope>
<outputDirectory>${jars.target.dir}</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<profiles>

View file

@ -144,10 +144,26 @@ abstract class AbstractCommandBuilder {
boolean isTesting = "1".equals(getenv("SPARK_TESTING"));
if (prependClasses || isTesting) {
String scala = getScalaVersion();
List<String> projects = Arrays.asList("core", "repl", "mllib", "graphx",
"streaming", "tools", "sql/catalyst", "sql/core", "sql/hive", "sql/hive-thriftserver",
"yarn", "launcher",
"common/network-common", "common/network-shuffle", "common/network-yarn");
List<String> projects = Arrays.asList(
"common/network-common",
"common/network-shuffle",
"common/network-yarn",
"common/sketch",
"common/tags",
"common/unsafe",
"core",
"examples",
"graphx",
"launcher",
"mllib",
"repl",
"sql/catalyst",
"sql/core",
"sql/hive",
"sql/hive-thriftserver",
"streaming",
"yarn"
);
if (prependClasses) {
if (!isTesting) {
System.err.println(
@ -174,31 +190,12 @@ abstract class AbstractCommandBuilder {
// Add Spark jars to the classpath. For the testing case, we rely on the test code to set and
// propagate the test classpath appropriately. For normal invocation, look for the jars
// directory under SPARK_HOME.
String jarsDir = findJarsDir(getSparkHome(), getScalaVersion(), !isTesting);
boolean isTestingSql = "1".equals(getenv("SPARK_SQL_TESTING"));
String jarsDir = findJarsDir(getSparkHome(), getScalaVersion(), !isTesting && !isTestingSql);
if (jarsDir != null) {
addToClassPath(cp, join(File.separator, jarsDir, "*"));
}
// Datanucleus jars must be included on the classpath. Datanucleus jars do not work if only
// included in the uber jar as plugin.xml metadata is lost. Both sbt and maven will populate
// "lib_managed/jars/" with the datanucleus jars when Spark is built with Hive
File libdir;
if (new File(sparkHome, "RELEASE").isFile()) {
libdir = new File(sparkHome, "lib");
} else {
libdir = new File(sparkHome, "lib_managed/jars");
}
if (libdir.isDirectory()) {
for (File jar : libdir.listFiles()) {
if (jar.getName().startsWith("datanucleus-")) {
addToClassPath(cp, jar.getAbsolutePath());
}
}
} else {
checkState(isTesting, "Library directory '%s' does not exist.", libdir.getAbsolutePath());
}
addToClassPath(cp, getenv("HADOOP_CONF_DIR"));
addToClassPath(cp, getenv("YARN_CONF_DIR"));
addToClassPath(cp, getenv("SPARK_DIST_CLASSPATH"));

View file

@ -358,12 +358,12 @@ class CommandBuilderUtils {
// TODO: change to the correct directory once the assembly build is changed.
File libdir;
if (new File(sparkHome, "RELEASE").isFile()) {
libdir = new File(sparkHome, "lib");
libdir = new File(sparkHome, "jars");
checkState(!failIfNotFound || libdir.isDirectory(),
"Library directory '%s' does not exist.",
libdir.getAbsolutePath());
} else {
libdir = new File(sparkHome, String.format("assembly/target/scala-%s", scalaVersion));
libdir = new File(sparkHome, String.format("assembly/target/scala-%s/jars", scalaVersion));
if (!libdir.isDirectory()) {
checkState(!failIfNotFound,
"Library directory '%s' does not exist; make sure Spark is built.",

View file

@ -336,6 +336,7 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
}
private List<String> findExamplesJars() {
boolean isTesting = "1".equals(getenv("SPARK_TESTING"));
List<String> examplesJars = new ArrayList<>();
String sparkHome = getSparkHome();
@ -346,11 +347,15 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
jarsDir = new File(sparkHome,
String.format("examples/target/scala-%s/jars", getScalaVersion()));
}
checkState(jarsDir.isDirectory(), "Examples jars directory '%s' does not exist.",
boolean foundDir = jarsDir.isDirectory();
checkState(isTesting || foundDir, "Examples jars directory '%s' does not exist.",
jarsDir.getAbsolutePath());
for (File f: jarsDir.listFiles()) {
examplesJars.add(f.getAbsolutePath());
if (foundDir) {
for (File f: jarsDir.listFiles()) {
examplesJars.add(f.getAbsolutePath());
}
}
return examplesJars;
}

44
pom.xml
View file

@ -185,6 +185,10 @@
<!-- Modules that copy jars to the build directory should do so under this location. -->
<jars.target.dir>${project.build.directory}/scala-${scala.binary.version}/jars</jars.target.dir>
<!-- Allow modules to enable / disable certain build plugins easily. -->
<build.testJarPhase>prepare-package</build.testJarPhase>
<build.copyDependenciesPhase>none</build.copyDependenciesPhase>
<!--
Dependency scopes that can be overridden by enabling certain profiles. These profiles are
declared in the projects that build assemblies.
@ -237,15 +241,6 @@
</pluginRepository>
</pluginRepositories>
<dependencies>
<!--
This is a dummy dependency that is used along with the shading plug-in
to create effective poms on publishing (see SPARK-3812).
-->
<dependency>
<groupId>org.spark-project.spark</groupId>
<artifactId>unused</artifactId>
<version>1.0.0</version>
</dependency>
<!--
This is needed by the scalatest plugin, and so is declared here to be available in
all child modules, just as scalatest is run in all children
@ -833,6 +828,14 @@
</exclusion>
</exclusions>
</dependency>
<!-- avro-mapred for some reason depends on avro-ipc's test jar, so undo that. -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro-ipc</artifactId>
<classifier>tests</classifier>
<version>${avro.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro-mapred</artifactId>
@ -1521,6 +1524,10 @@
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
@ -1916,6 +1923,7 @@
-->
<SPARK_DIST_CLASSPATH>${test_classpath}</SPARK_DIST_CLASSPATH>
<SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES>
<SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION>
<SPARK_TESTING>1</SPARK_TESTING>
<JAVA_HOME>${test.java.home}</JAVA_HOME>
</environmentVariables>
@ -1964,6 +1972,7 @@
-->
<SPARK_DIST_CLASSPATH>${test_classpath}</SPARK_DIST_CLASSPATH>
<SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES>
<SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION>
<SPARK_TESTING>1</SPARK_TESTING>
<JAVA_HOME>${test.java.home}</JAVA_HOME>
</environmentVariables>
@ -2146,6 +2155,7 @@
<version>2.10</version>
<executions>
<execution>
<id>generate-test-classpath</id>
<phase>test-compile</phase>
<goals>
<goal>build-classpath</goal>
@ -2155,6 +2165,17 @@
<outputProperty>test_classpath</outputProperty>
</configuration>
</execution>
<execution>
<id>copy-module-dependencies</id>
<phase>${build.copyDependenciesPhase}</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<includeScope>runtime</includeScope>
<outputDirectory>${jars.target.dir}</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
@ -2169,9 +2190,6 @@
<shadedArtifactAttached>false</shadedArtifactAttached>
<artifactSet>
<includes>
<!-- At a minimum we must include this to force effective pom generation -->
<include>org.spark-project.spark:unused</include>
<include>org.eclipse.jetty:jetty-io</include>
<include>org.eclipse.jetty:jetty-http</include>
<include>org.eclipse.jetty:jetty-continuation</include>
@ -2302,7 +2320,7 @@
<executions>
<execution>
<id>prepare-test-jar</id>
<phase>prepare-package</phase>
<phase>${build.testJarPhase}</phase>
<goals>
<goal>test-jar</goal>
</goals>

View file

@ -57,11 +57,12 @@ object BuildCommons {
Seq("yarn", "java8-tests", "ganglia-lgpl", "streaming-kinesis-asl",
"docker-integration-tests").map(ProjectRef(buildLocation, _))
val assemblyProjects@Seq(assembly, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKinesisAslAssembly) =
Seq("assembly", "network-yarn", "streaming-flume-assembly", "streaming-kafka-assembly", "streaming-kinesis-asl-assembly")
val assemblyProjects@Seq(networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKinesisAslAssembly) =
Seq("network-yarn", "streaming-flume-assembly", "streaming-kafka-assembly", "streaming-kinesis-asl-assembly")
.map(ProjectRef(buildLocation, _))
val copyJarsProjects@Seq(examples) = Seq("examples").map(ProjectRef(buildLocation, _))
val copyJarsProjects@Seq(assembly, examples) = Seq("assembly", "examples")
.map(ProjectRef(buildLocation, _))
val tools = ProjectRef(buildLocation, "tools")
// Root project.
@ -263,8 +264,14 @@ object SparkBuild extends PomBuild {
/* Unsafe settings */
enable(Unsafe.settings)(unsafe)
/* Set up tasks to copy dependencies during packaging. */
copyJarsProjects.foreach(enable(CopyDependencies.settings))
/*
* Set up tasks to copy dependencies during packaging. This step can be disabled in the command
* line, so that dev/mima can run without trying to copy these files again and potentially
* causing issues.
*/
if (!"false".equals(System.getProperty("copyDependencies"))) {
copyJarsProjects.foreach(enable(CopyDependencies.settings))
}
/* Enable Assembly for all assembly projects */
assemblyProjects.foreach(enable(Assembly.settings))
@ -477,8 +484,6 @@ object Assembly {
val hadoopVersion = taskKey[String]("The version of hadoop that spark is compiled against.")
val deployDatanucleusJars = taskKey[Unit]("Deploy datanucleus jars to the spark/lib_managed/jars directory")
lazy val settings = assemblySettings ++ Seq(
test in assembly := {},
hadoopVersion := {
@ -497,27 +502,13 @@ object Assembly {
s"${mName}-test-${v}.jar"
},
mergeStrategy in assembly := {
case PathList("org", "datanucleus", xs @ _*) => MergeStrategy.discard
case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard
case "log4j.properties" => MergeStrategy.discard
case m if m.toLowerCase.startsWith("meta-inf/services/") => MergeStrategy.filterDistinctLines
case "reference.conf" => MergeStrategy.concat
case _ => MergeStrategy.first
},
deployDatanucleusJars := {
val jars: Seq[File] = (fullClasspath in assembly).value.map(_.data)
.filter(_.getPath.contains("org.datanucleus"))
var libManagedJars = new File(BuildCommons.sparkHome, "lib_managed/jars")
libManagedJars.mkdirs()
jars.foreach { jar =>
val dest = new File(libManagedJars, jar.getName)
if (!dest.exists()) {
Files.copy(jar.toPath, dest.toPath)
}
}
},
assembly <<= assembly.dependsOn(deployDatanucleusJars)
}
)
}
@ -698,6 +689,13 @@ object Java8TestSettings {
object TestSettings {
import BuildCommons._
private val scalaBinaryVersion =
if (System.getProperty("scala-2.10") == "true") {
"2.10"
} else {
"2.11"
}
lazy val settings = Seq (
// Fork new JVMs for tests and set Java options for those
fork := true,
@ -707,6 +705,7 @@ object TestSettings {
"SPARK_DIST_CLASSPATH" ->
(fullClasspath in Test).value.files.map(_.getAbsolutePath).mkString(":").stripSuffix(":"),
"SPARK_PREPEND_CLASSES" -> "1",
"SPARK_SCALA_VERSION" -> scalaBinaryVersion,
"SPARK_TESTING" -> "1",
"JAVA_HOME" -> sys.env.get("JAVA_HOME").getOrElse(sys.props("java.home"))),
javaOptions in Test += s"-Djava.io.tmpdir=$testTempDir",
@ -744,7 +743,7 @@ object TestSettings {
// Make sure the test temp directory exists.
resourceGenerators in Test <+= resourceManaged in Test map { outDir: File =>
if (!new File(testTempDir).isDirectory()) {
require(new File(testTempDir).mkdirs())
require(new File(testTempDir).mkdirs(), s"Error creating temp directory $testTempDir.")
}
Seq[File]()
},

View file

@ -1482,7 +1482,7 @@ def search_kafka_assembly_jar():
raise Exception(
("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
"You need to build Spark with "
"'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or "
"'build/sbt assembly/package streaming-kafka-assembly/assembly' or "
"'build/mvn package' before running this test.")
elif len(jars) > 1:
raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please "
@ -1548,7 +1548,7 @@ if __name__ == "__main__":
elif are_kinesis_tests_enabled is False:
sys.stderr.write("Skipping all Kinesis Python tests as the optional Kinesis project was "
"not compiled into a JAR. To run these tests, "
"you need to build Spark with 'build/sbt -Pkinesis-asl assembly/assembly "
"you need to build Spark with 'build/sbt -Pkinesis-asl assembly/package "
"streaming-kinesis-asl-assembly/assembly' or "
"'build/mvn -Pkinesis-asl package' before running this test.")
else:
@ -1556,7 +1556,7 @@ if __name__ == "__main__":
("Failed to find Spark Streaming Kinesis assembly jar in %s. "
% kinesis_asl_assembly_dir) +
"You need to build Spark with 'build/sbt -Pkinesis-asl "
"assembly/assembly streaming-kinesis-asl-assembly/assembly'"
"assembly/package streaming-kinesis-asl-assembly/assembly'"
"or 'build/mvn -Pkinesis-asl package' before running this test.")
sys.stderr.write("Running tests: %s \n" % (str(testcases)))

View file

@ -53,11 +53,25 @@ LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
FAILURE_REPORTING_LOCK = Lock()
LOGGER = logging.getLogger()
# Find out where the assembly jars are located.
for scala in ["2.11", "2.10"]:
build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala)
if os.path.isdir(build_dir):
SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*")
break
else:
raise Exception("Cannot find assembly build directory, please build Spark first.")
def run_individual_python_test(test_name, pyspark_python):
env = dict(os.environ)
env.update({'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python),
'PYSPARK_DRIVER_PYTHON': which(pyspark_python)})
env.update({
'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
'SPARK_TESTING': '1',
'SPARK_PREPEND_CLASSES': '1',
'PYSPARK_PYTHON': which(pyspark_python),
'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
})
LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name)
start_time = time.time()
try:

View file

@ -763,11 +763,15 @@ abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAl
extraEnvironment = Map(
// Disables SPARK_TESTING to exclude log4j.properties in test directories.
"SPARK_TESTING" -> "0",
// But set SPARK_SQL_TESTING to make spark-class happy.
"SPARK_SQL_TESTING" -> "1",
// Points SPARK_PID_DIR to SPARK_HOME, otherwise only 1 Thrift server instance can be
// started at a time, which is not Jenkins friendly.
"SPARK_PID_DIR" -> pidDir.getCanonicalPath),
redirectStderr = true)
logInfo(s"COMMAND: $command")
logInfo(s"OUTPUT: $lines")
lines.split("\n").collectFirst {
case line if line.contains(LOG_FILE_MARK) => new File(line.drop(LOG_FILE_MARK.length))
}.getOrElse {

View file

@ -225,30 +225,6 @@
<argLine>-da -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
</configuration>
</plugin>
<!-- Deploy datanucleus jars to the spark/lib_managed/jars directory -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<!-- basedir is spark/sql/hive/ -->
<outputDirectory>${basedir}/../../lib_managed/jars</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>false</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
<includeGroupIds>org.datanucleus</includeGroupIds>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View file

@ -447,9 +447,6 @@ private[spark] class Client(
*
* Note that the archive cannot be a "local" URI. If none of the above settings are found,
* then upload all files found in $SPARK_HOME/jars.
*
* TODO: currently the code looks in $SPARK_HOME/lib while the work to replace assemblies
* with a directory full of jars is ongoing.
*/
val sparkArchive = sparkConf.get(SPARK_ARCHIVE)
if (sparkArchive.isDefined) {

View file

@ -273,7 +273,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
test("distribute local spark jars") {
val temp = Utils.createTempDir()
val jarsDir = new File(temp, "lib")
val jarsDir = new File(temp, "jars")
assert(jarsDir.mkdir())
val jar = TestUtils.createJarWithFiles(Map(), jarsDir)
new FileOutputStream(new File(temp, "RELEASE")).close()