[SPARK-6406] Launch Spark using assembly jar instead of a separate launcher jar

Author: Nishkam Ravi <nravi@cloudera.com>
Author: nishkamravi2 <nishkamravi@gmail.com>
Author: nravi <nravi@c1704.halxg.cloudera.com>

Closes #5085 from nishkamravi2/master_nravi and squashes the following commits:

bad4349 [nishkamravi2] Update Main.java
36a6f87 [Nishkam Ravi] Minor changes and bug fixes
b7f4ae7 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
4a45d6a [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
458af39 [Nishkam Ravi] Locate the jar using getLocation, obviates the need to pass assembly path as an argument
d9658d6 [Nishkam Ravi] Changes for SPARK-6406
ccdc334 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
3faa7a4 [Nishkam Ravi] Launcher library changes (SPARK-6406)
345206a [Nishkam Ravi] spark-class merge Merge branch 'master_nravi' of https://github.com/nishkamravi2/spark into master_nravi
ac58975 [Nishkam Ravi] spark-class changes
06bfeb0 [nishkamravi2] Update spark-class
35af990 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
32c3ab3 [nishkamravi2] Update AbstractCommandBuilder.java
4bd4489 [nishkamravi2] Update AbstractCommandBuilder.java
746f35b [Nishkam Ravi] "hadoop" string in the assembly name should not be mandatory (everywhere else in spark we mandate spark-assembly*hadoop*.jar)
bfe96e0 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
ee902fa [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
d453197 [nishkamravi2] Update NewHadoopRDD.scala
6f41a1d [nishkamravi2] Update NewHadoopRDD.scala
0ce2c32 [nishkamravi2] Update HadoopRDD.scala
f7e33c2 [Nishkam Ravi] Merge branch 'master_nravi' of https://github.com/nishkamravi2/spark into master_nravi
ba1eb8b [Nishkam Ravi] Try-catch block around the two occurrences of removeShutDownHook. Deletion of semi-redundant occurrences of expensive operation inShutDown.
71d0e17 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
494d8c0 [nishkamravi2] Update DiskBlockManager.scala
3c5ddba [nishkamravi2] Update DiskBlockManager.scala
f0d12de [Nishkam Ravi] Workaround for IllegalStateException caused by recent changes to BlockManager.stop
79ea8b4 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
b446edc [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
5c9a4cb [nishkamravi2] Update TaskSetManagerSuite.scala
535295a [nishkamravi2] Update TaskSetManager.scala
3e1b616 [Nishkam Ravi] Modify test for maxResultSize
9f6583e [Nishkam Ravi] Changes to maxResultSize code (improve error message and add condition to check if maxResultSize > 0)
5f8f9ed [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
636a9ff [nishkamravi2] Update YarnAllocator.scala
8f76c8b [Nishkam Ravi] Doc change for yarn memory overhead
35daa64 [Nishkam Ravi] Slight change in the doc for yarn memory overhead
5ac2ec1 [Nishkam Ravi] Remove out
dac1047 [Nishkam Ravi] Additional documentation for yarn memory overhead issue
42c2c3d [Nishkam Ravi] Additional changes for yarn memory overhead issue
362da5e [Nishkam Ravi] Additional changes for yarn memory overhead
c726bd9 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
f00fa31 [Nishkam Ravi] Improving logging for AM memoryOverhead
1cf2d1e [nishkamravi2] Update YarnAllocator.scala
ebcde10 [Nishkam Ravi] Modify default YARN memory_overhead-- from an additive constant to a multiplier (redone to resolve merge conflicts)
2e69f11 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
efd688a [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark
2b630f9 [nravi] Accept memory input as "30g", "512M" instead of an int value, to be consistent with rest of Spark
3bf8fad [nravi] Merge branch 'master' of https://github.com/apache/spark
5423a03 [nravi] Merge branch 'master' of https://github.com/apache/spark
eb663ca [nravi] Merge branch 'master' of https://github.com/apache/spark
df2aeb1 [nravi] Improved fix for ConcurrentModificationIssue (Spark-1097, Hadoop-10456)
6b840f0 [nravi] Undo the fix for SPARK-1758 (the problem is fixed)
5108700 [nravi] Fix in Spark for the Concurrent thread modification issue (SPARK-1097, HADOOP-10456)
681b36f [nravi] Fix for SPARK-1758: failing test org.apache.spark.JavaAPISuite.wholeTextFiles
This commit is contained in:
Nishkam Ravi 2015-03-29 12:40:37 +01:00 committed by Sean Owen
parent 55153f5c14
commit e3eb393961
4 changed files with 72 additions and 128 deletions

View file

@ -40,35 +40,46 @@ else
fi
fi
# Look for the launcher. In non-release mode, add the compiled classes directly to the classpath
# instead of looking for a jar file.
SPARK_LAUNCHER_CP=
if [ -f $SPARK_HOME/RELEASE ]; then
LAUNCHER_DIR="$SPARK_HOME/lib"
num_jars="$(ls -1 "$LAUNCHER_DIR" | grep "^spark-launcher.*\.jar$" | wc -l)"
if [ "$num_jars" -eq "0" -a -z "$SPARK_LAUNCHER_CP" ]; then
echo "Failed to find Spark launcher in $LAUNCHER_DIR." 1>&2
echo "You need to build Spark before running this program." 1>&2
exit 1
fi
LAUNCHER_JARS="$(ls -1 "$LAUNCHER_DIR" | grep "^spark-launcher.*\.jar$" || true)"
if [ "$num_jars" -gt "1" ]; then
echo "Found multiple Spark launcher jars in $LAUNCHER_DIR:" 1>&2
echo "$LAUNCHER_JARS" 1>&2
echo "Please remove all but one jar." 1>&2
exit 1
fi
SPARK_LAUNCHER_CP="${LAUNCHER_DIR}/${LAUNCHER_JARS}"
# Find assembly jar
SPARK_ASSEMBLY_JAR=
if [ -f "$SPARK_HOME/RELEASE" ]; then
ASSEMBLY_DIR="$SPARK_HOME/lib"
else
LAUNCHER_DIR="$SPARK_HOME/launcher/target/scala-$SPARK_SCALA_VERSION"
if [ ! -d "$LAUNCHER_DIR/classes" ]; then
echo "Failed to find Spark launcher classes in $LAUNCHER_DIR." 1>&2
echo "You need to build Spark before running this program." 1>&2
ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION"
fi
num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)"
if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" ]; then
echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2
echo "You need to build Spark before running this program." 1>&2
exit 1
fi
ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)"
if [ "$num_jars" -gt "1" ]; then
echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2
echo "$ASSEMBLY_JARS" 1>&2
echo "Please remove all but one jar." 1>&2
exit 1
fi
SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
# Verify that versions of java used to build the jars and run Spark are compatible
if [ -n "$JAVA_HOME" ]; then
JAR_CMD="$JAVA_HOME/bin/jar"
else
JAR_CMD="jar"
fi
if [ $(command -v "$JAR_CMD") ] ; then
jar_error_check=$("$JAR_CMD" -tf "$SPARK_ASSEMBLY_JAR" nonexistent/class/path 2>&1)
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
echo "or build Spark with Java 6." 1>&2
exit 1
fi
SPARK_LAUNCHER_CP="$LAUNCHER_DIR/classes"
fi
# The launcher library will print arguments separated by a NULL character, to allow arguments with
@ -77,7 +88,7 @@ fi
CMD=()
while IFS= read -d '' -r ARG; do
CMD+=("$ARG")
done < <("$RUNNER" -cp "$SPARK_LAUNCHER_CP" org.apache.spark.launcher.Main "$@")
done < <("$RUNNER" -cp "$SPARK_ASSEMBLY_JAR" org.apache.spark.launcher.Main "$@")
if [ "${CMD[0]}" = "usage" ]; then
"${CMD[@]}"

View file

@ -29,31 +29,20 @@ if "x%1"=="x" (
exit /b 1
)
set LAUNCHER_CP=0
if exist %SPARK_HOME%\RELEASE goto find_release_launcher
rem Find assembly jar
set SPARK_ASSEMBLY_JAR=0
rem Look for the Spark launcher in both Scala build directories. The launcher doesn't use Scala so
rem it doesn't really matter which one is picked up. Add the compiled classes directly to the
rem classpath instead of looking for a jar file, since it's very common for people using sbt to use
rem the "assembly" target instead of "package".
set LAUNCHER_CLASSES=%SPARK_HOME%\launcher\target\scala-2.10\classes
if exist %LAUNCHER_CLASSES% (
set LAUNCHER_CP=%LAUNCHER_CLASSES%
)
set LAUNCHER_CLASSES=%SPARK_HOME%\launcher\target\scala-2.11\classes
if exist %LAUNCHER_CLASSES% (
set LAUNCHER_CP=%LAUNCHER_CLASSES%
)
goto check_launcher
:find_release_launcher
for %%d in (%SPARK_HOME%\lib\spark-launcher*.jar) do (
set LAUNCHER_CP=%%d
if exist "%SPARK_HOME%\RELEASE" (
set ASSEMBLY_DIR=%SPARK_HOME%\lib
) else (
set ASSEMBLY_DIR=%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%
)
:check_launcher
if "%LAUNCHER_CP%"=="0" (
echo Failed to find Spark launcher JAR.
for %%d in (%ASSEMBLY_DIR%\spark-assembly*hadoop*.jar) do (
set SPARK_ASSEMBLY_JAR=%%d
)
if "%SPARK_ASSEMBLY_JAR%"=="0" (
echo Failed to find Spark assembly JAR.
echo You need to build Spark before running this program.
exit /b 1
)
@ -64,7 +53,7 @@ if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java
rem The launcher library prints the command to be executed in a single line suitable for being
rem executed by the batch interpreter. So read all the output of the launcher into a variable.
for /f "tokens=*" %%i in ('cmd /C ""%RUNNER%" -cp %LAUNCHER_CP% org.apache.spark.launcher.Main %*"') do (
for /f "tokens=*" %%i in ('cmd /C ""%RUNNER%" -cp %SPARK_ASSEMBLY_JAR% org.apache.spark.launcher.Main %*"') do (
set SPARK_CMD=%%i
)
%SPARK_CMD%

View file

@ -86,10 +86,14 @@ abstract class AbstractCommandBuilder {
*/
List<String> buildJavaCommand(String extraClassPath) throws IOException {
List<String> cmd = new ArrayList<String>();
if (javaHome == null) {
cmd.add(join(File.separator, System.getProperty("java.home"), "bin", "java"));
} else {
String envJavaHome;
if (javaHome != null) {
cmd.add(join(File.separator, javaHome, "bin", "java"));
} else if ((envJavaHome = System.getenv("JAVA_HOME")) != null) {
cmd.add(join(File.separator, envJavaHome, "bin", "java"));
} else {
cmd.add(join(File.separator, System.getProperty("java.home"), "bin", "java"));
}
// Load extra JAVA_OPTS from conf/java-opts, if it exists.
@ -182,59 +186,25 @@ abstract class AbstractCommandBuilder {
addToClassPath(cp, String.format("%s/core/target/jars/*", sparkHome));
}
String assembly = findAssembly();
final String assembly = AbstractCommandBuilder.class.getProtectionDomain().getCodeSource().
getLocation().getPath();
addToClassPath(cp, assembly);
// When Hive support is needed, Datanucleus jars must be included on the classpath. Datanucleus
// jars do not work if only included in the uber jar as plugin.xml metadata is lost. Both sbt
// and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is built
// with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
// assembly is built for Hive, before actually populating the CLASSPATH with the jars.
//
// This block also serves as a check for SPARK-1703, when the assembly jar is built with
// Java 7 and ends up with too many files, causing issues with other JDK versions.
boolean needsDataNucleus = false;
JarFile assemblyJar = null;
try {
assemblyJar = new JarFile(assembly);
needsDataNucleus = assemblyJar.getEntry("org/apache/hadoop/hive/ql/exec/") != null;
} catch (IOException ioe) {
if (ioe.getMessage().indexOf("invalid CEN header") >= 0) {
System.err.println(
"Loading Spark jar failed.\n" +
"This is likely because Spark was compiled with Java 7 and run\n" +
"with Java 6 (see SPARK-1703). Please use Java 7 to run Spark\n" +
"or build Spark with Java 6.");
System.exit(1);
} else {
throw ioe;
}
} finally {
if (assemblyJar != null) {
try {
assemblyJar.close();
} catch (IOException e) {
// Ignore.
}
}
// Datanucleus jars must be included on the classpath. Datanucleus jars do not work if only
// included in the uber jar as plugin.xml metadata is lost. Both sbt and maven will populate
// "lib_managed/jars/" with the datanucleus jars when Spark is built with Hive
File libdir;
if (new File(sparkHome, "RELEASE").isFile()) {
libdir = new File(sparkHome, "lib");
} else {
libdir = new File(sparkHome, "lib_managed/jars");
}
if (needsDataNucleus) {
System.err.println("Spark assembly has been built with Hive, including Datanucleus jars " +
"in classpath.");
File libdir;
if (new File(sparkHome, "RELEASE").isFile()) {
libdir = new File(sparkHome, "lib");
} else {
libdir = new File(sparkHome, "lib_managed/jars");
}
checkState(libdir.isDirectory(), "Library directory '%s' does not exist.",
libdir.getAbsolutePath());
for (File jar : libdir.listFiles()) {
if (jar.getName().startsWith("datanucleus-")) {
addToClassPath(cp, jar.getAbsolutePath());
}
checkState(libdir.isDirectory(), "Library directory '%s' does not exist.",
libdir.getAbsolutePath());
for (File jar : libdir.listFiles()) {
if (jar.getName().startsWith("datanucleus-")) {
addToClassPath(cp, jar.getAbsolutePath());
}
}
@ -270,7 +240,6 @@ abstract class AbstractCommandBuilder {
if (scala != null) {
return scala;
}
String sparkHome = getSparkHome();
File scala210 = new File(sparkHome, "assembly/target/scala-2.10");
File scala211 = new File(sparkHome, "assembly/target/scala-2.11");
@ -330,30 +299,6 @@ abstract class AbstractCommandBuilder {
return firstNonEmpty(childEnv.get(key), System.getenv(key));
}
private String findAssembly() {
String sparkHome = getSparkHome();
File libdir;
if (new File(sparkHome, "RELEASE").isFile()) {
libdir = new File(sparkHome, "lib");
checkState(libdir.isDirectory(), "Library directory '%s' does not exist.",
libdir.getAbsolutePath());
} else {
libdir = new File(sparkHome, String.format("assembly/target/scala-%s", getScalaVersion()));
}
final Pattern re = Pattern.compile("spark-assembly.*hadoop.*\\.jar");
FileFilter filter = new FileFilter() {
@Override
public boolean accept(File file) {
return file.isFile() && re.matcher(file.getName()).matches();
}
};
File[] assemblies = libdir.listFiles(filter);
checkState(assemblies != null && assemblies.length > 0, "No assemblies found in '%s'.", libdir);
checkState(assemblies.length == 1, "Multiple assemblies found in '%s'.", libdir);
return assemblies[0].getAbsolutePath();
}
private String getConfDir() {
String confDir = getenv("SPARK_CONF_DIR");
return confDir != null ? confDir : join(File.separator, getSparkHome(), "conf");

View file

@ -199,7 +199,6 @@ echo "Build flags: $@" >> "$DISTDIR/RELEASE"
# Copy jars
cp "$SPARK_HOME"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
cp "$SPARK_HOME"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
cp "$SPARK_HOME"/launcher/target/spark-launcher_$SCALA_VERSION-$VERSION.jar "$DISTDIR/lib/"
# This will fail if the -Pyarn profile is not provided
# In this case, silence the error and ignore the return code of this command
cp "$SPARK_HOME"/network/yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :