[SPARK-6406] Launch Spark using assembly jar instead of a separate launcher jar
Author: Nishkam Ravi <nravi@cloudera.com> Author: nishkamravi2 <nishkamravi@gmail.com> Author: nravi <nravi@c1704.halxg.cloudera.com> Closes #5085 from nishkamravi2/master_nravi and squashes the following commits: bad4349 [nishkamravi2] Update Main.java 36a6f87 [Nishkam Ravi] Minor changes and bug fixes b7f4ae7 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi 4a45d6a [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi 458af39 [Nishkam Ravi] Locate the jar using getLocation, obviates the need to pass assembly path as an argument d9658d6 [Nishkam Ravi] Changes for SPARK-6406 ccdc334 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi 3faa7a4 [Nishkam Ravi] Launcher library changes (SPARK-6406) 345206a [Nishkam Ravi] spark-class merge Merge branch 'master_nravi' of https://github.com/nishkamravi2/spark into master_nravi ac58975 [Nishkam Ravi] spark-class changes 06bfeb0 [nishkamravi2] Update spark-class 35af990 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi 32c3ab3 [nishkamravi2] Update AbstractCommandBuilder.java 4bd4489 [nishkamravi2] Update AbstractCommandBuilder.java 746f35b [Nishkam Ravi] "hadoop" string in the assembly name should not be mandatory (everywhere else in spark we mandate spark-assembly*hadoop*.jar) bfe96e0 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi ee902fa [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi d453197 [nishkamravi2] Update NewHadoopRDD.scala 6f41a1d [nishkamravi2] Update NewHadoopRDD.scala 0ce2c32 [nishkamravi2] Update HadoopRDD.scala f7e33c2 [Nishkam Ravi] Merge branch 'master_nravi' of https://github.com/nishkamravi2/spark into master_nravi ba1eb8b [Nishkam Ravi] Try-catch block around the two occurrences of removeShutDownHook. Deletion of semi-redundant occurrences of expensive operation inShutDown. 71d0e17 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi 494d8c0 [nishkamravi2] Update DiskBlockManager.scala 3c5ddba [nishkamravi2] Update DiskBlockManager.scala f0d12de [Nishkam Ravi] Workaround for IllegalStateException caused by recent changes to BlockManager.stop 79ea8b4 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi b446edc [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi 5c9a4cb [nishkamravi2] Update TaskSetManagerSuite.scala 535295a [nishkamravi2] Update TaskSetManager.scala 3e1b616 [Nishkam Ravi] Modify test for maxResultSize 9f6583e [Nishkam Ravi] Changes to maxResultSize code (improve error message and add condition to check if maxResultSize > 0) 5f8f9ed [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi 636a9ff [nishkamravi2] Update YarnAllocator.scala 8f76c8b [Nishkam Ravi] Doc change for yarn memory overhead 35daa64 [Nishkam Ravi] Slight change in the doc for yarn memory overhead 5ac2ec1 [Nishkam Ravi] Remove out dac1047 [Nishkam Ravi] Additional documentation for yarn memory overhead issue 42c2c3d [Nishkam Ravi] Additional changes for yarn memory overhead issue 362da5e [Nishkam Ravi] Additional changes for yarn memory overhead c726bd9 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi f00fa31 [Nishkam Ravi] Improving logging for AM memoryOverhead 1cf2d1e [nishkamravi2] Update YarnAllocator.scala ebcde10 [Nishkam Ravi] Modify default YARN memory_overhead-- from an additive constant to a multiplier (redone to resolve merge conflicts) 2e69f11 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi efd688a [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark 2b630f9 [nravi] Accept memory input as "30g", "512M" instead of an int value, to be consistent with rest of Spark 3bf8fad [nravi] Merge branch 'master' of https://github.com/apache/spark 5423a03 [nravi] Merge branch 'master' of https://github.com/apache/spark eb663ca [nravi] Merge branch 'master' of https://github.com/apache/spark df2aeb1 [nravi] Improved fix for ConcurrentModificationIssue (Spark-1097, Hadoop-10456) 6b840f0 [nravi] Undo the fix for SPARK-1758 (the problem is fixed) 5108700 [nravi] Fix in Spark for the Concurrent thread modification issue (SPARK-1097, HADOOP-10456) 681b36f [nravi] Fix for SPARK-1758: failing test org.apache.spark.JavaAPISuite.wholeTextFiles
This commit is contained in:
parent
55153f5c14
commit
e3eb393961
|
@ -40,35 +40,46 @@ else
|
|||
fi
|
||||
fi
|
||||
|
||||
# Look for the launcher. In non-release mode, add the compiled classes directly to the classpath
|
||||
# instead of looking for a jar file.
|
||||
SPARK_LAUNCHER_CP=
|
||||
if [ -f $SPARK_HOME/RELEASE ]; then
|
||||
LAUNCHER_DIR="$SPARK_HOME/lib"
|
||||
num_jars="$(ls -1 "$LAUNCHER_DIR" | grep "^spark-launcher.*\.jar$" | wc -l)"
|
||||
if [ "$num_jars" -eq "0" -a -z "$SPARK_LAUNCHER_CP" ]; then
|
||||
echo "Failed to find Spark launcher in $LAUNCHER_DIR." 1>&2
|
||||
echo "You need to build Spark before running this program." 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LAUNCHER_JARS="$(ls -1 "$LAUNCHER_DIR" | grep "^spark-launcher.*\.jar$" || true)"
|
||||
if [ "$num_jars" -gt "1" ]; then
|
||||
echo "Found multiple Spark launcher jars in $LAUNCHER_DIR:" 1>&2
|
||||
echo "$LAUNCHER_JARS" 1>&2
|
||||
echo "Please remove all but one jar." 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SPARK_LAUNCHER_CP="${LAUNCHER_DIR}/${LAUNCHER_JARS}"
|
||||
# Find assembly jar
|
||||
SPARK_ASSEMBLY_JAR=
|
||||
if [ -f "$SPARK_HOME/RELEASE" ]; then
|
||||
ASSEMBLY_DIR="$SPARK_HOME/lib"
|
||||
else
|
||||
LAUNCHER_DIR="$SPARK_HOME/launcher/target/scala-$SPARK_SCALA_VERSION"
|
||||
if [ ! -d "$LAUNCHER_DIR/classes" ]; then
|
||||
echo "Failed to find Spark launcher classes in $LAUNCHER_DIR." 1>&2
|
||||
echo "You need to build Spark before running this program." 1>&2
|
||||
ASSEMBLY_DIR="$SPARK_HOME/assembly/target/scala-$SPARK_SCALA_VERSION"
|
||||
fi
|
||||
|
||||
num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" | wc -l)"
|
||||
if [ "$num_jars" -eq "0" -a -z "$SPARK_ASSEMBLY_JAR" ]; then
|
||||
echo "Failed to find Spark assembly in $ASSEMBLY_DIR." 1>&2
|
||||
echo "You need to build Spark before running this program." 1>&2
|
||||
exit 1
|
||||
fi
|
||||
ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^spark-assembly.*hadoop.*\.jar$" || true)"
|
||||
if [ "$num_jars" -gt "1" ]; then
|
||||
echo "Found multiple Spark assembly jars in $ASSEMBLY_DIR:" 1>&2
|
||||
echo "$ASSEMBLY_JARS" 1>&2
|
||||
echo "Please remove all but one jar." 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
|
||||
|
||||
# Verify that versions of java used to build the jars and run Spark are compatible
|
||||
if [ -n "$JAVA_HOME" ]; then
|
||||
JAR_CMD="$JAVA_HOME/bin/jar"
|
||||
else
|
||||
JAR_CMD="jar"
|
||||
fi
|
||||
|
||||
if [ $(command -v "$JAR_CMD") ] ; then
|
||||
jar_error_check=$("$JAR_CMD" -tf "$SPARK_ASSEMBLY_JAR" nonexistent/class/path 2>&1)
|
||||
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
|
||||
echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
|
||||
echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
|
||||
echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
|
||||
echo "or build Spark with Java 6." 1>&2
|
||||
exit 1
|
||||
fi
|
||||
SPARK_LAUNCHER_CP="$LAUNCHER_DIR/classes"
|
||||
fi
|
||||
|
||||
# The launcher library will print arguments separated by a NULL character, to allow arguments with
|
||||
|
@ -77,7 +88,7 @@ fi
|
|||
CMD=()
|
||||
while IFS= read -d '' -r ARG; do
|
||||
CMD+=("$ARG")
|
||||
done < <("$RUNNER" -cp "$SPARK_LAUNCHER_CP" org.apache.spark.launcher.Main "$@")
|
||||
done < <("$RUNNER" -cp "$SPARK_ASSEMBLY_JAR" org.apache.spark.launcher.Main "$@")
|
||||
|
||||
if [ "${CMD[0]}" = "usage" ]; then
|
||||
"${CMD[@]}"
|
||||
|
|
|
@ -29,31 +29,20 @@ if "x%1"=="x" (
|
|||
exit /b 1
|
||||
)
|
||||
|
||||
set LAUNCHER_CP=0
|
||||
if exist %SPARK_HOME%\RELEASE goto find_release_launcher
|
||||
rem Find assembly jar
|
||||
set SPARK_ASSEMBLY_JAR=0
|
||||
|
||||
rem Look for the Spark launcher in both Scala build directories. The launcher doesn't use Scala so
|
||||
rem it doesn't really matter which one is picked up. Add the compiled classes directly to the
|
||||
rem classpath instead of looking for a jar file, since it's very common for people using sbt to use
|
||||
rem the "assembly" target instead of "package".
|
||||
set LAUNCHER_CLASSES=%SPARK_HOME%\launcher\target\scala-2.10\classes
|
||||
if exist %LAUNCHER_CLASSES% (
|
||||
set LAUNCHER_CP=%LAUNCHER_CLASSES%
|
||||
)
|
||||
set LAUNCHER_CLASSES=%SPARK_HOME%\launcher\target\scala-2.11\classes
|
||||
if exist %LAUNCHER_CLASSES% (
|
||||
set LAUNCHER_CP=%LAUNCHER_CLASSES%
|
||||
)
|
||||
goto check_launcher
|
||||
|
||||
:find_release_launcher
|
||||
for %%d in (%SPARK_HOME%\lib\spark-launcher*.jar) do (
|
||||
set LAUNCHER_CP=%%d
|
||||
if exist "%SPARK_HOME%\RELEASE" (
|
||||
set ASSEMBLY_DIR=%SPARK_HOME%\lib
|
||||
) else (
|
||||
set ASSEMBLY_DIR=%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%
|
||||
)
|
||||
|
||||
:check_launcher
|
||||
if "%LAUNCHER_CP%"=="0" (
|
||||
echo Failed to find Spark launcher JAR.
|
||||
for %%d in (%ASSEMBLY_DIR%\spark-assembly*hadoop*.jar) do (
|
||||
set SPARK_ASSEMBLY_JAR=%%d
|
||||
)
|
||||
if "%SPARK_ASSEMBLY_JAR%"=="0" (
|
||||
echo Failed to find Spark assembly JAR.
|
||||
echo You need to build Spark before running this program.
|
||||
exit /b 1
|
||||
)
|
||||
|
@ -64,7 +53,7 @@ if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java
|
|||
|
||||
rem The launcher library prints the command to be executed in a single line suitable for being
|
||||
rem executed by the batch interpreter. So read all the output of the launcher into a variable.
|
||||
for /f "tokens=*" %%i in ('cmd /C ""%RUNNER%" -cp %LAUNCHER_CP% org.apache.spark.launcher.Main %*"') do (
|
||||
for /f "tokens=*" %%i in ('cmd /C ""%RUNNER%" -cp %SPARK_ASSEMBLY_JAR% org.apache.spark.launcher.Main %*"') do (
|
||||
set SPARK_CMD=%%i
|
||||
)
|
||||
%SPARK_CMD%
|
||||
|
|
|
@ -86,10 +86,14 @@ abstract class AbstractCommandBuilder {
|
|||
*/
|
||||
List<String> buildJavaCommand(String extraClassPath) throws IOException {
|
||||
List<String> cmd = new ArrayList<String>();
|
||||
if (javaHome == null) {
|
||||
cmd.add(join(File.separator, System.getProperty("java.home"), "bin", "java"));
|
||||
} else {
|
||||
String envJavaHome;
|
||||
|
||||
if (javaHome != null) {
|
||||
cmd.add(join(File.separator, javaHome, "bin", "java"));
|
||||
} else if ((envJavaHome = System.getenv("JAVA_HOME")) != null) {
|
||||
cmd.add(join(File.separator, envJavaHome, "bin", "java"));
|
||||
} else {
|
||||
cmd.add(join(File.separator, System.getProperty("java.home"), "bin", "java"));
|
||||
}
|
||||
|
||||
// Load extra JAVA_OPTS from conf/java-opts, if it exists.
|
||||
|
@ -182,59 +186,25 @@ abstract class AbstractCommandBuilder {
|
|||
addToClassPath(cp, String.format("%s/core/target/jars/*", sparkHome));
|
||||
}
|
||||
|
||||
String assembly = findAssembly();
|
||||
final String assembly = AbstractCommandBuilder.class.getProtectionDomain().getCodeSource().
|
||||
getLocation().getPath();
|
||||
addToClassPath(cp, assembly);
|
||||
|
||||
// When Hive support is needed, Datanucleus jars must be included on the classpath. Datanucleus
|
||||
// jars do not work if only included in the uber jar as plugin.xml metadata is lost. Both sbt
|
||||
// and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is built
|
||||
// with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
|
||||
// assembly is built for Hive, before actually populating the CLASSPATH with the jars.
|
||||
//
|
||||
// This block also serves as a check for SPARK-1703, when the assembly jar is built with
|
||||
// Java 7 and ends up with too many files, causing issues with other JDK versions.
|
||||
boolean needsDataNucleus = false;
|
||||
JarFile assemblyJar = null;
|
||||
try {
|
||||
assemblyJar = new JarFile(assembly);
|
||||
needsDataNucleus = assemblyJar.getEntry("org/apache/hadoop/hive/ql/exec/") != null;
|
||||
} catch (IOException ioe) {
|
||||
if (ioe.getMessage().indexOf("invalid CEN header") >= 0) {
|
||||
System.err.println(
|
||||
"Loading Spark jar failed.\n" +
|
||||
"This is likely because Spark was compiled with Java 7 and run\n" +
|
||||
"with Java 6 (see SPARK-1703). Please use Java 7 to run Spark\n" +
|
||||
"or build Spark with Java 6.");
|
||||
System.exit(1);
|
||||
} else {
|
||||
throw ioe;
|
||||
}
|
||||
} finally {
|
||||
if (assemblyJar != null) {
|
||||
try {
|
||||
assemblyJar.close();
|
||||
} catch (IOException e) {
|
||||
// Ignore.
|
||||
}
|
||||
}
|
||||
// Datanucleus jars must be included on the classpath. Datanucleus jars do not work if only
|
||||
// included in the uber jar as plugin.xml metadata is lost. Both sbt and maven will populate
|
||||
// "lib_managed/jars/" with the datanucleus jars when Spark is built with Hive
|
||||
File libdir;
|
||||
if (new File(sparkHome, "RELEASE").isFile()) {
|
||||
libdir = new File(sparkHome, "lib");
|
||||
} else {
|
||||
libdir = new File(sparkHome, "lib_managed/jars");
|
||||
}
|
||||
|
||||
if (needsDataNucleus) {
|
||||
System.err.println("Spark assembly has been built with Hive, including Datanucleus jars " +
|
||||
"in classpath.");
|
||||
File libdir;
|
||||
if (new File(sparkHome, "RELEASE").isFile()) {
|
||||
libdir = new File(sparkHome, "lib");
|
||||
} else {
|
||||
libdir = new File(sparkHome, "lib_managed/jars");
|
||||
}
|
||||
|
||||
checkState(libdir.isDirectory(), "Library directory '%s' does not exist.",
|
||||
libdir.getAbsolutePath());
|
||||
for (File jar : libdir.listFiles()) {
|
||||
if (jar.getName().startsWith("datanucleus-")) {
|
||||
addToClassPath(cp, jar.getAbsolutePath());
|
||||
}
|
||||
checkState(libdir.isDirectory(), "Library directory '%s' does not exist.",
|
||||
libdir.getAbsolutePath());
|
||||
for (File jar : libdir.listFiles()) {
|
||||
if (jar.getName().startsWith("datanucleus-")) {
|
||||
addToClassPath(cp, jar.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -270,7 +240,6 @@ abstract class AbstractCommandBuilder {
|
|||
if (scala != null) {
|
||||
return scala;
|
||||
}
|
||||
|
||||
String sparkHome = getSparkHome();
|
||||
File scala210 = new File(sparkHome, "assembly/target/scala-2.10");
|
||||
File scala211 = new File(sparkHome, "assembly/target/scala-2.11");
|
||||
|
@ -330,30 +299,6 @@ abstract class AbstractCommandBuilder {
|
|||
return firstNonEmpty(childEnv.get(key), System.getenv(key));
|
||||
}
|
||||
|
||||
private String findAssembly() {
|
||||
String sparkHome = getSparkHome();
|
||||
File libdir;
|
||||
if (new File(sparkHome, "RELEASE").isFile()) {
|
||||
libdir = new File(sparkHome, "lib");
|
||||
checkState(libdir.isDirectory(), "Library directory '%s' does not exist.",
|
||||
libdir.getAbsolutePath());
|
||||
} else {
|
||||
libdir = new File(sparkHome, String.format("assembly/target/scala-%s", getScalaVersion()));
|
||||
}
|
||||
|
||||
final Pattern re = Pattern.compile("spark-assembly.*hadoop.*\\.jar");
|
||||
FileFilter filter = new FileFilter() {
|
||||
@Override
|
||||
public boolean accept(File file) {
|
||||
return file.isFile() && re.matcher(file.getName()).matches();
|
||||
}
|
||||
};
|
||||
File[] assemblies = libdir.listFiles(filter);
|
||||
checkState(assemblies != null && assemblies.length > 0, "No assemblies found in '%s'.", libdir);
|
||||
checkState(assemblies.length == 1, "Multiple assemblies found in '%s'.", libdir);
|
||||
return assemblies[0].getAbsolutePath();
|
||||
}
|
||||
|
||||
private String getConfDir() {
|
||||
String confDir = getenv("SPARK_CONF_DIR");
|
||||
return confDir != null ? confDir : join(File.separator, getSparkHome(), "conf");
|
||||
|
|
|
@ -199,7 +199,6 @@ echo "Build flags: $@" >> "$DISTDIR/RELEASE"
|
|||
# Copy jars
|
||||
cp "$SPARK_HOME"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
|
||||
cp "$SPARK_HOME"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
|
||||
cp "$SPARK_HOME"/launcher/target/spark-launcher_$SCALA_VERSION-$VERSION.jar "$DISTDIR/lib/"
|
||||
# This will fail if the -Pyarn profile is not provided
|
||||
# In this case, silence the error and ignore the return code of this command
|
||||
cp "$SPARK_HOME"/network/yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :
|
||||
|
|
Loading…
Reference in a new issue