Merge remote-tracking branch 'spark-upstream/master' into HEAD
Conflicts: README.md core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala core/src/main/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMap.scala pom.xml project/SparkBuild.scala repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
This commit is contained in:
commit
91227566bc
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,7 +1,10 @@
|
|||
*~
|
||||
*.swp
|
||||
*.ipr
|
||||
*.iml
|
||||
*.iws
|
||||
.idea/
|
||||
sbt/*.jar
|
||||
.settings
|
||||
.cache
|
||||
/build/
|
||||
|
@ -41,3 +44,4 @@ derby.log
|
|||
dist/
|
||||
spark-*-bin.tar.gz
|
||||
unit-tests.log
|
||||
lib/
|
||||
|
|
|
@ -1,27 +0,0 @@
|
|||
|
||||
Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
- The name of the author may not be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1 +0,0 @@
|
|||
b7924aabe9c5e63f0a4d8bbd17019534c7ec014e
|
Binary file not shown.
|
@ -1,9 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>net.sf.py4j</groupId>
|
||||
<artifactId>py4j</artifactId>
|
||||
<version>0.7</version>
|
||||
<description>POM was created from install:install-file</description>
|
||||
</project>
|
|
@ -1,12 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<metadata>
|
||||
<groupId>net.sf.py4j</groupId>
|
||||
<artifactId>py4j</artifactId>
|
||||
<versioning>
|
||||
<release>0.7</release>
|
||||
<versions>
|
||||
<version>0.7</version>
|
||||
</versions>
|
||||
<lastUpdated>20130828020333</lastUpdated>
|
||||
</versioning>
|
||||
</metadata>
|
|
@ -26,7 +26,7 @@
|
|||
</parent>
|
||||
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-assembly_2.9.3</artifactId>
|
||||
<artifactId>spark-assembly_2.10</artifactId>
|
||||
<name>Spark Project Assembly</name>
|
||||
<url>http://spark.incubator.apache.org/</url>
|
||||
|
||||
|
@ -41,33 +41,33 @@
|
|||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.9.3</artifactId>
|
||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-bagel_2.9.3</artifactId>
|
||||
<artifactId>spark-bagel_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-mllib_2.9.3</artifactId>
|
||||
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-repl_2.9.3</artifactId>
|
||||
<artifactId>spark-repl_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-streaming_2.9.3</artifactId>
|
||||
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.py4j</groupId>
|
||||
<artifactId>py4j</artifactId>
|
||||
<version>0.7</version>
|
||||
<version>0.8.1</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
@ -79,7 +79,7 @@
|
|||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<configuration>
|
||||
<shadedArtifactAttached>false</shadedArtifactAttached>
|
||||
<outputFile>${project.build.directory}/scala-${scala.version}/${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar</outputFile>
|
||||
<outputFile>${project.build.directory}/scala-${scala.binary.version}/${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar</outputFile>
|
||||
<artifactSet>
|
||||
<includes>
|
||||
<include>*:*</include>
|
||||
|
@ -108,12 +108,12 @@
|
|||
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
|
||||
<resource>META-INF/services/org.apache.hadoop.fs.FileSystem</resource>
|
||||
</transformer>
|
||||
</transformers>
|
||||
<transformers>
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
|
||||
<resource>reference.conf</resource>
|
||||
</transformer>
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
|
||||
<resource>log4j.properties</resource>
|
||||
</transformer>
|
||||
</transformers>
|
||||
</configuration>
|
||||
</execution>
|
||||
|
@ -124,11 +124,21 @@
|
|||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop2-yarn</id>
|
||||
<id>yarn-alpha</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-yarn_2.9.3</artifactId>
|
||||
<artifactId>spark-yarn-alpha_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>yarn</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-yarn_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
|
|
@ -39,23 +39,20 @@
|
|||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>
|
||||
${project.parent.basedir}/bin/
|
||||
${project.parent.basedir}/sbin/
|
||||
</directory>
|
||||
<outputDirectory>/bin</outputDirectory>
|
||||
<outputDirectory>/sbin</outputDirectory>
|
||||
<includes>
|
||||
<include>**/*</include>
|
||||
</includes>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>
|
||||
${project.parent.basedir}
|
||||
${project.parent.basedir}/bin/
|
||||
</directory>
|
||||
<outputDirectory>/bin</outputDirectory>
|
||||
<includes>
|
||||
<include>run-example*</include>
|
||||
<include>spark-class*</include>
|
||||
<include>spark-shell*</include>
|
||||
<include>spark-executor*</include>
|
||||
<include>**/*</include>
|
||||
</includes>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
</parent>
|
||||
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-bagel_2.9.3</artifactId>
|
||||
<artifactId>spark-bagel_2.10</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Spark Project Bagel</name>
|
||||
<url>http://spark.incubator.apache.org/</url>
|
||||
|
@ -34,7 +34,7 @@
|
|||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.9.3</artifactId>
|
||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
@ -43,18 +43,18 @@
|
|||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest_2.9.3</artifactId>
|
||||
<artifactId>scalatest_${scala.binary.version}</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalacheck</groupId>
|
||||
<artifactId>scalacheck_2.9.3</artifactId>
|
||||
<artifactId>scalacheck_${scala.binary.version}</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<outputDirectory>target/scala-${scala.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.version}/test-classes</testOutputDirectory>
|
||||
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.scalatest</groupId>
|
||||
|
|
|
@ -20,7 +20,7 @@ rem
|
|||
rem This script computes Spark's classpath and prints it to stdout; it's used by both the "run"
|
||||
rem script and the ExecutorRunner in standalone cluster mode.
|
||||
|
||||
set SCALA_VERSION=2.9.3
|
||||
set SCALA_VERSION=2.10
|
||||
|
||||
rem Figure out where the Spark framework is installed
|
||||
set FWDIR=%~dp0..\
|
||||
|
@ -29,7 +29,7 @@ rem Load environment variables from conf\spark-env.cmd, if it exists
|
|||
if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd"
|
||||
|
||||
rem Build up classpath
|
||||
set CLASSPATH=%SPARK_CLASSPATH%;%FWDIR%conf
|
||||
set CLASSPATH=%FWDIR%conf
|
||||
if exist "%FWDIR%RELEASE" (
|
||||
for %%d in ("%FWDIR%jars\spark-assembly*.jar") do (
|
||||
set ASSEMBLY_JAR=%%d
|
||||
|
|
|
@ -20,13 +20,13 @@
|
|||
# This script computes Spark's classpath and prints it to stdout; it's used by both the "run"
|
||||
# script and the ExecutorRunner in standalone cluster mode.
|
||||
|
||||
SCALA_VERSION=2.9.3
|
||||
SCALA_VERSION=2.10
|
||||
|
||||
# Figure out where Spark is installed
|
||||
FWDIR="$(cd `dirname $0`/..; pwd)"
|
||||
|
||||
# Load environment variables from conf/spark-env.sh, if it exists
|
||||
if [ -e $FWDIR/conf/spark-env.sh ] ; then
|
||||
if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
|
||||
. $FWDIR/conf/spark-env.sh
|
||||
fi
|
||||
|
||||
|
|
|
@ -18,12 +18,12 @@
|
|||
#
|
||||
|
||||
# Figure out where the Scala framework is installed
|
||||
FWDIR="$(cd `dirname $0`; pwd)"
|
||||
FWDIR="$(cd `dirname $0`/..; pwd)"
|
||||
|
||||
# Export this as SPARK_HOME
|
||||
export SPARK_HOME="$FWDIR"
|
||||
|
||||
SCALA_VERSION=2.9.3
|
||||
SCALA_VERSION=2.10
|
||||
|
||||
# Exit if the user hasn't compiled Spark
|
||||
if [ ! -f "$FWDIR/RELEASE" ]; then
|
||||
|
@ -37,7 +37,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
|
|||
fi
|
||||
|
||||
# Load environment variables from conf/spark-env.sh, if it exists
|
||||
if [ -e $FWDIR/conf/spark-env.sh ] ; then
|
||||
if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
|
||||
. $FWDIR/conf/spark-env.sh
|
||||
fi
|
||||
|
||||
|
@ -59,8 +59,12 @@ if [ -n "$IPYTHON_OPTS" ]; then
|
|||
fi
|
||||
|
||||
if [[ "$IPYTHON" = "1" ]] ; then
|
||||
IPYTHON_OPTS=${IPYTHON_OPTS:--i}
|
||||
exec ipython "$IPYTHON_OPTS" -c "%run $PYTHONSTARTUP"
|
||||
# IPython <1.0.0 doesn't honor PYTHONSTARTUP, while 1.0.0+ does.
|
||||
# Hence we clear PYTHONSTARTUP and use the -c "%run $IPYTHONSTARTUP" command which works on all versions
|
||||
# We also force interactive mode with "-i"
|
||||
IPYTHONSTARTUP=$PYTHONSTARTUP
|
||||
PYTHONSTARTUP=
|
||||
exec ipython "$IPYTHON_OPTS" -i -c "%run $IPYTHONSTARTUP"
|
||||
else
|
||||
exec "$PYSPARK_PYTHON" "$@"
|
||||
fi
|
|
@ -17,10 +17,10 @@ rem See the License for the specific language governing permissions and
|
|||
rem limitations under the License.
|
||||
rem
|
||||
|
||||
set SCALA_VERSION=2.9.3
|
||||
set SCALA_VERSION=2.10
|
||||
|
||||
rem Figure out where the Spark framework is installed
|
||||
set FWDIR=%~dp0
|
||||
set FWDIR=%~dp0..\
|
||||
|
||||
rem Export this as SPARK_HOME
|
||||
set SPARK_HOME=%FWDIR%
|
|
@ -17,16 +17,21 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
SCALA_VERSION=2.9.3
|
||||
cygwin=false
|
||||
case "`uname`" in
|
||||
CYGWIN*) cygwin=true;;
|
||||
esac
|
||||
|
||||
SCALA_VERSION=2.10
|
||||
|
||||
# Figure out where the Scala framework is installed
|
||||
FWDIR="$(cd `dirname $0`; pwd)"
|
||||
FWDIR="$(cd `dirname $0`/..; pwd)"
|
||||
|
||||
# Export this as SPARK_HOME
|
||||
export SPARK_HOME="$FWDIR"
|
||||
|
||||
# Load environment variables from conf/spark-env.sh, if it exists
|
||||
if [ -e $FWDIR/conf/spark-env.sh ] ; then
|
||||
if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
|
||||
. $FWDIR/conf/spark-env.sh
|
||||
fi
|
||||
|
||||
|
@ -40,25 +45,25 @@ fi
|
|||
EXAMPLES_DIR="$FWDIR"/examples
|
||||
SPARK_EXAMPLES_JAR=""
|
||||
if [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar ]; then
|
||||
# Use the JAR from the SBT build
|
||||
export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar`
|
||||
fi
|
||||
if [ -e "$EXAMPLES_DIR"/target/spark-examples*[0-9Tg].jar ]; then
|
||||
# Use the JAR from the Maven build
|
||||
# TODO: this also needs to become an assembly!
|
||||
export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR"/target/spark-examples*[0-9Tg].jar`
|
||||
fi
|
||||
if [[ -z $SPARK_EXAMPLES_JAR ]]; then
|
||||
echo "Failed to find Spark examples assembly in $FWDIR/examples/target" >&2
|
||||
echo "You need to build Spark with sbt/sbt assembly before running this program" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# Since the examples JAR ideally shouldn't include spark-core (that dependency should be
|
||||
# "provided"), also add our standard Spark classpath, built using compute-classpath.sh.
|
||||
CLASSPATH=`$FWDIR/bin/compute-classpath.sh`
|
||||
CLASSPATH="$SPARK_EXAMPLES_JAR:$CLASSPATH"
|
||||
|
||||
if $cygwin; then
|
||||
CLASSPATH=`cygpath -wp $CLASSPATH`
|
||||
export SPARK_EXAMPLES_JAR=`cygpath -w $SPARK_EXAMPLES_JAR`
|
||||
fi
|
||||
|
||||
# Find java binary
|
||||
if [ -n "${JAVA_HOME}" ]; then
|
||||
RUNNER="${JAVA_HOME}/bin/java"
|
|
@ -17,10 +17,10 @@ rem See the License for the specific language governing permissions and
|
|||
rem limitations under the License.
|
||||
rem
|
||||
|
||||
set SCALA_VERSION=2.9.3
|
||||
set SCALA_VERSION=2.10
|
||||
|
||||
rem Figure out where the Spark framework is installed
|
||||
set FWDIR=%~dp0
|
||||
set FWDIR=%~dp0..\
|
||||
|
||||
rem Export this as SPARK_HOME
|
||||
set SPARK_HOME=%FWDIR%
|
||||
|
@ -49,7 +49,7 @@ if "x%SPARK_EXAMPLES_JAR%"=="x" (
|
|||
|
||||
rem Compute Spark classpath using external script
|
||||
set DONT_PRINT_CLASSPATH=1
|
||||
call "%FWDIR%bin\compute-classpath.cmd"
|
||||
call "%FWDIR%sbin\compute-classpath.cmd"
|
||||
set DONT_PRINT_CLASSPATH=0
|
||||
set CLASSPATH=%SPARK_EXAMPLES_JAR%;%CLASSPATH%
|
||||
|
|
@ -17,16 +17,21 @@
|
|||
# limitations under the License.
|
||||
#
|
||||
|
||||
SCALA_VERSION=2.9.3
|
||||
cygwin=false
|
||||
case "`uname`" in
|
||||
CYGWIN*) cygwin=true;;
|
||||
esac
|
||||
|
||||
SCALA_VERSION=2.10
|
||||
|
||||
# Figure out where the Scala framework is installed
|
||||
FWDIR="$(cd `dirname $0`; pwd)"
|
||||
FWDIR="$(cd `dirname $0`/..; pwd)"
|
||||
|
||||
# Export this as SPARK_HOME
|
||||
export SPARK_HOME="$FWDIR"
|
||||
|
||||
# Load environment variables from conf/spark-env.sh, if it exists
|
||||
if [ -e $FWDIR/conf/spark-env.sh ] ; then
|
||||
if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
|
||||
. $FWDIR/conf/spark-env.sh
|
||||
fi
|
||||
|
||||
|
@ -55,7 +60,7 @@ case "$1" in
|
|||
'org.apache.spark.deploy.worker.Worker')
|
||||
OUR_JAVA_OPTS="$OUR_JAVA_OPTS $SPARK_WORKER_OPTS"
|
||||
;;
|
||||
'org.apache.spark.executor.StandaloneExecutorBackend')
|
||||
'org.apache.spark.executor.CoarseGrainedExecutorBackend')
|
||||
OUR_JAVA_OPTS="$OUR_JAVA_OPTS $SPARK_EXECUTOR_OPTS"
|
||||
;;
|
||||
'org.apache.spark.executor.MesosExecutorBackend')
|
||||
|
@ -87,7 +92,7 @@ JAVA_OPTS="$OUR_JAVA_OPTS"
|
|||
JAVA_OPTS="$JAVA_OPTS -Djava.library.path=$SPARK_LIBRARY_PATH"
|
||||
JAVA_OPTS="$JAVA_OPTS -Xms$SPARK_MEM -Xmx$SPARK_MEM"
|
||||
# Load extra JAVA_OPTS from conf/java-opts, if it exists
|
||||
if [ -e $FWDIR/conf/java-opts ] ; then
|
||||
if [ -e "$FWDIR/conf/java-opts" ] ; then
|
||||
JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"
|
||||
fi
|
||||
export JAVA_OPTS
|
||||
|
@ -124,7 +129,17 @@ fi
|
|||
|
||||
# Compute classpath using external script
|
||||
CLASSPATH=`$FWDIR/bin/compute-classpath.sh`
|
||||
CLASSPATH="$SPARK_TOOLS_JAR:$CLASSPATH"
|
||||
|
||||
if [ "$1" == "org.apache.spark.tools.JavaAPICompletenessChecker" ]; then
|
||||
CLASSPATH="$CLASSPATH:$SPARK_TOOLS_JAR"
|
||||
fi
|
||||
|
||||
if $cygwin; then
|
||||
CLASSPATH=`cygpath -wp $CLASSPATH`
|
||||
if [ "$1" == "org.apache.spark.tools.JavaAPICompletenessChecker" ]; then
|
||||
export SPARK_TOOLS_JAR=`cygpath -w $SPARK_TOOLS_JAR`
|
||||
fi
|
||||
fi
|
||||
export CLASSPATH
|
||||
|
||||
if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
|
||||
|
@ -135,3 +150,5 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
|
|||
fi
|
||||
|
||||
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
|
||||
|
||||
|
|
@ -17,10 +17,10 @@ rem See the License for the specific language governing permissions and
|
|||
rem limitations under the License.
|
||||
rem
|
||||
|
||||
set SCALA_VERSION=2.9.3
|
||||
set SCALA_VERSION=2.10
|
||||
|
||||
rem Figure out where the Spark framework is installed
|
||||
set FWDIR=%~dp0
|
||||
set FWDIR=%~dp0..\
|
||||
|
||||
rem Export this as SPARK_HOME
|
||||
set SPARK_HOME=%FWDIR%
|
||||
|
@ -73,9 +73,9 @@ for %%d in ("%TOOLS_DIR%\target\scala-%SCALA_VERSION%\spark-tools*assembly*.jar"
|
|||
|
||||
rem Compute classpath using external script
|
||||
set DONT_PRINT_CLASSPATH=1
|
||||
call "%FWDIR%bin\compute-classpath.cmd"
|
||||
call "%FWDIR%sbin\compute-classpath.cmd"
|
||||
set DONT_PRINT_CLASSPATH=0
|
||||
set CLASSPATH=%SPARK_TOOLS_JAR%;%CLASSPATH%
|
||||
set CLASSPATH=%CLASSPATH%;%SPARK_TOOLS_JAR%
|
||||
|
||||
rem Figure out where java is.
|
||||
set RUNNER=java
|
|
@ -23,12 +23,16 @@
|
|||
# if those two env vars are set in spark-env.sh but MASTER is not.
|
||||
# Options:
|
||||
# -c <cores> Set the number of cores for REPL to use
|
||||
#
|
||||
|
||||
cygwin=false
|
||||
case "`uname`" in
|
||||
CYGWIN*) cygwin=true;;
|
||||
esac
|
||||
|
||||
# Enter posix mode for bash
|
||||
set -o posix
|
||||
|
||||
FWDIR="`dirname $0`"
|
||||
FWDIR="$(cd `dirname $0`/..; pwd)"
|
||||
|
||||
for o in "$@"; do
|
||||
if [ "$1" = "-c" -o "$1" = "--cores" ]; then
|
||||
|
@ -79,7 +83,18 @@ if [[ ! $? ]]; then
|
|||
saved_stty=""
|
||||
fi
|
||||
|
||||
$FWDIR/spark-class $OPTIONS org.apache.spark.repl.Main "$@"
|
||||
if $cygwin; then
|
||||
# Workaround for issue involving JLine and Cygwin
|
||||
# (see http://sourceforge.net/p/jline/bugs/40/).
|
||||
# If you're using the Mintty terminal emulator in Cygwin, may need to set the
|
||||
# "Backspace sends ^H" setting in "Keys" section of the Mintty options
|
||||
# (see https://github.com/sbt/sbt/issues/562).
|
||||
stty -icanon min 1 -echo > /dev/null 2>&1
|
||||
$FWDIR/bin/spark-class -Djline.terminal=unix $OPTIONS org.apache.spark.repl.Main "$@"
|
||||
stty icanon echo > /dev/null 2>&1
|
||||
else
|
||||
$FWDIR/bin/spark-class $OPTIONS org.apache.spark.repl.Main "$@"
|
||||
fi
|
||||
|
||||
# record the exit status lest it be overwritten:
|
||||
# then reenable echo and propagate the code.
|
|
@ -17,6 +17,7 @@ rem See the License for the specific language governing permissions and
|
|||
rem limitations under the License.
|
||||
rem
|
||||
|
||||
set FWDIR=%~dp0
|
||||
rem Find the path of sbin
|
||||
set SBIN=%~dp0..\sbin\
|
||||
|
||||
cmd /V /E /C %FWDIR%spark-class2.cmd org.apache.spark.repl.Main %*
|
||||
cmd /V /E /C %SBIN%spark-class2.cmd org.apache.spark.repl.Main %*
|
|
@ -18,4 +18,4 @@
|
|||
# - SPARK_WORKER_MEMORY, to set how much memory to use (e.g. 1000m, 2g)
|
||||
# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT
|
||||
# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
|
||||
|
||||
# - SPARK_WORKER_DIR, to set the working directory of worker processes
|
||||
|
|
433
core/pom.xml
433
core/pom.xml
|
@ -17,226 +17,219 @@
|
|||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-parent</artifactId>
|
||||
<version>0.9.0-incubating-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-parent</artifactId>
|
||||
<version>0.9.0-incubating-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
<artifactId>spark-core_2.10</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Spark Project Core</name>
|
||||
<url>http://spark.incubator.apache.org/</url>
|
||||
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.9.3</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Spark Project Core</name>
|
||||
<url>http://spark.incubator.apache.org/</url>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.java.dev.jets3t</groupId>
|
||||
<artifactId>jets3t</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro-ipc</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.jetty</groupId>
|
||||
<artifactId>jetty-server</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.code.findbugs</groupId>
|
||||
<artifactId>jsr305</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ning</groupId>
|
||||
<artifactId>compress-lzf</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.xerial.snappy</groupId>
|
||||
<artifactId>snappy-java</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.ow2.asm</groupId>
|
||||
<artifactId>asm</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.protobuf</groupId>
|
||||
<artifactId>protobuf-java</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.twitter</groupId>
|
||||
<artifactId>chill_2.9.3</artifactId>
|
||||
<version>0.3.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.twitter</groupId>
|
||||
<artifactId>chill-java</artifactId>
|
||||
<version>0.3.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-actor</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-remote</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-slf4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scalap</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.liftweb</groupId>
|
||||
<artifactId>lift-json_2.9.2</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>it.unimi.dsi</groupId>
|
||||
<artifactId>fastutil</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>colt</groupId>
|
||||
<artifactId>colt</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.scala-incubator.io</groupId>
|
||||
<artifactId>scala-io-file_2.9.2</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.mesos</groupId>
|
||||
<artifactId>mesos</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.netty</groupId>
|
||||
<artifactId>netty-all</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-jvm</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-json</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-ganglia</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-graphite</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.derby</groupId>
|
||||
<artifactId>derby</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest_2.9.3</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalacheck</groupId>
|
||||
<artifactId>scalacheck_2.9.3</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.easymock</groupId>
|
||||
<artifactId>easymock</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.novocode</groupId>
|
||||
<artifactId>junit-interface</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<outputDirectory>target/scala-${scala.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.version}/test-classes</testOutputDirectory>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-antrun-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>test</phase>
|
||||
<goals>
|
||||
<goal>run</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<exportAntProperties>true</exportAntProperties>
|
||||
<tasks>
|
||||
<property name="spark.classpath" refid="maven.test.classpath" />
|
||||
<property environment="env" />
|
||||
<fail message="Please set the SCALA_HOME (or SCALA_LIBRARY_PATH if scala is on the path) environment variables and retry.">
|
||||
<condition>
|
||||
<not>
|
||||
<or>
|
||||
<isset property="env.SCALA_HOME" />
|
||||
<isset property="env.SCALA_LIBRARY_PATH" />
|
||||
</or>
|
||||
</not>
|
||||
</condition>
|
||||
</fail>
|
||||
</tasks>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<environmentVariables>
|
||||
<SPARK_HOME>${basedir}/..</SPARK_HOME>
|
||||
<SPARK_TESTING>1</SPARK_TESTING>
|
||||
<SPARK_CLASSPATH>${spark.classpath}</SPARK_CLASSPATH>
|
||||
</environmentVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.java.dev.jets3t</groupId>
|
||||
<artifactId>jets3t</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro-ipc</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.jetty</groupId>
|
||||
<artifactId>jetty-server</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.code.findbugs</groupId>
|
||||
<artifactId>jsr305</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ning</groupId>
|
||||
<artifactId>compress-lzf</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.xerial.snappy</groupId>
|
||||
<artifactId>snappy-java</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.ow2.asm</groupId>
|
||||
<artifactId>asm</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.twitter</groupId>
|
||||
<artifactId>chill_${scala.binary.version}</artifactId>
|
||||
<version>0.3.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.twitter</groupId>
|
||||
<artifactId>chill-java</artifactId>
|
||||
<version>0.3.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${akka.group}</groupId>
|
||||
<artifactId>akka-remote_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${akka.group}</groupId>
|
||||
<artifactId>akka-slf4j_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.liftweb</groupId>
|
||||
<artifactId>lift-json_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>it.unimi.dsi</groupId>
|
||||
<artifactId>fastutil</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>colt</groupId>
|
||||
<artifactId>colt</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.mesos</groupId>
|
||||
<artifactId>mesos</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.netty</groupId>
|
||||
<artifactId>netty-all</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.clearspring.analytics</groupId>
|
||||
<artifactId>stream</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-jvm</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-json</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-ganglia</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.codahale.metrics</groupId>
|
||||
<artifactId>metrics-graphite</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.derby</groupId>
|
||||
<artifactId>derby</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest_${scala.binary.version}</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalacheck</groupId>
|
||||
<artifactId>scalacheck_${scala.binary.version}</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.easymock</groupId>
|
||||
<artifactId>easymock</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.novocode</groupId>
|
||||
<artifactId>junit-interface</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-antrun-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>test</phase>
|
||||
<goals>
|
||||
<goal>run</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<exportAntProperties>true</exportAntProperties>
|
||||
<tasks>
|
||||
<property name="spark.classpath" refid="maven.test.classpath" />
|
||||
<property environment="env" />
|
||||
<fail message="Please set the SCALA_HOME (or SCALA_LIBRARY_PATH if scala is on the path) environment variables and retry.">
|
||||
<condition>
|
||||
<not>
|
||||
<or>
|
||||
<isset property="env.SCALA_HOME" />
|
||||
<isset property="env.SCALA_LIBRARY_PATH" />
|
||||
</or>
|
||||
</not>
|
||||
</condition>
|
||||
</fail>
|
||||
</tasks>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<environmentVariables>
|
||||
<SPARK_HOME>${basedir}/..</SPARK_HOME>
|
||||
<SPARK_TESTING>1</SPARK_TESTING>
|
||||
<SPARK_CLASSPATH>${spark.classpath}</SPARK_CLASSPATH>
|
||||
</environmentVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -19,31 +19,36 @@ package org.apache.spark.network.netty;
|
|||
|
||||
import io.netty.bootstrap.Bootstrap;
|
||||
import io.netty.channel.Channel;
|
||||
import io.netty.channel.ChannelFuture;
|
||||
import io.netty.channel.ChannelFutureListener;
|
||||
import io.netty.channel.ChannelOption;
|
||||
import io.netty.channel.EventLoopGroup;
|
||||
import io.netty.channel.oio.OioEventLoopGroup;
|
||||
import io.netty.channel.socket.oio.OioSocketChannel;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
class FileClient {
|
||||
|
||||
private Logger LOG = LoggerFactory.getLogger(this.getClass().getName());
|
||||
private FileClientHandler handler = null;
|
||||
private static final Logger LOG = LoggerFactory.getLogger(FileClient.class.getName());
|
||||
|
||||
private final FileClientHandler handler;
|
||||
private Channel channel = null;
|
||||
private Bootstrap bootstrap = null;
|
||||
private int connectTimeout = 60*1000; // 1 min
|
||||
private EventLoopGroup group = null;
|
||||
private final int connectTimeout;
|
||||
private final int sendTimeout = 60; // 1 min
|
||||
|
||||
public FileClient(FileClientHandler handler, int connectTimeout) {
|
||||
FileClient(FileClientHandler handler, int connectTimeout) {
|
||||
this.handler = handler;
|
||||
this.connectTimeout = connectTimeout;
|
||||
}
|
||||
|
||||
public void init() {
|
||||
group = new OioEventLoopGroup();
|
||||
bootstrap = new Bootstrap();
|
||||
bootstrap.group(new OioEventLoopGroup())
|
||||
bootstrap.group(group)
|
||||
.channel(OioSocketChannel.class)
|
||||
.option(ChannelOption.SO_KEEPALIVE, true)
|
||||
.option(ChannelOption.TCP_NODELAY, true)
|
||||
|
@ -58,6 +63,7 @@ class FileClient {
|
|||
// ChannelFuture cf = channel.closeFuture();
|
||||
//cf.addListener(new ChannelCloseListener(this));
|
||||
} catch (InterruptedException e) {
|
||||
LOG.warn("FileClient interrupted while trying to connect", e);
|
||||
close();
|
||||
}
|
||||
}
|
||||
|
@ -73,16 +79,21 @@ class FileClient {
|
|||
public void sendRequest(String file) {
|
||||
//assert(file == null);
|
||||
//assert(channel == null);
|
||||
channel.write(file + "\r\n");
|
||||
try {
|
||||
// Should be able to send the message to network link channel.
|
||||
boolean bSent = channel.writeAndFlush(file + "\r\n").await(sendTimeout, TimeUnit.SECONDS);
|
||||
if (!bSent) {
|
||||
throw new RuntimeException("Failed to send");
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
LOG.error("Error", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if(channel != null) {
|
||||
channel.close();
|
||||
channel = null;
|
||||
}
|
||||
if ( bootstrap!=null) {
|
||||
bootstrap.shutdown();
|
||||
if (group != null) {
|
||||
group.shutdownGracefully();
|
||||
group = null;
|
||||
bootstrap = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,17 +17,15 @@
|
|||
|
||||
package org.apache.spark.network.netty;
|
||||
|
||||
import io.netty.buffer.BufType;
|
||||
import io.netty.channel.ChannelInitializer;
|
||||
import io.netty.channel.socket.SocketChannel;
|
||||
import io.netty.handler.codec.string.StringEncoder;
|
||||
|
||||
|
||||
class FileClientChannelInitializer extends ChannelInitializer<SocketChannel> {
|
||||
|
||||
private FileClientHandler fhandler;
|
||||
private final FileClientHandler fhandler;
|
||||
|
||||
public FileClientChannelInitializer(FileClientHandler handler) {
|
||||
FileClientChannelInitializer(FileClientHandler handler) {
|
||||
fhandler = handler;
|
||||
}
|
||||
|
||||
|
@ -35,7 +33,7 @@ class FileClientChannelInitializer extends ChannelInitializer<SocketChannel> {
|
|||
public void initChannel(SocketChannel channel) {
|
||||
// file no more than 2G
|
||||
channel.pipeline()
|
||||
.addLast("encoder", new StringEncoder(BufType.BYTE))
|
||||
.addLast("encoder", new StringEncoder())
|
||||
.addLast("handler", fhandler);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,11 +19,11 @@ package org.apache.spark.network.netty;
|
|||
|
||||
import io.netty.buffer.ByteBuf;
|
||||
import io.netty.channel.ChannelHandlerContext;
|
||||
import io.netty.channel.ChannelInboundByteHandlerAdapter;
|
||||
import io.netty.channel.SimpleChannelInboundHandler;
|
||||
|
||||
import org.apache.spark.storage.BlockId;
|
||||
|
||||
abstract class FileClientHandler extends ChannelInboundByteHandlerAdapter {
|
||||
abstract class FileClientHandler extends SimpleChannelInboundHandler<ByteBuf> {
|
||||
|
||||
private FileHeader currentHeader = null;
|
||||
|
||||
|
@ -37,13 +37,7 @@ abstract class FileClientHandler extends ChannelInboundByteHandlerAdapter {
|
|||
public abstract void handleError(BlockId blockId);
|
||||
|
||||
@Override
|
||||
public ByteBuf newInboundBuffer(ChannelHandlerContext ctx) {
|
||||
// Use direct buffer if possible.
|
||||
return ctx.alloc().ioBuffer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inboundBufferUpdated(ChannelHandlerContext ctx, ByteBuf in) {
|
||||
public void channelRead0(ChannelHandlerContext ctx, ByteBuf in) {
|
||||
// get header
|
||||
if (currentHeader == null && in.readableBytes() >= FileHeader.HEADER_SIZE()) {
|
||||
currentHeader = FileHeader.create(in.readBytes(FileHeader.HEADER_SIZE()));
|
||||
|
|
|
@ -20,34 +20,35 @@ package org.apache.spark.network.netty;
|
|||
import java.net.InetSocketAddress;
|
||||
|
||||
import io.netty.bootstrap.ServerBootstrap;
|
||||
import io.netty.channel.Channel;
|
||||
import io.netty.channel.ChannelFuture;
|
||||
import io.netty.channel.ChannelOption;
|
||||
import io.netty.channel.EventLoopGroup;
|
||||
import io.netty.channel.oio.OioEventLoopGroup;
|
||||
import io.netty.channel.socket.oio.OioServerSocketChannel;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
||||
/**
|
||||
* Server that accept the path of a file an echo back its content.
|
||||
*/
|
||||
class FileServer {
|
||||
|
||||
private Logger LOG = LoggerFactory.getLogger(this.getClass().getName());
|
||||
private static final Logger LOG = LoggerFactory.getLogger(FileServer.class.getName());
|
||||
|
||||
private ServerBootstrap bootstrap = null;
|
||||
private EventLoopGroup bossGroup = null;
|
||||
private EventLoopGroup workerGroup = null;
|
||||
private ChannelFuture channelFuture = null;
|
||||
private int port = 0;
|
||||
private Thread blockingThread = null;
|
||||
|
||||
public FileServer(PathResolver pResolver, int port) {
|
||||
FileServer(PathResolver pResolver, int port) {
|
||||
InetSocketAddress addr = new InetSocketAddress(port);
|
||||
|
||||
// Configure the server.
|
||||
bootstrap = new ServerBootstrap();
|
||||
bootstrap.group(new OioEventLoopGroup(), new OioEventLoopGroup())
|
||||
bossGroup = new OioEventLoopGroup();
|
||||
workerGroup = new OioEventLoopGroup();
|
||||
|
||||
ServerBootstrap bootstrap = new ServerBootstrap();
|
||||
bootstrap.group(bossGroup, workerGroup)
|
||||
.channel(OioServerSocketChannel.class)
|
||||
.option(ChannelOption.SO_BACKLOG, 100)
|
||||
.option(ChannelOption.SO_RCVBUF, 1500)
|
||||
|
@ -68,7 +69,8 @@ class FileServer {
|
|||
* Start the file server asynchronously in a new thread.
|
||||
*/
|
||||
public void start() {
|
||||
blockingThread = new Thread() {
|
||||
Thread blockingThread = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
channelFuture.channel().closeFuture().sync();
|
||||
|
@ -90,13 +92,19 @@ class FileServer {
|
|||
public void stop() {
|
||||
// Close the bound channel.
|
||||
if (channelFuture != null) {
|
||||
channelFuture.channel().close();
|
||||
channelFuture.channel().close().awaitUninterruptibly();
|
||||
channelFuture = null;
|
||||
}
|
||||
// Shutdown bootstrap.
|
||||
if (bootstrap != null) {
|
||||
bootstrap.shutdown();
|
||||
bootstrap = null;
|
||||
|
||||
// Shutdown event groups
|
||||
if (bossGroup != null) {
|
||||
bossGroup.shutdownGracefully();
|
||||
bossGroup = null;
|
||||
}
|
||||
|
||||
if (workerGroup != null) {
|
||||
workerGroup.shutdownGracefully();
|
||||
workerGroup = null;
|
||||
}
|
||||
// TODO: Shutdown all accepted channels as well ?
|
||||
}
|
||||
|
|
|
@ -23,12 +23,11 @@ import io.netty.handler.codec.DelimiterBasedFrameDecoder;
|
|||
import io.netty.handler.codec.Delimiters;
|
||||
import io.netty.handler.codec.string.StringDecoder;
|
||||
|
||||
|
||||
class FileServerChannelInitializer extends ChannelInitializer<SocketChannel> {
|
||||
|
||||
PathResolver pResolver;
|
||||
private final PathResolver pResolver;
|
||||
|
||||
public FileServerChannelInitializer(PathResolver pResolver) {
|
||||
FileServerChannelInitializer(PathResolver pResolver) {
|
||||
this.pResolver = pResolver;
|
||||
}
|
||||
|
||||
|
@ -36,7 +35,7 @@ class FileServerChannelInitializer extends ChannelInitializer<SocketChannel> {
|
|||
public void initChannel(SocketChannel channel) {
|
||||
channel.pipeline()
|
||||
.addLast("framer", new DelimiterBasedFrameDecoder(8192, Delimiters.lineDelimiter()))
|
||||
.addLast("strDecoder", new StringDecoder())
|
||||
.addLast("stringDecoder", new StringDecoder())
|
||||
.addLast("handler", new FileServerHandler(pResolver));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,22 +21,26 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
|
||||
import io.netty.channel.ChannelHandlerContext;
|
||||
import io.netty.channel.ChannelInboundMessageHandlerAdapter;
|
||||
import io.netty.channel.SimpleChannelInboundHandler;
|
||||
import io.netty.channel.DefaultFileRegion;
|
||||
|
||||
import org.apache.spark.storage.BlockId;
|
||||
import org.apache.spark.storage.FileSegment;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
class FileServerHandler extends ChannelInboundMessageHandlerAdapter<String> {
|
||||
class FileServerHandler extends SimpleChannelInboundHandler<String> {
|
||||
|
||||
PathResolver pResolver;
|
||||
private static final Logger LOG = LoggerFactory.getLogger(FileServerHandler.class.getName());
|
||||
|
||||
public FileServerHandler(PathResolver pResolver){
|
||||
private final PathResolver pResolver;
|
||||
|
||||
FileServerHandler(PathResolver pResolver){
|
||||
this.pResolver = pResolver;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void messageReceived(ChannelHandlerContext ctx, String blockIdString) {
|
||||
public void channelRead0(ChannelHandlerContext ctx, String blockIdString) {
|
||||
BlockId blockId = BlockId.apply(blockIdString);
|
||||
FileSegment fileSegment = pResolver.getBlockLocation(blockId);
|
||||
// if getBlockLocation returns null, close the channel
|
||||
|
@ -57,13 +61,13 @@ class FileServerHandler extends ChannelInboundMessageHandlerAdapter<String> {
|
|||
ctx.flush();
|
||||
return;
|
||||
}
|
||||
int len = new Long(length).intValue();
|
||||
int len = (int) length;
|
||||
ctx.write((new FileHeader(len, blockId)).buffer());
|
||||
try {
|
||||
ctx.sendFile(new DefaultFileRegion(new FileInputStream(file)
|
||||
ctx.write(new DefaultFileRegion(new FileInputStream(file)
|
||||
.getChannel(), fileSegment.offset(), fileSegment.length()));
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
LOG.error("Exception: ", e);
|
||||
}
|
||||
} else {
|
||||
ctx.write(new FileHeader(0, blockId).buffer());
|
||||
|
@ -73,7 +77,7 @@ class FileServerHandler extends ChannelInboundMessageHandlerAdapter<String> {
|
|||
|
||||
@Override
|
||||
public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) {
|
||||
cause.printStackTrace();
|
||||
LOG.error("Exception: ", cause);
|
||||
ctx.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,26 +1,26 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.network.netty;
|
||||
|
||||
import org.apache.spark.storage.BlockId;
|
||||
import org.apache.spark.storage.FileSegment;
|
||||
|
||||
public interface PathResolver {
|
||||
/** Get the file segment in which the given block resides. */
|
||||
public FileSegment getBlockLocation(BlockId blockId);
|
||||
}
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.network.netty;
|
||||
|
||||
import org.apache.spark.storage.BlockId;
|
||||
import org.apache.spark.storage.FileSegment;
|
||||
|
||||
public interface PathResolver {
|
||||
/** Get the file segment in which the given block resides. */
|
||||
FileSegment getBlockLocation(BlockId blockId);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
# Set everything to be logged to the console
|
||||
log4j.rootCategory=INFO, console
|
||||
log4j.appender.console=org.apache.log4j.ConsoleAppender
|
||||
log4j.appender.console.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
|
||||
|
||||
# Ignore messages below warning level from Jetty, because it's a bit verbose
|
||||
log4j.logger.org.eclipse.jetty=WARN
|
|
@ -41,7 +41,7 @@ class Accumulable[R, T] (
|
|||
@transient initialValue: R,
|
||||
param: AccumulableParam[R, T])
|
||||
extends Serializable {
|
||||
|
||||
|
||||
val id = Accumulators.newId
|
||||
@transient private var value_ = initialValue // Current value on master
|
||||
val zero = param.zero(initialValue) // Zero value to be passed to workers
|
||||
|
@ -113,7 +113,7 @@ class Accumulable[R, T] (
|
|||
def setValue(newValue: R) {
|
||||
this.value = newValue
|
||||
}
|
||||
|
||||
|
||||
// Called by Java when deserializing an object
|
||||
private def readObject(in: ObjectInputStream) {
|
||||
in.defaultReadObject()
|
||||
|
@ -177,7 +177,7 @@ class GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Ser
|
|||
def zero(initialValue: R): R = {
|
||||
// We need to clone initialValue, but it's hard to specify that R should also be Cloneable.
|
||||
// Instead we'll serialize it to a buffer and load it back.
|
||||
val ser = new JavaSerializer().newInstance()
|
||||
val ser = new JavaSerializer(new SparkConf(false)).newInstance()
|
||||
val copy = ser.deserialize[R](ser.serialize(initialValue))
|
||||
copy.clear() // In case it contained stuff
|
||||
copy
|
||||
|
@ -215,7 +215,7 @@ private object Accumulators {
|
|||
val originals = Map[Long, Accumulable[_, _]]()
|
||||
val localAccums = Map[Thread, Map[Long, Accumulable[_, _]]]()
|
||||
var lastId: Long = 0
|
||||
|
||||
|
||||
def newId: Long = synchronized {
|
||||
lastId += 1
|
||||
return lastId
|
||||
|
|
|
@ -99,7 +99,7 @@ class SimpleFutureAction[T] private[spark](jobWaiter: JobWaiter[_], resultFunc:
|
|||
override def ready(atMost: Duration)(implicit permit: CanAwait): SimpleFutureAction.this.type = {
|
||||
if (!atMost.isFinite()) {
|
||||
awaitResult()
|
||||
} else {
|
||||
} else jobWaiter.synchronized {
|
||||
val finishTime = System.currentTimeMillis() + atMost.toMillis
|
||||
while (!isCompleted) {
|
||||
val time = System.currentTimeMillis()
|
||||
|
|
|
@ -46,6 +46,7 @@ private[spark] class HttpServer(resourceBase: File) extends Logging {
|
|||
if (server != null) {
|
||||
throw new ServerStateException("Server is already started")
|
||||
} else {
|
||||
logInfo("Starting HTTP Server")
|
||||
server = new Server()
|
||||
val connector = new SocketConnector
|
||||
connector.setMaxIdleTime(60*1000)
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
|
||||
package org.apache.spark
|
||||
|
||||
import org.slf4j.Logger
|
||||
import org.slf4j.LoggerFactory
|
||||
import org.apache.log4j.{LogManager, PropertyConfigurator}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
/**
|
||||
* Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows
|
||||
|
@ -33,6 +33,7 @@ trait Logging {
|
|||
// Method to get or create the logger for this object
|
||||
protected def log: Logger = {
|
||||
if (log_ == null) {
|
||||
initializeIfNecessary()
|
||||
var className = this.getClass.getName
|
||||
// Ignore trailing $'s in the class names for Scala objects
|
||||
if (className.endsWith("$")) {
|
||||
|
@ -89,7 +90,39 @@ trait Logging {
|
|||
log.isTraceEnabled
|
||||
}
|
||||
|
||||
// Method for ensuring that logging is initialized, to avoid having multiple
|
||||
// threads do it concurrently (as SLF4J initialization is not thread safe).
|
||||
protected def initLogging() { log }
|
||||
private def initializeIfNecessary() {
|
||||
if (!Logging.initialized) {
|
||||
Logging.initLock.synchronized {
|
||||
if (!Logging.initialized) {
|
||||
initializeLogging()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def initializeLogging() {
|
||||
// If Log4j doesn't seem initialized, load a default properties file
|
||||
val log4jInitialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
|
||||
if (!log4jInitialized) {
|
||||
val defaultLogProps = "org/apache/spark/log4j-defaults.properties"
|
||||
val classLoader = this.getClass.getClassLoader
|
||||
Option(classLoader.getResource(defaultLogProps)) match {
|
||||
case Some(url) =>
|
||||
PropertyConfigurator.configure(url)
|
||||
log.info(s"Using Spark's default log4j profile: $defaultLogProps")
|
||||
case None =>
|
||||
System.err.println(s"Spark was unable to load $defaultLogProps")
|
||||
}
|
||||
}
|
||||
Logging.initialized = true
|
||||
|
||||
// Force a call into slf4j to initialize it. Avoids this happening from mutliple threads
|
||||
// and triggering this: http://mailman.qos.ch/pipermail/slf4j-dev/2010-April/002956.html
|
||||
log
|
||||
}
|
||||
}
|
||||
|
||||
object Logging {
|
||||
@volatile private var initialized = false
|
||||
val initLock = new Object()
|
||||
}
|
||||
|
|
|
@ -21,17 +21,15 @@ import java.io._
|
|||
import java.util.zip.{GZIPInputStream, GZIPOutputStream}
|
||||
|
||||
import scala.collection.mutable.HashSet
|
||||
import scala.concurrent.Await
|
||||
import scala.concurrent.duration._
|
||||
|
||||
import akka.actor._
|
||||
import akka.dispatch._
|
||||
import akka.pattern.ask
|
||||
import akka.util.Duration
|
||||
|
||||
|
||||
import org.apache.spark.scheduler.MapStatus
|
||||
import org.apache.spark.storage.BlockManagerId
|
||||
import org.apache.spark.util.{MetadataCleanerType, Utils, MetadataCleaner, TimeStampedHashMap}
|
||||
|
||||
import org.apache.spark.util.{AkkaUtils, MetadataCleaner, MetadataCleanerType, TimeStampedHashMap, Utils}
|
||||
|
||||
private[spark] sealed trait MapOutputTrackerMessage
|
||||
private[spark] case class GetMapOutputStatuses(shuffleId: Int, requester: String)
|
||||
|
@ -52,10 +50,10 @@ private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster
|
|||
}
|
||||
}
|
||||
|
||||
private[spark] class MapOutputTracker extends Logging {
|
||||
private[spark] class MapOutputTracker(conf: SparkConf) extends Logging {
|
||||
|
||||
private val timeout = AkkaUtils.askTimeout(conf)
|
||||
|
||||
private val timeout = Duration.create(System.getProperty("spark.akka.askTimeout", "10").toLong, "seconds")
|
||||
|
||||
// Set to the MapOutputTrackerActor living on the driver
|
||||
var trackerActor: ActorRef = _
|
||||
|
||||
|
@ -67,14 +65,14 @@ private[spark] class MapOutputTracker extends Logging {
|
|||
protected val epochLock = new java.lang.Object
|
||||
|
||||
private val metadataCleaner =
|
||||
new MetadataCleaner(MetadataCleanerType.MAP_OUTPUT_TRACKER, this.cleanup)
|
||||
new MetadataCleaner(MetadataCleanerType.MAP_OUTPUT_TRACKER, this.cleanup, conf)
|
||||
|
||||
// Send a message to the trackerActor and get its result within a default timeout, or
|
||||
// throw a SparkException if this fails.
|
||||
private def askTracker(message: Any): Any = {
|
||||
try {
|
||||
val future = trackerActor.ask(message)(timeout)
|
||||
return Await.result(future, timeout)
|
||||
Await.result(future, timeout)
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
throw new SparkException("Error communicating with MapOutputTracker", e)
|
||||
|
@ -117,11 +115,11 @@ private[spark] class MapOutputTracker extends Logging {
|
|||
fetching += shuffleId
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (fetchedStatuses == null) {
|
||||
// We won the race to fetch the output locs; do so
|
||||
logInfo("Doing the fetch; tracker actor = " + trackerActor)
|
||||
val hostPort = Utils.localHostPort()
|
||||
val hostPort = Utils.localHostPort(conf)
|
||||
// This try-finally prevents hangs due to timeouts:
|
||||
try {
|
||||
val fetchedBytes =
|
||||
|
@ -144,7 +142,7 @@ private[spark] class MapOutputTracker extends Logging {
|
|||
else{
|
||||
throw new FetchFailedException(null, shuffleId, -1, reduceId,
|
||||
new Exception("Missing all output locations for shuffle " + shuffleId))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
statuses.synchronized {
|
||||
return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, statuses)
|
||||
|
@ -184,7 +182,8 @@ private[spark] class MapOutputTracker extends Logging {
|
|||
}
|
||||
}
|
||||
|
||||
private[spark] class MapOutputTrackerMaster extends MapOutputTracker {
|
||||
private[spark] class MapOutputTrackerMaster(conf: SparkConf)
|
||||
extends MapOutputTracker(conf) {
|
||||
|
||||
// Cache a serialized version of the output statuses for each shuffle to send them out faster
|
||||
private var cacheEpoch = epoch
|
||||
|
@ -244,12 +243,12 @@ private[spark] class MapOutputTrackerMaster extends MapOutputTracker {
|
|||
case Some(bytes) =>
|
||||
return bytes
|
||||
case None =>
|
||||
statuses = mapStatuses(shuffleId)
|
||||
statuses = mapStatuses.getOrElse(shuffleId, Array[MapStatus]())
|
||||
epochGotten = epoch
|
||||
}
|
||||
}
|
||||
// If we got here, we failed to find the serialized locations in the cache, so we pulled
|
||||
// out a snapshot of the locations as "locs"; let's serialize and return that
|
||||
// out a snapshot of the locations as "statuses"; let's serialize and return that
|
||||
val bytes = MapOutputTracker.serializeMapStatuses(statuses)
|
||||
logInfo("Size of output statuses for shuffle %d is %d bytes".format(shuffleId, bytes.length))
|
||||
// Add them into the table only if the epoch hasn't changed while we were working
|
||||
|
@ -274,6 +273,10 @@ private[spark] class MapOutputTrackerMaster extends MapOutputTracker {
|
|||
override def updateEpoch(newEpoch: Long) {
|
||||
// This might be called on the MapOutputTrackerMaster if we're running in local mode.
|
||||
}
|
||||
|
||||
def has(shuffleId: Int): Boolean = {
|
||||
cachedSerializedStatuses.get(shuffleId).isDefined || mapStatuses.contains(shuffleId)
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] object MapOutputTracker {
|
||||
|
@ -308,7 +311,7 @@ private[spark] object MapOutputTracker {
|
|||
statuses: Array[MapStatus]): Array[(BlockManagerId, Long)] = {
|
||||
assert (statuses != null)
|
||||
statuses.map {
|
||||
status =>
|
||||
status =>
|
||||
if (status == null) {
|
||||
throw new FetchFailedException(null, shuffleId, -1, reduceId,
|
||||
new Exception("Missing an output location for shuffle " + shuffleId))
|
||||
|
|
|
@ -17,8 +17,10 @@
|
|||
|
||||
package org.apache.spark
|
||||
|
||||
import org.apache.spark.util.Utils
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
/**
|
||||
* An object that defines how the elements in a key-value pair RDD are partitioned by key.
|
||||
|
@ -50,7 +52,7 @@ object Partitioner {
|
|||
for (r <- bySize if r.partitioner != None) {
|
||||
return r.partitioner.get
|
||||
}
|
||||
if (System.getProperty("spark.default.parallelism") != null) {
|
||||
if (rdd.context.conf.contains("spark.default.parallelism")) {
|
||||
return new HashPartitioner(rdd.context.defaultParallelism)
|
||||
} else {
|
||||
return new HashPartitioner(bySize.head.partitions.size)
|
||||
|
@ -72,7 +74,7 @@ class HashPartitioner(partitions: Int) extends Partitioner {
|
|||
case null => 0
|
||||
case _ => Utils.nonNegativeMod(key.hashCode, numPartitions)
|
||||
}
|
||||
|
||||
|
||||
override def equals(other: Any): Boolean = other match {
|
||||
case h: HashPartitioner =>
|
||||
h.numPartitions == numPartitions
|
||||
|
@ -85,10 +87,10 @@ class HashPartitioner(partitions: Int) extends Partitioner {
|
|||
* A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly equal ranges.
|
||||
* Determines the ranges by sampling the RDD passed in.
|
||||
*/
|
||||
class RangePartitioner[K <% Ordered[K]: ClassManifest, V](
|
||||
class RangePartitioner[K <% Ordered[K]: ClassTag, V](
|
||||
partitions: Int,
|
||||
@transient rdd: RDD[_ <: Product2[K,V]],
|
||||
private val ascending: Boolean = true)
|
||||
private val ascending: Boolean = true)
|
||||
extends Partitioner {
|
||||
|
||||
// An array of upper bounds for the first (partitions - 1) partitions
|
||||
|
|
198
core/src/main/scala/org/apache/spark/SparkConf.scala
Normal file
198
core/src/main/scala/org/apache/spark/SparkConf.scala
Normal file
|
@ -0,0 +1,198 @@
|
|||
package org.apache.spark
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable.HashMap
|
||||
|
||||
import com.typesafe.config.ConfigFactory
|
||||
|
||||
/**
|
||||
* Configuration for a Spark application. Used to set various Spark parameters as key-value pairs.
|
||||
*
|
||||
* Most of the time, you would create a SparkConf object with `new SparkConf()`, which will load
|
||||
* values from both the `spark.*` Java system properties and any `spark.conf` on your application's
|
||||
* classpath (if it has one). In this case, system properties take priority over `spark.conf`, and
|
||||
* any parameters you set directly on the `SparkConf` object take priority over both of those.
|
||||
*
|
||||
* For unit tests, you can also call `new SparkConf(false)` to skip loading external settings and
|
||||
* get the same configuration no matter what is on the classpath.
|
||||
*
|
||||
* All setter methods in this class support chaining. For example, you can write
|
||||
* `new SparkConf().setMaster("local").setAppName("My app")`.
|
||||
*
|
||||
* Note that once a SparkConf object is passed to Spark, it is cloned and can no longer be modified
|
||||
* by the user. Spark does not support modifying the configuration at runtime.
|
||||
*
|
||||
* @param loadDefaults whether to load values from the system properties and classpath
|
||||
*/
|
||||
class SparkConf(loadDefaults: Boolean) extends Serializable with Cloneable with Logging {
|
||||
|
||||
/** Create a SparkConf that loads defaults from system properties and the classpath */
|
||||
def this() = this(true)
|
||||
|
||||
private val settings = new HashMap[String, String]()
|
||||
|
||||
if (loadDefaults) {
|
||||
ConfigFactory.invalidateCaches()
|
||||
val typesafeConfig = ConfigFactory.systemProperties()
|
||||
.withFallback(ConfigFactory.parseResources("spark.conf"))
|
||||
for (e <- typesafeConfig.entrySet().asScala if e.getKey.startsWith("spark.")) {
|
||||
settings(e.getKey) = e.getValue.unwrapped.toString
|
||||
}
|
||||
}
|
||||
|
||||
/** Set a configuration variable. */
|
||||
def set(key: String, value: String): SparkConf = {
|
||||
if (key == null) {
|
||||
throw new NullPointerException("null key")
|
||||
}
|
||||
if (value == null) {
|
||||
throw new NullPointerException("null value")
|
||||
}
|
||||
settings(key) = value
|
||||
this
|
||||
}
|
||||
|
||||
/**
|
||||
* The master URL to connect to, such as "local" to run locally with one thread, "local[4]" to
|
||||
* run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster.
|
||||
*/
|
||||
def setMaster(master: String): SparkConf = {
|
||||
set("spark.master", master)
|
||||
}
|
||||
|
||||
/** Set a name for your application. Shown in the Spark web UI. */
|
||||
def setAppName(name: String): SparkConf = {
|
||||
set("spark.app.name", name)
|
||||
}
|
||||
|
||||
/** Set JAR files to distribute to the cluster. */
|
||||
def setJars(jars: Seq[String]): SparkConf = {
|
||||
for (jar <- jars if (jar == null)) logWarning("null jar passed to SparkContext constructor")
|
||||
set("spark.jars", jars.filter(_ != null).mkString(","))
|
||||
}
|
||||
|
||||
/** Set JAR files to distribute to the cluster. (Java-friendly version.) */
|
||||
def setJars(jars: Array[String]): SparkConf = {
|
||||
setJars(jars.toSeq)
|
||||
}
|
||||
|
||||
/**
|
||||
* Set an environment variable to be used when launching executors for this application.
|
||||
* These variables are stored as properties of the form spark.executorEnv.VAR_NAME
|
||||
* (for example spark.executorEnv.PATH) but this method makes them easier to set.
|
||||
*/
|
||||
def setExecutorEnv(variable: String, value: String): SparkConf = {
|
||||
set("spark.executorEnv." + variable, value)
|
||||
}
|
||||
|
||||
/**
|
||||
* Set multiple environment variables to be used when launching executors.
|
||||
* These variables are stored as properties of the form spark.executorEnv.VAR_NAME
|
||||
* (for example spark.executorEnv.PATH) but this method makes them easier to set.
|
||||
*/
|
||||
def setExecutorEnv(variables: Seq[(String, String)]): SparkConf = {
|
||||
for ((k, v) <- variables) {
|
||||
setExecutorEnv(k, v)
|
||||
}
|
||||
this
|
||||
}
|
||||
|
||||
/**
|
||||
* Set multiple environment variables to be used when launching executors.
|
||||
* (Java-friendly version.)
|
||||
*/
|
||||
def setExecutorEnv(variables: Array[(String, String)]): SparkConf = {
|
||||
setExecutorEnv(variables.toSeq)
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the location where Spark is installed on worker nodes.
|
||||
*/
|
||||
def setSparkHome(home: String): SparkConf = {
|
||||
set("spark.home", home)
|
||||
}
|
||||
|
||||
/** Set multiple parameters together */
|
||||
def setAll(settings: Traversable[(String, String)]) = {
|
||||
this.settings ++= settings
|
||||
this
|
||||
}
|
||||
|
||||
/** Set a parameter if it isn't already configured */
|
||||
def setIfMissing(key: String, value: String): SparkConf = {
|
||||
if (!settings.contains(key)) {
|
||||
settings(key) = value
|
||||
}
|
||||
this
|
||||
}
|
||||
|
||||
/** Remove a parameter from the configuration */
|
||||
def remove(key: String): SparkConf = {
|
||||
settings.remove(key)
|
||||
this
|
||||
}
|
||||
|
||||
/** Get a parameter; throws a NoSuchElementException if it's not set */
|
||||
def get(key: String): String = {
|
||||
settings.getOrElse(key, throw new NoSuchElementException(key))
|
||||
}
|
||||
|
||||
/** Get a parameter, falling back to a default if not set */
|
||||
def get(key: String, defaultValue: String): String = {
|
||||
settings.getOrElse(key, defaultValue)
|
||||
}
|
||||
|
||||
/** Get a parameter as an Option */
|
||||
def getOption(key: String): Option[String] = {
|
||||
settings.get(key)
|
||||
}
|
||||
|
||||
/** Get all parameters as a list of pairs */
|
||||
def getAll: Array[(String, String)] = settings.clone().toArray
|
||||
|
||||
/** Get a parameter as an integer, falling back to a default if not set */
|
||||
def getInt(key: String, defaultValue: Int): Int = {
|
||||
getOption(key).map(_.toInt).getOrElse(defaultValue)
|
||||
}
|
||||
|
||||
/** Get a parameter as a long, falling back to a default if not set */
|
||||
def getLong(key: String, defaultValue: Long): Long = {
|
||||
getOption(key).map(_.toLong).getOrElse(defaultValue)
|
||||
}
|
||||
|
||||
/** Get a parameter as a double, falling back to a default if not set */
|
||||
def getDouble(key: String, defaultValue: Double): Double = {
|
||||
getOption(key).map(_.toDouble).getOrElse(defaultValue)
|
||||
}
|
||||
|
||||
/** Get a parameter as a boolean, falling back to a default if not set */
|
||||
def getBoolean(key: String, defaultValue: Boolean): Boolean = {
|
||||
getOption(key).map(_.toBoolean).getOrElse(defaultValue)
|
||||
}
|
||||
|
||||
/** Get all executor environment variables set on this SparkConf */
|
||||
def getExecutorEnv: Seq[(String, String)] = {
|
||||
val prefix = "spark.executorEnv."
|
||||
getAll.filter{case (k, v) => k.startsWith(prefix)}
|
||||
.map{case (k, v) => (k.substring(prefix.length), v)}
|
||||
}
|
||||
|
||||
/** Get all akka conf variables set on this SparkConf */
|
||||
def getAkkaConf: Seq[(String, String)] = getAll.filter {case (k, v) => k.startsWith("akka.")}
|
||||
|
||||
/** Does the configuration contain a given parameter? */
|
||||
def contains(key: String): Boolean = settings.contains(key)
|
||||
|
||||
/** Copy this object */
|
||||
override def clone: SparkConf = {
|
||||
new SparkConf(false).setAll(settings)
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string listing all keys and values, one per line. This is useful to print the
|
||||
* configuration out for debugging.
|
||||
*/
|
||||
def toDebugString: String = {
|
||||
settings.toArray.sorted.map{case (k, v) => k + "=" + v}.mkString("\n")
|
||||
}
|
||||
}
|
|
@ -19,35 +19,23 @@ package org.apache.spark
|
|||
|
||||
import java.io._
|
||||
import java.net.URI
|
||||
import java.util.Properties
|
||||
import java.util.{UUID, Properties}
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
|
||||
import scala.collection.Map
|
||||
import scala.collection.{Map, Set}
|
||||
import scala.collection.generic.Growable
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
import scala.collection.mutable.HashMap
|
||||
|
||||
import scala.collection.mutable.{ArrayBuffer, HashMap}
|
||||
import scala.reflect.{ClassTag, classTag}
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hadoop.io.ArrayWritable
|
||||
import org.apache.hadoop.io.BooleanWritable
|
||||
import org.apache.hadoop.io.BytesWritable
|
||||
import org.apache.hadoop.io.DoubleWritable
|
||||
import org.apache.hadoop.io.FloatWritable
|
||||
import org.apache.hadoop.io.IntWritable
|
||||
import org.apache.hadoop.io.LongWritable
|
||||
import org.apache.hadoop.io.NullWritable
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.Writable
|
||||
import org.apache.hadoop.mapred.FileInputFormat
|
||||
import org.apache.hadoop.mapred.InputFormat
|
||||
import org.apache.hadoop.mapred.JobConf
|
||||
import org.apache.hadoop.mapred.SequenceFileInputFormat
|
||||
import org.apache.hadoop.mapred.TextInputFormat
|
||||
import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
|
||||
import org.apache.hadoop.mapreduce.{Job => NewHadoopJob}
|
||||
import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable,
|
||||
FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
|
||||
import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat,
|
||||
TextInputFormat}
|
||||
import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob}
|
||||
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
|
||||
|
||||
import org.apache.mesos.MesosNativeLibrary
|
||||
|
||||
import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil}
|
||||
|
@ -55,59 +43,106 @@ import org.apache.spark.partial.{ApproximateEvaluator, PartialResult}
|
|||
import org.apache.spark.rdd._
|
||||
import org.apache.spark.scheduler._
|
||||
import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend,
|
||||
SparkDeploySchedulerBackend, ClusterScheduler, SimrSchedulerBackend}
|
||||
SparkDeploySchedulerBackend, SimrSchedulerBackend}
|
||||
import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend}
|
||||
import org.apache.spark.scheduler.local.LocalScheduler
|
||||
import org.apache.spark.scheduler.StageInfo
|
||||
import org.apache.spark.scheduler.local.LocalBackend
|
||||
import org.apache.spark.storage.{BlockManagerSource, RDDInfo, StorageStatus, StorageUtils}
|
||||
import org.apache.spark.ui.SparkUI
|
||||
import org.apache.spark.util.{ClosureCleaner, MetadataCleaner, MetadataCleanerType,
|
||||
TimeStampedHashMap, Utils}
|
||||
import org.apache.spark.util.{Utils, TimeStampedHashMap, MetadataCleaner, MetadataCleanerType,
|
||||
ClosureCleaner}
|
||||
|
||||
/**
|
||||
* Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
|
||||
* cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
|
||||
*
|
||||
* @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
|
||||
* @param appName A name for your application, to display on the cluster web UI.
|
||||
* @param sparkHome Location where Spark is installed on cluster nodes.
|
||||
* @param jars Collection of JARs to send to the cluster. These can be paths on the local file
|
||||
* system or HDFS, HTTP, HTTPS, or FTP URLs.
|
||||
* @param environment Environment variables to set on worker nodes.
|
||||
* @param config a Spark Config object describing the application configuration. Any settings in
|
||||
* this config overrides the default configs as well as system properties.
|
||||
* @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. Can
|
||||
* be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
|
||||
* from a list of input files or InputFormats for the application.
|
||||
*/
|
||||
class SparkContext(
|
||||
val master: String,
|
||||
val appName: String,
|
||||
val sparkHome: String = null,
|
||||
val jars: Seq[String] = Nil,
|
||||
val environment: Map[String, String] = Map(),
|
||||
// This is used only by yarn for now, but should be relevant to other cluster types (mesos, etc)
|
||||
// too. This is typically generated from InputFormatInfo.computePreferredLocations .. host, set
|
||||
// of data-local splits on host
|
||||
val preferredNodeLocationData: scala.collection.Map[String, scala.collection.Set[SplitInfo]] =
|
||||
scala.collection.immutable.Map())
|
||||
config: SparkConf,
|
||||
// This is used only by YARN for now, but should be relevant to other cluster types (Mesos, etc)
|
||||
// too. This is typically generated from InputFormatInfo.computePreferredLocations. It contains
|
||||
// a map from hostname to a list of input format splits on the host.
|
||||
val preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map())
|
||||
extends Logging {
|
||||
|
||||
// Ensure logging is initialized before we spawn any threads
|
||||
initLogging()
|
||||
/**
|
||||
* Alternative constructor that allows setting common Spark properties directly
|
||||
*
|
||||
* @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
|
||||
* @param appName A name for your application, to display on the cluster web UI
|
||||
* @param conf a [[org.apache.spark.SparkConf]] object specifying other Spark parameters
|
||||
*/
|
||||
def this(master: String, appName: String, conf: SparkConf) =
|
||||
this(SparkContext.updatedConf(conf, master, appName))
|
||||
|
||||
/**
|
||||
* Alternative constructor that allows setting common Spark properties directly
|
||||
*
|
||||
* @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
|
||||
* @param appName A name for your application, to display on the cluster web UI.
|
||||
* @param sparkHome Location where Spark is installed on cluster nodes.
|
||||
* @param jars Collection of JARs to send to the cluster. These can be paths on the local file
|
||||
* system or HDFS, HTTP, HTTPS, or FTP URLs.
|
||||
* @param environment Environment variables to set on worker nodes.
|
||||
*/
|
||||
def this(
|
||||
master: String,
|
||||
appName: String,
|
||||
sparkHome: String = null,
|
||||
jars: Seq[String] = Nil,
|
||||
environment: Map[String, String] = Map(),
|
||||
preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) =
|
||||
{
|
||||
this(SparkContext.updatedConf(new SparkConf(), master, appName, sparkHome, jars, environment),
|
||||
preferredNodeLocationData)
|
||||
}
|
||||
|
||||
private[spark] val conf = config.clone()
|
||||
|
||||
/**
|
||||
* Return a copy of this SparkContext's configuration. The configuration ''cannot'' be
|
||||
* changed at runtime.
|
||||
*/
|
||||
def getConf: SparkConf = conf.clone()
|
||||
|
||||
if (!conf.contains("spark.master")) {
|
||||
throw new SparkException("A master URL must be set in your configuration")
|
||||
}
|
||||
if (!conf.contains("spark.app.name")) {
|
||||
throw new SparkException("An application must be set in your configuration")
|
||||
}
|
||||
|
||||
if (conf.get("spark.logConf", "false").toBoolean) {
|
||||
logInfo("Spark configuration:\n" + conf.toDebugString)
|
||||
}
|
||||
|
||||
// Set Spark driver host and port system properties
|
||||
if (System.getProperty("spark.driver.host") == null) {
|
||||
System.setProperty("spark.driver.host", Utils.localHostName())
|
||||
}
|
||||
if (System.getProperty("spark.driver.port") == null) {
|
||||
System.setProperty("spark.driver.port", "0")
|
||||
conf.setIfMissing("spark.driver.host", Utils.localHostName())
|
||||
conf.setIfMissing("spark.driver.port", "0")
|
||||
|
||||
val jars: Seq[String] = if (conf.contains("spark.jars")) {
|
||||
conf.get("spark.jars").split(",").filter(_.size != 0)
|
||||
} else {
|
||||
null
|
||||
}
|
||||
|
||||
val master = conf.get("spark.master")
|
||||
val appName = conf.get("spark.app.name")
|
||||
|
||||
val isLocal = (master == "local" || master.startsWith("local["))
|
||||
|
||||
// Create the Spark execution environment (cache, map output tracker, etc)
|
||||
private[spark] val env = SparkEnv.createFromSystemProperties(
|
||||
private[spark] val env = SparkEnv.create(
|
||||
conf,
|
||||
"<driver>",
|
||||
System.getProperty("spark.driver.host"),
|
||||
System.getProperty("spark.driver.port").toInt,
|
||||
true,
|
||||
isLocal)
|
||||
conf.get("spark.driver.host"),
|
||||
conf.get("spark.driver.port").toInt,
|
||||
isDriver = true,
|
||||
isLocal = isLocal)
|
||||
SparkEnv.set(env)
|
||||
|
||||
// Used to store a URL for each static file/jar together with the file's local timestamp
|
||||
|
@ -116,7 +151,8 @@ class SparkContext(
|
|||
|
||||
// Keeps track of all persisted RDDs
|
||||
private[spark] val persistentRdds = new TimeStampedHashMap[Int, RDD[_]]
|
||||
private[spark] val metadataCleaner = new MetadataCleaner(MetadataCleanerType.SPARK_CONTEXT, this.cleanup)
|
||||
private[spark] val metadataCleaner =
|
||||
new MetadataCleaner(MetadataCleanerType.SPARK_CONTEXT, this.cleanup, conf)
|
||||
|
||||
// Initialize the Spark UI
|
||||
private[spark] val ui = new SparkUI(this)
|
||||
|
@ -126,23 +162,30 @@ class SparkContext(
|
|||
|
||||
// Add each JAR given through the constructor
|
||||
if (jars != null) {
|
||||
jars.foreach { addJar(_) }
|
||||
jars.foreach(addJar)
|
||||
}
|
||||
|
||||
private[spark] val executorMemory = conf.getOption("spark.executor.memory")
|
||||
.orElse(Option(System.getenv("SPARK_MEM")))
|
||||
.map(Utils.memoryStringToMb)
|
||||
.getOrElse(512)
|
||||
|
||||
// Environment variables to pass to our executors
|
||||
private[spark] val executorEnvs = HashMap[String, String]()
|
||||
// Note: SPARK_MEM is included for Mesos, but overwritten for standalone mode in ExecutorRunner
|
||||
for (key <- Seq("SPARK_CLASSPATH", "SPARK_LIBRARY_PATH", "SPARK_JAVA_OPTS", "SPARK_TESTING")) {
|
||||
val value = System.getenv(key)
|
||||
if (value != null) {
|
||||
executorEnvs(key) = value
|
||||
}
|
||||
for (key <- Seq("SPARK_CLASSPATH", "SPARK_LIBRARY_PATH", "SPARK_JAVA_OPTS");
|
||||
value <- Option(System.getenv(key))) {
|
||||
executorEnvs(key) = value
|
||||
}
|
||||
// Convert java options to env vars as a work around
|
||||
// since we can't set env vars directly in sbt.
|
||||
for { (envKey, propKey) <- Seq(("SPARK_HOME", "spark.home"), ("SPARK_TESTING", "spark.testing"))
|
||||
value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} {
|
||||
executorEnvs(envKey) = value
|
||||
}
|
||||
// Since memory can be set with a system property too, use that
|
||||
executorEnvs("SPARK_MEM") = SparkContext.executorMemoryRequested + "m"
|
||||
if (environment != null) {
|
||||
executorEnvs ++= environment
|
||||
}
|
||||
executorEnvs("SPARK_MEM") = executorMemory + "m"
|
||||
executorEnvs ++= conf.getExecutorEnv
|
||||
|
||||
// Set SPARK_USER for user who is running SparkContext.
|
||||
val sparkUser = Option {
|
||||
|
@ -153,122 +196,35 @@ class SparkContext(
|
|||
executorEnvs("SPARK_USER") = sparkUser
|
||||
|
||||
// Create and start the scheduler
|
||||
private[spark] var taskScheduler: TaskScheduler = {
|
||||
// Regular expression used for local[N] master format
|
||||
val LOCAL_N_REGEX = """local\[([0-9]+)\]""".r
|
||||
// Regular expression for local[N, maxRetries], used in tests with failing tasks
|
||||
val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+)\s*,\s*([0-9]+)\]""".r
|
||||
// Regular expression for simulating a Spark cluster of [N, cores, memory] locally
|
||||
val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
|
||||
// Regular expression for connecting to Spark deploy clusters
|
||||
val SPARK_REGEX = """spark://(.*)""".r
|
||||
// Regular expression for connection to Mesos cluster
|
||||
val MESOS_REGEX = """mesos://(.*)""".r
|
||||
// Regular expression for connection to Simr cluster
|
||||
val SIMR_REGEX = """simr://(.*)""".r
|
||||
|
||||
master match {
|
||||
case "local" =>
|
||||
new LocalScheduler(1, 0, this)
|
||||
|
||||
case LOCAL_N_REGEX(threads) =>
|
||||
new LocalScheduler(threads.toInt, 0, this)
|
||||
|
||||
case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
|
||||
new LocalScheduler(threads.toInt, maxFailures.toInt, this)
|
||||
|
||||
case SPARK_REGEX(sparkUrl) =>
|
||||
val scheduler = new ClusterScheduler(this)
|
||||
val masterUrls = sparkUrl.split(",").map("spark://" + _)
|
||||
val backend = new SparkDeploySchedulerBackend(scheduler, this, masterUrls, appName)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case SIMR_REGEX(simrUrl) =>
|
||||
val scheduler = new ClusterScheduler(this)
|
||||
val backend = new SimrSchedulerBackend(scheduler, this, simrUrl)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
|
||||
// Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
|
||||
val memoryPerSlaveInt = memoryPerSlave.toInt
|
||||
if (SparkContext.executorMemoryRequested > memoryPerSlaveInt) {
|
||||
throw new SparkException(
|
||||
"Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(
|
||||
memoryPerSlaveInt, SparkContext.executorMemoryRequested))
|
||||
}
|
||||
|
||||
val scheduler = new ClusterScheduler(this)
|
||||
val localCluster = new LocalSparkCluster(
|
||||
numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt)
|
||||
val masterUrls = localCluster.start()
|
||||
val backend = new SparkDeploySchedulerBackend(scheduler, this, masterUrls, appName)
|
||||
scheduler.initialize(backend)
|
||||
backend.shutdownCallback = (backend: SparkDeploySchedulerBackend) => {
|
||||
localCluster.stop()
|
||||
}
|
||||
scheduler
|
||||
|
||||
case "yarn-standalone" =>
|
||||
val scheduler = try {
|
||||
val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")
|
||||
val cons = clazz.getConstructor(classOf[SparkContext])
|
||||
cons.newInstance(this).asInstanceOf[ClusterScheduler]
|
||||
} catch {
|
||||
// TODO: Enumerate the exact reasons why it can fail
|
||||
// But irrespective of it, it means we cannot proceed !
|
||||
case th: Throwable => {
|
||||
throw new SparkException("YARN mode not available ?", th)
|
||||
}
|
||||
}
|
||||
val backend = new CoarseGrainedSchedulerBackend(scheduler, this.env.actorSystem)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case MESOS_REGEX(mesosUrl) =>
|
||||
MesosNativeLibrary.load()
|
||||
val scheduler = new ClusterScheduler(this)
|
||||
val coarseGrained = System.getProperty("spark.mesos.coarse", "false").toBoolean
|
||||
val backend = if (coarseGrained) {
|
||||
new CoarseMesosSchedulerBackend(scheduler, this, mesosUrl, appName)
|
||||
} else {
|
||||
new MesosSchedulerBackend(scheduler, this, mesosUrl, appName)
|
||||
}
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case _ =>
|
||||
throw new SparkException("Could not parse Master URL: '" + master + "'")
|
||||
}
|
||||
}
|
||||
private[spark] var taskScheduler = SparkContext.createTaskScheduler(this, master, appName)
|
||||
taskScheduler.start()
|
||||
|
||||
@volatile private[spark] var dagScheduler = new DAGScheduler(taskScheduler)
|
||||
dagScheduler.start()
|
||||
|
||||
ui.start()
|
||||
|
||||
/** A default Hadoop Configuration for the Hadoop code (e.g. file systems) that we reuse. */
|
||||
val hadoopConfiguration = {
|
||||
val env = SparkEnv.get
|
||||
val conf = SparkHadoopUtil.get.newConfiguration()
|
||||
val hadoopConf = SparkHadoopUtil.get.newConfiguration()
|
||||
// Explicitly check for S3 environment variables
|
||||
if (System.getenv("AWS_ACCESS_KEY_ID") != null &&
|
||||
System.getenv("AWS_SECRET_ACCESS_KEY") != null) {
|
||||
conf.set("fs.s3.awsAccessKeyId", System.getenv("AWS_ACCESS_KEY_ID"))
|
||||
conf.set("fs.s3n.awsAccessKeyId", System.getenv("AWS_ACCESS_KEY_ID"))
|
||||
conf.set("fs.s3.awsSecretAccessKey", System.getenv("AWS_SECRET_ACCESS_KEY"))
|
||||
conf.set("fs.s3n.awsSecretAccessKey", System.getenv("AWS_SECRET_ACCESS_KEY"))
|
||||
hadoopConf.set("fs.s3.awsAccessKeyId", System.getenv("AWS_ACCESS_KEY_ID"))
|
||||
hadoopConf.set("fs.s3n.awsAccessKeyId", System.getenv("AWS_ACCESS_KEY_ID"))
|
||||
hadoopConf.set("fs.s3.awsSecretAccessKey", System.getenv("AWS_SECRET_ACCESS_KEY"))
|
||||
hadoopConf.set("fs.s3n.awsSecretAccessKey", System.getenv("AWS_SECRET_ACCESS_KEY"))
|
||||
}
|
||||
// Copy any "spark.hadoop.foo=bar" system properties into conf as "foo=bar"
|
||||
Utils.getSystemProperties.foreach { case (key, value) =>
|
||||
conf.getAll.foreach { case (key, value) =>
|
||||
if (key.startsWith("spark.hadoop.")) {
|
||||
conf.set(key.substring("spark.hadoop.".length), value)
|
||||
hadoopConf.set(key.substring("spark.hadoop.".length), value)
|
||||
}
|
||||
}
|
||||
val bufferSize = System.getProperty("spark.buffer.size", "65536")
|
||||
conf.set("io.file.buffer.size", bufferSize)
|
||||
conf
|
||||
val bufferSize = conf.get("spark.buffer.size", "65536")
|
||||
hadoopConf.set("io.file.buffer.size", bufferSize)
|
||||
hadoopConf
|
||||
}
|
||||
|
||||
private[spark] var checkpointDir: Option[String] = None
|
||||
|
@ -278,7 +234,7 @@ class SparkContext(
|
|||
override protected def childValue(parent: Properties): Properties = new Properties(parent)
|
||||
}
|
||||
|
||||
private[spark] def getLocalProperties(): Properties = localProperties.get()
|
||||
private[spark] def getLocalProperties: Properties = localProperties.get()
|
||||
|
||||
private[spark] def setLocalProperties(props: Properties) {
|
||||
localProperties.set(props)
|
||||
|
@ -354,19 +310,19 @@ class SparkContext(
|
|||
// Methods for creating RDDs
|
||||
|
||||
/** Distribute a local Scala collection to form an RDD. */
|
||||
def parallelize[T: ClassManifest](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
|
||||
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
|
||||
new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]())
|
||||
}
|
||||
|
||||
/** Distribute a local Scala collection to form an RDD. */
|
||||
def makeRDD[T: ClassManifest](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
|
||||
def makeRDD[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
|
||||
parallelize(seq, numSlices)
|
||||
}
|
||||
|
||||
/** Distribute a local Scala collection to form an RDD, with one or more
|
||||
* location preferences (hostnames of Spark nodes) for each object.
|
||||
* Create a new partition for each collection item. */
|
||||
def makeRDD[T: ClassManifest](seq: Seq[(T, Seq[String])]): RDD[T] = {
|
||||
def makeRDD[T: ClassTag](seq: Seq[(T, Seq[String])]): RDD[T] = {
|
||||
val indexToPrefs = seq.zipWithIndex.map(t => (t._2, t._1._2)).toMap
|
||||
new ParallelCollectionRDD[T](this, seq.map(_._1), seq.size, indexToPrefs)
|
||||
}
|
||||
|
@ -419,7 +375,7 @@ class SparkContext(
|
|||
}
|
||||
|
||||
/**
|
||||
* Smarter version of hadoopFile() that uses class manifests to figure out the classes of keys,
|
||||
* Smarter version of hadoopFile() that uses class tags to figure out the classes of keys,
|
||||
* values and the InputFormat so that users don't need to pass them directly. Instead, callers
|
||||
* can just write, for example,
|
||||
* {{{
|
||||
|
@ -427,17 +383,17 @@ class SparkContext(
|
|||
* }}}
|
||||
*/
|
||||
def hadoopFile[K, V, F <: InputFormat[K, V]](path: String, minSplits: Int)
|
||||
(implicit km: ClassManifest[K], vm: ClassManifest[V], fm: ClassManifest[F])
|
||||
(implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F])
|
||||
: RDD[(K, V)] = {
|
||||
hadoopFile(path,
|
||||
fm.erasure.asInstanceOf[Class[F]],
|
||||
km.erasure.asInstanceOf[Class[K]],
|
||||
vm.erasure.asInstanceOf[Class[V]],
|
||||
fm.runtimeClass.asInstanceOf[Class[F]],
|
||||
km.runtimeClass.asInstanceOf[Class[K]],
|
||||
vm.runtimeClass.asInstanceOf[Class[V]],
|
||||
minSplits)
|
||||
}
|
||||
|
||||
/**
|
||||
* Smarter version of hadoopFile() that uses class manifests to figure out the classes of keys,
|
||||
* Smarter version of hadoopFile() that uses class tags to figure out the classes of keys,
|
||||
* values and the InputFormat so that users don't need to pass them directly. Instead, callers
|
||||
* can just write, for example,
|
||||
* {{{
|
||||
|
@ -445,17 +401,17 @@ class SparkContext(
|
|||
* }}}
|
||||
*/
|
||||
def hadoopFile[K, V, F <: InputFormat[K, V]](path: String)
|
||||
(implicit km: ClassManifest[K], vm: ClassManifest[V], fm: ClassManifest[F]): RDD[(K, V)] =
|
||||
(implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] =
|
||||
hadoopFile[K, V, F](path, defaultMinSplits)
|
||||
|
||||
/** Get an RDD for a Hadoop file with an arbitrary new API InputFormat. */
|
||||
def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]](path: String)
|
||||
(implicit km: ClassManifest[K], vm: ClassManifest[V], fm: ClassManifest[F]): RDD[(K, V)] = {
|
||||
(implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = {
|
||||
newAPIHadoopFile(
|
||||
path,
|
||||
fm.erasure.asInstanceOf[Class[F]],
|
||||
km.erasure.asInstanceOf[Class[K]],
|
||||
vm.erasure.asInstanceOf[Class[V]])
|
||||
fm.runtimeClass.asInstanceOf[Class[F]],
|
||||
km.runtimeClass.asInstanceOf[Class[K]],
|
||||
vm.runtimeClass.asInstanceOf[Class[V]])
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -513,11 +469,11 @@ class SparkContext(
|
|||
* IntWritable). The most natural thing would've been to have implicit objects for the
|
||||
* converters, but then we couldn't have an object for every subclass of Writable (you can't
|
||||
* have a parameterized singleton object). We use functions instead to create a new converter
|
||||
* for the appropriate type. In addition, we pass the converter a ClassManifest of its type to
|
||||
* for the appropriate type. In addition, we pass the converter a ClassTag of its type to
|
||||
* allow it to figure out the Writable class to use in the subclass case.
|
||||
*/
|
||||
def sequenceFile[K, V](path: String, minSplits: Int = defaultMinSplits)
|
||||
(implicit km: ClassManifest[K], vm: ClassManifest[V],
|
||||
(implicit km: ClassTag[K], vm: ClassTag[V],
|
||||
kcf: () => WritableConverter[K], vcf: () => WritableConverter[V])
|
||||
: RDD[(K, V)] = {
|
||||
val kc = kcf()
|
||||
|
@ -536,7 +492,7 @@ class SparkContext(
|
|||
* slow if you use the default serializer (Java serialization), though the nice thing about it is
|
||||
* that there's very little effort required to save arbitrary objects.
|
||||
*/
|
||||
def objectFile[T: ClassManifest](
|
||||
def objectFile[T: ClassTag](
|
||||
path: String,
|
||||
minSplits: Int = defaultMinSplits
|
||||
): RDD[T] = {
|
||||
|
@ -545,17 +501,17 @@ class SparkContext(
|
|||
}
|
||||
|
||||
|
||||
protected[spark] def checkpointFile[T: ClassManifest](
|
||||
protected[spark] def checkpointFile[T: ClassTag](
|
||||
path: String
|
||||
): RDD[T] = {
|
||||
new CheckpointRDD[T](this, path)
|
||||
}
|
||||
|
||||
/** Build the union of a list of RDDs. */
|
||||
def union[T: ClassManifest](rdds: Seq[RDD[T]]): RDD[T] = new UnionRDD(this, rdds)
|
||||
def union[T: ClassTag](rdds: Seq[RDD[T]]): RDD[T] = new UnionRDD(this, rdds)
|
||||
|
||||
/** Build the union of a list of RDDs passed as variable-length arguments. */
|
||||
def union[T: ClassManifest](first: RDD[T], rest: RDD[T]*): RDD[T] =
|
||||
def union[T: ClassTag](first: RDD[T], rest: RDD[T]*): RDD[T] =
|
||||
new UnionRDD(this, Seq(first) ++ rest)
|
||||
|
||||
// Methods for creating shared variables
|
||||
|
@ -608,10 +564,8 @@ class SparkContext(
|
|||
}
|
||||
addedFiles(key) = System.currentTimeMillis
|
||||
|
||||
// Fetch the file locally in case a job is executed locally.
|
||||
// Jobs that run through LocalScheduler will already fetch the required dependencies,
|
||||
// but jobs run in DAGScheduler.runLocally() will not so we must fetch the files here.
|
||||
Utils.fetchFile(path, new File(SparkFiles.getRootDirectory))
|
||||
// Fetch the file locally in case a job is executed using DAGScheduler.runLocally().
|
||||
Utils.fetchFile(path, new File(SparkFiles.getRootDirectory), conf)
|
||||
|
||||
logInfo("Added file " + path + " at " + key + " with timestamp " + addedFiles(key))
|
||||
}
|
||||
|
@ -781,15 +735,27 @@ class SparkContext(
|
|||
* (in that order of preference). If neither of these is set, return None.
|
||||
*/
|
||||
private[spark] def getSparkHome(): Option[String] = {
|
||||
if (sparkHome != null) {
|
||||
Some(sparkHome)
|
||||
} else if (System.getProperty("spark.home") != null) {
|
||||
Some(System.getProperty("spark.home"))
|
||||
} else if (System.getenv("SPARK_HOME") != null) {
|
||||
Some(System.getenv("SPARK_HOME"))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
conf.getOption("spark.home").orElse(Option(System.getenv("SPARK_HOME")))
|
||||
}
|
||||
|
||||
/**
|
||||
* Support function for API backtraces.
|
||||
*/
|
||||
def setCallSite(site: String) {
|
||||
setLocalProperty("externalCallSite", site)
|
||||
}
|
||||
|
||||
/**
|
||||
* Support function for API backtraces.
|
||||
*/
|
||||
def clearCallSite() {
|
||||
setLocalProperty("externalCallSite", null)
|
||||
}
|
||||
|
||||
private[spark] def getCallSite(): String = {
|
||||
val callSite = getLocalProperty("externalCallSite")
|
||||
if (callSite == null) return Utils.formatSparkCallSite
|
||||
callSite
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -798,13 +764,13 @@ class SparkContext(
|
|||
* flag specifies whether the scheduler can run the computation on the driver rather than
|
||||
* shipping it out to the cluster, for short actions like first().
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
def runJob[T, U: ClassTag](
|
||||
rdd: RDD[T],
|
||||
func: (TaskContext, Iterator[T]) => U,
|
||||
partitions: Seq[Int],
|
||||
allowLocal: Boolean,
|
||||
resultHandler: (Int, U) => Unit) {
|
||||
val callSite = Utils.formatSparkCallSite
|
||||
val callSite = getCallSite
|
||||
val cleanedFunc = clean(func)
|
||||
logInfo("Starting job: " + callSite)
|
||||
val start = System.nanoTime
|
||||
|
@ -819,7 +785,7 @@ class SparkContext(
|
|||
* allowLocal flag specifies whether the scheduler can run the computation on the driver rather
|
||||
* than shipping it out to the cluster, for short actions like first().
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
def runJob[T, U: ClassTag](
|
||||
rdd: RDD[T],
|
||||
func: (TaskContext, Iterator[T]) => U,
|
||||
partitions: Seq[Int],
|
||||
|
@ -834,7 +800,7 @@ class SparkContext(
|
|||
* Run a job on a given set of partitions of an RDD, but take a function of type
|
||||
* `Iterator[T] => U` instead of `(TaskContext, Iterator[T]) => U`.
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
def runJob[T, U: ClassTag](
|
||||
rdd: RDD[T],
|
||||
func: Iterator[T] => U,
|
||||
partitions: Seq[Int],
|
||||
|
@ -846,21 +812,21 @@ class SparkContext(
|
|||
/**
|
||||
* Run a job on all partitions in an RDD and return the results in an array.
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](rdd: RDD[T], func: (TaskContext, Iterator[T]) => U): Array[U] = {
|
||||
def runJob[T, U: ClassTag](rdd: RDD[T], func: (TaskContext, Iterator[T]) => U): Array[U] = {
|
||||
runJob(rdd, func, 0 until rdd.partitions.size, false)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a job on all partitions in an RDD and return the results in an array.
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
|
||||
def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
|
||||
runJob(rdd, func, 0 until rdd.partitions.size, false)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a job on all partitions in an RDD and pass the results to a handler function.
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
def runJob[T, U: ClassTag](
|
||||
rdd: RDD[T],
|
||||
processPartition: (TaskContext, Iterator[T]) => U,
|
||||
resultHandler: (Int, U) => Unit)
|
||||
|
@ -871,7 +837,7 @@ class SparkContext(
|
|||
/**
|
||||
* Run a job on all partitions in an RDD and pass the results to a handler function.
|
||||
*/
|
||||
def runJob[T, U: ClassManifest](
|
||||
def runJob[T, U: ClassTag](
|
||||
rdd: RDD[T],
|
||||
processPartition: Iterator[T] => U,
|
||||
resultHandler: (Int, U) => Unit)
|
||||
|
@ -888,7 +854,7 @@ class SparkContext(
|
|||
func: (TaskContext, Iterator[T]) => U,
|
||||
evaluator: ApproximateEvaluator[U, R],
|
||||
timeout: Long): PartialResult[R] = {
|
||||
val callSite = Utils.formatSparkCallSite
|
||||
val callSite = getCallSite
|
||||
logInfo("Starting job: " + callSite)
|
||||
val start = System.nanoTime
|
||||
val result = dagScheduler.runApproximateJob(rdd, func, evaluator, callSite, timeout,
|
||||
|
@ -908,7 +874,7 @@ class SparkContext(
|
|||
resultFunc: => R): SimpleFutureAction[R] =
|
||||
{
|
||||
val cleanF = clean(processPartition)
|
||||
val callSite = Utils.formatSparkCallSite
|
||||
val callSite = getCallSite
|
||||
val waiter = dagScheduler.submitJob(
|
||||
rdd,
|
||||
(context: TaskContext, iter: Iterator[T]) => cleanF(iter),
|
||||
|
@ -944,22 +910,15 @@ class SparkContext(
|
|||
|
||||
/**
|
||||
* Set the directory under which RDDs are going to be checkpointed. The directory must
|
||||
* be a HDFS path if running on a cluster. If the directory does not exist, it will
|
||||
* be created. If the directory exists and useExisting is set to true, then the
|
||||
* exisiting directory will be used. Otherwise an exception will be thrown to
|
||||
* prevent accidental overriding of checkpoint files in the existing directory.
|
||||
* be a HDFS path if running on a cluster.
|
||||
*/
|
||||
def setCheckpointDir(dir: String, useExisting: Boolean = false) {
|
||||
val path = new Path(dir)
|
||||
val fs = path.getFileSystem(SparkHadoopUtil.get.newConfiguration())
|
||||
if (!useExisting) {
|
||||
if (fs.exists(path)) {
|
||||
throw new Exception("Checkpoint directory '" + path + "' already exists.")
|
||||
} else {
|
||||
fs.mkdirs(path)
|
||||
}
|
||||
def setCheckpointDir(directory: String) {
|
||||
checkpointDir = Option(directory).map { dir =>
|
||||
val path = new Path(dir, UUID.randomUUID().toString)
|
||||
val fs = path.getFileSystem(hadoopConfiguration)
|
||||
fs.mkdirs(path)
|
||||
fs.getFileStatus(path).getPath().toString
|
||||
}
|
||||
checkpointDir = Some(dir)
|
||||
}
|
||||
|
||||
/** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */
|
||||
|
@ -1017,16 +976,16 @@ object SparkContext {
|
|||
|
||||
// TODO: Add AccumulatorParams for other types, e.g. lists and strings
|
||||
|
||||
implicit def rddToPairRDDFunctions[K: ClassManifest, V: ClassManifest](rdd: RDD[(K, V)]) =
|
||||
implicit def rddToPairRDDFunctions[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) =
|
||||
new PairRDDFunctions(rdd)
|
||||
|
||||
implicit def rddToAsyncRDDActions[T: ClassManifest](rdd: RDD[T]) = new AsyncRDDActions(rdd)
|
||||
implicit def rddToAsyncRDDActions[T: ClassTag](rdd: RDD[T]) = new AsyncRDDActions(rdd)
|
||||
|
||||
implicit def rddToSequenceFileRDDFunctions[K <% Writable: ClassManifest, V <% Writable: ClassManifest](
|
||||
implicit def rddToSequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable: ClassTag](
|
||||
rdd: RDD[(K, V)]) =
|
||||
new SequenceFileRDDFunctions(rdd)
|
||||
|
||||
implicit def rddToOrderedRDDFunctions[K <% Ordered[K]: ClassManifest, V: ClassManifest](
|
||||
implicit def rddToOrderedRDDFunctions[K <% Ordered[K]: ClassTag, V: ClassTag](
|
||||
rdd: RDD[(K, V)]) =
|
||||
new OrderedRDDFunctions[K, V, (K, V)](rdd)
|
||||
|
||||
|
@ -1051,16 +1010,16 @@ object SparkContext {
|
|||
|
||||
implicit def stringToText(s: String) = new Text(s)
|
||||
|
||||
private implicit def arrayToArrayWritable[T <% Writable: ClassManifest](arr: Traversable[T]): ArrayWritable = {
|
||||
private implicit def arrayToArrayWritable[T <% Writable: ClassTag](arr: Traversable[T]): ArrayWritable = {
|
||||
def anyToWritable[U <% Writable](u: U): Writable = u
|
||||
|
||||
new ArrayWritable(classManifest[T].erasure.asInstanceOf[Class[Writable]],
|
||||
new ArrayWritable(classTag[T].runtimeClass.asInstanceOf[Class[Writable]],
|
||||
arr.map(x => anyToWritable(x)).toArray)
|
||||
}
|
||||
|
||||
// Helper objects for converting common types to Writable
|
||||
private def simpleWritableConverter[T, W <: Writable: ClassManifest](convert: W => T) = {
|
||||
val wClass = classManifest[W].erasure.asInstanceOf[Class[W]]
|
||||
private def simpleWritableConverter[T, W <: Writable: ClassTag](convert: W => T) = {
|
||||
val wClass = classTag[W].runtimeClass.asInstanceOf[Class[W]]
|
||||
new WritableConverter[T](_ => wClass, x => convert(x.asInstanceOf[W]))
|
||||
}
|
||||
|
||||
|
@ -1079,11 +1038,11 @@ object SparkContext {
|
|||
implicit def stringWritableConverter() = simpleWritableConverter[String, Text](_.toString)
|
||||
|
||||
implicit def writableWritableConverter[T <: Writable]() =
|
||||
new WritableConverter[T](_.erasure.asInstanceOf[Class[T]], _.asInstanceOf[T])
|
||||
new WritableConverter[T](_.runtimeClass.asInstanceOf[Class[T]], _.asInstanceOf[T])
|
||||
|
||||
/**
|
||||
* Find the JAR from which a given class was loaded, to make it easy for users to pass
|
||||
* their JARs to SparkContext
|
||||
* their JARs to SparkContext.
|
||||
*/
|
||||
def jarOfClass(cls: Class[_]): Seq[String] = {
|
||||
val uri = cls.getResource("/" + cls.getName.replace('.', '/') + ".class")
|
||||
|
@ -1100,28 +1059,181 @@ object SparkContext {
|
|||
}
|
||||
}
|
||||
|
||||
/** Find the JAR that contains the class of a particular object */
|
||||
/**
|
||||
* Find the JAR that contains the class of a particular object, to make it easy for users
|
||||
* to pass their JARs to SparkContext. In most cases you can call jarOfObject(this) in
|
||||
* your driver program.
|
||||
*/
|
||||
def jarOfObject(obj: AnyRef): Seq[String] = jarOfClass(obj.getClass)
|
||||
|
||||
/** Get the amount of memory per executor requested through system properties or SPARK_MEM */
|
||||
private[spark] val executorMemoryRequested = {
|
||||
// TODO: Might need to add some extra memory for the non-heap parts of the JVM
|
||||
Option(System.getProperty("spark.executor.memory"))
|
||||
.orElse(Option(System.getenv("SPARK_MEM")))
|
||||
.map(Utils.memoryStringToMb)
|
||||
.getOrElse(512)
|
||||
/**
|
||||
* Creates a modified version of a SparkConf with the parameters that can be passed separately
|
||||
* to SparkContext, to make it easier to write SparkContext's constructors. This ignores
|
||||
* parameters that are passed as the default value of null, instead of throwing an exception
|
||||
* like SparkConf would.
|
||||
*/
|
||||
private def updatedConf(
|
||||
conf: SparkConf,
|
||||
master: String,
|
||||
appName: String,
|
||||
sparkHome: String = null,
|
||||
jars: Seq[String] = Nil,
|
||||
environment: Map[String, String] = Map()): SparkConf =
|
||||
{
|
||||
val res = conf.clone()
|
||||
res.setMaster(master)
|
||||
res.setAppName(appName)
|
||||
if (sparkHome != null) {
|
||||
res.setSparkHome(sparkHome)
|
||||
}
|
||||
if (!jars.isEmpty) {
|
||||
res.setJars(jars)
|
||||
}
|
||||
res.setExecutorEnv(environment.toSeq)
|
||||
res
|
||||
}
|
||||
|
||||
/** Creates a task scheduler based on a given master URL. Extracted for testing. */
|
||||
private def createTaskScheduler(sc: SparkContext, master: String, appName: String)
|
||||
: TaskScheduler =
|
||||
{
|
||||
// Regular expression used for local[N] master format
|
||||
val LOCAL_N_REGEX = """local\[([0-9]+)\]""".r
|
||||
// Regular expression for local[N, maxRetries], used in tests with failing tasks
|
||||
val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+)\s*,\s*([0-9]+)\]""".r
|
||||
// Regular expression for simulating a Spark cluster of [N, cores, memory] locally
|
||||
val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
|
||||
// Regular expression for connecting to Spark deploy clusters
|
||||
val SPARK_REGEX = """spark://(.*)""".r
|
||||
// Regular expression for connection to Mesos cluster by mesos:// or zk:// url
|
||||
val MESOS_REGEX = """(mesos|zk)://.*""".r
|
||||
// Regular expression for connection to Simr cluster
|
||||
val SIMR_REGEX = """simr://(.*)""".r
|
||||
|
||||
// When running locally, don't try to re-execute tasks on failure.
|
||||
val MAX_LOCAL_TASK_FAILURES = 1
|
||||
|
||||
master match {
|
||||
case "local" =>
|
||||
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
|
||||
val backend = new LocalBackend(scheduler, 1)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case LOCAL_N_REGEX(threads) =>
|
||||
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
|
||||
val backend = new LocalBackend(scheduler, threads.toInt)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
|
||||
val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
|
||||
val backend = new LocalBackend(scheduler, threads.toInt)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case SPARK_REGEX(sparkUrl) =>
|
||||
val scheduler = new TaskSchedulerImpl(sc)
|
||||
val masterUrls = sparkUrl.split(",").map("spark://" + _)
|
||||
val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls, appName)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
|
||||
// Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
|
||||
val memoryPerSlaveInt = memoryPerSlave.toInt
|
||||
if (sc.executorMemory > memoryPerSlaveInt) {
|
||||
throw new SparkException(
|
||||
"Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(
|
||||
memoryPerSlaveInt, sc.executorMemory))
|
||||
}
|
||||
|
||||
val scheduler = new TaskSchedulerImpl(sc)
|
||||
val localCluster = new LocalSparkCluster(
|
||||
numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt)
|
||||
val masterUrls = localCluster.start()
|
||||
val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls, appName)
|
||||
scheduler.initialize(backend)
|
||||
backend.shutdownCallback = (backend: SparkDeploySchedulerBackend) => {
|
||||
localCluster.stop()
|
||||
}
|
||||
scheduler
|
||||
|
||||
case "yarn-standalone" =>
|
||||
val scheduler = try {
|
||||
val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")
|
||||
val cons = clazz.getConstructor(classOf[SparkContext])
|
||||
cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
|
||||
} catch {
|
||||
// TODO: Enumerate the exact reasons why it can fail
|
||||
// But irrespective of it, it means we cannot proceed !
|
||||
case th: Throwable => {
|
||||
throw new SparkException("YARN mode not available ?", th)
|
||||
}
|
||||
}
|
||||
val backend = new CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case "yarn-client" =>
|
||||
val scheduler = try {
|
||||
val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClientClusterScheduler")
|
||||
val cons = clazz.getConstructor(classOf[SparkContext])
|
||||
cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
|
||||
|
||||
} catch {
|
||||
case th: Throwable => {
|
||||
throw new SparkException("YARN mode not available ?", th)
|
||||
}
|
||||
}
|
||||
|
||||
val backend = try {
|
||||
val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend")
|
||||
val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
|
||||
cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
|
||||
} catch {
|
||||
case th: Throwable => {
|
||||
throw new SparkException("YARN mode not available ?", th)
|
||||
}
|
||||
}
|
||||
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case mesosUrl @ MESOS_REGEX(_) =>
|
||||
MesosNativeLibrary.load()
|
||||
val scheduler = new TaskSchedulerImpl(sc)
|
||||
val coarseGrained = sc.conf.get("spark.mesos.coarse", "false").toBoolean
|
||||
val url = mesosUrl.stripPrefix("mesos://") // strip scheme from raw Mesos URLs
|
||||
val backend = if (coarseGrained) {
|
||||
new CoarseMesosSchedulerBackend(scheduler, sc, url, appName)
|
||||
} else {
|
||||
new MesosSchedulerBackend(scheduler, sc, url, appName)
|
||||
}
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case SIMR_REGEX(simrUrl) =>
|
||||
val scheduler = new TaskSchedulerImpl(sc)
|
||||
val backend = new SimrSchedulerBackend(scheduler, sc, simrUrl)
|
||||
scheduler.initialize(backend)
|
||||
scheduler
|
||||
|
||||
case _ =>
|
||||
throw new SparkException("Could not parse Master URL: '" + master + "'")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A class encapsulating how to convert some type T to Writable. It stores both the Writable class
|
||||
* corresponding to T (e.g. IntWritable for Int) and a function for doing the conversion.
|
||||
* The getter for the writable class takes a ClassManifest[T] in case this is a generic object
|
||||
* The getter for the writable class takes a ClassTag[T] in case this is a generic object
|
||||
* that doesn't know the type of T when it is created. This sounds strange but is necessary to
|
||||
* support converting subclasses of Writable to themselves (writableWritableConverter).
|
||||
*/
|
||||
private[spark] class WritableConverter[T](
|
||||
val writableClass: ClassManifest[T] => Class[_ <: Writable],
|
||||
val writableClass: ClassTag[T] => Class[_ <: Writable],
|
||||
val convert: Writable => T)
|
||||
extends Serializable
|
||||
|
||||
|
|
|
@ -17,11 +17,10 @@
|
|||
|
||||
package org.apache.spark
|
||||
|
||||
import collection.mutable
|
||||
import serializer.Serializer
|
||||
import scala.collection.mutable
|
||||
import scala.concurrent.Await
|
||||
|
||||
import akka.actor.{Actor, ActorRef, Props, ActorSystemImpl, ActorSystem}
|
||||
import akka.remote.RemoteActorRefProvider
|
||||
import akka.actor._
|
||||
|
||||
import org.apache.spark.broadcast.BroadcastManager
|
||||
import org.apache.spark.metrics.MetricsSystem
|
||||
|
@ -40,7 +39,7 @@ import com.google.common.collect.MapMaker
|
|||
* objects needs to have the right SparkEnv set. You can get the current environment with
|
||||
* SparkEnv.get (e.g. after creating a SparkContext) and set it with SparkEnv.set.
|
||||
*/
|
||||
class SparkEnv (
|
||||
class SparkEnv private[spark] (
|
||||
val executorId: String,
|
||||
val actorSystem: ActorSystem,
|
||||
val serializerManager: SerializerManager,
|
||||
|
@ -54,7 +53,8 @@ class SparkEnv (
|
|||
val connectionManager: ConnectionManager,
|
||||
val httpFileServer: HttpFileServer,
|
||||
val sparkFilesDir: String,
|
||||
val metricsSystem: MetricsSystem) {
|
||||
val metricsSystem: MetricsSystem,
|
||||
val conf: SparkConf) {
|
||||
|
||||
private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]()
|
||||
|
||||
|
@ -62,7 +62,7 @@ class SparkEnv (
|
|||
// (e.g., HadoopFileRDD uses this to cache JobConfs and InputFormats).
|
||||
private[spark] val hadoopJobMetadata = new MapMaker().softValues().makeMap[String, Any]()
|
||||
|
||||
def stop() {
|
||||
private[spark] def stop() {
|
||||
pythonWorkers.foreach { case(key, worker) => worker.stop() }
|
||||
httpFileServer.stop()
|
||||
mapOutputTracker.stop()
|
||||
|
@ -74,9 +74,11 @@ class SparkEnv (
|
|||
actorSystem.shutdown()
|
||||
// Unfortunately Akka's awaitTermination doesn't actually wait for the Netty server to shut
|
||||
// down, but let's call it anyway in case it gets fixed in a later release
|
||||
actorSystem.awaitTermination()
|
||||
// UPDATE: In Akka 2.1.x, this hangs if there are remote actors, so we can't call it.
|
||||
//actorSystem.awaitTermination()
|
||||
}
|
||||
|
||||
private[spark]
|
||||
def createPythonWorker(pythonExec: String, envVars: Map[String, String]): java.net.Socket = {
|
||||
synchronized {
|
||||
val key = (pythonExec, envVars)
|
||||
|
@ -105,33 +107,35 @@ object SparkEnv extends Logging {
|
|||
/**
|
||||
* Returns the ThreadLocal SparkEnv.
|
||||
*/
|
||||
def getThreadLocal : SparkEnv = {
|
||||
def getThreadLocal: SparkEnv = {
|
||||
env.get()
|
||||
}
|
||||
|
||||
def createFromSystemProperties(
|
||||
private[spark] def create(
|
||||
conf: SparkConf,
|
||||
executorId: String,
|
||||
hostname: String,
|
||||
port: Int,
|
||||
isDriver: Boolean,
|
||||
isLocal: Boolean): SparkEnv = {
|
||||
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("spark", hostname, port)
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("spark", hostname, port,
|
||||
conf = conf)
|
||||
|
||||
// Bit of a hack: If this is the driver and our port was 0 (meaning bind to any free port),
|
||||
// figure out which port number Akka actually bound to and set spark.driver.port to it.
|
||||
if (isDriver && port == 0) {
|
||||
System.setProperty("spark.driver.port", boundPort.toString)
|
||||
conf.set("spark.driver.port", boundPort.toString)
|
||||
}
|
||||
|
||||
// set only if unset until now.
|
||||
if (System.getProperty("spark.hostPort", null) == null) {
|
||||
if (!conf.contains("spark.hostPort")) {
|
||||
if (!isDriver){
|
||||
// unexpected
|
||||
Utils.logErrorWithStack("Unexpected NOT to have spark.hostPort set")
|
||||
}
|
||||
Utils.checkHost(hostname)
|
||||
System.setProperty("spark.hostPort", hostname + ":" + boundPort)
|
||||
conf.set("spark.hostPort", hostname + ":" + boundPort)
|
||||
}
|
||||
|
||||
val classLoader = Thread.currentThread.getContextClassLoader
|
||||
|
@ -139,49 +143,51 @@ object SparkEnv extends Logging {
|
|||
// Create an instance of the class named by the given Java system property, or by
|
||||
// defaultClassName if the property is not set, and return it as a T
|
||||
def instantiateClass[T](propertyName: String, defaultClassName: String): T = {
|
||||
val name = System.getProperty(propertyName, defaultClassName)
|
||||
val name = conf.get(propertyName, defaultClassName)
|
||||
Class.forName(name, true, classLoader).newInstance().asInstanceOf[T]
|
||||
}
|
||||
|
||||
val serializerManager = new SerializerManager
|
||||
|
||||
val serializer = serializerManager.setDefault(
|
||||
System.getProperty("spark.serializer", "org.apache.spark.serializer.JavaSerializer"))
|
||||
conf.get("spark.serializer", "org.apache.spark.serializer.JavaSerializer"), conf)
|
||||
|
||||
val closureSerializer = serializerManager.get(
|
||||
System.getProperty("spark.closure.serializer", "org.apache.spark.serializer.JavaSerializer"))
|
||||
conf.get("spark.closure.serializer", "org.apache.spark.serializer.JavaSerializer"),
|
||||
conf)
|
||||
|
||||
def registerOrLookup(name: String, newActor: => Actor): ActorRef = {
|
||||
if (isDriver) {
|
||||
logInfo("Registering " + name)
|
||||
actorSystem.actorOf(Props(newActor), name = name)
|
||||
} else {
|
||||
val driverHost: String = System.getProperty("spark.driver.host", "localhost")
|
||||
val driverPort: Int = System.getProperty("spark.driver.port", "7077").toInt
|
||||
val driverHost: String = conf.get("spark.driver.host", "localhost")
|
||||
val driverPort: Int = conf.get("spark.driver.port", "7077").toInt
|
||||
Utils.checkHost(driverHost, "Expected hostname")
|
||||
val url = "akka://spark@%s:%s/user/%s".format(driverHost, driverPort, name)
|
||||
logInfo("Connecting to " + name + ": " + url)
|
||||
actorSystem.actorFor(url)
|
||||
val url = s"akka.tcp://spark@$driverHost:$driverPort/user/$name"
|
||||
val timeout = AkkaUtils.lookupTimeout(conf)
|
||||
logInfo(s"Connecting to $name: $url")
|
||||
Await.result(actorSystem.actorSelection(url).resolveOne(timeout), timeout)
|
||||
}
|
||||
}
|
||||
|
||||
val blockManagerMaster = new BlockManagerMaster(registerOrLookup(
|
||||
"BlockManagerMaster",
|
||||
new BlockManagerMasterActor(isLocal)))
|
||||
val blockManager = new BlockManager(executorId, actorSystem, blockManagerMaster, serializer)
|
||||
new BlockManagerMasterActor(isLocal, conf)), conf)
|
||||
val blockManager = new BlockManager(executorId, actorSystem, blockManagerMaster, serializer, conf)
|
||||
|
||||
val connectionManager = blockManager.connectionManager
|
||||
|
||||
val broadcastManager = new BroadcastManager(isDriver)
|
||||
val broadcastManager = new BroadcastManager(isDriver, conf)
|
||||
|
||||
val cacheManager = new CacheManager(blockManager)
|
||||
|
||||
// Have to assign trackerActor after initialization as MapOutputTrackerActor
|
||||
// requires the MapOutputTracker itself
|
||||
val mapOutputTracker = if (isDriver) {
|
||||
new MapOutputTrackerMaster()
|
||||
new MapOutputTrackerMaster(conf)
|
||||
} else {
|
||||
new MapOutputTracker()
|
||||
new MapOutputTracker(conf)
|
||||
}
|
||||
mapOutputTracker.trackerActor = registerOrLookup(
|
||||
"MapOutputTracker",
|
||||
|
@ -192,12 +198,12 @@ object SparkEnv extends Logging {
|
|||
|
||||
val httpFileServer = new HttpFileServer()
|
||||
httpFileServer.initialize()
|
||||
System.setProperty("spark.fileserver.uri", httpFileServer.serverUri)
|
||||
conf.set("spark.fileserver.uri", httpFileServer.serverUri)
|
||||
|
||||
val metricsSystem = if (isDriver) {
|
||||
MetricsSystem.createMetricsSystem("driver")
|
||||
MetricsSystem.createMetricsSystem("driver", conf)
|
||||
} else {
|
||||
MetricsSystem.createMetricsSystem("executor")
|
||||
MetricsSystem.createMetricsSystem("executor", conf)
|
||||
}
|
||||
metricsSystem.start()
|
||||
|
||||
|
@ -211,7 +217,7 @@ object SparkEnv extends Logging {
|
|||
}
|
||||
|
||||
// Warn about deprecated spark.cache.class property
|
||||
if (System.getProperty("spark.cache.class") != null) {
|
||||
if (conf.contains("spark.cache.class")) {
|
||||
logWarning("The spark.cache.class property is no longer being used! Specify storage " +
|
||||
"levels using the RDD.persist() method instead.")
|
||||
}
|
||||
|
@ -230,6 +236,7 @@ object SparkEnv extends Logging {
|
|||
connectionManager,
|
||||
httpFileServer,
|
||||
sparkFilesDir,
|
||||
metricsSystem)
|
||||
metricsSystem,
|
||||
conf)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -127,10 +127,6 @@ class SparkHadoopWriter(@transient jobConf: JobConf)
|
|||
cmtr.commitJob(getJobContext())
|
||||
}
|
||||
|
||||
def cleanup() {
|
||||
getOutputCommitter().cleanupJob(getJobContext())
|
||||
}
|
||||
|
||||
// ********* Private Functions *********
|
||||
|
||||
private def getOutputFormat(): OutputFormat[AnyRef,AnyRef] = {
|
||||
|
|
|
@ -53,5 +53,3 @@ private[spark] case class ExceptionFailure(
|
|||
private[spark] case object TaskResultLost extends TaskEndReason
|
||||
|
||||
private[spark] case object TaskKilled extends TaskEndReason
|
||||
|
||||
private[spark] case class OtherFailure(message: String) extends TaskEndReason
|
||||
|
|
|
@ -19,8 +19,7 @@ package org.apache.spark
|
|||
|
||||
import org.apache.mesos.Protos.{TaskState => MesosTaskState}
|
||||
|
||||
private[spark] object TaskState
|
||||
extends Enumeration("LAUNCHING", "RUNNING", "FINISHED", "FAILED", "KILLED", "LOST") {
|
||||
private[spark] object TaskState extends Enumeration {
|
||||
|
||||
val LAUNCHING, RUNNING, FINISHED, FAILED, KILLED, LOST = Value
|
||||
|
||||
|
|
|
@ -17,18 +17,23 @@
|
|||
|
||||
package org.apache.spark.api.java
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.SparkContext.doubleRDDToDoubleRDDFunctions
|
||||
import org.apache.spark.api.java.function.{Function => JFunction}
|
||||
import org.apache.spark.util.StatCounter
|
||||
import org.apache.spark.partial.{BoundedDouble, PartialResult}
|
||||
import org.apache.spark.storage.StorageLevel
|
||||
|
||||
import java.lang.Double
|
||||
import org.apache.spark.Partitioner
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, JavaDoubleRDD] {
|
||||
|
||||
override val classManifest: ClassManifest[Double] = implicitly[ClassManifest[Double]]
|
||||
override val classTag: ClassTag[Double] = implicitly[ClassTag[Double]]
|
||||
|
||||
override val rdd: RDD[Double] = srdd.map(x => Double.valueOf(x))
|
||||
|
||||
|
@ -42,7 +47,7 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, Jav
|
|||
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
|
||||
def cache(): JavaDoubleRDD = fromRDD(srdd.cache())
|
||||
|
||||
/**
|
||||
/**
|
||||
* Set this RDD's storage level to persist its values across operations after the first time
|
||||
* it is computed. Can only be called once on each RDD.
|
||||
*/
|
||||
|
@ -106,7 +111,7 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, Jav
|
|||
|
||||
/**
|
||||
* Return an RDD with the elements from `this` that are not in `other`.
|
||||
*
|
||||
*
|
||||
* Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
|
||||
* RDD will be <= us.
|
||||
*/
|
||||
|
@ -182,6 +187,44 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, Jav
|
|||
|
||||
/** (Experimental) Approximate operation to return the sum within a timeout. */
|
||||
def sumApprox(timeout: Long): PartialResult[BoundedDouble] = srdd.sumApprox(timeout)
|
||||
|
||||
/**
|
||||
* Compute a histogram of the data using bucketCount number of buckets evenly
|
||||
* spaced between the minimum and maximum of the RDD. For example if the min
|
||||
* value is 0 and the max is 100 and there are two buckets the resulting
|
||||
* buckets will be [0,50) [50,100]. bucketCount must be at least 1
|
||||
* If the RDD contains infinity, NaN throws an exception
|
||||
* If the elements in RDD do not vary (max == min) always returns a single bucket.
|
||||
*/
|
||||
def histogram(bucketCount: Int): Pair[Array[scala.Double], Array[Long]] = {
|
||||
val result = srdd.histogram(bucketCount)
|
||||
(result._1, result._2)
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a histogram using the provided buckets. The buckets are all open
|
||||
* to the left except for the last which is closed
|
||||
* e.g. for the array
|
||||
* [1,10,20,50] the buckets are [1,10) [10,20) [20,50]
|
||||
* e.g 1<=x<10 , 10<=x<20, 20<=x<50
|
||||
* And on the input of 1 and 50 we would have a histogram of 1,0,0
|
||||
*
|
||||
* Note: if your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
|
||||
* from an O(log n) inseration to O(1) per element. (where n = # buckets) if you set evenBuckets
|
||||
* to true.
|
||||
* buckets must be sorted and not contain any duplicates.
|
||||
* buckets array must be at least two elements
|
||||
* All NaN entries are treated the same. If you have a NaN bucket it must be
|
||||
* the maximum value of the last position and all NaN entries will be counted
|
||||
* in that bucket.
|
||||
*/
|
||||
def histogram(buckets: Array[scala.Double]): Array[Long] = {
|
||||
srdd.histogram(buckets, false)
|
||||
}
|
||||
|
||||
def histogram(buckets: Array[Double], evenBuckets: Boolean): Array[Long] = {
|
||||
srdd.histogram(buckets.map(_.toDouble), evenBuckets)
|
||||
}
|
||||
}
|
||||
|
||||
object JavaDoubleRDD {
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Comparator
|
|||
|
||||
import scala.Tuple2
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import com.google.common.base.Optional
|
||||
import org.apache.hadoop.io.compress.CompressionCodec
|
||||
|
@ -43,13 +44,13 @@ import org.apache.spark.rdd.OrderedRDDFunctions
|
|||
import org.apache.spark.storage.StorageLevel
|
||||
|
||||
|
||||
class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManifest[K],
|
||||
implicit val vManifest: ClassManifest[V]) extends JavaRDDLike[(K, V), JavaPairRDD[K, V]] {
|
||||
class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K],
|
||||
implicit val vClassTag: ClassTag[V]) extends JavaRDDLike[(K, V), JavaPairRDD[K, V]] {
|
||||
|
||||
override def wrapRDD(rdd: RDD[(K, V)]): JavaPairRDD[K, V] = JavaPairRDD.fromRDD(rdd)
|
||||
|
||||
override val classManifest: ClassManifest[(K, V)] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K, V]]]
|
||||
override val classTag: ClassTag[(K, V)] =
|
||||
implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K, V]]]
|
||||
|
||||
import JavaPairRDD._
|
||||
|
||||
|
@ -58,7 +59,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
|
||||
def cache(): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.cache())
|
||||
|
||||
/**
|
||||
/**
|
||||
* Set this RDD's storage level to persist its values across operations after the first time
|
||||
* it is computed. Can only be called once on each RDD.
|
||||
*/
|
||||
|
@ -138,14 +139,14 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
override def first(): (K, V) = rdd.first()
|
||||
|
||||
// Pair RDD functions
|
||||
|
||||
|
||||
/**
|
||||
* Generic function to combine the elements for each key using a custom set of aggregation
|
||||
* functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
|
||||
* "combined type" C * Note that V and C can be different -- for example, one might group an
|
||||
* RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
|
||||
* Generic function to combine the elements for each key using a custom set of aggregation
|
||||
* functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
|
||||
* "combined type" C * Note that V and C can be different -- for example, one might group an
|
||||
* RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
|
||||
* functions:
|
||||
*
|
||||
*
|
||||
* - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
|
||||
* - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
|
||||
* - `mergeCombiners`, to combine two C's into a single one.
|
||||
|
@ -157,8 +158,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
mergeValue: JFunction2[C, V, C],
|
||||
mergeCombiners: JFunction2[C, C, C],
|
||||
partitioner: Partitioner): JavaPairRDD[K, C] = {
|
||||
implicit val cm: ClassManifest[C] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[C]]
|
||||
implicit val cm: ClassTag[C] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[C]]
|
||||
fromRDD(rdd.combineByKey(
|
||||
createCombiner,
|
||||
mergeValue,
|
||||
|
@ -195,14 +195,14 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
/** Count the number of elements for each key, and return the result to the master as a Map. */
|
||||
def countByKey(): java.util.Map[K, Long] = mapAsJavaMap(rdd.countByKey())
|
||||
|
||||
/**
|
||||
/**
|
||||
* (Experimental) Approximate version of countByKey that can return a partial result if it does
|
||||
* not finish within a timeout.
|
||||
*/
|
||||
def countByKeyApprox(timeout: Long): PartialResult[java.util.Map[K, BoundedDouble]] =
|
||||
rdd.countByKeyApprox(timeout).map(mapAsJavaMap)
|
||||
|
||||
/**
|
||||
/**
|
||||
* (Experimental) Approximate version of countByKey that can return a partial result if it does
|
||||
* not finish within a timeout.
|
||||
*/
|
||||
|
@ -258,7 +258,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
|
||||
/**
|
||||
* Return an RDD with the elements from `this` that are not in `other`.
|
||||
*
|
||||
*
|
||||
* Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
|
||||
* RDD will be <= us.
|
||||
*/
|
||||
|
@ -315,15 +315,14 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
fromRDD(joinResult.mapValues{case (v, w) => (JavaUtils.optionToOptional(v), w)})
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Simplified version of combineByKey that hash-partitions the resulting RDD using the existing
|
||||
* partitioner/parallelism level.
|
||||
*/
|
||||
def combineByKey[C](createCombiner: JFunction[V, C],
|
||||
mergeValue: JFunction2[C, V, C],
|
||||
mergeCombiners: JFunction2[C, C, C]): JavaPairRDD[K, C] = {
|
||||
implicit val cm: ClassManifest[C] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[C]]
|
||||
implicit val cm: ClassTag[C] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[C]]
|
||||
fromRDD(combineByKey(createCombiner, mergeValue, mergeCombiners, defaultPartitioner(rdd)))
|
||||
}
|
||||
|
||||
|
@ -414,8 +413,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
* this also retains the original RDD's partitioning.
|
||||
*/
|
||||
def mapValues[U](f: JFunction[V, U]): JavaPairRDD[K, U] = {
|
||||
implicit val cm: ClassManifest[U] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[U]]
|
||||
implicit val cm: ClassTag[U] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[U]]
|
||||
fromRDD(rdd.mapValues(f))
|
||||
}
|
||||
|
||||
|
@ -426,8 +424,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
def flatMapValues[U](f: JFunction[V, java.lang.Iterable[U]]): JavaPairRDD[K, U] = {
|
||||
import scala.collection.JavaConverters._
|
||||
def fn = (x: V) => f.apply(x).asScala
|
||||
implicit val cm: ClassManifest[U] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[U]]
|
||||
implicit val cm: ClassTag[U] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[U]]
|
||||
fromRDD(rdd.flatMapValues(fn))
|
||||
}
|
||||
|
||||
|
@ -591,6 +588,20 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending))
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
|
||||
* `collect` or `save` on the resulting RDD will return or output an ordered list of records
|
||||
* (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
|
||||
* order of the keys).
|
||||
*/
|
||||
def sortByKey(comp: Comparator[K], ascending: Boolean, numPartitions: Int): JavaPairRDD[K, V] = {
|
||||
class KeyOrdering(val a: K) extends Ordered[K] {
|
||||
override def compare(b: K) = comp.compare(a, b)
|
||||
}
|
||||
implicit def toOrdered(x: K): Ordered[K] = new KeyOrdering(x)
|
||||
fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions))
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an RDD with the keys of each tuple.
|
||||
*/
|
||||
|
@ -600,25 +611,61 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
* Return an RDD with the values of each tuple.
|
||||
*/
|
||||
def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2))
|
||||
|
||||
/**
|
||||
* Return approximate number of distinct values for each key in this RDD.
|
||||
* The accuracy of approximation can be controlled through the relative standard deviation
|
||||
* (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
|
||||
* more accurate counts but increase the memory footprint and vise versa. Uses the provided
|
||||
* Partitioner to partition the output RDD.
|
||||
*/
|
||||
def countApproxDistinctByKey(relativeSD: Double, partitioner: Partitioner): JavaRDD[(K, Long)] = {
|
||||
rdd.countApproxDistinctByKey(relativeSD, partitioner)
|
||||
}
|
||||
|
||||
/**
|
||||
* Return approximate number of distinct values for each key this RDD.
|
||||
* The accuracy of approximation can be controlled through the relative standard deviation
|
||||
* (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
|
||||
* more accurate counts but increase the memory footprint and vise versa. The default value of
|
||||
* relativeSD is 0.05. Hash-partitions the output RDD using the existing partitioner/parallelism
|
||||
* level.
|
||||
*/
|
||||
def countApproxDistinctByKey(relativeSD: Double = 0.05): JavaRDD[(K, Long)] = {
|
||||
rdd.countApproxDistinctByKey(relativeSD)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return approximate number of distinct values for each key in this RDD.
|
||||
* The accuracy of approximation can be controlled through the relative standard deviation
|
||||
* (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
|
||||
* more accurate counts but increase the memory footprint and vise versa. HashPartitions the
|
||||
* output RDD into numPartitions.
|
||||
*
|
||||
*/
|
||||
def countApproxDistinctByKey(relativeSD: Double, numPartitions: Int): JavaRDD[(K, Long)] = {
|
||||
rdd.countApproxDistinctByKey(relativeSD, numPartitions)
|
||||
}
|
||||
}
|
||||
|
||||
object JavaPairRDD {
|
||||
def groupByResultToJava[K, T](rdd: RDD[(K, Seq[T])])(implicit kcm: ClassManifest[K],
|
||||
vcm: ClassManifest[T]): RDD[(K, JList[T])] =
|
||||
def groupByResultToJava[K, T](rdd: RDD[(K, Seq[T])])(implicit kcm: ClassTag[K],
|
||||
vcm: ClassTag[T]): RDD[(K, JList[T])] =
|
||||
rddToPairRDDFunctions(rdd).mapValues(seqAsJavaList _)
|
||||
|
||||
def cogroupResultToJava[W, K, V](rdd: RDD[(K, (Seq[V], Seq[W]))])(implicit kcm: ClassManifest[K],
|
||||
vcm: ClassManifest[V]): RDD[(K, (JList[V], JList[W]))] = rddToPairRDDFunctions(rdd).mapValues((x: (Seq[V],
|
||||
Seq[W])) => (seqAsJavaList(x._1), seqAsJavaList(x._2)))
|
||||
def cogroupResultToJava[W, K, V](rdd: RDD[(K, (Seq[V], Seq[W]))])(implicit kcm: ClassTag[K],
|
||||
vcm: ClassTag[V]): RDD[(K, (JList[V], JList[W]))] = rddToPairRDDFunctions(rdd)
|
||||
.mapValues((x: (Seq[V], Seq[W])) => (seqAsJavaList(x._1), seqAsJavaList(x._2)))
|
||||
|
||||
def cogroupResult2ToJava[W1, W2, K, V](rdd: RDD[(K, (Seq[V], Seq[W1],
|
||||
Seq[W2]))])(implicit kcm: ClassManifest[K]) : RDD[(K, (JList[V], JList[W1],
|
||||
Seq[W2]))])(implicit kcm: ClassTag[K]) : RDD[(K, (JList[V], JList[W1],
|
||||
JList[W2]))] = rddToPairRDDFunctions(rdd).mapValues(
|
||||
(x: (Seq[V], Seq[W1], Seq[W2])) => (seqAsJavaList(x._1),
|
||||
seqAsJavaList(x._2),
|
||||
seqAsJavaList(x._3)))
|
||||
|
||||
def fromRDD[K: ClassManifest, V: ClassManifest](rdd: RDD[(K, V)]): JavaPairRDD[K, V] =
|
||||
def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] =
|
||||
new JavaPairRDD[K, V](rdd)
|
||||
|
||||
implicit def toRDD[K, V](rdd: JavaPairRDD[K, V]): RDD[(K, V)] = rdd.rdd
|
||||
|
@ -626,10 +673,8 @@ object JavaPairRDD {
|
|||
|
||||
/** Convert a JavaRDD of key-value pairs to JavaPairRDD. */
|
||||
def fromJavaRDD[K, V](rdd: JavaRDD[(K, V)]): JavaPairRDD[K, V] = {
|
||||
implicit val cmk: ClassManifest[K] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K]]
|
||||
implicit val cmv: ClassManifest[V] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[V]]
|
||||
implicit val cmk: ClassTag[K] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[K]]
|
||||
implicit val cmv: ClassTag[V] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
|
||||
new JavaPairRDD[K, V](rdd.rdd)
|
||||
}
|
||||
|
||||
|
|
|
@ -17,12 +17,14 @@
|
|||
|
||||
package org.apache.spark.api.java
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.spark._
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.api.java.function.{Function => JFunction}
|
||||
import org.apache.spark.storage.StorageLevel
|
||||
|
||||
class JavaRDD[T](val rdd: RDD[T])(implicit val classManifest: ClassManifest[T]) extends
|
||||
class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) extends
|
||||
JavaRDDLike[T, JavaRDD[T]] {
|
||||
|
||||
override def wrapRDD(rdd: RDD[T]): JavaRDD[T] = JavaRDD.fromRDD(rdd)
|
||||
|
@ -123,12 +125,13 @@ JavaRDDLike[T, JavaRDD[T]] {
|
|||
*/
|
||||
def subtract(other: JavaRDD[T], p: Partitioner): JavaRDD[T] =
|
||||
wrapRDD(rdd.subtract(other, p))
|
||||
|
||||
override def toString = rdd.toString
|
||||
}
|
||||
|
||||
object JavaRDD {
|
||||
|
||||
implicit def fromRDD[T: ClassManifest](rdd: RDD[T]): JavaRDD[T] = new JavaRDD[T](rdd)
|
||||
implicit def fromRDD[T: ClassTag](rdd: RDD[T]): JavaRDD[T] = new JavaRDD[T](rdd)
|
||||
|
||||
implicit def toRDD[T](rdd: JavaRDD[T]): RDD[T] = rdd.rdd
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.spark.api.java
|
|||
import java.util.{List => JList, Comparator}
|
||||
import scala.Tuple2
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import com.google.common.base.Optional
|
||||
import org.apache.hadoop.io.compress.CompressionCodec
|
||||
|
@ -35,7 +36,7 @@ import org.apache.spark.storage.StorageLevel
|
|||
trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
||||
def wrapRDD(rdd: RDD[T]): This
|
||||
|
||||
implicit val classManifest: ClassManifest[T]
|
||||
implicit val classTag: ClassTag[T]
|
||||
|
||||
def rdd: RDD[T]
|
||||
|
||||
|
@ -71,7 +72,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
* Return a new RDD by applying a function to each partition of this RDD, while tracking the index
|
||||
* of the original partition.
|
||||
*/
|
||||
def mapPartitionsWithIndex[R: ClassManifest](
|
||||
def mapPartitionsWithIndex[R: ClassTag](
|
||||
f: JFunction2[Int, java.util.Iterator[T], java.util.Iterator[R]],
|
||||
preservesPartitioning: Boolean = false): JavaRDD[R] =
|
||||
new JavaRDD(rdd.mapPartitionsWithIndex(((a,b) => f(a,asJavaIterator(b))),
|
||||
|
@ -87,7 +88,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
* Return a new RDD by applying a function to all elements of this RDD.
|
||||
*/
|
||||
def map[K2, V2](f: PairFunction[T, K2, V2]): JavaPairRDD[K2, V2] = {
|
||||
def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K2, V2]]]
|
||||
def cm = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
|
||||
new JavaPairRDD(rdd.map(f)(cm))(f.keyType(), f.valueType())
|
||||
}
|
||||
|
||||
|
@ -118,7 +119,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
def flatMap[K2, V2](f: PairFlatMapFunction[T, K2, V2]): JavaPairRDD[K2, V2] = {
|
||||
import scala.collection.JavaConverters._
|
||||
def fn = (x: T) => f.apply(x).asScala
|
||||
def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K2, V2]]]
|
||||
def cm = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
|
||||
JavaPairRDD.fromRDD(rdd.flatMap(fn)(cm))(f.keyType(), f.valueType())
|
||||
}
|
||||
|
||||
|
@ -158,18 +159,16 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
* elements (a, b) where a is in `this` and b is in `other`.
|
||||
*/
|
||||
def cartesian[U](other: JavaRDDLike[U, _]): JavaPairRDD[T, U] =
|
||||
JavaPairRDD.fromRDD(rdd.cartesian(other.rdd)(other.classManifest))(classManifest,
|
||||
other.classManifest)
|
||||
JavaPairRDD.fromRDD(rdd.cartesian(other.rdd)(other.classTag))(classTag, other.classTag)
|
||||
|
||||
/**
|
||||
* Return an RDD of grouped elements. Each group consists of a key and a sequence of elements
|
||||
* mapping to that key.
|
||||
*/
|
||||
def groupBy[K](f: JFunction[T, K]): JavaPairRDD[K, JList[T]] = {
|
||||
implicit val kcm: ClassManifest[K] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K]]
|
||||
implicit val vcm: ClassManifest[JList[T]] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[JList[T]]]
|
||||
implicit val kcm: ClassTag[K] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[K]]
|
||||
implicit val vcm: ClassTag[JList[T]] =
|
||||
implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[JList[T]]]
|
||||
JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f)(f.returnType)))(kcm, vcm)
|
||||
}
|
||||
|
||||
|
@ -178,10 +177,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
* mapping to that key.
|
||||
*/
|
||||
def groupBy[K](f: JFunction[T, K], numPartitions: Int): JavaPairRDD[K, JList[T]] = {
|
||||
implicit val kcm: ClassManifest[K] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K]]
|
||||
implicit val vcm: ClassManifest[JList[T]] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[JList[T]]]
|
||||
implicit val kcm: ClassTag[K] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[K]]
|
||||
implicit val vcm: ClassTag[JList[T]] =
|
||||
implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[JList[T]]]
|
||||
JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f, numPartitions)(f.returnType)))(kcm, vcm)
|
||||
}
|
||||
|
||||
|
@ -209,7 +207,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
* a map on the other).
|
||||
*/
|
||||
def zip[U](other: JavaRDDLike[U, _]): JavaPairRDD[T, U] = {
|
||||
JavaPairRDD.fromRDD(rdd.zip(other.rdd)(other.classManifest))(classManifest, other.classManifest)
|
||||
JavaPairRDD.fromRDD(rdd.zip(other.rdd)(other.classTag))(classTag, other.classTag)
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -224,7 +222,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
def fn = (x: Iterator[T], y: Iterator[U]) => asScalaIterator(
|
||||
f.apply(asJavaIterator(x), asJavaIterator(y)).iterator())
|
||||
JavaRDD.fromRDD(
|
||||
rdd.zipPartitions(other.rdd)(fn)(other.classManifest, f.elementType()))(f.elementType())
|
||||
rdd.zipPartitions(other.rdd)(fn)(other.classTag, f.elementType()))(f.elementType())
|
||||
}
|
||||
|
||||
// Actions (launch a job to return a value to the user program)
|
||||
|
@ -246,6 +244,17 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
new java.util.ArrayList(arr)
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an array that contains all of the elements in a specific partition of this RDD.
|
||||
*/
|
||||
def collectPartitions(partitionIds: Array[Int]): Array[JList[T]] = {
|
||||
// This is useful for implementing `take` from other language frontends
|
||||
// like Python where the data is serialized.
|
||||
import scala.collection.JavaConversions._
|
||||
val res = context.runJob(rdd, (it: Iterator[T]) => it.toArray, partitionIds, true)
|
||||
res.map(x => new java.util.ArrayList(x.toSeq)).toArray
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces the elements of this RDD using the specified commutative and associative binary operator.
|
||||
*/
|
||||
|
@ -356,7 +365,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
* Creates tuples of the elements in this RDD by applying `f`.
|
||||
*/
|
||||
def keyBy[K](f: JFunction[T, K]): JavaPairRDD[K, T] = {
|
||||
implicit val kcm: ClassManifest[K] = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K]]
|
||||
implicit val kcm: ClassTag[K] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[K]]
|
||||
JavaPairRDD.fromRDD(rdd.keyBy(f))
|
||||
}
|
||||
|
||||
|
@ -435,4 +444,15 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[T]]
|
||||
takeOrdered(num, comp)
|
||||
}
|
||||
|
||||
/**
|
||||
* Return approximate number of distinct elements in the RDD.
|
||||
*
|
||||
* The accuracy of approximation can be controlled through the relative standard deviation
|
||||
* (relativeSD) parameter, which also controls the amount of memory used. Lower values result in
|
||||
* more accurate counts but increase the memory footprint and vise versa. The default value of
|
||||
* relativeSD is 0.05.
|
||||
*/
|
||||
def countApproxDistinct(relativeSD: Double = 0.05): Long = rdd.countApproxDistinct(relativeSD)
|
||||
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.{Map => JMap}
|
|||
|
||||
import scala.collection.JavaConversions
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.mapred.InputFormat
|
||||
|
@ -28,17 +29,22 @@ import org.apache.hadoop.mapred.JobConf
|
|||
import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
|
||||
import com.google.common.base.Optional
|
||||
|
||||
import org.apache.spark.{Accumulable, AccumulableParam, Accumulator, AccumulatorParam, SparkContext}
|
||||
import org.apache.spark._
|
||||
import org.apache.spark.SparkContext.IntAccumulatorParam
|
||||
import org.apache.spark.SparkContext.DoubleAccumulatorParam
|
||||
import org.apache.spark.broadcast.Broadcast
|
||||
import org.apache.spark.rdd.RDD
|
||||
import scala.Tuple2
|
||||
|
||||
/**
|
||||
* A Java-friendly version of [[org.apache.spark.SparkContext]] that returns [[org.apache.spark.api.java.JavaRDD]]s and
|
||||
* works with Java collections instead of Scala ones.
|
||||
*/
|
||||
class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWorkaround {
|
||||
/**
|
||||
* @param conf a [[org.apache.spark.SparkConf]] object specifying Spark parameters
|
||||
*/
|
||||
def this(conf: SparkConf) = this(new SparkContext(conf))
|
||||
|
||||
/**
|
||||
* @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
|
||||
|
@ -46,6 +52,14 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
*/
|
||||
def this(master: String, appName: String) = this(new SparkContext(master, appName))
|
||||
|
||||
/**
|
||||
* @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
|
||||
* @param appName A name for your application, to display on the cluster web UI
|
||||
* @param conf a [[org.apache.spark.SparkConf]] object specifying other Spark parameters
|
||||
*/
|
||||
def this(master: String, appName: String, conf: SparkConf) =
|
||||
this(conf.setMaster(master).setAppName(appName))
|
||||
|
||||
/**
|
||||
* @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
|
||||
* @param appName A name for your application, to display on the cluster web UI
|
||||
|
@ -82,8 +96,7 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
|
||||
/** Distribute a local Scala collection to form an RDD. */
|
||||
def parallelize[T](list: java.util.List[T], numSlices: Int): JavaRDD[T] = {
|
||||
implicit val cm: ClassManifest[T] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[T]]
|
||||
implicit val cm: ClassTag[T] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
|
||||
sc.parallelize(JavaConversions.asScalaBuffer(list), numSlices)
|
||||
}
|
||||
|
||||
|
@ -94,10 +107,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
/** Distribute a local Scala collection to form an RDD. */
|
||||
def parallelizePairs[K, V](list: java.util.List[Tuple2[K, V]], numSlices: Int)
|
||||
: JavaPairRDD[K, V] = {
|
||||
implicit val kcm: ClassManifest[K] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K]]
|
||||
implicit val vcm: ClassManifest[V] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[V]]
|
||||
implicit val kcm: ClassTag[K] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[K]]
|
||||
implicit val vcm: ClassTag[V] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
|
||||
JavaPairRDD.fromRDD(sc.parallelize(JavaConversions.asScalaBuffer(list), numSlices))
|
||||
}
|
||||
|
||||
|
@ -132,16 +143,16 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
valueClass: Class[V],
|
||||
minSplits: Int
|
||||
): JavaPairRDD[K, V] = {
|
||||
implicit val kcm = ClassManifest.fromClass(keyClass)
|
||||
implicit val vcm = ClassManifest.fromClass(valueClass)
|
||||
implicit val kcm: ClassTag[K] = ClassTag(keyClass)
|
||||
implicit val vcm: ClassTag[V] = ClassTag(valueClass)
|
||||
new JavaPairRDD(sc.sequenceFile(path, keyClass, valueClass, minSplits))
|
||||
}
|
||||
|
||||
/**Get an RDD for a Hadoop SequenceFile. */
|
||||
def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]):
|
||||
JavaPairRDD[K, V] = {
|
||||
implicit val kcm = ClassManifest.fromClass(keyClass)
|
||||
implicit val vcm = ClassManifest.fromClass(valueClass)
|
||||
implicit val kcm: ClassTag[K] = ClassTag(keyClass)
|
||||
implicit val vcm: ClassTag[V] = ClassTag(valueClass)
|
||||
new JavaPairRDD(sc.sequenceFile(path, keyClass, valueClass))
|
||||
}
|
||||
|
||||
|
@ -153,8 +164,7 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
* that there's very little effort required to save arbitrary objects.
|
||||
*/
|
||||
def objectFile[T](path: String, minSplits: Int): JavaRDD[T] = {
|
||||
implicit val cm: ClassManifest[T] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[T]]
|
||||
implicit val cm: ClassTag[T] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
|
||||
sc.objectFile(path, minSplits)(cm)
|
||||
}
|
||||
|
||||
|
@ -166,8 +176,7 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
* that there's very little effort required to save arbitrary objects.
|
||||
*/
|
||||
def objectFile[T](path: String): JavaRDD[T] = {
|
||||
implicit val cm: ClassManifest[T] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[T]]
|
||||
implicit val cm: ClassTag[T] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
|
||||
sc.objectFile(path)(cm)
|
||||
}
|
||||
|
||||
|
@ -183,8 +192,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
valueClass: Class[V],
|
||||
minSplits: Int
|
||||
): JavaPairRDD[K, V] = {
|
||||
implicit val kcm = ClassManifest.fromClass(keyClass)
|
||||
implicit val vcm = ClassManifest.fromClass(valueClass)
|
||||
implicit val kcm: ClassTag[K] = ClassTag(keyClass)
|
||||
implicit val vcm: ClassTag[V] = ClassTag(valueClass)
|
||||
new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass, minSplits))
|
||||
}
|
||||
|
||||
|
@ -199,8 +208,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
keyClass: Class[K],
|
||||
valueClass: Class[V]
|
||||
): JavaPairRDD[K, V] = {
|
||||
implicit val kcm = ClassManifest.fromClass(keyClass)
|
||||
implicit val vcm = ClassManifest.fromClass(valueClass)
|
||||
implicit val kcm: ClassTag[K] = ClassTag(keyClass)
|
||||
implicit val vcm: ClassTag[V] = ClassTag(valueClass)
|
||||
new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass))
|
||||
}
|
||||
|
||||
|
@ -212,8 +221,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
valueClass: Class[V],
|
||||
minSplits: Int
|
||||
): JavaPairRDD[K, V] = {
|
||||
implicit val kcm = ClassManifest.fromClass(keyClass)
|
||||
implicit val vcm = ClassManifest.fromClass(valueClass)
|
||||
implicit val kcm: ClassTag[K] = ClassTag(keyClass)
|
||||
implicit val vcm: ClassTag[V] = ClassTag(valueClass)
|
||||
new JavaPairRDD(sc.hadoopFile(path, inputFormatClass, keyClass, valueClass, minSplits))
|
||||
}
|
||||
|
||||
|
@ -224,8 +233,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
keyClass: Class[K],
|
||||
valueClass: Class[V]
|
||||
): JavaPairRDD[K, V] = {
|
||||
implicit val kcm = ClassManifest.fromClass(keyClass)
|
||||
implicit val vcm = ClassManifest.fromClass(valueClass)
|
||||
implicit val kcm: ClassTag[K] = ClassTag(keyClass)
|
||||
implicit val vcm: ClassTag[V] = ClassTag(valueClass)
|
||||
new JavaPairRDD(sc.hadoopFile(path,
|
||||
inputFormatClass, keyClass, valueClass))
|
||||
}
|
||||
|
@ -240,8 +249,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
kClass: Class[K],
|
||||
vClass: Class[V],
|
||||
conf: Configuration): JavaPairRDD[K, V] = {
|
||||
implicit val kcm = ClassManifest.fromClass(kClass)
|
||||
implicit val vcm = ClassManifest.fromClass(vClass)
|
||||
implicit val kcm: ClassTag[K] = ClassTag(kClass)
|
||||
implicit val vcm: ClassTag[V] = ClassTag(vClass)
|
||||
new JavaPairRDD(sc.newAPIHadoopFile(path, fClass, kClass, vClass, conf))
|
||||
}
|
||||
|
||||
|
@ -254,15 +263,15 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
fClass: Class[F],
|
||||
kClass: Class[K],
|
||||
vClass: Class[V]): JavaPairRDD[K, V] = {
|
||||
implicit val kcm = ClassManifest.fromClass(kClass)
|
||||
implicit val vcm = ClassManifest.fromClass(vClass)
|
||||
implicit val kcm: ClassTag[K] = ClassTag(kClass)
|
||||
implicit val vcm: ClassTag[V] = ClassTag(vClass)
|
||||
new JavaPairRDD(sc.newAPIHadoopRDD(conf, fClass, kClass, vClass))
|
||||
}
|
||||
|
||||
/** Build the union of two or more RDDs. */
|
||||
override def union[T](first: JavaRDD[T], rest: java.util.List[JavaRDD[T]]): JavaRDD[T] = {
|
||||
val rdds: Seq[RDD[T]] = (Seq(first) ++ asScalaBuffer(rest)).map(_.rdd)
|
||||
implicit val cm: ClassManifest[T] = first.classManifest
|
||||
implicit val cm: ClassTag[T] = first.classTag
|
||||
sc.union(rdds)(cm)
|
||||
}
|
||||
|
||||
|
@ -270,9 +279,9 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
override def union[K, V](first: JavaPairRDD[K, V], rest: java.util.List[JavaPairRDD[K, V]])
|
||||
: JavaPairRDD[K, V] = {
|
||||
val rdds: Seq[RDD[(K, V)]] = (Seq(first) ++ asScalaBuffer(rest)).map(_.rdd)
|
||||
implicit val cm: ClassManifest[(K, V)] = first.classManifest
|
||||
implicit val kcm: ClassManifest[K] = first.kManifest
|
||||
implicit val vcm: ClassManifest[V] = first.vManifest
|
||||
implicit val cm: ClassTag[(K, V)] = first.classTag
|
||||
implicit val kcm: ClassTag[K] = first.kClassTag
|
||||
implicit val vcm: ClassTag[V] = first.vClassTag
|
||||
new JavaPairRDD(sc.union(rdds)(cm))(kcm, vcm)
|
||||
}
|
||||
|
||||
|
@ -385,34 +394,47 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
|
|||
|
||||
/**
|
||||
* Set the directory under which RDDs are going to be checkpointed. The directory must
|
||||
* be a HDFS path if running on a cluster. If the directory does not exist, it will
|
||||
* be created. If the directory exists and useExisting is set to true, then the
|
||||
* exisiting directory will be used. Otherwise an exception will be thrown to
|
||||
* prevent accidental overriding of checkpoint files in the existing directory.
|
||||
*/
|
||||
def setCheckpointDir(dir: String, useExisting: Boolean) {
|
||||
sc.setCheckpointDir(dir, useExisting)
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the directory under which RDDs are going to be checkpointed. The directory must
|
||||
* be a HDFS path if running on a cluster. If the directory does not exist, it will
|
||||
* be created. If the directory exists, an exception will be thrown to prevent accidental
|
||||
* overriding of checkpoint files.
|
||||
* be a HDFS path if running on a cluster.
|
||||
*/
|
||||
def setCheckpointDir(dir: String) {
|
||||
sc.setCheckpointDir(dir)
|
||||
}
|
||||
|
||||
protected def checkpointFile[T](path: String): JavaRDD[T] = {
|
||||
implicit val cm: ClassManifest[T] =
|
||||
implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[T]]
|
||||
implicit val cm: ClassTag[T] =
|
||||
implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
|
||||
new JavaRDD(sc.checkpointFile(path))
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a copy of this JavaSparkContext's configuration. The configuration ''cannot'' be
|
||||
* changed at runtime.
|
||||
*/
|
||||
def getConf: SparkConf = sc.getConf
|
||||
|
||||
/**
|
||||
* Pass-through to SparkContext.setCallSite. For API support only.
|
||||
*/
|
||||
def setCallSite(site: String) {
|
||||
sc.setCallSite(site)
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass-through to SparkContext.setCallSite. For API support only.
|
||||
*/
|
||||
def clearCallSite() {
|
||||
sc.clearCallSite()
|
||||
}
|
||||
}
|
||||
|
||||
object JavaSparkContext {
|
||||
implicit def fromSparkContext(sc: SparkContext): JavaSparkContext = new JavaSparkContext(sc)
|
||||
|
||||
implicit def toSparkContext(jsc: JavaSparkContext): SparkContext = jsc.sc
|
||||
|
||||
/**
|
||||
* Find the JAR from which a given class was loaded, to make it easy for users to pass
|
||||
* their JARs to SparkContext.
|
||||
*/
|
||||
def jarOfClass(cls: Class[_]) = SparkContext.jarOfClass(cls).toArray
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.spark.api.java;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
|
@ -17,9 +17,11 @@
|
|||
|
||||
package org.apache.spark.api.java.function
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
/**
|
||||
* A function that returns zero or more output records from each input record.
|
||||
*/
|
||||
abstract class FlatMapFunction[T, R] extends Function[T, java.lang.Iterable[R]] {
|
||||
def elementType() : ClassManifest[R] = ClassManifest.Any.asInstanceOf[ClassManifest[R]]
|
||||
def elementType(): ClassTag[R] = ClassTag.Any.asInstanceOf[ClassTag[R]]
|
||||
}
|
||||
|
|
|
@ -17,9 +17,11 @@
|
|||
|
||||
package org.apache.spark.api.java.function
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
/**
|
||||
* A function that takes two inputs and returns zero or more output records.
|
||||
*/
|
||||
abstract class FlatMapFunction2[A, B, C] extends Function2[A, B, java.lang.Iterable[C]] {
|
||||
def elementType() : ClassManifest[C] = ClassManifest.Any.asInstanceOf[ClassManifest[C]]
|
||||
def elementType() : ClassTag[C] = ClassTag.Any.asInstanceOf[ClassTag[C]]
|
||||
}
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
|
||||
package org.apache.spark.api.java.function;
|
||||
|
||||
import scala.reflect.ClassManifest;
|
||||
import scala.reflect.ClassManifest$;
|
||||
import scala.reflect.ClassTag;
|
||||
import scala.reflect.ClassTag$;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
|
@ -29,8 +29,8 @@ import java.io.Serializable;
|
|||
* when mapping RDDs of other types.
|
||||
*/
|
||||
public abstract class Function<T, R> extends WrappedFunction1<T, R> implements Serializable {
|
||||
public ClassManifest<R> returnType() {
|
||||
return (ClassManifest<R>) ClassManifest$.MODULE$.fromClass(Object.class);
|
||||
public ClassTag<R> returnType() {
|
||||
return ClassTag$.MODULE$.apply(Object.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
|
||||
package org.apache.spark.api.java.function;
|
||||
|
||||
import scala.reflect.ClassManifest;
|
||||
import scala.reflect.ClassManifest$;
|
||||
import scala.reflect.ClassTag;
|
||||
import scala.reflect.ClassTag$;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
|
@ -28,8 +28,8 @@ import java.io.Serializable;
|
|||
public abstract class Function2<T1, T2, R> extends WrappedFunction2<T1, T2, R>
|
||||
implements Serializable {
|
||||
|
||||
public ClassManifest<R> returnType() {
|
||||
return (ClassManifest<R>) ClassManifest$.MODULE$.fromClass(Object.class);
|
||||
public ClassTag<R> returnType() {
|
||||
return (ClassTag<R>) ClassTag$.MODULE$.apply(Object.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
|
||||
package org.apache.spark.api.java.function;
|
||||
|
||||
import scala.reflect.ClassManifest;
|
||||
import scala.reflect.ClassManifest$;
|
||||
import scala.reflect.ClassTag;
|
||||
import scala.reflect.ClassTag$;
|
||||
import scala.runtime.AbstractFunction2;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -29,8 +29,8 @@ import java.io.Serializable;
|
|||
public abstract class Function3<T1, T2, T3, R> extends WrappedFunction3<T1, T2, T3, R>
|
||||
implements Serializable {
|
||||
|
||||
public ClassManifest<R> returnType() {
|
||||
return (ClassManifest<R>) ClassManifest$.MODULE$.fromClass(Object.class);
|
||||
public ClassTag<R> returnType() {
|
||||
return (ClassTag<R>) ClassTag$.MODULE$.apply(Object.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,8 +18,8 @@
|
|||
package org.apache.spark.api.java.function;
|
||||
|
||||
import scala.Tuple2;
|
||||
import scala.reflect.ClassManifest;
|
||||
import scala.reflect.ClassManifest$;
|
||||
import scala.reflect.ClassTag;
|
||||
import scala.reflect.ClassTag$;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
|
@ -33,11 +33,11 @@ public abstract class PairFlatMapFunction<T, K, V>
|
|||
extends WrappedFunction1<T, Iterable<Tuple2<K, V>>>
|
||||
implements Serializable {
|
||||
|
||||
public ClassManifest<K> keyType() {
|
||||
return (ClassManifest<K>) ClassManifest$.MODULE$.fromClass(Object.class);
|
||||
public ClassTag<K> keyType() {
|
||||
return (ClassTag<K>) ClassTag$.MODULE$.apply(Object.class);
|
||||
}
|
||||
|
||||
public ClassManifest<V> valueType() {
|
||||
return (ClassManifest<V>) ClassManifest$.MODULE$.fromClass(Object.class);
|
||||
public ClassTag<V> valueType() {
|
||||
return (ClassTag<V>) ClassTag$.MODULE$.apply(Object.class);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,8 +18,8 @@
|
|||
package org.apache.spark.api.java.function;
|
||||
|
||||
import scala.Tuple2;
|
||||
import scala.reflect.ClassManifest;
|
||||
import scala.reflect.ClassManifest$;
|
||||
import scala.reflect.ClassTag;
|
||||
import scala.reflect.ClassTag$;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
|
@ -31,11 +31,11 @@ import java.io.Serializable;
|
|||
public abstract class PairFunction<T, K, V> extends WrappedFunction1<T, Tuple2<K, V>>
|
||||
implements Serializable {
|
||||
|
||||
public ClassManifest<K> keyType() {
|
||||
return (ClassManifest<K>) ClassManifest$.MODULE$.fromClass(Object.class);
|
||||
public ClassTag<K> keyType() {
|
||||
return (ClassTag<K>) ClassTag$.MODULE$.apply(Object.class);
|
||||
}
|
||||
|
||||
public ClassManifest<V> valueType() {
|
||||
return (ClassManifest<V>) ClassManifest$.MODULE$.fromClass(Object.class);
|
||||
public ClassTag<V> valueType() {
|
||||
return (ClassTag<V>) ClassTag$.MODULE$.apply(Object.class);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,18 +22,17 @@ import java.net._
|
|||
import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collections}
|
||||
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
|
||||
import org.apache.spark.broadcast.Broadcast
|
||||
import org.apache.spark._
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.rdd.PipedRDD
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
|
||||
private[spark] class PythonRDD[T: ClassManifest](
|
||||
private[spark] class PythonRDD[T: ClassTag](
|
||||
parent: RDD[T],
|
||||
command: Seq[String],
|
||||
command: Array[Byte],
|
||||
envVars: JMap[String, String],
|
||||
pythonIncludes: JList[String],
|
||||
preservePartitoning: Boolean,
|
||||
|
@ -42,23 +41,12 @@ private[spark] class PythonRDD[T: ClassManifest](
|
|||
accumulator: Accumulator[JList[Array[Byte]]])
|
||||
extends RDD[Array[Byte]](parent) {
|
||||
|
||||
val bufferSize = System.getProperty("spark.buffer.size", "65536").toInt
|
||||
|
||||
// Similar to Runtime.exec(), if we are given a single string, split it into words
|
||||
// using a standard StringTokenizer (i.e. by spaces)
|
||||
def this(parent: RDD[T], command: String, envVars: JMap[String, String],
|
||||
pythonIncludes: JList[String],
|
||||
preservePartitoning: Boolean, pythonExec: String,
|
||||
broadcastVars: JList[Broadcast[Array[Byte]]],
|
||||
accumulator: Accumulator[JList[Array[Byte]]]) =
|
||||
this(parent, PipedRDD.tokenize(command), envVars, pythonIncludes, preservePartitoning, pythonExec,
|
||||
broadcastVars, accumulator)
|
||||
val bufferSize = conf.get("spark.buffer.size", "65536").toInt
|
||||
|
||||
override def getPartitions = parent.partitions
|
||||
|
||||
override val partitioner = if (preservePartitoning) parent.partitioner else None
|
||||
|
||||
|
||||
override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
|
||||
val startTime = System.currentTimeMillis
|
||||
val env = SparkEnv.get
|
||||
|
@ -71,11 +59,10 @@ private[spark] class PythonRDD[T: ClassManifest](
|
|||
SparkEnv.set(env)
|
||||
val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
|
||||
val dataOut = new DataOutputStream(stream)
|
||||
val printOut = new PrintWriter(stream)
|
||||
// Partition index
|
||||
dataOut.writeInt(split.index)
|
||||
// sparkFilesDir
|
||||
PythonRDD.writeAsPickle(SparkFiles.getRootDirectory, dataOut)
|
||||
dataOut.writeUTF(SparkFiles.getRootDirectory)
|
||||
// Broadcast variables
|
||||
dataOut.writeInt(broadcastVars.length)
|
||||
for (broadcast <- broadcastVars) {
|
||||
|
@ -85,21 +72,16 @@ private[spark] class PythonRDD[T: ClassManifest](
|
|||
}
|
||||
// Python includes (*.zip and *.egg files)
|
||||
dataOut.writeInt(pythonIncludes.length)
|
||||
for (f <- pythonIncludes) {
|
||||
PythonRDD.writeAsPickle(f, dataOut)
|
||||
}
|
||||
pythonIncludes.foreach(dataOut.writeUTF)
|
||||
dataOut.flush()
|
||||
// Serialized user code
|
||||
for (elem <- command) {
|
||||
printOut.println(elem)
|
||||
}
|
||||
printOut.flush()
|
||||
// Serialized command:
|
||||
dataOut.writeInt(command.length)
|
||||
dataOut.write(command)
|
||||
// Data values
|
||||
for (elem <- parent.iterator(split, context)) {
|
||||
PythonRDD.writeAsPickle(elem, dataOut)
|
||||
PythonRDD.writeToStream(elem, dataOut)
|
||||
}
|
||||
dataOut.flush()
|
||||
printOut.flush()
|
||||
worker.shutdownOutput()
|
||||
} catch {
|
||||
case e: IOException =>
|
||||
|
@ -132,7 +114,7 @@ private[spark] class PythonRDD[T: ClassManifest](
|
|||
val obj = new Array[Byte](length)
|
||||
stream.readFully(obj)
|
||||
obj
|
||||
case -3 =>
|
||||
case SpecialLengths.TIMING_DATA =>
|
||||
// Timing data from worker
|
||||
val bootTime = stream.readLong()
|
||||
val initTime = stream.readLong()
|
||||
|
@ -143,30 +125,30 @@ private[spark] class PythonRDD[T: ClassManifest](
|
|||
val total = finishTime - startTime
|
||||
logInfo("Times: total = %s, boot = %s, init = %s, finish = %s".format(total, boot, init, finish))
|
||||
read
|
||||
case -2 =>
|
||||
case SpecialLengths.PYTHON_EXCEPTION_THROWN =>
|
||||
// Signals that an exception has been thrown in python
|
||||
val exLength = stream.readInt()
|
||||
val obj = new Array[Byte](exLength)
|
||||
stream.readFully(obj)
|
||||
throw new PythonException(new String(obj))
|
||||
case -1 =>
|
||||
case SpecialLengths.END_OF_DATA_SECTION =>
|
||||
// We've finished the data section of the output, but we can still
|
||||
// read some accumulator updates; let's do that, breaking when we
|
||||
// get a negative length record.
|
||||
var len2 = stream.readInt()
|
||||
while (len2 >= 0) {
|
||||
val update = new Array[Byte](len2)
|
||||
// read some accumulator updates:
|
||||
val numAccumulatorUpdates = stream.readInt()
|
||||
(1 to numAccumulatorUpdates).foreach { _ =>
|
||||
val updateLen = stream.readInt()
|
||||
val update = new Array[Byte](updateLen)
|
||||
stream.readFully(update)
|
||||
accumulator += Collections.singletonList(update)
|
||||
len2 = stream.readInt()
|
||||
|
||||
}
|
||||
new Array[Byte](0)
|
||||
Array.empty[Byte]
|
||||
}
|
||||
} catch {
|
||||
case eof: EOFException => {
|
||||
throw new SparkException("Python worker exited unexpectedly (crashed)", eof)
|
||||
}
|
||||
case e => throw e
|
||||
case e: Throwable => throw e
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -197,62 +179,15 @@ private class PairwiseRDD(prev: RDD[Array[Byte]]) extends
|
|||
val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this)
|
||||
}
|
||||
|
||||
private object SpecialLengths {
|
||||
val END_OF_DATA_SECTION = -1
|
||||
val PYTHON_EXCEPTION_THROWN = -2
|
||||
val TIMING_DATA = -3
|
||||
}
|
||||
|
||||
private[spark] object PythonRDD {
|
||||
|
||||
/** Strips the pickle PROTO and STOP opcodes from the start and end of a pickle */
|
||||
def stripPickle(arr: Array[Byte]) : Array[Byte] = {
|
||||
arr.slice(2, arr.length - 1)
|
||||
}
|
||||
|
||||
/**
|
||||
* Write strings, pickled Python objects, or pairs of pickled objects to a data output stream.
|
||||
* The data format is a 32-bit integer representing the pickled object's length (in bytes),
|
||||
* followed by the pickled data.
|
||||
*
|
||||
* Pickle module:
|
||||
*
|
||||
* http://docs.python.org/2/library/pickle.html
|
||||
*
|
||||
* The pickle protocol is documented in the source of the `pickle` and `pickletools` modules:
|
||||
*
|
||||
* http://hg.python.org/cpython/file/2.6/Lib/pickle.py
|
||||
* http://hg.python.org/cpython/file/2.6/Lib/pickletools.py
|
||||
*
|
||||
* @param elem the object to write
|
||||
* @param dOut a data output stream
|
||||
*/
|
||||
def writeAsPickle(elem: Any, dOut: DataOutputStream) {
|
||||
if (elem.isInstanceOf[Array[Byte]]) {
|
||||
val arr = elem.asInstanceOf[Array[Byte]]
|
||||
dOut.writeInt(arr.length)
|
||||
dOut.write(arr)
|
||||
} else if (elem.isInstanceOf[scala.Tuple2[Array[Byte], Array[Byte]]]) {
|
||||
val t = elem.asInstanceOf[scala.Tuple2[Array[Byte], Array[Byte]]]
|
||||
val length = t._1.length + t._2.length - 3 - 3 + 4 // stripPickle() removes 3 bytes
|
||||
dOut.writeInt(length)
|
||||
dOut.writeByte(Pickle.PROTO)
|
||||
dOut.writeByte(Pickle.TWO)
|
||||
dOut.write(PythonRDD.stripPickle(t._1))
|
||||
dOut.write(PythonRDD.stripPickle(t._2))
|
||||
dOut.writeByte(Pickle.TUPLE2)
|
||||
dOut.writeByte(Pickle.STOP)
|
||||
} else if (elem.isInstanceOf[String]) {
|
||||
// For uniformity, strings are wrapped into Pickles.
|
||||
val s = elem.asInstanceOf[String].getBytes("UTF-8")
|
||||
val length = 2 + 1 + 4 + s.length + 1
|
||||
dOut.writeInt(length)
|
||||
dOut.writeByte(Pickle.PROTO)
|
||||
dOut.writeByte(Pickle.TWO)
|
||||
dOut.write(Pickle.BINUNICODE)
|
||||
dOut.writeInt(Integer.reverseBytes(s.length))
|
||||
dOut.write(s)
|
||||
dOut.writeByte(Pickle.STOP)
|
||||
} else {
|
||||
throw new SparkException("Unexpected RDD type")
|
||||
}
|
||||
}
|
||||
|
||||
def readRDDFromPickleFile(sc: JavaSparkContext, filename: String, parallelism: Int) :
|
||||
def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int):
|
||||
JavaRDD[Array[Byte]] = {
|
||||
val file = new DataInputStream(new FileInputStream(filename))
|
||||
val objs = new collection.mutable.ArrayBuffer[Array[Byte]]
|
||||
|
@ -265,39 +200,41 @@ private[spark] object PythonRDD {
|
|||
}
|
||||
} catch {
|
||||
case eof: EOFException => {}
|
||||
case e => throw e
|
||||
case e: Throwable => throw e
|
||||
}
|
||||
JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
|
||||
}
|
||||
|
||||
def writeIteratorToPickleFile[T](items: java.util.Iterator[T], filename: String) {
|
||||
import scala.collection.JavaConverters._
|
||||
writeIteratorToPickleFile(items.asScala, filename)
|
||||
def writeToStream(elem: Any, dataOut: DataOutputStream) {
|
||||
elem match {
|
||||
case bytes: Array[Byte] =>
|
||||
dataOut.writeInt(bytes.length)
|
||||
dataOut.write(bytes)
|
||||
case pair: (Array[Byte], Array[Byte]) =>
|
||||
dataOut.writeInt(pair._1.length)
|
||||
dataOut.write(pair._1)
|
||||
dataOut.writeInt(pair._2.length)
|
||||
dataOut.write(pair._2)
|
||||
case str: String =>
|
||||
dataOut.writeUTF(str)
|
||||
case other =>
|
||||
throw new SparkException("Unexpected element type " + other.getClass)
|
||||
}
|
||||
}
|
||||
|
||||
def writeIteratorToPickleFile[T](items: Iterator[T], filename: String) {
|
||||
def writeToFile[T](items: java.util.Iterator[T], filename: String) {
|
||||
import scala.collection.JavaConverters._
|
||||
writeToFile(items.asScala, filename)
|
||||
}
|
||||
|
||||
def writeToFile[T](items: Iterator[T], filename: String) {
|
||||
val file = new DataOutputStream(new FileOutputStream(filename))
|
||||
for (item <- items) {
|
||||
writeAsPickle(item, file)
|
||||
writeToStream(item, file)
|
||||
}
|
||||
file.close()
|
||||
}
|
||||
|
||||
def takePartition[T](rdd: RDD[T], partition: Int): Iterator[T] = {
|
||||
implicit val cm : ClassManifest[T] = rdd.elementClassManifest
|
||||
rdd.context.runJob(rdd, ((x: Iterator[T]) => x.toArray), Seq(partition), true).head.iterator
|
||||
}
|
||||
}
|
||||
|
||||
private object Pickle {
|
||||
val PROTO: Byte = 0x80.toByte
|
||||
val TWO: Byte = 0x02.toByte
|
||||
val BINUNICODE: Byte = 'X'
|
||||
val STOP: Byte = '.'
|
||||
val TUPLE2: Byte = 0x86.toByte
|
||||
val EMPTY_LIST: Byte = ']'
|
||||
val MARK: Byte = '('
|
||||
val APPENDS: Byte = 'e'
|
||||
}
|
||||
|
||||
private class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] {
|
||||
|
@ -313,7 +250,7 @@ private class PythonAccumulatorParam(@transient serverHost: String, serverPort:
|
|||
|
||||
Utils.checkHost(serverHost, "Expected hostname")
|
||||
|
||||
val bufferSize = System.getProperty("spark.buffer.size", "65536").toInt
|
||||
val bufferSize = SparkEnv.get.conf.get("spark.buffer.size", "65536").toInt
|
||||
|
||||
override def zero(value: JList[Array[Byte]]): JList[Array[Byte]] = new JArrayList
|
||||
|
||||
|
|
|
@ -64,7 +64,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
|
|||
startDaemon()
|
||||
new Socket(daemonHost, daemonPort)
|
||||
}
|
||||
case e => throw e
|
||||
case e: Throwable => throw e
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -198,7 +198,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
|
|||
}
|
||||
}.start()
|
||||
} catch {
|
||||
case e => {
|
||||
case e: Throwable => {
|
||||
stopDaemon()
|
||||
throw e
|
||||
}
|
||||
|
|
|
@ -31,8 +31,8 @@ abstract class Broadcast[T](private[spark] val id: Long) extends Serializable {
|
|||
override def toString = "Broadcast(" + id + ")"
|
||||
}
|
||||
|
||||
private[spark]
|
||||
class BroadcastManager(val _isDriver: Boolean) extends Logging with Serializable {
|
||||
private[spark]
|
||||
class BroadcastManager(val _isDriver: Boolean, conf: SparkConf) extends Logging with Serializable {
|
||||
|
||||
private var initialized = false
|
||||
private var broadcastFactory: BroadcastFactory = null
|
||||
|
@ -43,14 +43,14 @@ class BroadcastManager(val _isDriver: Boolean) extends Logging with Serializable
|
|||
private def initialize() {
|
||||
synchronized {
|
||||
if (!initialized) {
|
||||
val broadcastFactoryClass = System.getProperty(
|
||||
val broadcastFactoryClass = conf.get(
|
||||
"spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory")
|
||||
|
||||
broadcastFactory =
|
||||
Class.forName(broadcastFactoryClass).newInstance.asInstanceOf[BroadcastFactory]
|
||||
|
||||
// Initialize appropriate BroadcastFactory and BroadcastObject
|
||||
broadcastFactory.initialize(isDriver)
|
||||
broadcastFactory.initialize(isDriver, conf)
|
||||
|
||||
initialized = true
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.spark.broadcast
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
/**
|
||||
* An interface for all the broadcast implementations in Spark (to allow
|
||||
* multiple broadcast implementations). SparkContext uses a user-specified
|
||||
|
@ -24,7 +26,7 @@ package org.apache.spark.broadcast
|
|||
* entire Spark job.
|
||||
*/
|
||||
private[spark] trait BroadcastFactory {
|
||||
def initialize(isDriver: Boolean): Unit
|
||||
def initialize(isDriver: Boolean, conf: SparkConf): Unit
|
||||
def newBroadcast[T](value: T, isLocal: Boolean, id: Long): Broadcast[T]
|
||||
def stop(): Unit
|
||||
}
|
||||
|
|
|
@ -19,18 +19,19 @@ package org.apache.spark.broadcast
|
|||
|
||||
import java.io.{File, FileOutputStream, ObjectInputStream, OutputStream}
|
||||
import java.net.URL
|
||||
import java.util.concurrent.TimeUnit
|
||||
|
||||
import it.unimi.dsi.fastutil.io.FastBufferedInputStream
|
||||
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream
|
||||
|
||||
import org.apache.spark.{HttpServer, Logging, SparkEnv}
|
||||
import org.apache.spark.{SparkConf, HttpServer, Logging, SparkEnv}
|
||||
import org.apache.spark.io.CompressionCodec
|
||||
import org.apache.spark.storage.{BroadcastBlockId, StorageLevel}
|
||||
import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashSet, Utils}
|
||||
|
||||
private[spark] class HttpBroadcast[T](@transient var value_ : T, isLocal: Boolean, id: Long)
|
||||
extends Broadcast[T](id) with Logging with Serializable {
|
||||
|
||||
|
||||
def value = value_
|
||||
|
||||
def blockId = BroadcastBlockId(id)
|
||||
|
@ -39,7 +40,7 @@ private[spark] class HttpBroadcast[T](@transient var value_ : T, isLocal: Boolea
|
|||
SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, false)
|
||||
}
|
||||
|
||||
if (!isLocal) {
|
||||
if (!isLocal) {
|
||||
HttpBroadcast.write(id, value_)
|
||||
}
|
||||
|
||||
|
@ -63,7 +64,7 @@ private[spark] class HttpBroadcast[T](@transient var value_ : T, isLocal: Boolea
|
|||
}
|
||||
|
||||
private[spark] class HttpBroadcastFactory extends BroadcastFactory {
|
||||
def initialize(isDriver: Boolean) { HttpBroadcast.initialize(isDriver) }
|
||||
def initialize(isDriver: Boolean, conf: SparkConf) { HttpBroadcast.initialize(isDriver, conf) }
|
||||
|
||||
def newBroadcast[T](value_ : T, isLocal: Boolean, id: Long) =
|
||||
new HttpBroadcast[T](value_, isLocal, id)
|
||||
|
@ -80,42 +81,51 @@ private object HttpBroadcast extends Logging {
|
|||
private var serverUri: String = null
|
||||
private var server: HttpServer = null
|
||||
|
||||
// TODO: This shouldn't be a global variable so that multiple SparkContexts can coexist
|
||||
private val files = new TimeStampedHashSet[String]
|
||||
private val cleaner = new MetadataCleaner(MetadataCleanerType.HTTP_BROADCAST, cleanup)
|
||||
private var cleaner: MetadataCleaner = null
|
||||
|
||||
private lazy val compressionCodec = CompressionCodec.createCodec()
|
||||
private val httpReadTimeout = TimeUnit.MILLISECONDS.convert(5, TimeUnit.MINUTES).toInt
|
||||
|
||||
def initialize(isDriver: Boolean) {
|
||||
private var compressionCodec: CompressionCodec = null
|
||||
|
||||
def initialize(isDriver: Boolean, conf: SparkConf) {
|
||||
synchronized {
|
||||
if (!initialized) {
|
||||
bufferSize = System.getProperty("spark.buffer.size", "65536").toInt
|
||||
compress = System.getProperty("spark.broadcast.compress", "true").toBoolean
|
||||
bufferSize = conf.get("spark.buffer.size", "65536").toInt
|
||||
compress = conf.get("spark.broadcast.compress", "true").toBoolean
|
||||
if (isDriver) {
|
||||
createServer()
|
||||
createServer(conf)
|
||||
conf.set("spark.httpBroadcast.uri", serverUri)
|
||||
}
|
||||
serverUri = System.getProperty("spark.httpBroadcast.uri")
|
||||
serverUri = conf.get("spark.httpBroadcast.uri")
|
||||
cleaner = new MetadataCleaner(MetadataCleanerType.HTTP_BROADCAST, cleanup, conf)
|
||||
compressionCodec = CompressionCodec.createCodec(conf)
|
||||
initialized = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def stop() {
|
||||
synchronized {
|
||||
if (server != null) {
|
||||
server.stop()
|
||||
server = null
|
||||
}
|
||||
if (cleaner != null) {
|
||||
cleaner.cancel()
|
||||
cleaner = null
|
||||
}
|
||||
compressionCodec = null
|
||||
initialized = false
|
||||
cleaner.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
private def createServer() {
|
||||
broadcastDir = Utils.createTempDir(Utils.getLocalDir)
|
||||
private def createServer(conf: SparkConf) {
|
||||
broadcastDir = Utils.createTempDir(Utils.getLocalDir(conf))
|
||||
server = new HttpServer(broadcastDir)
|
||||
server.start()
|
||||
serverUri = server.uri
|
||||
System.setProperty("spark.httpBroadcast.uri", serverUri)
|
||||
logInfo("Broadcast server started at " + serverUri)
|
||||
}
|
||||
|
||||
|
@ -138,10 +148,13 @@ private object HttpBroadcast extends Logging {
|
|||
def read[T](id: Long): T = {
|
||||
val url = serverUri + "/" + BroadcastBlockId(id).name
|
||||
val in = {
|
||||
val httpConnection = new URL(url).openConnection()
|
||||
httpConnection.setReadTimeout(httpReadTimeout)
|
||||
val inputStream = httpConnection.getInputStream
|
||||
if (compress) {
|
||||
compressionCodec.compressedInputStream(new URL(url).openStream())
|
||||
compressionCodec.compressedInputStream(inputStream)
|
||||
} else {
|
||||
new FastBufferedInputStream(new URL(url).openStream(), bufferSize)
|
||||
new FastBufferedInputStream(inputStream, bufferSize)
|
||||
}
|
||||
}
|
||||
val ser = SparkEnv.get.serializer.newInstance()
|
||||
|
|
|
@ -83,13 +83,13 @@ extends Broadcast[T](id) with Logging with Serializable {
|
|||
case None =>
|
||||
val start = System.nanoTime
|
||||
logInfo("Started reading broadcast variable " + id)
|
||||
|
||||
|
||||
// Initialize @transient variables that will receive garbage values from the master.
|
||||
resetWorkerVariables()
|
||||
|
||||
if (receiveBroadcast(id)) {
|
||||
value_ = TorrentBroadcast.unBlockifyObject[T](arrayOfBlocks, totalBytes, totalBlocks)
|
||||
|
||||
|
||||
// Store the merged copy in cache so that the next worker doesn't need to rebuild it.
|
||||
// This creates a tradeoff between memory usage and latency.
|
||||
// Storing copy doubles the memory footprint; not storing doubles deserialization cost.
|
||||
|
@ -122,14 +122,14 @@ extends Broadcast[T](id) with Logging with Serializable {
|
|||
while (attemptId > 0 && totalBlocks == -1) {
|
||||
TorrentBroadcast.synchronized {
|
||||
SparkEnv.get.blockManager.getSingle(metaId) match {
|
||||
case Some(x) =>
|
||||
case Some(x) =>
|
||||
val tInfo = x.asInstanceOf[TorrentInfo]
|
||||
totalBlocks = tInfo.totalBlocks
|
||||
totalBytes = tInfo.totalBytes
|
||||
arrayOfBlocks = new Array[TorrentBlock](totalBlocks)
|
||||
hasBlocks = 0
|
||||
|
||||
case None =>
|
||||
|
||||
case None =>
|
||||
Thread.sleep(500)
|
||||
}
|
||||
}
|
||||
|
@ -145,13 +145,13 @@ extends Broadcast[T](id) with Logging with Serializable {
|
|||
val pieceId = BroadcastHelperBlockId(broadcastId, "piece" + pid)
|
||||
TorrentBroadcast.synchronized {
|
||||
SparkEnv.get.blockManager.getSingle(pieceId) match {
|
||||
case Some(x) =>
|
||||
case Some(x) =>
|
||||
arrayOfBlocks(pid) = x.asInstanceOf[TorrentBlock]
|
||||
hasBlocks += 1
|
||||
SparkEnv.get.blockManager.putSingle(
|
||||
pieceId, arrayOfBlocks(pid), StorageLevel.MEMORY_AND_DISK, true)
|
||||
|
||||
case None =>
|
||||
|
||||
case None =>
|
||||
throw new SparkException("Failed to get " + pieceId + " of " + broadcastId)
|
||||
}
|
||||
}
|
||||
|
@ -166,21 +166,22 @@ private object TorrentBroadcast
|
|||
extends Logging {
|
||||
|
||||
private var initialized = false
|
||||
|
||||
def initialize(_isDriver: Boolean) {
|
||||
private var conf: SparkConf = null
|
||||
def initialize(_isDriver: Boolean, conf: SparkConf) {
|
||||
TorrentBroadcast.conf = conf //TODO: we might have to fix it in tests
|
||||
synchronized {
|
||||
if (!initialized) {
|
||||
initialized = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def stop() {
|
||||
initialized = false
|
||||
}
|
||||
|
||||
val BLOCK_SIZE = System.getProperty("spark.broadcast.blockSize", "4096").toInt * 1024
|
||||
|
||||
lazy val BLOCK_SIZE = conf.get("spark.broadcast.blockSize", "4096").toInt * 1024
|
||||
|
||||
def blockifyObject[T](obj: T): TorrentInfo = {
|
||||
val byteArray = Utils.serialize[T](obj)
|
||||
val bais = new ByteArrayInputStream(byteArray)
|
||||
|
@ -209,7 +210,7 @@ extends Logging {
|
|||
}
|
||||
|
||||
def unBlockifyObject[T](arrayOfBlocks: Array[TorrentBlock],
|
||||
totalBytes: Int,
|
||||
totalBytes: Int,
|
||||
totalBlocks: Int): T = {
|
||||
var retByteArray = new Array[Byte](totalBytes)
|
||||
for (i <- 0 until totalBlocks) {
|
||||
|
@ -222,23 +223,23 @@ extends Logging {
|
|||
}
|
||||
|
||||
private[spark] case class TorrentBlock(
|
||||
blockID: Int,
|
||||
byteArray: Array[Byte])
|
||||
blockID: Int,
|
||||
byteArray: Array[Byte])
|
||||
extends Serializable
|
||||
|
||||
private[spark] case class TorrentInfo(
|
||||
@transient arrayOfBlocks : Array[TorrentBlock],
|
||||
totalBlocks: Int,
|
||||
totalBytes: Int)
|
||||
totalBlocks: Int,
|
||||
totalBytes: Int)
|
||||
extends Serializable {
|
||||
|
||||
@transient var hasBlocks = 0
|
||||
|
||||
@transient var hasBlocks = 0
|
||||
}
|
||||
|
||||
private[spark] class TorrentBroadcastFactory
|
||||
extends BroadcastFactory {
|
||||
|
||||
def initialize(isDriver: Boolean) { TorrentBroadcast.initialize(isDriver) }
|
||||
|
||||
def initialize(isDriver: Boolean, conf: SparkConf) { TorrentBroadcast.initialize(isDriver, conf) }
|
||||
|
||||
def newBroadcast[T](value_ : T, isLocal: Boolean, id: Long) =
|
||||
new TorrentBroadcast[T](value_, isLocal, id)
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.spark.deploy
|
|||
|
||||
private[spark] class ApplicationDescription(
|
||||
val name: String,
|
||||
val maxCores: Int, /* Integer.MAX_VALUE denotes an unlimited number of cores */
|
||||
val maxCores: Option[Int],
|
||||
val memoryPerSlave: Int,
|
||||
val command: Command,
|
||||
val sparkHome: String,
|
||||
|
|
|
@ -17,8 +17,7 @@
|
|||
|
||||
package org.apache.spark.deploy
|
||||
|
||||
private[spark] object ExecutorState
|
||||
extends Enumeration("LAUNCHING", "LOADING", "RUNNING", "KILLED", "FAILED", "LOST") {
|
||||
private[spark] object ExecutorState extends Enumeration {
|
||||
|
||||
val LAUNCHING, LOADING, RUNNING, KILLED, FAILED, LOST = Value
|
||||
|
||||
|
|
|
@ -190,7 +190,7 @@ private[spark] object FaultToleranceTest extends App with Logging {
|
|||
/** Creates a SparkContext, which constructs a Client to interact with our cluster. */
|
||||
def createClient() = {
|
||||
if (sc != null) { sc.stop() }
|
||||
// Counter-hack: Because of a hack in SparkEnv#createFromSystemProperties() that changes this
|
||||
// Counter-hack: Because of a hack in SparkEnv#create() that changes this
|
||||
// property, we need to reset it.
|
||||
System.setProperty("spark.driver.port", "0")
|
||||
sc = new SparkContext(getMasterUrls(masters), "fault-tolerance", containerSparkHome)
|
||||
|
@ -417,4 +417,4 @@ private[spark] object Docker extends Logging {
|
|||
"docker ps -l -q".!(ProcessLogger(line => id = line))
|
||||
new DockerId(id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ import akka.actor.ActorSystem
|
|||
import org.apache.spark.deploy.worker.Worker
|
||||
import org.apache.spark.deploy.master.Master
|
||||
import org.apache.spark.util.Utils
|
||||
import org.apache.spark.Logging
|
||||
import org.apache.spark.{SparkConf, Logging}
|
||||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
|
@ -34,16 +34,17 @@ import scala.collection.mutable.ArrayBuffer
|
|||
*/
|
||||
private[spark]
|
||||
class LocalSparkCluster(numWorkers: Int, coresPerWorker: Int, memoryPerWorker: Int) extends Logging {
|
||||
|
||||
|
||||
private val localHostname = Utils.localHostName()
|
||||
private val masterActorSystems = ArrayBuffer[ActorSystem]()
|
||||
private val workerActorSystems = ArrayBuffer[ActorSystem]()
|
||||
|
||||
|
||||
def start(): Array[String] = {
|
||||
logInfo("Starting a local Spark cluster with " + numWorkers + " workers.")
|
||||
|
||||
/* Start the Master */
|
||||
val (masterSystem, masterPort, _) = Master.startSystemAndActor(localHostname, 0, 0)
|
||||
val conf = new SparkConf(false)
|
||||
val (masterSystem, masterPort, _) = Master.startSystemAndActor(localHostname, 0, 0, conf)
|
||||
masterActorSystems += masterSystem
|
||||
val masterUrl = "spark://" + localHostname + ":" + masterPort
|
||||
val masters = Array(masterUrl)
|
||||
|
@ -55,16 +56,19 @@ class LocalSparkCluster(numWorkers: Int, coresPerWorker: Int, memoryPerWorker: I
|
|||
workerActorSystems += workerSystem
|
||||
}
|
||||
|
||||
return masters
|
||||
masters
|
||||
}
|
||||
|
||||
def stop() {
|
||||
logInfo("Shutting down local Spark cluster.")
|
||||
// Stop the workers before the master so they don't get upset that it disconnected
|
||||
// TODO: In Akka 2.1.x, ActorSystem.awaitTermination hangs when you have remote actors!
|
||||
// This is unfortunate, but for now we just comment it out.
|
||||
workerActorSystems.foreach(_.shutdown())
|
||||
workerActorSystems.foreach(_.awaitTermination())
|
||||
|
||||
//workerActorSystems.foreach(_.awaitTermination())
|
||||
masterActorSystems.foreach(_.shutdown())
|
||||
masterActorSystems.foreach(_.awaitTermination())
|
||||
//masterActorSystems.foreach(_.awaitTermination())
|
||||
masterActorSystems.clear()
|
||||
workerActorSystems.clear()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,10 +34,10 @@ class SparkHadoopUtil {
|
|||
UserGroupInformation.setConfiguration(conf)
|
||||
|
||||
def runAsUser(user: String)(func: () => Unit) {
|
||||
// if we are already running as the user intended there is no reason to do the doAs. It
|
||||
// if we are already running as the user intended there is no reason to do the doAs. It
|
||||
// will actually break secure HDFS access as it doesn't fill in the credentials. Also if
|
||||
// the user is UNKNOWN then we shouldn't be creating a remote unknown user
|
||||
// (this is actually the path spark on yarn takes) since SPARK_USER is initialized only
|
||||
// the user is UNKNOWN then we shouldn't be creating a remote unknown user
|
||||
// (this is actually the path spark on yarn takes) since SPARK_USER is initialized only
|
||||
// in SparkContext.
|
||||
val currentUser = Option(System.getProperty("user.name")).
|
||||
getOrElse(SparkContext.SPARK_UNKNOWN_USER)
|
||||
|
@ -67,11 +67,15 @@ class SparkHadoopUtil {
|
|||
}
|
||||
|
||||
object SparkHadoopUtil {
|
||||
|
||||
private val hadoop = {
|
||||
val yarnMode = java.lang.Boolean.valueOf(System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
|
||||
val yarnMode = java.lang.Boolean.valueOf(
|
||||
System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
|
||||
if (yarnMode) {
|
||||
try {
|
||||
Class.forName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil").newInstance.asInstanceOf[SparkHadoopUtil]
|
||||
Class.forName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
|
||||
.newInstance()
|
||||
.asInstanceOf[SparkHadoopUtil]
|
||||
} catch {
|
||||
case th: Throwable => throw new SparkException("Unable to load YARN support", th)
|
||||
}
|
||||
|
|
|
@ -19,21 +19,18 @@ package org.apache.spark.deploy.client
|
|||
|
||||
import java.util.concurrent.TimeoutException
|
||||
|
||||
import akka.actor._
|
||||
import akka.actor.Terminated
|
||||
import akka.pattern.ask
|
||||
import akka.util.Duration
|
||||
import akka.util.duration._
|
||||
import akka.remote.RemoteClientDisconnected
|
||||
import akka.remote.RemoteClientLifeCycleEvent
|
||||
import akka.remote.RemoteClientShutdown
|
||||
import akka.dispatch.Await
|
||||
import scala.concurrent.Await
|
||||
import scala.concurrent.duration._
|
||||
|
||||
import org.apache.spark.Logging
|
||||
import akka.actor._
|
||||
import akka.pattern.ask
|
||||
import akka.remote.{AssociationErrorEvent, DisassociatedEvent, RemotingLifecycleEvent}
|
||||
|
||||
import org.apache.spark.{Logging, SparkConf, SparkException}
|
||||
import org.apache.spark.deploy.{ApplicationDescription, ExecutorState}
|
||||
import org.apache.spark.deploy.DeployMessages._
|
||||
import org.apache.spark.deploy.master.Master
|
||||
|
||||
import org.apache.spark.util.AkkaUtils
|
||||
|
||||
/**
|
||||
* The main class used to talk to a Spark deploy cluster. Takes a master URL, an app description,
|
||||
|
@ -45,24 +42,26 @@ private[spark] class Client(
|
|||
actorSystem: ActorSystem,
|
||||
masterUrls: Array[String],
|
||||
appDescription: ApplicationDescription,
|
||||
listener: ClientListener)
|
||||
listener: ClientListener,
|
||||
conf: SparkConf)
|
||||
extends Logging {
|
||||
|
||||
val REGISTRATION_TIMEOUT = 20.seconds
|
||||
val REGISTRATION_RETRIES = 3
|
||||
|
||||
var masterAddress: Address = null
|
||||
var actor: ActorRef = null
|
||||
var appId: String = null
|
||||
var registered = false
|
||||
var activeMasterUrl: String = null
|
||||
|
||||
class ClientActor extends Actor with Logging {
|
||||
var master: ActorRef = null
|
||||
var masterAddress: Address = null
|
||||
var master: ActorSelection = null
|
||||
var alreadyDisconnected = false // To avoid calling listener.disconnected() multiple times
|
||||
var alreadyDead = false // To avoid calling listener.dead() multiple times
|
||||
|
||||
override def preStart() {
|
||||
context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
|
||||
try {
|
||||
registerWithMaster()
|
||||
} catch {
|
||||
|
@ -76,7 +75,7 @@ private[spark] class Client(
|
|||
def tryRegisterAllMasters() {
|
||||
for (masterUrl <- masterUrls) {
|
||||
logInfo("Connecting to master " + masterUrl + "...")
|
||||
val actor = context.actorFor(Master.toAkkaUrl(masterUrl))
|
||||
val actor = context.actorSelection(Master.toAkkaUrl(masterUrl))
|
||||
actor ! RegisterApplication(appDescription)
|
||||
}
|
||||
}
|
||||
|
@ -84,6 +83,7 @@ private[spark] class Client(
|
|||
def registerWithMaster() {
|
||||
tryRegisterAllMasters()
|
||||
|
||||
import context.dispatcher
|
||||
var retries = 0
|
||||
lazy val retryTimer: Cancellable =
|
||||
context.system.scheduler.schedule(REGISTRATION_TIMEOUT, REGISTRATION_TIMEOUT) {
|
||||
|
@ -102,10 +102,19 @@ private[spark] class Client(
|
|||
|
||||
def changeMaster(url: String) {
|
||||
activeMasterUrl = url
|
||||
master = context.actorFor(Master.toAkkaUrl(url))
|
||||
masterAddress = master.path.address
|
||||
context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent])
|
||||
context.watch(master) // Doesn't work with remote actors, but useful for testing
|
||||
master = context.actorSelection(Master.toAkkaUrl(activeMasterUrl))
|
||||
masterAddress = activeMasterUrl match {
|
||||
case Master.sparkUrlRegex(host, port) =>
|
||||
Address("akka.tcp", Master.systemName, host, port.toInt)
|
||||
case x =>
|
||||
throw new SparkException("Invalid spark URL: " + x)
|
||||
}
|
||||
}
|
||||
|
||||
private def isPossibleMaster(remoteUrl: Address) = {
|
||||
masterUrls.map(s => Master.toAkkaUrl(s))
|
||||
.map(u => AddressFromURIString(u).hostPort)
|
||||
.contains(remoteUrl.hostPort)
|
||||
}
|
||||
|
||||
override def receive = {
|
||||
|
@ -135,22 +144,16 @@ private[spark] class Client(
|
|||
|
||||
case MasterChanged(masterUrl, masterWebUiUrl) =>
|
||||
logInfo("Master has changed, new master is at " + masterUrl)
|
||||
context.unwatch(master)
|
||||
changeMaster(masterUrl)
|
||||
alreadyDisconnected = false
|
||||
sender ! MasterChangeAcknowledged(appId)
|
||||
|
||||
case Terminated(actor_) if actor_ == master =>
|
||||
logWarning("Connection to master failed; waiting for master to reconnect...")
|
||||
case DisassociatedEvent(_, address, _) if address == masterAddress =>
|
||||
logWarning(s"Connection to $address failed; waiting for master to reconnect...")
|
||||
markDisconnected()
|
||||
|
||||
case RemoteClientDisconnected(transport, address) if address == masterAddress =>
|
||||
logWarning("Connection to master failed; waiting for master to reconnect...")
|
||||
markDisconnected()
|
||||
|
||||
case RemoteClientShutdown(transport, address) if address == masterAddress =>
|
||||
logWarning("Connection to master failed; waiting for master to reconnect...")
|
||||
markDisconnected()
|
||||
case AssociationErrorEvent(cause, _, address, _) if isPossibleMaster(address) =>
|
||||
logWarning(s"Could not connect to $address: $cause")
|
||||
|
||||
case StopClient =>
|
||||
markDead()
|
||||
|
@ -184,7 +187,7 @@ private[spark] class Client(
|
|||
def stop() {
|
||||
if (actor != null) {
|
||||
try {
|
||||
val timeout = Duration.create(System.getProperty("spark.akka.askTimeout", "10").toLong, "seconds")
|
||||
val timeout = AkkaUtils.askTimeout(conf)
|
||||
val future = actor.ask(StopClient)(timeout)
|
||||
Await.result(future, timeout)
|
||||
} catch {
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
package org.apache.spark.deploy.client
|
||||
|
||||
import org.apache.spark.util.{Utils, AkkaUtils}
|
||||
import org.apache.spark.{Logging}
|
||||
import org.apache.spark.{SparkConf, SparkContext, Logging}
|
||||
import org.apache.spark.deploy.{Command, ApplicationDescription}
|
||||
|
||||
private[spark] object TestClient {
|
||||
|
@ -45,11 +45,13 @@ private[spark] object TestClient {
|
|||
|
||||
def main(args: Array[String]) {
|
||||
val url = args(0)
|
||||
val (actorSystem, port) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0)
|
||||
val (actorSystem, port) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0,
|
||||
conf = new SparkConf)
|
||||
val desc = new ApplicationDescription(
|
||||
"TestClient", 1, 512, Command("spark.deploy.client.TestExecutor", Seq(), Map()), "dummy-spark-home", "ignored")
|
||||
"TestClient", Some(1), 512, Command("spark.deploy.client.TestExecutor", Seq(), Map()),
|
||||
"dummy-spark-home", "ignored")
|
||||
val listener = new TestListener
|
||||
val client = new Client(actorSystem, Array(url), desc, listener)
|
||||
val client = new Client(actorSystem, Array(url), desc, listener, new SparkConf)
|
||||
client.start()
|
||||
actorSystem.awaitTermination()
|
||||
}
|
||||
|
|
|
@ -28,7 +28,8 @@ private[spark] class ApplicationInfo(
|
|||
val desc: ApplicationDescription,
|
||||
val submitDate: Date,
|
||||
val driver: ActorRef,
|
||||
val appUiUrl: String)
|
||||
val appUiUrl: String,
|
||||
defaultCores: Int)
|
||||
extends Serializable {
|
||||
|
||||
@transient var state: ApplicationState.Value = _
|
||||
|
@ -81,7 +82,9 @@ private[spark] class ApplicationInfo(
|
|||
}
|
||||
}
|
||||
|
||||
def coresLeft: Int = desc.maxCores - coresGranted
|
||||
private val myMaxCores = desc.maxCores.getOrElse(defaultCores)
|
||||
|
||||
def coresLeft: Int = myMaxCores - coresGranted
|
||||
|
||||
private var _retryCount = 0
|
||||
|
||||
|
|
|
@ -17,8 +17,7 @@
|
|||
|
||||
package org.apache.spark.deploy.master
|
||||
|
||||
private[spark] object ApplicationState
|
||||
extends Enumeration("WAITING", "RUNNING", "FINISHED", "FAILED", "UNKNOWN") {
|
||||
private[spark] object ApplicationState extends Enumeration {
|
||||
|
||||
type ApplicationState = Value
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ private[spark] class FileSystemPersistenceEngine(
|
|||
(apps, workers)
|
||||
}
|
||||
|
||||
private def serializeIntoFile(file: File, value: Serializable) {
|
||||
private def serializeIntoFile(file: File, value: AnyRef) {
|
||||
val created = file.createNewFile()
|
||||
if (!created) { throw new IllegalStateException("Could not create file: " + file) }
|
||||
|
||||
|
@ -77,13 +77,13 @@ private[spark] class FileSystemPersistenceEngine(
|
|||
out.close()
|
||||
}
|
||||
|
||||
def deserializeFromFile[T <: Serializable](file: File)(implicit m: Manifest[T]): T = {
|
||||
def deserializeFromFile[T](file: File)(implicit m: Manifest[T]): T = {
|
||||
val fileData = new Array[Byte](file.length().asInstanceOf[Int])
|
||||
val dis = new DataInputStream(new FileInputStream(file))
|
||||
dis.readFully(fileData)
|
||||
dis.close()
|
||||
|
||||
val clazz = m.erasure.asInstanceOf[Class[T]]
|
||||
val clazz = m.runtimeClass.asInstanceOf[Class[T]]
|
||||
val serializer = serialization.serializerFor(clazz)
|
||||
serializer.fromBinary(fileData).asInstanceOf[T]
|
||||
}
|
||||
|
|
|
@ -17,21 +17,19 @@
|
|||
|
||||
package org.apache.spark.deploy.master
|
||||
|
||||
import java.util.Date
|
||||
import java.text.SimpleDateFormat
|
||||
import java.util.Date
|
||||
|
||||
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
|
||||
import scala.concurrent.Await
|
||||
import scala.concurrent.duration._
|
||||
|
||||
import akka.actor._
|
||||
import akka.actor.Terminated
|
||||
import akka.dispatch.Await
|
||||
import akka.pattern.ask
|
||||
import akka.remote.{RemoteClientLifeCycleEvent, RemoteClientDisconnected, RemoteClientShutdown}
|
||||
import akka.remote.{DisassociatedEvent, RemotingLifecycleEvent}
|
||||
import akka.serialization.SerializationExtension
|
||||
import akka.util.duration._
|
||||
import akka.util.{Duration, Timeout}
|
||||
|
||||
import org.apache.spark.{Logging, SparkException}
|
||||
import org.apache.spark.{SparkConf, SparkContext, Logging, SparkException}
|
||||
import org.apache.spark.deploy.{ApplicationDescription, ExecutorState}
|
||||
import org.apache.spark.deploy.DeployMessages._
|
||||
import org.apache.spark.deploy.master.MasterMessages._
|
||||
|
@ -40,12 +38,16 @@ import org.apache.spark.metrics.MetricsSystem
|
|||
import org.apache.spark.util.{AkkaUtils, Utils}
|
||||
|
||||
private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Actor with Logging {
|
||||
import context.dispatcher // to use Akka's scheduler.schedule()
|
||||
|
||||
val conf = new SparkConf
|
||||
|
||||
val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For application IDs
|
||||
val WORKER_TIMEOUT = System.getProperty("spark.worker.timeout", "60").toLong * 1000
|
||||
val RETAINED_APPLICATIONS = System.getProperty("spark.deploy.retainedApplications", "200").toInt
|
||||
val REAPER_ITERATIONS = System.getProperty("spark.dead.worker.persistence", "15").toInt
|
||||
val RECOVERY_DIR = System.getProperty("spark.deploy.recoveryDirectory", "")
|
||||
val RECOVERY_MODE = System.getProperty("spark.deploy.recoveryMode", "NONE")
|
||||
val WORKER_TIMEOUT = conf.get("spark.worker.timeout", "60").toLong * 1000
|
||||
val RETAINED_APPLICATIONS = conf.get("spark.deploy.retainedApplications", "200").toInt
|
||||
val REAPER_ITERATIONS = conf.get("spark.dead.worker.persistence", "15").toInt
|
||||
val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "")
|
||||
val RECOVERY_MODE = conf.get("spark.deploy.recoveryMode", "NONE")
|
||||
|
||||
var nextAppNumber = 0
|
||||
val workers = new HashSet[WorkerInfo]
|
||||
|
@ -61,12 +63,10 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
val waitingApps = new ArrayBuffer[ApplicationInfo]
|
||||
val completedApps = new ArrayBuffer[ApplicationInfo]
|
||||
|
||||
var firstApp: Option[ApplicationInfo] = None
|
||||
|
||||
Utils.checkHost(host, "Expected hostname")
|
||||
|
||||
val masterMetricsSystem = MetricsSystem.createMetricsSystem("master")
|
||||
val applicationMetricsSystem = MetricsSystem.createMetricsSystem("applications")
|
||||
val masterMetricsSystem = MetricsSystem.createMetricsSystem("master", conf)
|
||||
val applicationMetricsSystem = MetricsSystem.createMetricsSystem("applications", conf)
|
||||
val masterSource = new MasterSource(this)
|
||||
|
||||
val webUi = new MasterWebUI(this, webUiPort)
|
||||
|
@ -88,12 +88,18 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
// As a temporary workaround before better ways of configuring memory, we allow users to set
|
||||
// a flag that will perform round-robin scheduling across the nodes (spreading out each app
|
||||
// among all the nodes) instead of trying to consolidate each app onto a small # of nodes.
|
||||
val spreadOutApps = System.getProperty("spark.deploy.spreadOut", "true").toBoolean
|
||||
val spreadOutApps = conf.getBoolean("spark.deploy.spreadOut", true)
|
||||
|
||||
// Default maxCores for applications that don't specify it (i.e. pass Int.MaxValue)
|
||||
val defaultCores = conf.getInt("spark.deploy.defaultCores", Int.MaxValue)
|
||||
if (defaultCores < 1) {
|
||||
throw new SparkException("spark.deploy.defaultCores must be positive")
|
||||
}
|
||||
|
||||
override def preStart() {
|
||||
logInfo("Starting Spark master at " + masterUrl)
|
||||
// Listen for remote client disconnection events, since they don't go through Akka's watch()
|
||||
context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent])
|
||||
context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
|
||||
webUi.start()
|
||||
masterWebUiUrl = "http://" + masterPublicAddress + ":" + webUi.boundPort.get
|
||||
context.system.scheduler.schedule(0 millis, WORKER_TIMEOUT millis, self, CheckForWorkerTimeOut)
|
||||
|
@ -105,7 +111,7 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
persistenceEngine = RECOVERY_MODE match {
|
||||
case "ZOOKEEPER" =>
|
||||
logInfo("Persisting recovery state to ZooKeeper")
|
||||
new ZooKeeperPersistenceEngine(SerializationExtension(context.system))
|
||||
new ZooKeeperPersistenceEngine(SerializationExtension(context.system), conf)
|
||||
case "FILESYSTEM" =>
|
||||
logInfo("Persisting recovery state to directory: " + RECOVERY_DIR)
|
||||
new FileSystemPersistenceEngine(RECOVERY_DIR, SerializationExtension(context.system))
|
||||
|
@ -113,13 +119,12 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
new BlackHolePersistenceEngine()
|
||||
}
|
||||
|
||||
leaderElectionAgent = context.actorOf(Props(
|
||||
RECOVERY_MODE match {
|
||||
leaderElectionAgent = RECOVERY_MODE match {
|
||||
case "ZOOKEEPER" =>
|
||||
new ZooKeeperLeaderElectionAgent(self, masterUrl)
|
||||
context.actorOf(Props(classOf[ZooKeeperLeaderElectionAgent], self, masterUrl, conf))
|
||||
case _ =>
|
||||
new MonarchyLeaderAgent(self)
|
||||
}))
|
||||
context.actorOf(Props(classOf[MonarchyLeaderAgent], self))
|
||||
}
|
||||
}
|
||||
|
||||
override def preRestart(reason: Throwable, message: Option[Any]) {
|
||||
|
@ -142,9 +147,7 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
RecoveryState.ALIVE
|
||||
else
|
||||
RecoveryState.RECOVERING
|
||||
|
||||
logInfo("I have been elected leader! New state: " + state)
|
||||
|
||||
if (state == RecoveryState.RECOVERING) {
|
||||
beginRecovery(storedApps, storedWorkers)
|
||||
context.system.scheduler.scheduleOnce(WORKER_TIMEOUT millis) { completeRecovery() }
|
||||
|
@ -156,7 +159,7 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
System.exit(0)
|
||||
}
|
||||
|
||||
case RegisterWorker(id, host, workerPort, cores, memory, webUiPort, publicAddress) => {
|
||||
case RegisterWorker(id, workerHost, workerPort, cores, memory, workerWebUiPort, publicAddress) => {
|
||||
logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
|
||||
host, workerPort, cores, Utils.megabytesToString(memory)))
|
||||
if (state == RecoveryState.STANDBY) {
|
||||
|
@ -164,9 +167,9 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
} else if (idToWorker.contains(id)) {
|
||||
sender ! RegisterWorkerFailed("Duplicate worker ID")
|
||||
} else {
|
||||
val worker = new WorkerInfo(id, host, port, cores, memory, sender, webUiPort, publicAddress)
|
||||
val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
|
||||
sender, workerWebUiPort, publicAddress)
|
||||
registerWorker(worker)
|
||||
context.watch(sender) // This doesn't work with remote actors but helps for testing
|
||||
persistenceEngine.addWorker(worker)
|
||||
sender ! RegisteredWorker(masterUrl, masterWebUiUrl)
|
||||
schedule()
|
||||
|
@ -181,7 +184,6 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
val app = createApplication(description, sender)
|
||||
registerApplication(app)
|
||||
logInfo("Registered app " + description.name + " with ID " + app.id)
|
||||
context.watch(sender) // This doesn't work with remote actors but helps for testing
|
||||
persistenceEngine.addApplication(app)
|
||||
sender ! RegisteredApplication(app.id, masterUrl)
|
||||
schedule()
|
||||
|
@ -257,23 +259,9 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
if (canCompleteRecovery) { completeRecovery() }
|
||||
}
|
||||
|
||||
case Terminated(actor) => {
|
||||
// The disconnected actor could've been either a worker or an app; remove whichever of
|
||||
// those we have an entry for in the corresponding actor hashmap
|
||||
actorToWorker.get(actor).foreach(removeWorker)
|
||||
actorToApp.get(actor).foreach(finishApplication)
|
||||
if (state == RecoveryState.RECOVERING && canCompleteRecovery) { completeRecovery() }
|
||||
}
|
||||
|
||||
case RemoteClientDisconnected(transport, address) => {
|
||||
// The disconnected client could've been either a worker or an app; remove whichever it was
|
||||
addressToWorker.get(address).foreach(removeWorker)
|
||||
addressToApp.get(address).foreach(finishApplication)
|
||||
if (state == RecoveryState.RECOVERING && canCompleteRecovery) { completeRecovery() }
|
||||
}
|
||||
|
||||
case RemoteClientShutdown(transport, address) => {
|
||||
case DisassociatedEvent(_, address, _) => {
|
||||
// The disconnected client could've been either a worker or an app; remove whichever it was
|
||||
logInfo(s"$address got disassociated, removing it.")
|
||||
addressToWorker.get(address).foreach(removeWorker)
|
||||
addressToApp.get(address).foreach(finishApplication)
|
||||
if (state == RecoveryState.RECOVERING && canCompleteRecovery) { completeRecovery() }
|
||||
|
@ -444,7 +432,8 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
def createApplication(desc: ApplicationDescription, driver: ActorRef): ApplicationInfo = {
|
||||
val now = System.currentTimeMillis()
|
||||
val date = new Date(now)
|
||||
new ApplicationInfo(now, newApplicationId(date), desc, date, driver, desc.appUiUrl)
|
||||
new ApplicationInfo(
|
||||
now, newApplicationId(date), desc, date, driver, desc.appUiUrl, defaultCores)
|
||||
}
|
||||
|
||||
def registerApplication(app: ApplicationInfo): Unit = {
|
||||
|
@ -459,14 +448,6 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
idToApp(app.id) = app
|
||||
actorToApp(app.driver) = app
|
||||
addressToApp(appAddress) = app
|
||||
if (firstApp == None) {
|
||||
firstApp = Some(app)
|
||||
}
|
||||
// TODO: What is firstApp?? Can we remove it?
|
||||
val workersAlive = workers.filter(_.state == WorkerState.ALIVE).toArray
|
||||
if (workersAlive.size > 0 && !workersAlive.exists(_.memoryFree >= app.desc.memoryPerSlave)) {
|
||||
logWarning("Could not find any workers with enough memory for " + firstApp.get.id)
|
||||
}
|
||||
waitingApps += app
|
||||
}
|
||||
|
||||
|
@ -523,41 +504,42 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
|
|||
removeWorker(worker)
|
||||
} else {
|
||||
if (worker.lastHeartbeat < currentTime - ((REAPER_ITERATIONS + 1) * WORKER_TIMEOUT))
|
||||
workers -= worker // we've seen this DEAD worker in the UI, etc. for long enough; cull it
|
||||
workers -= worker // we've seen this DEAD worker in the UI, etc. for long enough; cull it
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] object Master {
|
||||
private val systemName = "sparkMaster"
|
||||
val systemName = "sparkMaster"
|
||||
private val actorName = "Master"
|
||||
private val sparkUrlRegex = "spark://([^:]+):([0-9]+)".r
|
||||
val sparkUrlRegex = "spark://([^:]+):([0-9]+)".r
|
||||
|
||||
def main(argStrings: Array[String]) {
|
||||
val args = new MasterArguments(argStrings)
|
||||
val (actorSystem, _, _) = startSystemAndActor(args.host, args.port, args.webUiPort)
|
||||
val conf = new SparkConf
|
||||
val args = new MasterArguments(argStrings, conf)
|
||||
val (actorSystem, _, _) = startSystemAndActor(args.host, args.port, args.webUiPort, conf)
|
||||
actorSystem.awaitTermination()
|
||||
}
|
||||
|
||||
/** Returns an `akka://...` URL for the Master actor given a sparkUrl `spark://host:ip`. */
|
||||
/** Returns an `akka.tcp://...` URL for the Master actor given a sparkUrl `spark://host:ip`. */
|
||||
def toAkkaUrl(sparkUrl: String): String = {
|
||||
sparkUrl match {
|
||||
case sparkUrlRegex(host, port) =>
|
||||
"akka://%s@%s:%s/user/%s".format(systemName, host, port, actorName)
|
||||
"akka.tcp://%s@%s:%s/user/%s".format(systemName, host, port, actorName)
|
||||
case _ =>
|
||||
throw new SparkException("Invalid master URL: " + sparkUrl)
|
||||
}
|
||||
}
|
||||
|
||||
def startSystemAndActor(host: String, port: Int, webUiPort: Int): (ActorSystem, Int, Int) = {
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem(systemName, host, port)
|
||||
val actor = actorSystem.actorOf(Props(new Master(host, boundPort, webUiPort)), name = actorName)
|
||||
val timeoutDuration = Duration.create(
|
||||
System.getProperty("spark.akka.askTimeout", "10").toLong, "seconds")
|
||||
implicit val timeout = Timeout(timeoutDuration)
|
||||
val respFuture = actor ? RequestWebUIPort // ask pattern
|
||||
val resp = Await.result(respFuture, timeoutDuration).asInstanceOf[WebUIPortResponse]
|
||||
def startSystemAndActor(host: String, port: Int, webUiPort: Int, conf: SparkConf)
|
||||
: (ActorSystem, Int, Int) =
|
||||
{
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem(systemName, host, port, conf = conf)
|
||||
val actor = actorSystem.actorOf(Props(classOf[Master], host, boundPort, webUiPort), actorName)
|
||||
val timeout = AkkaUtils.askTimeout(conf)
|
||||
val respFuture = actor.ask(RequestWebUIPort)(timeout)
|
||||
val resp = Await.result(respFuture, timeout).asInstanceOf[WebUIPortResponse]
|
||||
(actorSystem, boundPort, resp.webUIBoundPort)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,16 +18,17 @@
|
|||
package org.apache.spark.deploy.master
|
||||
|
||||
import org.apache.spark.util.{Utils, IntParam}
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
/**
|
||||
* Command-line parser for the master.
|
||||
*/
|
||||
private[spark] class MasterArguments(args: Array[String]) {
|
||||
private[spark] class MasterArguments(args: Array[String], conf: SparkConf) {
|
||||
var host = Utils.localHostName()
|
||||
var port = 7077
|
||||
var webUiPort = 8080
|
||||
|
||||
// Check for settings in environment variables
|
||||
|
||||
// Check for settings in environment variables
|
||||
if (System.getenv("SPARK_MASTER_HOST") != null) {
|
||||
host = System.getenv("SPARK_MASTER_HOST")
|
||||
}
|
||||
|
@ -37,8 +38,8 @@ private[spark] class MasterArguments(args: Array[String]) {
|
|||
if (System.getenv("SPARK_MASTER_WEBUI_PORT") != null) {
|
||||
webUiPort = System.getenv("SPARK_MASTER_WEBUI_PORT").toInt
|
||||
}
|
||||
if (System.getProperty("master.ui.port") != null) {
|
||||
webUiPort = System.getProperty("master.ui.port").toInt
|
||||
if (conf.contains("master.ui.port")) {
|
||||
webUiPort = conf.get("master.ui.port").toInt
|
||||
}
|
||||
|
||||
parse(args.toList)
|
||||
|
|
|
@ -17,9 +17,7 @@
|
|||
|
||||
package org.apache.spark.deploy.master
|
||||
|
||||
private[spark] object RecoveryState
|
||||
extends Enumeration("STANDBY", "ALIVE", "RECOVERING", "COMPLETING_RECOVERY") {
|
||||
|
||||
private[spark] object RecoveryState extends Enumeration {
|
||||
type MasterState = Value
|
||||
|
||||
val STANDBY, ALIVE, RECOVERING, COMPLETING_RECOVERY = Value
|
||||
|
|
|
@ -18,12 +18,12 @@
|
|||
package org.apache.spark.deploy.master
|
||||
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.concurrent.ops._
|
||||
|
||||
import org.apache.spark.Logging
|
||||
import org.apache.zookeeper._
|
||||
import org.apache.zookeeper.data.Stat
|
||||
import org.apache.zookeeper.Watcher.Event.KeeperState
|
||||
import org.apache.zookeeper.data.Stat
|
||||
|
||||
import org.apache.spark.{SparkConf, Logging}
|
||||
|
||||
/**
|
||||
* Provides a Scala-side interface to the standard ZooKeeper client, with the addition of retry
|
||||
|
@ -33,10 +33,11 @@ import org.apache.zookeeper.Watcher.Event.KeeperState
|
|||
* informed via zkDown().
|
||||
*
|
||||
* Additionally, all commands sent to ZooKeeper will be retried until they either fail too many
|
||||
* times or a semantic exception is thrown (e.g.., "node already exists").
|
||||
* times or a semantic exception is thrown (e.g., "node already exists").
|
||||
*/
|
||||
private[spark] class SparkZooKeeperSession(zkWatcher: SparkZooKeeperWatcher) extends Logging {
|
||||
val ZK_URL = System.getProperty("spark.deploy.zookeeper.url", "")
|
||||
private[spark] class SparkZooKeeperSession(zkWatcher: SparkZooKeeperWatcher,
|
||||
conf: SparkConf) extends Logging {
|
||||
val ZK_URL = conf.get("spark.deploy.zookeeper.url", "")
|
||||
|
||||
val ZK_ACL = ZooDefs.Ids.OPEN_ACL_UNSAFE
|
||||
val ZK_TIMEOUT_MILLIS = 30000
|
||||
|
@ -103,6 +104,7 @@ private[spark] class SparkZooKeeperSession(zkWatcher: SparkZooKeeperWatcher) ext
|
|||
connectToZooKeeper()
|
||||
case KeeperState.Disconnected =>
|
||||
logWarning("ZooKeeper disconnected, will retry...")
|
||||
case s => // Do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -179,7 +181,7 @@ private[spark] class SparkZooKeeperSession(zkWatcher: SparkZooKeeperWatcher) ext
|
|||
} catch {
|
||||
case e: KeeperException.NoNodeException => throw e
|
||||
case e: KeeperException.NodeExistsException => throw e
|
||||
case e if n > 0 =>
|
||||
case e: Exception if n > 0 =>
|
||||
logError("ZooKeeper exception, " + n + " more retries...", e)
|
||||
Thread.sleep(RETRY_WAIT_MILLIS)
|
||||
retry(fn, n-1)
|
||||
|
|
|
@ -17,9 +17,7 @@
|
|||
|
||||
package org.apache.spark.deploy.master
|
||||
|
||||
private[spark] object WorkerState
|
||||
extends Enumeration("ALIVE", "DEAD", "DECOMMISSIONED", "UNKNOWN") {
|
||||
|
||||
private[spark] object WorkerState extends Enumeration {
|
||||
type WorkerState = Value
|
||||
|
||||
val ALIVE, DEAD, DECOMMISSIONED, UNKNOWN = Value
|
||||
|
|
|
@ -21,16 +21,17 @@ import akka.actor.ActorRef
|
|||
import org.apache.zookeeper._
|
||||
import org.apache.zookeeper.Watcher.Event.EventType
|
||||
|
||||
import org.apache.spark.{SparkConf, Logging}
|
||||
import org.apache.spark.deploy.master.MasterMessages._
|
||||
import org.apache.spark.Logging
|
||||
|
||||
private[spark] class ZooKeeperLeaderElectionAgent(val masterActor: ActorRef, masterUrl: String)
|
||||
private[spark] class ZooKeeperLeaderElectionAgent(val masterActor: ActorRef,
|
||||
masterUrl: String, conf: SparkConf)
|
||||
extends LeaderElectionAgent with SparkZooKeeperWatcher with Logging {
|
||||
|
||||
val WORKING_DIR = System.getProperty("spark.deploy.zookeeper.dir", "/spark") + "/leader_election"
|
||||
val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election"
|
||||
|
||||
private val watcher = new ZooKeeperWatcher()
|
||||
private val zk = new SparkZooKeeperSession(this)
|
||||
private val zk = new SparkZooKeeperSession(this, conf)
|
||||
private var status = LeadershipStatus.NOT_LEADER
|
||||
private var myLeaderFile: String = _
|
||||
private var leaderUrl: String = _
|
||||
|
@ -105,7 +106,7 @@ private[spark] class ZooKeeperLeaderElectionAgent(val masterActor: ActorRef, mas
|
|||
// We found a different master file pointing to this process.
|
||||
// This can happen in the following two cases:
|
||||
// (1) The master process was restarted on the same node.
|
||||
// (2) The ZK server died between creating the node and returning the name of the node.
|
||||
// (2) The ZK server died between creating the file and returning the name of the file.
|
||||
// For this case, we will end up creating a second file, and MUST explicitly delete the
|
||||
// first one, since our ZK session is still open.
|
||||
// Note that this deletion will cause a NodeDeleted event to be fired so we check again for
|
||||
|
|
|
@ -17,19 +17,19 @@
|
|||
|
||||
package org.apache.spark.deploy.master
|
||||
|
||||
import org.apache.spark.Logging
|
||||
import org.apache.spark.{SparkConf, Logging}
|
||||
import org.apache.zookeeper._
|
||||
|
||||
import akka.serialization.Serialization
|
||||
|
||||
class ZooKeeperPersistenceEngine(serialization: Serialization)
|
||||
class ZooKeeperPersistenceEngine(serialization: Serialization, conf: SparkConf)
|
||||
extends PersistenceEngine
|
||||
with SparkZooKeeperWatcher
|
||||
with Logging
|
||||
{
|
||||
val WORKING_DIR = System.getProperty("spark.deploy.zookeeper.dir", "/spark") + "/master_status"
|
||||
val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status"
|
||||
|
||||
val zk = new SparkZooKeeperSession(this)
|
||||
val zk = new SparkZooKeeperSession(this, conf)
|
||||
|
||||
zk.connect()
|
||||
|
||||
|
@ -70,15 +70,15 @@ class ZooKeeperPersistenceEngine(serialization: Serialization)
|
|||
(apps, workers)
|
||||
}
|
||||
|
||||
private def serializeIntoFile(path: String, value: Serializable) {
|
||||
private def serializeIntoFile(path: String, value: AnyRef) {
|
||||
val serializer = serialization.findSerializerFor(value)
|
||||
val serialized = serializer.toBinary(value)
|
||||
zk.create(path, serialized, CreateMode.PERSISTENT)
|
||||
}
|
||||
|
||||
def deserializeFromFile[T <: Serializable](filename: String)(implicit m: Manifest[T]): T = {
|
||||
def deserializeFromFile[T](filename: String)(implicit m: Manifest[T]): T = {
|
||||
val fileData = zk.getData("/spark/master_status/" + filename)
|
||||
val clazz = m.erasure.asInstanceOf[Class[T]]
|
||||
val clazz = m.runtimeClass.asInstanceOf[Class[T]]
|
||||
val serializer = serialization.serializerFor(clazz)
|
||||
serializer.fromBinary(fileData).asInstanceOf[T]
|
||||
}
|
||||
|
|
|
@ -17,31 +17,28 @@
|
|||
|
||||
package org.apache.spark.deploy.master.ui
|
||||
|
||||
import scala.concurrent.Await
|
||||
import scala.xml.Node
|
||||
|
||||
import akka.dispatch.Await
|
||||
import akka.pattern.ask
|
||||
import akka.util.duration._
|
||||
|
||||
import javax.servlet.http.HttpServletRequest
|
||||
|
||||
import net.liftweb.json.JsonAST.JValue
|
||||
|
||||
import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
|
||||
import org.apache.spark.deploy.JsonProtocol
|
||||
import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
|
||||
import org.apache.spark.deploy.master.ExecutorInfo
|
||||
import org.apache.spark.ui.UIUtils
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
private[spark] class ApplicationPage(parent: MasterWebUI) {
|
||||
val master = parent.masterActorRef
|
||||
implicit val timeout = parent.timeout
|
||||
val timeout = parent.timeout
|
||||
|
||||
/** Executor details for a particular application */
|
||||
def renderJson(request: HttpServletRequest): JValue = {
|
||||
val appId = request.getParameter("appId")
|
||||
val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse]
|
||||
val state = Await.result(stateFuture, 30 seconds)
|
||||
val state = Await.result(stateFuture, timeout)
|
||||
val app = state.activeApps.find(_.id == appId).getOrElse({
|
||||
state.completedApps.find(_.id == appId).getOrElse(null)
|
||||
})
|
||||
|
@ -52,7 +49,7 @@ private[spark] class ApplicationPage(parent: MasterWebUI) {
|
|||
def render(request: HttpServletRequest): Seq[Node] = {
|
||||
val appId = request.getParameter("appId")
|
||||
val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse]
|
||||
val state = Await.result(stateFuture, 30 seconds)
|
||||
val state = Await.result(stateFuture, timeout)
|
||||
val app = state.activeApps.find(_.id == appId).getOrElse({
|
||||
state.completedApps.find(_.id == appId).getOrElse(null)
|
||||
})
|
||||
|
|
|
@ -17,37 +17,33 @@
|
|||
|
||||
package org.apache.spark.deploy.master.ui
|
||||
|
||||
import javax.servlet.http.HttpServletRequest
|
||||
|
||||
import scala.concurrent.Await
|
||||
import scala.xml.Node
|
||||
|
||||
import akka.dispatch.Await
|
||||
import akka.pattern.ask
|
||||
import akka.util.duration._
|
||||
|
||||
import javax.servlet.http.HttpServletRequest
|
||||
import net.liftweb.json.JsonAST.JValue
|
||||
|
||||
import org.apache.spark.deploy.DeployWebUI
|
||||
import org.apache.spark.deploy.{DeployWebUI, JsonProtocol}
|
||||
import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
|
||||
import org.apache.spark.deploy.JsonProtocol
|
||||
import org.apache.spark.deploy.master.{ApplicationInfo, WorkerInfo}
|
||||
import org.apache.spark.ui.UIUtils
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
private[spark] class IndexPage(parent: MasterWebUI) {
|
||||
val master = parent.masterActorRef
|
||||
implicit val timeout = parent.timeout
|
||||
val timeout = parent.timeout
|
||||
|
||||
def renderJson(request: HttpServletRequest): JValue = {
|
||||
val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse]
|
||||
val state = Await.result(stateFuture, 30 seconds)
|
||||
val state = Await.result(stateFuture, timeout)
|
||||
JsonProtocol.writeMasterState(state)
|
||||
}
|
||||
|
||||
/** Index view listing applications and executors */
|
||||
def render(request: HttpServletRequest): Seq[Node] = {
|
||||
val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse]
|
||||
val state = Await.result(stateFuture, 30 seconds)
|
||||
val state = Await.result(stateFuture, timeout)
|
||||
|
||||
val workerHeaders = Seq("Id", "Address", "State", "Cores", "Memory")
|
||||
val workers = state.workers.sortBy(_.id)
|
||||
|
|
|
@ -17,25 +17,21 @@
|
|||
|
||||
package org.apache.spark.deploy.master.ui
|
||||
|
||||
import akka.util.Duration
|
||||
|
||||
import javax.servlet.http.HttpServletRequest
|
||||
|
||||
import org.eclipse.jetty.server.{Handler, Server}
|
||||
|
||||
import org.apache.spark.{Logging}
|
||||
import org.apache.spark.Logging
|
||||
import org.apache.spark.deploy.master.Master
|
||||
import org.apache.spark.ui.JettyUtils
|
||||
import org.apache.spark.ui.JettyUtils._
|
||||
import org.apache.spark.util.Utils
|
||||
import org.apache.spark.util.{AkkaUtils, Utils}
|
||||
|
||||
/**
|
||||
* Web UI server for the standalone master.
|
||||
*/
|
||||
private[spark]
|
||||
class MasterWebUI(val master: Master, requestedPort: Int) extends Logging {
|
||||
implicit val timeout = Duration.create(
|
||||
System.getProperty("spark.akka.askTimeout", "10").toLong, "seconds")
|
||||
val timeout = AkkaUtils.askTimeout(master.conf)
|
||||
val host = Utils.localHostName()
|
||||
val port = requestedPort
|
||||
|
||||
|
|
|
@ -17,23 +17,22 @@
|
|||
|
||||
package org.apache.spark.deploy.worker
|
||||
|
||||
import java.io.File
|
||||
import java.text.SimpleDateFormat
|
||||
import java.util.Date
|
||||
import java.io.File
|
||||
|
||||
import scala.collection.mutable.HashMap
|
||||
import scala.concurrent.duration._
|
||||
|
||||
import akka.actor._
|
||||
import akka.remote.{RemoteClientLifeCycleEvent, RemoteClientShutdown, RemoteClientDisconnected}
|
||||
import akka.util.duration._
|
||||
|
||||
import org.apache.spark.Logging
|
||||
import akka.remote.{DisassociatedEvent, RemotingLifecycleEvent}
|
||||
import org.apache.spark.{Logging, SparkConf, SparkException}
|
||||
import org.apache.spark.deploy.{ExecutorDescription, ExecutorState}
|
||||
import org.apache.spark.deploy.DeployMessages._
|
||||
import org.apache.spark.deploy.master.Master
|
||||
import org.apache.spark.deploy.worker.ui.WorkerWebUI
|
||||
import org.apache.spark.metrics.MetricsSystem
|
||||
import org.apache.spark.util.{Utils, AkkaUtils}
|
||||
import org.apache.spark.util.{AkkaUtils, Utils}
|
||||
|
||||
/**
|
||||
* @param masterUrls Each url should look like spark://host:port.
|
||||
|
@ -45,8 +44,10 @@ private[spark] class Worker(
|
|||
cores: Int,
|
||||
memory: Int,
|
||||
masterUrls: Array[String],
|
||||
workDirPath: String = null)
|
||||
workDirPath: String = null,
|
||||
val conf: SparkConf)
|
||||
extends Actor with Logging {
|
||||
import context.dispatcher
|
||||
|
||||
Utils.checkHost(host, "Expected hostname")
|
||||
assert (port > 0)
|
||||
|
@ -54,7 +55,7 @@ private[spark] class Worker(
|
|||
val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For worker and executor IDs
|
||||
|
||||
// Send a heartbeat every (heartbeat timeout) / 4 milliseconds
|
||||
val HEARTBEAT_MILLIS = System.getProperty("spark.worker.timeout", "60").toLong * 1000 / 4
|
||||
val HEARTBEAT_MILLIS = conf.get("spark.worker.timeout", "60").toLong * 1000 / 4
|
||||
|
||||
val REGISTRATION_TIMEOUT = 20.seconds
|
||||
val REGISTRATION_RETRIES = 3
|
||||
|
@ -63,7 +64,8 @@ private[spark] class Worker(
|
|||
var masterIndex = 0
|
||||
|
||||
val masterLock: Object = new Object()
|
||||
var master: ActorRef = null
|
||||
var master: ActorSelection = null
|
||||
var masterAddress: Address = null
|
||||
var activeMasterUrl: String = ""
|
||||
var activeMasterWebUiUrl : String = ""
|
||||
@volatile var registered = false
|
||||
|
@ -82,7 +84,7 @@ private[spark] class Worker(
|
|||
var coresUsed = 0
|
||||
var memoryUsed = 0
|
||||
|
||||
val metricsSystem = MetricsSystem.createMetricsSystem("worker")
|
||||
val metricsSystem = MetricsSystem.createMetricsSystem("worker", conf)
|
||||
val workerSource = new WorkerSource(this)
|
||||
|
||||
def coresFree: Int = cores - coresUsed
|
||||
|
@ -114,7 +116,7 @@ private[spark] class Worker(
|
|||
logInfo("Spark home: " + sparkHome)
|
||||
createWorkDir()
|
||||
webUi = new WorkerWebUI(this, workDir, Some(webUiPort))
|
||||
|
||||
context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
|
||||
webUi.start()
|
||||
registerWithMaster()
|
||||
|
||||
|
@ -126,9 +128,13 @@ private[spark] class Worker(
|
|||
masterLock.synchronized {
|
||||
activeMasterUrl = url
|
||||
activeMasterWebUiUrl = uiUrl
|
||||
master = context.actorFor(Master.toAkkaUrl(activeMasterUrl))
|
||||
context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent])
|
||||
context.watch(master) // Doesn't work with remote actors, but useful for testing
|
||||
master = context.actorSelection(Master.toAkkaUrl(activeMasterUrl))
|
||||
masterAddress = activeMasterUrl match {
|
||||
case Master.sparkUrlRegex(_host, _port) =>
|
||||
Address("akka.tcp", Master.systemName, _host, _port.toInt)
|
||||
case x =>
|
||||
throw new SparkException("Invalid spark URL: " + x)
|
||||
}
|
||||
connected = true
|
||||
}
|
||||
}
|
||||
|
@ -136,7 +142,7 @@ private[spark] class Worker(
|
|||
def tryRegisterAllMasters() {
|
||||
for (masterUrl <- masterUrls) {
|
||||
logInfo("Connecting to master " + masterUrl + "...")
|
||||
val actor = context.actorFor(Master.toAkkaUrl(masterUrl))
|
||||
val actor = context.actorSelection(Master.toAkkaUrl(masterUrl))
|
||||
actor ! RegisterWorker(workerId, host, port, cores, memory, webUi.boundPort.get,
|
||||
publicAddress)
|
||||
}
|
||||
|
@ -175,7 +181,6 @@ private[spark] class Worker(
|
|||
|
||||
case MasterChanged(masterUrl, masterWebUiUrl) =>
|
||||
logInfo("Master has changed, new master is at " + masterUrl)
|
||||
context.unwatch(master)
|
||||
changeMaster(masterUrl, masterWebUiUrl)
|
||||
|
||||
val execs = executors.values.
|
||||
|
@ -234,13 +239,8 @@ private[spark] class Worker(
|
|||
}
|
||||
}
|
||||
|
||||
case Terminated(actor_) if actor_ == master =>
|
||||
masterDisconnected()
|
||||
|
||||
case RemoteClientDisconnected(transport, address) if address == master.path.address =>
|
||||
masterDisconnected()
|
||||
|
||||
case RemoteClientShutdown(transport, address) if address == master.path.address =>
|
||||
case x: DisassociatedEvent if x.remoteAddress == masterAddress =>
|
||||
logInfo(s"$x Disassociated !")
|
||||
masterDisconnected()
|
||||
|
||||
case RequestWorkerState => {
|
||||
|
@ -267,6 +267,7 @@ private[spark] class Worker(
|
|||
}
|
||||
|
||||
private[spark] object Worker {
|
||||
|
||||
def main(argStrings: Array[String]) {
|
||||
val args = new WorkerArguments(argStrings)
|
||||
val (actorSystem, _) = startSystemAndActor(args.host, args.port, args.webUiPort, args.cores,
|
||||
|
@ -275,13 +276,16 @@ private[spark] object Worker {
|
|||
}
|
||||
|
||||
def startSystemAndActor(host: String, port: Int, webUiPort: Int, cores: Int, memory: Int,
|
||||
masterUrls: Array[String], workDir: String, workerNumber: Option[Int] = None)
|
||||
: (ActorSystem, Int) = {
|
||||
masterUrls: Array[String], workDir: String, workerNumber: Option[Int] = None)
|
||||
: (ActorSystem, Int) =
|
||||
{
|
||||
// The LocalSparkCluster runs multiple local sparkWorkerX actor systems
|
||||
val conf = new SparkConf
|
||||
val systemName = "sparkWorker" + workerNumber.map(_.toString).getOrElse("")
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem(systemName, host, port)
|
||||
val actor = actorSystem.actorOf(Props(new Worker(host, boundPort, webUiPort, cores, memory,
|
||||
masterUrls, workDir)), name = "Worker")
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem(systemName, host, port,
|
||||
conf = conf)
|
||||
actorSystem.actorOf(Props(classOf[Worker], host, boundPort, webUiPort, cores, memory,
|
||||
masterUrls, workDir, conf), name = "Worker")
|
||||
(actorSystem, boundPort)
|
||||
}
|
||||
|
||||
|
|
|
@ -21,9 +21,10 @@ import javax.servlet.http.HttpServletRequest
|
|||
|
||||
import scala.xml.Node
|
||||
|
||||
import akka.dispatch.Await
|
||||
import scala.concurrent.duration._
|
||||
import scala.concurrent.Await
|
||||
|
||||
import akka.pattern.ask
|
||||
import akka.util.duration._
|
||||
|
||||
import net.liftweb.json.JsonAST.JValue
|
||||
|
||||
|
@ -41,13 +42,13 @@ private[spark] class IndexPage(parent: WorkerWebUI) {
|
|||
|
||||
def renderJson(request: HttpServletRequest): JValue = {
|
||||
val stateFuture = (workerActor ? RequestWorkerState)(timeout).mapTo[WorkerStateResponse]
|
||||
val workerState = Await.result(stateFuture, 30 seconds)
|
||||
val workerState = Await.result(stateFuture, timeout)
|
||||
JsonProtocol.writeWorkerState(workerState)
|
||||
}
|
||||
|
||||
def render(request: HttpServletRequest): Seq[Node] = {
|
||||
val stateFuture = (workerActor ? RequestWorkerState)(timeout).mapTo[WorkerStateResponse]
|
||||
val workerState = Await.result(stateFuture, 30 seconds)
|
||||
val workerState = Await.result(stateFuture, timeout)
|
||||
|
||||
val executorHeaders = Seq("ExecutorID", "Cores", "Memory", "Job Details", "Logs")
|
||||
val runningExecutorTable =
|
||||
|
|
|
@ -17,20 +17,16 @@
|
|||
|
||||
package org.apache.spark.deploy.worker.ui
|
||||
|
||||
import akka.util.{Duration, Timeout}
|
||||
|
||||
import java.io.{FileInputStream, File}
|
||||
import java.io.File
|
||||
|
||||
import javax.servlet.http.HttpServletRequest
|
||||
|
||||
import org.eclipse.jetty.server.{Handler, Server}
|
||||
|
||||
import org.apache.spark.{Logging, SparkConf}
|
||||
import org.apache.spark.deploy.worker.Worker
|
||||
import org.apache.spark.{Logging}
|
||||
import org.apache.spark.ui.JettyUtils
|
||||
import org.apache.spark.ui.{JettyUtils, UIUtils}
|
||||
import org.apache.spark.ui.JettyUtils._
|
||||
import org.apache.spark.ui.UIUtils
|
||||
import org.apache.spark.util.Utils
|
||||
import org.apache.spark.util.{AkkaUtils, Utils}
|
||||
|
||||
/**
|
||||
* Web UI server for the standalone worker.
|
||||
|
@ -38,11 +34,10 @@ import org.apache.spark.util.Utils
|
|||
private[spark]
|
||||
class WorkerWebUI(val worker: Worker, val workDir: File, requestedPort: Option[Int] = None)
|
||||
extends Logging {
|
||||
implicit val timeout = Timeout(
|
||||
Duration.create(System.getProperty("spark.akka.askTimeout", "10").toLong, "seconds"))
|
||||
val timeout = AkkaUtils.askTimeout(worker.conf)
|
||||
val host = Utils.localHostName()
|
||||
val port = requestedPort.getOrElse(
|
||||
System.getProperty("worker.ui.port", WorkerWebUI.DEFAULT_PORT).toInt)
|
||||
worker.conf.get("worker.ui.port", WorkerWebUI.DEFAULT_PORT).toInt)
|
||||
|
||||
var server: Option[Server] = None
|
||||
var boundPort: Option[Int] = None
|
||||
|
@ -145,12 +140,12 @@ class WorkerWebUI(val worker: Worker, val workDir: File, requestedPort: Option[I
|
|||
<body>
|
||||
{linkToMaster}
|
||||
<div>
|
||||
<div style="float:left;width:40%">{backButton}</div>
|
||||
<div style="float:left; margin-right:10px">{backButton}</div>
|
||||
<div style="float:left;">{range}</div>
|
||||
<div style="float:right;">{nextButton}</div>
|
||||
<div style="float:right; margin-left:10px">{nextButton}</div>
|
||||
</div>
|
||||
<br />
|
||||
<div style="height:500px;overflow:auto;padding:5px;">
|
||||
<div style="height:500px; overflow:auto; padding:5px;">
|
||||
<pre>{logText}</pre>
|
||||
</div>
|
||||
</body>
|
||||
|
|
|
@ -19,15 +19,14 @@ package org.apache.spark.executor
|
|||
|
||||
import java.nio.ByteBuffer
|
||||
|
||||
import akka.actor.{ActorRef, Actor, Props, Terminated}
|
||||
import akka.remote.{RemoteClientLifeCycleEvent, RemoteClientShutdown, RemoteClientDisconnected}
|
||||
import akka.actor._
|
||||
import akka.remote._
|
||||
|
||||
import org.apache.spark.Logging
|
||||
import org.apache.spark.{SparkConf, SparkContext, Logging}
|
||||
import org.apache.spark.TaskState.TaskState
|
||||
import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
|
||||
import org.apache.spark.util.{Utils, AkkaUtils}
|
||||
|
||||
|
||||
private[spark] class CoarseGrainedExecutorBackend(
|
||||
driverUrl: String,
|
||||
executorId: String,
|
||||
|
@ -40,14 +39,13 @@ private[spark] class CoarseGrainedExecutorBackend(
|
|||
Utils.checkHostPort(hostPort, "Expected hostport")
|
||||
|
||||
var executor: Executor = null
|
||||
var driver: ActorRef = null
|
||||
var driver: ActorSelection = null
|
||||
|
||||
override def preStart() {
|
||||
logInfo("Connecting to driver: " + driverUrl)
|
||||
driver = context.actorFor(driverUrl)
|
||||
driver = context.actorSelection(driverUrl)
|
||||
driver ! RegisterExecutor(executorId, hostPort, cores)
|
||||
context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent])
|
||||
context.watch(driver) // Doesn't work with remote actors, but useful for testing
|
||||
context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
|
||||
}
|
||||
|
||||
override def receive = {
|
||||
|
@ -77,8 +75,8 @@ private[spark] class CoarseGrainedExecutorBackend(
|
|||
executor.killTask(taskId)
|
||||
}
|
||||
|
||||
case Terminated(_) | RemoteClientDisconnected(_, _) | RemoteClientShutdown(_, _) =>
|
||||
logError("Driver terminated or disconnected! Shutting down.")
|
||||
case x: DisassociatedEvent =>
|
||||
logError(s"Driver $x disassociated! Shutting down.")
|
||||
System.exit(1)
|
||||
|
||||
case StopExecutor =>
|
||||
|
@ -99,12 +97,13 @@ private[spark] object CoarseGrainedExecutorBackend {
|
|||
|
||||
// Create a new ActorSystem to run the backend, because we can't create a SparkEnv / Executor
|
||||
// before getting started with all our system properties, etc
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("sparkExecutor", hostname, 0)
|
||||
val (actorSystem, boundPort) = AkkaUtils.createActorSystem("sparkExecutor", hostname, 0,
|
||||
indestructible = true, conf = new SparkConf)
|
||||
// set it
|
||||
val sparkHostPort = hostname + ":" + boundPort
|
||||
System.setProperty("spark.hostPort", sparkHostPort)
|
||||
val actor = actorSystem.actorOf(
|
||||
Props(new CoarseGrainedExecutorBackend(driverUrl, executorId, sparkHostPort, cores)),
|
||||
// conf.set("spark.hostPort", sparkHostPort)
|
||||
actorSystem.actorOf(
|
||||
Props(classOf[CoarseGrainedExecutorBackend], driverUrl, executorId, sparkHostPort, cores),
|
||||
name = "Executor")
|
||||
actorSystem.awaitTermination()
|
||||
}
|
||||
|
|
|
@ -48,8 +48,6 @@ private[spark] class Executor(
|
|||
|
||||
private val EMPTY_BYTE_BUFFER = ByteBuffer.wrap(new Array[Byte](0))
|
||||
|
||||
initLogging()
|
||||
|
||||
// No ip or host:port - just hostname
|
||||
Utils.checkHost(slaveHostname, "Expected executed slave to be a hostname")
|
||||
// must not have port specified.
|
||||
|
@ -58,16 +56,17 @@ private[spark] class Executor(
|
|||
// Make sure the local hostname we report matches the cluster scheduler's name for this host
|
||||
Utils.setCustomHostname(slaveHostname)
|
||||
|
||||
// Set spark.* system properties from executor arg
|
||||
for ((key, value) <- properties) {
|
||||
System.setProperty(key, value)
|
||||
}
|
||||
// Set spark.* properties from executor arg
|
||||
val conf = new SparkConf(false)
|
||||
conf.setAll(properties)
|
||||
|
||||
// If we are in yarn mode, systems can have different disk layouts so we must set it
|
||||
// to what Yarn on this system said was available. This will be used later when SparkEnv
|
||||
// created.
|
||||
if (java.lang.Boolean.valueOf(System.getenv("SPARK_YARN_MODE"))) {
|
||||
System.setProperty("spark.local.dir", getYarnLocalDirs())
|
||||
if (java.lang.Boolean.valueOf(
|
||||
System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE"))))
|
||||
{
|
||||
conf.set("spark.local.dir", getYarnLocalDirs())
|
||||
}
|
||||
|
||||
// Create our ClassLoader and set it on this thread
|
||||
|
@ -108,7 +107,7 @@ private[spark] class Executor(
|
|||
// Initialize Spark environment (using system properties read above)
|
||||
private val env = {
|
||||
if (!isLocal) {
|
||||
val _env = SparkEnv.createFromSystemProperties(executorId, slaveHostname, 0,
|
||||
val _env = SparkEnv.create(conf, executorId, slaveHostname, 0,
|
||||
isDriver = false, isLocal = false)
|
||||
SparkEnv.set(_env)
|
||||
_env.metricsSystem.registerSource(executorSource)
|
||||
|
@ -121,7 +120,7 @@ private[spark] class Executor(
|
|||
// Akka's message frame size. If task result is bigger than this, we use the block manager
|
||||
// to send the result back.
|
||||
private val akkaFrameSize = {
|
||||
env.actorSystem.settings.config.getBytes("akka.remote.netty.message-frame-size")
|
||||
env.actorSystem.settings.config.getBytes("akka.remote.netty.tcp.maximum-frame-size")
|
||||
}
|
||||
|
||||
// Start worker thread pool
|
||||
|
@ -142,11 +141,6 @@ private[spark] class Executor(
|
|||
val tr = runningTasks.get(taskId)
|
||||
if (tr != null) {
|
||||
tr.kill()
|
||||
// We remove the task also in the finally block in TaskRunner.run.
|
||||
// The reason we need to remove it here is because killTask might be called before the task
|
||||
// is even launched, and never reaching that finally block. ConcurrentHashMap's remove is
|
||||
// idempotent.
|
||||
runningTasks.remove(taskId)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -168,6 +162,8 @@ private[spark] class Executor(
|
|||
class TaskRunner(execBackend: ExecutorBackend, taskId: Long, serializedTask: ByteBuffer)
|
||||
extends Runnable {
|
||||
|
||||
object TaskKilledException extends Exception
|
||||
|
||||
@volatile private var killed = false
|
||||
@volatile private var task: Task[Any] = _
|
||||
|
||||
|
@ -201,9 +197,11 @@ private[spark] class Executor(
|
|||
// If this task has been killed before we deserialized it, let's quit now. Otherwise,
|
||||
// continue executing the task.
|
||||
if (killed) {
|
||||
logInfo("Executor killed task " + taskId)
|
||||
execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
|
||||
return
|
||||
// Throw an exception rather than returning, because returning within a try{} block
|
||||
// causes a NonLocalReturnControl exception to be thrown. The NonLocalReturnControl
|
||||
// exception will be caught by the catch block, leading to an incorrect ExceptionFailure
|
||||
// for the task.
|
||||
throw TaskKilledException
|
||||
}
|
||||
|
||||
attemptedTask = Some(task)
|
||||
|
@ -217,23 +215,25 @@ private[spark] class Executor(
|
|||
|
||||
// If the task has been killed, let's fail it.
|
||||
if (task.killed) {
|
||||
logInfo("Executor killed task " + taskId)
|
||||
execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
|
||||
return
|
||||
throw TaskKilledException
|
||||
}
|
||||
|
||||
val resultSer = SparkEnv.get.serializer.newInstance()
|
||||
val beforeSerialization = System.currentTimeMillis()
|
||||
val valueBytes = resultSer.serialize(value)
|
||||
val afterSerialization = System.currentTimeMillis()
|
||||
|
||||
for (m <- task.metrics) {
|
||||
m.hostname = Utils.localHostName()
|
||||
m.executorDeserializeTime = (taskStart - startTime).toInt
|
||||
m.executorRunTime = (taskFinish - taskStart).toInt
|
||||
m.jvmGCTime = gcTime - startGCTime
|
||||
m.resultSerializationTime = (afterSerialization - beforeSerialization).toInt
|
||||
}
|
||||
// TODO I'd also like to track the time it takes to serialize the task results, but that is
|
||||
// huge headache, b/c we need to serialize the task metrics first. If TaskMetrics had a
|
||||
// custom serialized format, we could just change the relevants bytes in the byte buffer
|
||||
|
||||
val accumUpdates = Accumulators.values
|
||||
|
||||
val directResult = new DirectTaskResult(value, accumUpdates, task.metrics.getOrElse(null))
|
||||
val directResult = new DirectTaskResult(valueBytes, accumUpdates, task.metrics.getOrElse(null))
|
||||
val serializedDirectResult = ser.serialize(directResult)
|
||||
logInfo("Serialized size of result for " + taskId + " is " + serializedDirectResult.limit)
|
||||
val serializedResult = {
|
||||
|
@ -257,6 +257,11 @@ private[spark] class Executor(
|
|||
execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
|
||||
}
|
||||
|
||||
case TaskKilledException => {
|
||||
logInfo("Executor killed task " + taskId)
|
||||
execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
|
||||
}
|
||||
|
||||
case t: Throwable => {
|
||||
val serviceTime = (System.currentTimeMillis() - taskStart).toInt
|
||||
val metrics = attemptedTask.flatMap(t => t.metrics)
|
||||
|
@ -299,7 +304,7 @@ private[spark] class Executor(
|
|||
* new classes defined by the REPL as the user types code
|
||||
*/
|
||||
private def addReplClassLoaderIfNeeded(parent: ClassLoader): ClassLoader = {
|
||||
val classUri = System.getProperty("spark.repl.class.uri")
|
||||
val classUri = conf.get("spark.repl.class.uri", null)
|
||||
if (classUri != null) {
|
||||
logInfo("Using REPL class URI: " + classUri)
|
||||
try {
|
||||
|
@ -327,12 +332,12 @@ private[spark] class Executor(
|
|||
// Fetch missing dependencies
|
||||
for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) {
|
||||
logInfo("Fetching " + name + " with timestamp " + timestamp)
|
||||
Utils.fetchFile(name, new File(SparkFiles.getRootDirectory))
|
||||
Utils.fetchFile(name, new File(SparkFiles.getRootDirectory), conf)
|
||||
currentFiles(name) = timestamp
|
||||
}
|
||||
for ((name, timestamp) <- newJars if currentJars.getOrElse(name, -1L) < timestamp) {
|
||||
logInfo("Fetching " + name + " with timestamp " + timestamp)
|
||||
Utils.fetchFile(name, new File(SparkFiles.getRootDirectory))
|
||||
Utils.fetchFile(name, new File(SparkFiles.getRootDirectory), conf)
|
||||
currentJars(name) = timestamp
|
||||
// Add it to our class loader
|
||||
val localName = name.split("/").last
|
||||
|
|
|
@ -43,6 +43,11 @@ class TaskMetrics extends Serializable {
|
|||
*/
|
||||
var jvmGCTime: Long = _
|
||||
|
||||
/**
|
||||
* Amount of time spent serializing the task result
|
||||
*/
|
||||
var resultSerializationTime: Long = _
|
||||
|
||||
/**
|
||||
* If this task reads from shuffle output, metrics on getting shuffle data will be collected here
|
||||
*/
|
||||
|
@ -61,50 +66,53 @@ object TaskMetrics {
|
|||
|
||||
class ShuffleReadMetrics extends Serializable {
|
||||
/**
|
||||
* Time when shuffle finishs
|
||||
* Absolute time when this task finished reading shuffle data
|
||||
*/
|
||||
var shuffleFinishTime: Long = _
|
||||
|
||||
/**
|
||||
* Total number of blocks fetched in a shuffle (remote or local)
|
||||
* Number of blocks fetched in this shuffle by this task (remote or local)
|
||||
*/
|
||||
var totalBlocksFetched: Int = _
|
||||
|
||||
/**
|
||||
* Number of remote blocks fetched in a shuffle
|
||||
* Number of remote blocks fetched in this shuffle by this task
|
||||
*/
|
||||
var remoteBlocksFetched: Int = _
|
||||
|
||||
/**
|
||||
* Local blocks fetched in a shuffle
|
||||
* Number of local blocks fetched in this shuffle by this task
|
||||
*/
|
||||
var localBlocksFetched: Int = _
|
||||
|
||||
/**
|
||||
* Total time that is spent blocked waiting for shuffle to fetch data
|
||||
* Time the task spent waiting for remote shuffle blocks. This only includes the time
|
||||
* blocking on shuffle input data. For instance if block B is being fetched while the task is
|
||||
* still not finished processing block A, it is not considered to be blocking on block B.
|
||||
*/
|
||||
var fetchWaitTime: Long = _
|
||||
|
||||
/**
|
||||
* The total amount of time for all the shuffle fetches. This adds up time from overlapping
|
||||
* shuffles, so can be longer than task time
|
||||
* Total time spent fetching remote shuffle blocks. This aggregates the time spent fetching all
|
||||
* input blocks. Since block fetches are both pipelined and parallelized, this can
|
||||
* exceed fetchWaitTime and executorRunTime.
|
||||
*/
|
||||
var remoteFetchTime: Long = _
|
||||
|
||||
/**
|
||||
* Total number of remote bytes read from a shuffle
|
||||
* Total number of remote bytes read from the shuffle by this task
|
||||
*/
|
||||
var remoteBytesRead: Long = _
|
||||
}
|
||||
|
||||
class ShuffleWriteMetrics extends Serializable {
|
||||
/**
|
||||
* Number of bytes written for a shuffle
|
||||
* Number of bytes written for the shuffle by this task
|
||||
*/
|
||||
var shuffleBytesWritten: Long = _
|
||||
|
||||
/**
|
||||
* Time spent blocking on writes to disk or buffer cache, in nanoseconds.
|
||||
* Time the task spent blocking on writes to disk or buffer cache, in nanoseconds
|
||||
*/
|
||||
var shuffleWriteTime: Long = _
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.{InputStream, OutputStream}
|
|||
import com.ning.compress.lzf.{LZFInputStream, LZFOutputStream}
|
||||
|
||||
import org.xerial.snappy.{SnappyInputStream, SnappyOutputStream}
|
||||
import org.apache.spark.{SparkEnv, SparkConf}
|
||||
|
||||
|
||||
/**
|
||||
|
@ -37,15 +38,15 @@ trait CompressionCodec {
|
|||
|
||||
|
||||
private[spark] object CompressionCodec {
|
||||
|
||||
def createCodec(): CompressionCodec = {
|
||||
createCodec(System.getProperty(
|
||||
def createCodec(conf: SparkConf): CompressionCodec = {
|
||||
createCodec(conf, conf.get(
|
||||
"spark.io.compression.codec", classOf[LZFCompressionCodec].getName))
|
||||
}
|
||||
|
||||
def createCodec(codecName: String): CompressionCodec = {
|
||||
Class.forName(codecName, true, Thread.currentThread.getContextClassLoader)
|
||||
.newInstance().asInstanceOf[CompressionCodec]
|
||||
def createCodec(conf: SparkConf, codecName: String): CompressionCodec = {
|
||||
val ctor = Class.forName(codecName, true, Thread.currentThread.getContextClassLoader)
|
||||
.getConstructor(classOf[SparkConf])
|
||||
ctor.newInstance(conf).asInstanceOf[CompressionCodec]
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -53,7 +54,7 @@ private[spark] object CompressionCodec {
|
|||
/**
|
||||
* LZF implementation of [[org.apache.spark.io.CompressionCodec]].
|
||||
*/
|
||||
class LZFCompressionCodec extends CompressionCodec {
|
||||
class LZFCompressionCodec(conf: SparkConf) extends CompressionCodec {
|
||||
|
||||
override def compressedOutputStream(s: OutputStream): OutputStream = {
|
||||
new LZFOutputStream(s).setFinishBlockOnFlush(true)
|
||||
|
@ -67,10 +68,10 @@ class LZFCompressionCodec extends CompressionCodec {
|
|||
* Snappy implementation of [[org.apache.spark.io.CompressionCodec]].
|
||||
* Block size can be configured by spark.io.compression.snappy.block.size.
|
||||
*/
|
||||
class SnappyCompressionCodec extends CompressionCodec {
|
||||
class SnappyCompressionCodec(conf: SparkConf) extends CompressionCodec {
|
||||
|
||||
override def compressedOutputStream(s: OutputStream): OutputStream = {
|
||||
val blockSize = System.getProperty("spark.io.compression.snappy.block.size", "32768").toInt
|
||||
val blockSize = conf.get("spark.io.compression.snappy.block.size", "32768").toInt
|
||||
new SnappyOutputStream(s, blockSize)
|
||||
}
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@ import scala.util.matching.Regex
|
|||
import org.apache.spark.Logging
|
||||
|
||||
private[spark] class MetricsConfig(val configFile: Option[String]) extends Logging {
|
||||
initLogging()
|
||||
|
||||
val DEFAULT_PREFIX = "*"
|
||||
val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.util.concurrent.TimeUnit
|
|||
|
||||
import scala.collection.mutable
|
||||
|
||||
import org.apache.spark.Logging
|
||||
import org.apache.spark.{SparkConf, Logging}
|
||||
import org.apache.spark.metrics.sink.{MetricsServlet, Sink}
|
||||
import org.apache.spark.metrics.source.Source
|
||||
|
||||
|
@ -62,10 +62,10 @@ import org.apache.spark.metrics.source.Source
|
|||
*
|
||||
* [options] is the specific property of this source or sink.
|
||||
*/
|
||||
private[spark] class MetricsSystem private (val instance: String) extends Logging {
|
||||
initLogging()
|
||||
private[spark] class MetricsSystem private (val instance: String,
|
||||
conf: SparkConf) extends Logging {
|
||||
|
||||
val confFile = System.getProperty("spark.metrics.conf")
|
||||
val confFile = conf.get("spark.metrics.conf", null)
|
||||
val metricsConfig = new MetricsConfig(Option(confFile))
|
||||
|
||||
val sinks = new mutable.ArrayBuffer[Sink]
|
||||
|
@ -159,5 +159,6 @@ private[spark] object MetricsSystem {
|
|||
}
|
||||
}
|
||||
|
||||
def createMetricsSystem(instance: String): MetricsSystem = new MetricsSystem(instance)
|
||||
def createMetricsSystem(instance: String, conf: SparkConf): MetricsSystem =
|
||||
new MetricsSystem(instance, conf)
|
||||
}
|
||||
|
|
|
@ -31,13 +31,13 @@ import scala.collection.mutable.SynchronizedMap
|
|||
import scala.collection.mutable.SynchronizedQueue
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
import akka.dispatch.{Await, Promise, ExecutionContext, Future}
|
||||
import akka.util.Duration
|
||||
import akka.util.duration._
|
||||
import scala.concurrent.{Await, Promise, ExecutionContext, Future}
|
||||
import scala.concurrent.duration.Duration
|
||||
import scala.concurrent.duration._
|
||||
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
|
||||
private[spark] class ConnectionManager(port: Int) extends Logging {
|
||||
private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Logging {
|
||||
|
||||
class MessageStatus(
|
||||
val message: Message,
|
||||
|
@ -54,22 +54,22 @@ private[spark] class ConnectionManager(port: Int) extends Logging {
|
|||
private val selector = SelectorProvider.provider.openSelector()
|
||||
|
||||
private val handleMessageExecutor = new ThreadPoolExecutor(
|
||||
System.getProperty("spark.core.connection.handler.threads.min","20").toInt,
|
||||
System.getProperty("spark.core.connection.handler.threads.max","60").toInt,
|
||||
System.getProperty("spark.core.connection.handler.threads.keepalive","60").toInt, TimeUnit.SECONDS,
|
||||
conf.get("spark.core.connection.handler.threads.min", "20").toInt,
|
||||
conf.get("spark.core.connection.handler.threads.max", "60").toInt,
|
||||
conf.get("spark.core.connection.handler.threads.keepalive", "60").toInt, TimeUnit.SECONDS,
|
||||
new LinkedBlockingDeque[Runnable]())
|
||||
|
||||
private val handleReadWriteExecutor = new ThreadPoolExecutor(
|
||||
System.getProperty("spark.core.connection.io.threads.min","4").toInt,
|
||||
System.getProperty("spark.core.connection.io.threads.max","32").toInt,
|
||||
System.getProperty("spark.core.connection.io.threads.keepalive","60").toInt, TimeUnit.SECONDS,
|
||||
conf.get("spark.core.connection.io.threads.min", "4").toInt,
|
||||
conf.get("spark.core.connection.io.threads.max", "32").toInt,
|
||||
conf.get("spark.core.connection.io.threads.keepalive", "60").toInt, TimeUnit.SECONDS,
|
||||
new LinkedBlockingDeque[Runnable]())
|
||||
|
||||
// Use a different, yet smaller, thread pool - infrequently used with very short lived tasks : which should be executed asap
|
||||
private val handleConnectExecutor = new ThreadPoolExecutor(
|
||||
System.getProperty("spark.core.connection.connect.threads.min","1").toInt,
|
||||
System.getProperty("spark.core.connection.connect.threads.max","8").toInt,
|
||||
System.getProperty("spark.core.connection.connect.threads.keepalive","60").toInt, TimeUnit.SECONDS,
|
||||
conf.get("spark.core.connection.connect.threads.min", "1").toInt,
|
||||
conf.get("spark.core.connection.connect.threads.max", "8").toInt,
|
||||
conf.get("spark.core.connection.connect.threads.keepalive", "60").toInt, TimeUnit.SECONDS,
|
||||
new LinkedBlockingDeque[Runnable]())
|
||||
|
||||
private val serverChannel = ServerSocketChannel.open()
|
||||
|
@ -594,7 +594,7 @@ private[spark] class ConnectionManager(port: Int) extends Logging {
|
|||
private[spark] object ConnectionManager {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
val manager = new ConnectionManager(9999)
|
||||
val manager = new ConnectionManager(9999, new SparkConf)
|
||||
manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
|
||||
println("Received [" + msg + "] from [" + id + "]")
|
||||
None
|
||||
|
|
|
@ -25,8 +25,8 @@ import scala.io.Source
|
|||
import java.nio.ByteBuffer
|
||||
import java.net.InetAddress
|
||||
|
||||
import akka.dispatch.Await
|
||||
import akka.util.duration._
|
||||
import scala.concurrent.Await
|
||||
import scala.concurrent.duration._
|
||||
|
||||
private[spark] object ConnectionManagerTest extends Logging{
|
||||
def main(args: Array[String]) {
|
||||
|
|
|
@ -19,19 +19,19 @@ package org.apache.spark.network
|
|||
|
||||
import java.nio.ByteBuffer
|
||||
import java.net.InetAddress
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
private[spark] object ReceiverTest {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
val manager = new ConnectionManager(9999)
|
||||
val manager = new ConnectionManager(9999, new SparkConf)
|
||||
println("Started connection manager with id = " + manager.id)
|
||||
|
||||
manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
|
||||
|
||||
manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
|
||||
/*println("Received [" + msg + "] from [" + id + "] at " + System.currentTimeMillis)*/
|
||||
val buffer = ByteBuffer.wrap("response".getBytes())
|
||||
val buffer = ByteBuffer.wrap("response".getBytes)
|
||||
Some(Message.createBufferMessage(buffer, msg.id))
|
||||
})
|
||||
Thread.currentThread.join()
|
||||
Thread.currentThread.join()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,29 +19,29 @@ package org.apache.spark.network
|
|||
|
||||
import java.nio.ByteBuffer
|
||||
import java.net.InetAddress
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
private[spark] object SenderTest {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
|
||||
|
||||
if (args.length < 2) {
|
||||
println("Usage: SenderTest <target host> <target port>")
|
||||
System.exit(1)
|
||||
}
|
||||
|
||||
|
||||
val targetHost = args(0)
|
||||
val targetPort = args(1).toInt
|
||||
val targetConnectionManagerId = new ConnectionManagerId(targetHost, targetPort)
|
||||
|
||||
val manager = new ConnectionManager(0)
|
||||
val manager = new ConnectionManager(0, new SparkConf)
|
||||
println("Started connection manager with id = " + manager.id)
|
||||
|
||||
manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
|
||||
manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
|
||||
println("Received [" + msg + "] from [" + id + "]")
|
||||
None
|
||||
})
|
||||
|
||||
val size = 100 * 1024 * 1024
|
||||
|
||||
val size = 100 * 1024 * 1024
|
||||
val buffer = ByteBuffer.allocate(size).put(Array.tabulate[Byte](size)(x => x.toByte))
|
||||
buffer.flip
|
||||
|
||||
|
@ -50,7 +50,7 @@ private[spark] object SenderTest {
|
|||
val count = 100
|
||||
(0 until count).foreach(i => {
|
||||
val dataMessage = Message.createBufferMessage(buffer.duplicate)
|
||||
val startTime = System.currentTimeMillis
|
||||
val startTime = System.currentTimeMillis
|
||||
/*println("Started timer at " + startTime)*/
|
||||
val responseStr = manager.sendMessageReliablySync(targetConnectionManagerId, dataMessage) match {
|
||||
case Some(response) =>
|
||||
|
|
|
@ -23,20 +23,20 @@ import io.netty.buffer.ByteBuf
|
|||
import io.netty.channel.ChannelHandlerContext
|
||||
import io.netty.util.CharsetUtil
|
||||
|
||||
import org.apache.spark.Logging
|
||||
import org.apache.spark.{SparkContext, SparkConf, Logging}
|
||||
import org.apache.spark.network.ConnectionManagerId
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import org.apache.spark.storage.BlockId
|
||||
|
||||
|
||||
private[spark] class ShuffleCopier extends Logging {
|
||||
private[spark] class ShuffleCopier(conf: SparkConf) extends Logging {
|
||||
|
||||
def getBlock(host: String, port: Int, blockId: BlockId,
|
||||
resultCollectCallback: (BlockId, Long, ByteBuf) => Unit) {
|
||||
|
||||
val handler = new ShuffleCopier.ShuffleClientHandler(resultCollectCallback)
|
||||
val connectTimeout = System.getProperty("spark.shuffle.netty.connect.timeout", "60000").toInt
|
||||
val connectTimeout = conf.get("spark.shuffle.netty.connect.timeout", "60000").toInt
|
||||
val fc = new FileClient(handler, connectTimeout)
|
||||
|
||||
try {
|
||||
|
@ -104,10 +104,10 @@ private[spark] object ShuffleCopier extends Logging {
|
|||
val threads = if (args.length > 3) args(3).toInt else 10
|
||||
|
||||
val copiers = Executors.newFixedThreadPool(80)
|
||||
val tasks = (for (i <- Range(0, threads)) yield {
|
||||
val tasks = (for (i <- Range(0, threads)) yield {
|
||||
Executors.callable(new Runnable() {
|
||||
def run() {
|
||||
val copier = new ShuffleCopier()
|
||||
val copier = new ShuffleCopier(new SparkConf)
|
||||
copier.getBlock(host, port, blockId, echoResultCollectCallBack)
|
||||
}
|
||||
})
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.concurrent.atomic.AtomicLong
|
|||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
import scala.concurrent.ExecutionContext.Implicits.global
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.spark.{ComplexFutureAction, FutureAction, Logging}
|
||||
|
||||
|
@ -28,7 +29,7 @@ import org.apache.spark.{ComplexFutureAction, FutureAction, Logging}
|
|||
* A set of asynchronous RDD actions available through an implicit conversion.
|
||||
* Import `org.apache.spark.SparkContext._` at the top of your program to use these functions.
|
||||
*/
|
||||
class AsyncRDDActions[T: ClassManifest](self: RDD[T]) extends Serializable with Logging {
|
||||
class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Logging {
|
||||
|
||||
/**
|
||||
* Returns a future for counting the number of elements in the RDD.
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.spark.rdd
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.spark.{SparkContext, SparkEnv, Partition, TaskContext}
|
||||
import org.apache.spark.storage.{BlockId, BlockManager}
|
||||
|
||||
|
@ -25,7 +27,7 @@ private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends P
|
|||
}
|
||||
|
||||
private[spark]
|
||||
class BlockRDD[T: ClassManifest](sc: SparkContext, @transient blockIds: Array[BlockId])
|
||||
class BlockRDD[T: ClassTag](sc: SparkContext, @transient blockIds: Array[BlockId])
|
||||
extends RDD[T](sc, Nil) {
|
||||
|
||||
@transient lazy val locations_ = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get)
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.spark.rdd
|
||||
|
||||
import java.io.{ObjectOutputStream, IOException}
|
||||
import scala.reflect.ClassTag
|
||||
import org.apache.spark._
|
||||
|
||||
|
||||
|
@ -43,7 +44,7 @@ class CartesianPartition(
|
|||
}
|
||||
|
||||
private[spark]
|
||||
class CartesianRDD[T: ClassManifest, U:ClassManifest](
|
||||
class CartesianRDD[T: ClassTag, U: ClassTag](
|
||||
sc: SparkContext,
|
||||
var rdd1 : RDD[T],
|
||||
var rdd2 : RDD[U])
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue