SPARK-1004. PySpark on YARN
This reopens https://github.com/apache/incubator-spark/pull/640 against the new repo Author: Sandy Ryza <sandy@cloudera.com> Closes #30 from sryza/sandy-spark-1004 and squashes the following commits: 89889d4 [Sandy Ryza] Move unzipping py4j to the generate-resources phase so that it gets included in the jar the first time 5165a02 [Sandy Ryza] Fix docs fd0df79 [Sandy Ryza] PySpark on YARN
This commit is contained in:
parent
7025dda8fa
commit
ff5be9a41e
|
@ -46,6 +46,7 @@ export PYSPARK_PYTHON
|
|||
|
||||
# Add the PySpark classes to the Python path:
|
||||
export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
|
||||
export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
|
||||
|
||||
# Load the PySpark shell.py script when ./pyspark is used interactively:
|
||||
export OLD_PYTHONSTARTUP=$PYTHONSTARTUP
|
||||
|
|
|
@ -45,6 +45,7 @@ rem Figure out which Python to use.
|
|||
if "x%PYSPARK_PYTHON%"=="x" set PYSPARK_PYTHON=python
|
||||
|
||||
set PYTHONPATH=%FWDIR%python;%PYTHONPATH%
|
||||
set PYTHONPATH=%FWDIR%python\lib\py4j-0.8.1-src.zip;%PYTHONPATH%
|
||||
|
||||
set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
|
||||
set PYTHONSTARTUP=%FWDIR%python\pyspark\shell.py
|
||||
|
|
42
core/pom.xml
42
core/pom.xml
|
@ -294,6 +294,48 @@
|
|||
</environmentVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!-- Unzip py4j so we can include its files in the jar -->
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
<version>1.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-resources</phase>
|
||||
<goals>
|
||||
<goal>exec</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<executable>unzip</executable>
|
||||
<workingDirectory>../python</workingDirectory>
|
||||
<arguments>
|
||||
<argument>-o</argument>
|
||||
<argument>lib/py4j*.zip</argument>
|
||||
<argument>-d</argument>
|
||||
<argument>build</argument>
|
||||
</arguments>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
</resource>
|
||||
<resource>
|
||||
<directory>../python</directory>
|
||||
<includes>
|
||||
<include>pyspark/*.py</include>
|
||||
</includes>
|
||||
</resource>
|
||||
<resource>
|
||||
<directory>../python/build</directory>
|
||||
<includes>
|
||||
<include>py4j/*.py</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -78,12 +78,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
|
|||
serverSocket = new ServerSocket(0, 1, InetAddress.getByAddress(Array(127, 0, 0, 1)))
|
||||
|
||||
// Create and start the worker
|
||||
val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
|
||||
val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/worker.py"))
|
||||
val pb = new ProcessBuilder(Seq(pythonExec, "-m", "pyspark.worker"))
|
||||
val workerEnv = pb.environment()
|
||||
workerEnv.putAll(envVars)
|
||||
val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
|
||||
workerEnv.put("PYTHONPATH", pythonPath)
|
||||
val worker = pb.start()
|
||||
|
||||
// Redirect the worker's stderr to ours
|
||||
|
@ -154,12 +151,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
|
|||
|
||||
try {
|
||||
// Create and start the daemon
|
||||
val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
|
||||
val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/daemon.py"))
|
||||
val pb = new ProcessBuilder(Seq(pythonExec, "-m", "pyspark.daemon"))
|
||||
val workerEnv = pb.environment()
|
||||
workerEnv.putAll(envVars)
|
||||
val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
|
||||
workerEnv.put("PYTHONPATH", pythonPath)
|
||||
daemon = pb.start()
|
||||
|
||||
// Redirect the stderr to ours
|
||||
|
|
|
@ -63,6 +63,9 @@ All of PySpark's library dependencies, including [Py4J](http://py4j.sourceforge.
|
|||
Standalone PySpark applications should be run using the `bin/pyspark` script, which automatically configures the Java and Python environment using the settings in `conf/spark-env.sh` or `.cmd`.
|
||||
The script automatically adds the `bin/pyspark` package to the `PYTHONPATH`.
|
||||
|
||||
# Running PySpark on YARN
|
||||
|
||||
To run PySpark against a YARN cluster, simply set the MASTER environment variable to "yarn-client".
|
||||
|
||||
# Interactive Use
|
||||
|
||||
|
|
3
python/.gitignore
vendored
3
python/.gitignore
vendored
|
@ -1,2 +1,5 @@
|
|||
*.pyc
|
||||
docs/
|
||||
pyspark.egg-info
|
||||
build/
|
||||
dist/
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
b7924aabe9c5e63f0a4d8bbd17019534c7ec014e
|
|
@ -49,13 +49,6 @@ Hive:
|
|||
Main entry point for accessing data stored in Apache Hive..
|
||||
"""
|
||||
|
||||
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "python/lib/py4j-0.8.1-src.zip"))
|
||||
|
||||
|
||||
from pyspark.conf import SparkConf
|
||||
from pyspark.context import SparkContext
|
||||
from pyspark.sql import SQLContext
|
||||
|
|
|
@ -24,10 +24,11 @@ from threading import Thread
|
|||
from py4j.java_gateway import java_import, JavaGateway, GatewayClient
|
||||
|
||||
|
||||
SPARK_HOME = os.environ["SPARK_HOME"]
|
||||
|
||||
|
||||
def launch_gateway():
|
||||
SPARK_HOME = os.environ["SPARK_HOME"]
|
||||
|
||||
set_env_vars_for_yarn()
|
||||
|
||||
# Launch the Py4j gateway using Spark's run command so that we pick up the
|
||||
# proper classpath and settings from spark-env.sh
|
||||
on_windows = platform.system() == "Windows"
|
||||
|
@ -70,3 +71,27 @@ def launch_gateway():
|
|||
java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
|
||||
java_import(gateway.jvm, "scala.Tuple2")
|
||||
return gateway
|
||||
|
||||
def set_env_vars_for_yarn():
|
||||
# Add the spark jar, which includes the pyspark files, to the python path
|
||||
env_map = parse_env(os.environ.get("SPARK_YARN_USER_ENV", ""))
|
||||
if "PYTHONPATH" in env_map:
|
||||
env_map["PYTHONPATH"] += ":spark.jar"
|
||||
else:
|
||||
env_map["PYTHONPATH"] = "spark.jar"
|
||||
|
||||
os.environ["SPARK_YARN_USER_ENV"] = ",".join(k + '=' + v for (k, v) in env_map.items())
|
||||
|
||||
def parse_env(env_str):
|
||||
# Turns a comma-separated of env settings into a dict that maps env vars to
|
||||
# their values.
|
||||
env = {}
|
||||
for var_str in env_str.split(","):
|
||||
parts = var_str.split("=")
|
||||
if len(parts) == 2:
|
||||
env[parts[0]] = parts[1]
|
||||
elif len(var_str) > 0:
|
||||
print "Invalid entry in SPARK_YARN_USER_ENV: " + var_str
|
||||
sys.exit(1)
|
||||
|
||||
return env
|
||||
|
|
|
@ -30,10 +30,12 @@ import unittest
|
|||
|
||||
from pyspark.context import SparkContext
|
||||
from pyspark.files import SparkFiles
|
||||
from pyspark.java_gateway import SPARK_HOME
|
||||
from pyspark.serializers import read_int
|
||||
|
||||
|
||||
SPARK_HOME = os.environ["SPARK_HOME"]
|
||||
|
||||
|
||||
class PySparkTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
|
|
|
@ -34,3 +34,6 @@ this="$config_bin/$script"
|
|||
export SPARK_PREFIX=`dirname "$this"`/..
|
||||
export SPARK_HOME=${SPARK_PREFIX}
|
||||
export SPARK_CONF_DIR="$SPARK_HOME/conf"
|
||||
# Add the PySpark classes to the PYTHONPATH:
|
||||
export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
|
||||
export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
|
||||
|
|
Loading…
Reference in a new issue