Simplify PySpark installation.

- Bundle Py4J binaries, since it's hard to install - Uses Spark's `run` script to launch the Py4J gateway, inheriting the settings in spark-env.sh With these changes, (hopefully) nothing more than running `sbt/sbt package` will be necessary to run PySpark.
2012-12-27 22:47:37 -08:00 · 2012-12-27 22:47:37 -08:00 · 665466dfff
parent ac32447cd3
commit 665466dfff
13 changed files with 78 additions and 47 deletions
--- a/pyspark/README
+++ b/pyspark/README
@ -32,30 +32,11 @@ The `pyspark/pyspark/examples` directory contains a few complete
 examples.

 ## Installing PySpark
-
-PySpark requires a development version of Py4J, a Python library for
-interacting with Java processes.  It can be installed from
-https://github.com/bartdag/py4j; make sure to install a version that
-contains at least the commits through b7924aabe9.
-
-PySpark requires the `argparse` module, which is included in Python 2.7
-and is is available for Python 2.6 through `pip` or `easy_install`.
-
-PySpark uses the `PYTHONPATH` environment variable to search for Python
-classes; Py4J should be on this path, along with any libraries used by
-PySpark programs.  `PYTHONPATH` will be automatically shipped to worker
-machines, but the files that it points to must be present on each
-machine.
-
-PySpark requires the Spark assembly JAR, which can be created by running
-`sbt/sbt assembly` in the Spark directory.
-
-Additionally, `SPARK_HOME` should be set to the location of the Spark
+#
+To use PySpark, `SPARK_HOME` should be set to the location of the Spark
 package.

 ## Running PySpark

 The easiest way to run PySpark is to use the `run-pyspark` and
 `pyspark-shell` scripts, which are included in the `pyspark` directory.
-These scripts automatically load the `spark-conf.sh` file, set
-`SPARK_HOME`, and add the `pyspark` package to the `PYTHONPATH`.
--- a/pyspark/lib/PY4J_LICENSE.txt
+++ b/pyspark/lib/PY4J_LICENSE.txt
@ -0,0 +1,27 @@
+
+Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+- The name of the author may not be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
--- a/pyspark/lib/PY4J_VERSION.txt
+++ b/pyspark/lib/PY4J_VERSION.txt
@ -0,0 +1 @@
+b7924aabe9c5e63f0a4d8bbd17019534c7ec014e
--- a/pyspark/lib/py4j0.7.egg
+++ b/pyspark/lib/py4j0.7.egg
--- a/pyspark/lib/py4j0.7.jar
+++ b/pyspark/lib/py4j0.7.jar
--- a/pyspark/pyspark-shell
+++ b/pyspark/pyspark-shell
@ -1,3 +1,3 @@
-#!/bin/sh
+#!/usr/bin/env bash
 FWDIR="`dirname $0`"
 exec $FWDIR/run-pyspark $FWDIR/pyspark/shell.py "$@"
--- a/pyspark/pyspark/init.py
+++ b/pyspark/pyspark/init.py
@ -0,0 +1,3 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "pyspark/lib/py4j0.7.egg"))
--- a/pyspark/pyspark/java_gateway.py
+++ b/pyspark/pyspark/java_gateway.py
@ -1,19 +1,36 @@
-import glob
 import os
-from py4j.java_gateway import java_import, JavaGateway
+from subprocess import Popen, PIPE
+from threading import Thread
+from py4j.java_gateway import java_import, JavaGateway, GatewayClient


 SPARK_HOME = os.environ["SPARK_HOME"]


-assembly_jar = glob.glob(os.path.join(SPARK_HOME, "core/target") + \
-    "/spark-core-assembly-*.jar")[0]
-    # TODO: what if multiple assembly jars are found?
-
-
 def launch_gateway():
-    gateway = JavaGateway.launch_gateway(classpath=assembly_jar,
-        javaopts=["-Xmx256m"], die_on_exit=True)
+    # Launch the Py4j gateway using Spark's run command so that we pick up the
+    # proper classpath and SPARK_MEM settings from spark-env.sh
+    command = [os.path.join(SPARK_HOME, "run"), "py4j.GatewayServer",
+               "--die-on-broken-pipe", "0"]
+    proc = Popen(command, stdout=PIPE, stdin=PIPE)
+    # Determine which ephemeral port the server started on:
+    port = int(proc.stdout.readline())
+    # Create a thread to echo output from the GatewayServer, which is required
+    # for Java log output to show up:
+    class EchoOutputThread(Thread):
+        def __init__(self, stream):
+            Thread.__init__(self)
+            self.daemon = True
+            self.stream = stream
+
+        def run(self):
+            while True:
+                line = self.stream.readline()
+                print line,
+    EchoOutputThread(proc.stdout).start()
+    # Connect to the gateway
+    gateway = JavaGateway(GatewayClient(port=port))
+    # Import the classes used by PySpark
    java_import(gateway.jvm, "spark.api.java.*")
    java_import(gateway.jvm, "spark.api.python.*")
    java_import(gateway.jvm, "scala.Tuple2")
--- a/pyspark/pyspark/shell.py
+++ b/pyspark/pyspark/shell.py
@ -1,7 +1,7 @@
 """
 An interactive shell.
 """
-import argparse  # argparse is avaiable for Python < 2.7 through easy_install.
+import optparse  # I prefer argparse, but it's not included with Python < 2.7
 import code
 import sys

@ -21,10 +21,13 @@ def main(master='local', ipython=False):


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("master", help="Spark master host (default='local')",
-                        nargs='?', type=str, default="local")
-    parser.add_argument("-i", "--ipython", help="Run IPython shell",
-                        action="store_true")
-    args = parser.parse_args()
-    main(args.master, args.ipython)
+    usage = "usage: %prog [options] master"
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("-i", "--ipython", help="Run IPython shell",
+                      action="store_true")
+    (options, args) = parser.parse_args()
+    if len(sys.argv) > 1:
+        master = args[0]
+    else:
+        master = 'local'
+    main(master, options.ipython)
--- a/pyspark/requirements.txt
+++ b/pyspark/requirements.txt
@ -1,7 +0,0 @@
-# The Python API relies on some new features from the Py4J development branch.
-# pip can't install Py4J from git because the setup.py file for the Python
-# package is not at the root of the git repository.  It may be possible to
-# install Py4J from git once https://github.com/pypa/pip/pull/526 is merged.
-
-# git+git://github.com/bartdag/py4j.git@b7924aabe9c5e63f0a4d8bbd17019534c7ec014e
-argparse
--- a/pyspark/run-pyspark
+++ b/pyspark/run-pyspark
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Figure out where the Scala framework is installed
 FWDIR="$(cd `dirname $0`; cd ../; pwd)"
--- a/4
+++ b/4
@ -40,6 +40,7 @@ CORE_DIR="$FWDIR/core"
 REPL_DIR="$FWDIR/repl"
 EXAMPLES_DIR="$FWDIR/examples"
 BAGEL_DIR="$FWDIR/bagel"
+PYSPARK_DIR="$FWDIR/pyspark"

 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH"
@ -61,6 +62,9 @@ for jar in `find $REPL_DIR/lib -name '*jar'`; do
  CLASSPATH+=":$jar"
 done
 CLASSPATH+=":$BAGEL_DIR/target/scala-$SCALA_VERSION/classes"
+for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do
+  CLASSPATH+=":$jar"
+done
 export CLASSPATH # Needed for spark-shell

 # Figure out whether to run our class with java or with the scala launcher.
--- a/run2.cmd
+++ b/run2.cmd
@ -34,6 +34,7 @@ set CORE_DIR=%FWDIR%core
 set REPL_DIR=%FWDIR%repl
 set EXAMPLES_DIR=%FWDIR%examples
 set BAGEL_DIR=%FWDIR%bagel
+set PYSPARK_DIR=%FWDIR%pyspark

 rem Build up classpath
 set CLASSPATH=%SPARK_CLASSPATH%;%MESOS_CLASSPATH%;%FWDIR%conf;%CORE_DIR%\target\scala-%SCALA_VERSION%\classes
@ -42,6 +43,7 @@ set CLASSPATH=%CLASSPATH%;%REPL_DIR%\target\scala-%SCALA_VERSION%\classes;%EXAMP
 for /R "%FWDIR%\lib_managed\jars" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j
 for /R "%FWDIR%\lib_managed\bundles" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j
 for /R "%REPL_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j
+for /R "%PYSPARK_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j
 set CLASSPATH=%CLASSPATH%;%BAGEL_DIR%\target\scala-%SCALA_VERSION%\classes

 rem Figure out whether to run our class with java or with the scala launcher.