[SPARK-13848][SPARK-5185] Update to Py4J 0.9.2 in order to fix classloading issue

This patch upgrades Py4J from 0.9.1 to 0.9.2 in order to include a patch which modifies Py4J to use the current thread's ContextClassLoader when performing reflection / class loading. This is necessary in order to fix [SPARK-5185](https://issues.apache.org/jira/browse/SPARK-5185), a longstanding issue affecting the use of `--jars` and `--packages` in PySpark.

In order to demonstrate that the fix works, I removed the workarounds which were added as part of [SPARK-6027](https://issues.apache.org/jira/browse/SPARK-6027) / #4779 and other patches.

Py4J diff: https://github.com/bartdag/py4j/compare/0.9.1...0.9.2

/cc zsxwing tdas davies brkyvz

Author: Josh Rosen <joshrosen@databricks.com>

Closes #11687 from JoshRosen/py4j-0.9.2.
This commit is contained in:
Josh Rosen 2016-03-14 12:22:02 -07:00
parent 6a4bfcd62b
commit 07cb323e7a
21 changed files with 38 additions and 65 deletions

View file

@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf) (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net) (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
(The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net) (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
(The New BSD License) Py4J (net.sf.py4j:py4j:0.9.1 - http://py4j.sourceforge.net/) (The New BSD License) Py4J (net.sf.py4j:py4j:0.9.2 - http://py4j.sourceforge.net/)
(Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/) (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
(BSD licence) sbt and sbt-launch-lib.bash (BSD licence) sbt and sbt-launch-lib.bash
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE) (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)

View file

@ -67,7 +67,7 @@ export PYSPARK_PYTHON
# Add the PySpark classes to the Python path: # Add the PySpark classes to the Python path:
export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH" export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9.1-src.zip:$PYTHONPATH" export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9.2-src.zip:$PYTHONPATH"
# Load the PySpark shell.py script when ./pyspark is used interactively: # Load the PySpark shell.py script when ./pyspark is used interactively:
export OLD_PYTHONSTARTUP="$PYTHONSTARTUP" export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"

View file

@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
) )
set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH% set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9.1-src.zip;%PYTHONPATH% set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9.2-src.zip;%PYTHONPATH%
set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py

View file

@ -314,7 +314,7 @@
<dependency> <dependency>
<groupId>net.sf.py4j</groupId> <groupId>net.sf.py4j</groupId>
<artifactId>py4j</artifactId> <artifactId>py4j</artifactId>
<version>0.9.1</version> <version>0.9.2</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View file

@ -32,7 +32,7 @@ private[spark] object PythonUtils {
val pythonPath = new ArrayBuffer[String] val pythonPath = new ArrayBuffer[String]
for (sparkHome <- sys.env.get("SPARK_HOME")) { for (sparkHome <- sys.env.get("SPARK_HOME")) {
pythonPath += Seq(sparkHome, "python", "lib", "pyspark.zip").mkString(File.separator) pythonPath += Seq(sparkHome, "python", "lib", "pyspark.zip").mkString(File.separator)
pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.9.1-src.zip").mkString(File.separator) pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.9.2-src.zip").mkString(File.separator)
} }
pythonPath ++= SparkContext.jarOfObject(this) pythonPath ++= SparkContext.jarOfObject(this)
pythonPath.mkString(File.pathSeparator) pythonPath.mkString(File.pathSeparator)

View file

@ -153,7 +153,7 @@ pmml-agent-1.2.7.jar
pmml-model-1.2.7.jar pmml-model-1.2.7.jar
pmml-schema-1.2.7.jar pmml-schema-1.2.7.jar
protobuf-java-2.5.0.jar protobuf-java-2.5.0.jar
py4j-0.9.1.jar py4j-0.9.2.jar
pyrolite-4.9.jar pyrolite-4.9.jar
reflectasm-1.07-shaded.jar reflectasm-1.07-shaded.jar
scala-compiler-2.11.7.jar scala-compiler-2.11.7.jar

View file

@ -144,7 +144,7 @@ pmml-agent-1.2.7.jar
pmml-model-1.2.7.jar pmml-model-1.2.7.jar
pmml-schema-1.2.7.jar pmml-schema-1.2.7.jar
protobuf-java-2.5.0.jar protobuf-java-2.5.0.jar
py4j-0.9.1.jar py4j-0.9.2.jar
pyrolite-4.9.jar pyrolite-4.9.jar
reflectasm-1.07-shaded.jar reflectasm-1.07-shaded.jar
scala-compiler-2.11.7.jar scala-compiler-2.11.7.jar

View file

@ -145,7 +145,7 @@ pmml-agent-1.2.7.jar
pmml-model-1.2.7.jar pmml-model-1.2.7.jar
pmml-schema-1.2.7.jar pmml-schema-1.2.7.jar
protobuf-java-2.5.0.jar protobuf-java-2.5.0.jar
py4j-0.9.1.jar py4j-0.9.2.jar
pyrolite-4.9.jar pyrolite-4.9.jar
reflectasm-1.07-shaded.jar reflectasm-1.07-shaded.jar
scala-compiler-2.11.7.jar scala-compiler-2.11.7.jar

View file

@ -151,7 +151,7 @@ pmml-agent-1.2.7.jar
pmml-model-1.2.7.jar pmml-model-1.2.7.jar
pmml-schema-1.2.7.jar pmml-schema-1.2.7.jar
protobuf-java-2.5.0.jar protobuf-java-2.5.0.jar
py4j-0.9.1.jar py4j-0.9.2.jar
pyrolite-4.9.jar pyrolite-4.9.jar
reflectasm-1.07-shaded.jar reflectasm-1.07-shaded.jar
scala-compiler-2.11.7.jar scala-compiler-2.11.7.jar

View file

@ -152,7 +152,7 @@ pmml-agent-1.2.7.jar
pmml-model-1.2.7.jar pmml-model-1.2.7.jar
pmml-schema-1.2.7.jar pmml-schema-1.2.7.jar
protobuf-java-2.5.0.jar protobuf-java-2.5.0.jar
py4j-0.9.1.jar py4j-0.9.2.jar
pyrolite-4.9.jar pyrolite-4.9.jar
reflectasm-1.07-shaded.jar reflectasm-1.07-shaded.jar
scala-compiler-2.11.7.jar scala-compiler-2.11.7.jar

View file

@ -615,7 +615,7 @@ object KafkaUtils {
/** /**
* This is a helper class that wraps the KafkaUtils.createStream() into more * This is a helper class that wraps the KafkaUtils.createStream() into more
* Python-friendly class and function so that it can be easily * Python-friendly class and function so that it can be easily
* instantiated and called from Python's KafkaUtils (see SPARK-6027). * instantiated and called from Python's KafkaUtils.
* *
* The zero-arg constructor helps instantiate this class from the Class object * The zero-arg constructor helps instantiate this class from the Class object
* classOf[KafkaUtilsPythonHelper].newInstance(), and the createStream() * classOf[KafkaUtilsPythonHelper].newInstance(), and the createStream()

View file

@ -7,7 +7,7 @@ SPHINXBUILD = sphinx-build
PAPER = PAPER =
BUILDDIR = _build BUILDDIR = _build
export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.9.1-src.zip) export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.9.2-src.zip)
# User-friendly check for sphinx-build # User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)

View file

@ -111,12 +111,9 @@ class FlumeUtils(object):
@staticmethod @staticmethod
def _get_helper(sc): def _get_helper(sc):
try: try:
helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ return sc._jvm.org.apache.spark.streaming.flume.FlumeUtilsPythonHelper()
.loadClass("org.apache.spark.streaming.flume.FlumeUtilsPythonHelper") except TypeError as e:
return helperClass.newInstance() if str(e) == "'JavaPackage' object is not callable":
except Py4JJavaError as e:
# TODO: use --jar once it also work on driver
if 'ClassNotFoundException' in str(e.java_exception):
FlumeUtils._printErrorMsg(sc) FlumeUtils._printErrorMsg(sc)
raise raise

View file

@ -192,13 +192,9 @@ class KafkaUtils(object):
@staticmethod @staticmethod
def _get_helper(sc): def _get_helper(sc):
try: try:
# Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027) return sc._jvm.org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper()
helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ except TypeError as e:
.loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") if str(e) == "'JavaPackage' object is not callable":
return helperClass.newInstance()
except Py4JJavaError as e:
# TODO: use --jar once it also work on driver
if 'ClassNotFoundException' in str(e.java_exception):
KafkaUtils._printErrorMsg(sc) KafkaUtils._printErrorMsg(sc)
raise raise

View file

@ -74,16 +74,14 @@ class KinesisUtils(object):
try: try:
# Use KinesisUtilsPythonHelper to access Scala's KinesisUtils # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils
helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\ helper = ssc._jvm.org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper()
.loadClass("org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper") except TypeError as e:
helper = helperClass.newInstance() if str(e) == "'JavaPackage' object is not callable":
jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl,
regionName, initialPositionInStream, jduration, jlevel,
awsAccessKeyId, awsSecretKey)
except Py4JJavaError as e:
if 'ClassNotFoundException' in str(e.java_exception):
KinesisUtils._printErrorMsg(ssc.sparkContext) KinesisUtils._printErrorMsg(ssc.sparkContext)
raise raise
jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl,
regionName, initialPositionInStream, jduration, jlevel,
awsAccessKeyId, awsSecretKey)
stream = DStream(jstream, ssc, NoOpSerializer()) stream = DStream(jstream, ssc, NoOpSerializer())
return stream.map(lambda v: decoder(v)) return stream.map(lambda v: decoder(v))

View file

@ -38,18 +38,15 @@ class MQTTUtils(object):
:param storageLevel: RDD storage level. :param storageLevel: RDD storage level.
:return: A DStream object :return: A DStream object
""" """
jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
try: try:
helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ helper = ssc._jvm.org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper()
.loadClass("org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper") except TypeError as e:
helper = helperClass.newInstance() if str(e) == "'JavaPackage' object is not callable":
jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel)
except Py4JJavaError as e:
if 'ClassNotFoundException' in str(e.java_exception):
MQTTUtils._printErrorMsg(ssc.sparkContext) MQTTUtils._printErrorMsg(ssc.sparkContext)
raise raise
jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel)
return DStream(jstream, ssc, UTF8Deserializer()) return DStream(jstream, ssc, UTF8Deserializer())
@staticmethod @staticmethod

View file

@ -1006,10 +1006,7 @@ class KafkaStreamTests(PySparkStreamingTestCase):
def setUp(self): def setUp(self):
super(KafkaStreamTests, self).setUp() super(KafkaStreamTests, self).setUp()
self._kafkaTestUtils = self.ssc._jvm.org.apache.spark.streaming.kafka.KafkaTestUtils()
kafkaTestUtilsClz = self.ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
.loadClass("org.apache.spark.streaming.kafka.KafkaTestUtils")
self._kafkaTestUtils = kafkaTestUtilsClz.newInstance()
self._kafkaTestUtils.setup() self._kafkaTestUtils.setup()
def tearDown(self): def tearDown(self):
@ -1271,10 +1268,7 @@ class FlumeStreamTests(PySparkStreamingTestCase):
def setUp(self): def setUp(self):
super(FlumeStreamTests, self).setUp() super(FlumeStreamTests, self).setUp()
self._utils = self.ssc._jvm.org.apache.spark.streaming.flume.FlumeTestUtils()
utilsClz = self.ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
.loadClass("org.apache.spark.streaming.flume.FlumeTestUtils")
self._utils = utilsClz.newInstance()
def tearDown(self): def tearDown(self):
if self._utils is not None: if self._utils is not None:
@ -1339,10 +1333,7 @@ class FlumePollingStreamTests(PySparkStreamingTestCase):
maxAttempts = 5 maxAttempts = 5
def setUp(self): def setUp(self):
utilsClz = \ self._utils = self.sc._jvm.org.apache.spark.streaming.flume.PollingFlumeTestUtils()
self.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
.loadClass("org.apache.spark.streaming.flume.PollingFlumeTestUtils")
self._utils = utilsClz.newInstance()
def tearDown(self): def tearDown(self):
if self._utils is not None: if self._utils is not None:
@ -1419,10 +1410,7 @@ class MQTTStreamTests(PySparkStreamingTestCase):
def setUp(self): def setUp(self):
super(MQTTStreamTests, self).setUp() super(MQTTStreamTests, self).setUp()
self._MQTTTestUtils = self.ssc._jvm.org.apache.spark.streaming.mqtt.MQTTTestUtils()
MQTTTestUtilsClz = self.ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
.loadClass("org.apache.spark.streaming.mqtt.MQTTTestUtils")
self._MQTTTestUtils = MQTTTestUtilsClz.newInstance()
self._MQTTTestUtils.setup() self._MQTTTestUtils.setup()
def tearDown(self): def tearDown(self):
@ -1498,10 +1486,7 @@ class KinesisStreamTests(PySparkStreamingTestCase):
import random import random
kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000))) kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000)))
kinesisTestUtilsClz = \ kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils()
self.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
.loadClass("org.apache.spark.streaming.kinesis.KinesisTestUtils")
kinesisTestUtils = kinesisTestUtilsClz.newInstance()
try: try:
kinesisTestUtils.createStream() kinesisTestUtils.createStream()
aWSCredentials = kinesisTestUtils.getAWSCredentials() aWSCredentials = kinesisTestUtils.getAWSCredentials()

View file

@ -27,4 +27,4 @@ fi
export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}" export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
# Add the PySpark classes to the PYTHONPATH: # Add the PySpark classes to the PYTHONPATH:
export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}" export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9.1-src.zip:${PYTHONPATH}" export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9.2-src.zip:${PYTHONPATH}"

View file

@ -1087,9 +1087,9 @@ private[spark] class Client(
val pyArchivesFile = new File(pyLibPath, "pyspark.zip") val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
require(pyArchivesFile.exists(), require(pyArchivesFile.exists(),
"pyspark.zip not found; cannot run pyspark application in YARN mode.") "pyspark.zip not found; cannot run pyspark application in YARN mode.")
val py4jFile = new File(pyLibPath, "py4j-0.9.1-src.zip") val py4jFile = new File(pyLibPath, "py4j-0.9.2-src.zip")
require(py4jFile.exists(), require(py4jFile.exists(),
"py4j-0.9.1-src.zip not found; cannot run pyspark application in YARN mode.") "py4j-0.9.2-src.zip not found; cannot run pyspark application in YARN mode.")
Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath()) Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
} }
} }

View file

@ -154,7 +154,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
// needed locations. // needed locations.
val sparkHome = sys.props("spark.test.home") val sparkHome = sys.props("spark.test.home")
val pythonPath = Seq( val pythonPath = Seq(
s"$sparkHome/python/lib/py4j-0.9.1-src.zip", s"$sparkHome/python/lib/py4j-0.9.2-src.zip",
s"$sparkHome/python") s"$sparkHome/python")
val extraEnv = Map( val extraEnv = Map(
"PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator), "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),