2017-04-11 15:18:31 -04:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
2018-03-08 06:29:07 -05:00
|
|
|
|
2018-05-01 22:55:01 -04:00
|
|
|
import re
|
2018-03-08 06:29:07 -05:00
|
|
|
import sys
|
|
|
|
import inspect
|
[SPARK-23517][PYTHON] Make `pyspark.util._exception_message` produce the trace from Java side by Py4JJavaError
## What changes were proposed in this pull request?
This PR proposes for `pyspark.util._exception_message` to produce the trace from Java side by `Py4JJavaError`.
Currently, in Python 2, it uses `message` attribute which `Py4JJavaError` didn't happen to have:
```python
>>> from pyspark.util import _exception_message
>>> try:
... sc._jvm.java.lang.String(None)
... except Exception as e:
... pass
...
>>> e.message
''
```
Seems we should use `str` instead for now:
https://github.com/bartdag/py4j/blob/aa6c53b59027925a426eb09b58c453de02c21b7c/py4j-python/src/py4j/protocol.py#L412
but this doesn't address the problem with non-ascii string from Java side -
`https://github.com/bartdag/py4j/issues/306`
So, we could directly call `__str__()`:
```python
>>> e.__str__()
u'An error occurred while calling None.java.lang.String.\n: java.lang.NullPointerException\n\tat java.lang.String.<init>(String.java:588)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)\n\tat sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.lang.reflect.Constructor.newInstance(Constructor.java:422)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:238)\n\tat py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)\n\tat py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:745)\n'
```
which doesn't type coerce unicodes to `str` in Python 2.
This can be actually a problem:
```python
from pyspark.sql.functions import udf
spark.conf.set("spark.sql.execution.arrow.enabled", True)
spark.range(1).select(udf(lambda x: [[]])()).toPandas()
```
**Before**
```
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/.../spark/python/pyspark/sql/dataframe.py", line 2009, in toPandas
raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
RuntimeError:
Note: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.enabled' is set to true. Please set it to false to disable this.
```
**After**
```
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/.../spark/python/pyspark/sql/dataframe.py", line 2009, in toPandas
raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
RuntimeError: An error occurred while calling o47.collectAsArrowToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 0.0 failed 1 times, most recent failure: Lost task 7.0 in stage 0.0 (TID 7, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/.../spark/python/pyspark/worker.py", line 245, in main
process()
File "/.../spark/python/pyspark/worker.py", line 240, in process
...
Note: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.enabled' is set to true. Please set it to false to disable this.
```
## How was this patch tested?
Manually tested and unit tests were added.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #20680 from HyukjinKwon/SPARK-23517.
2018-02-28 10:44:13 -05:00
|
|
|
from py4j.protocol import Py4JJavaError
|
2017-04-11 15:18:31 -04:00
|
|
|
|
|
|
|
__all__ = []
|
|
|
|
|
|
|
|
|
|
|
|
def _exception_message(excp):
|
|
|
|
"""Return the message from an exception as either a str or unicode object. Supports both
|
|
|
|
Python 2 and Python 3.
|
|
|
|
|
|
|
|
>>> msg = "Exception message"
|
|
|
|
>>> excp = Exception(msg)
|
|
|
|
>>> msg == _exception_message(excp)
|
|
|
|
True
|
|
|
|
|
|
|
|
>>> msg = u"unicöde"
|
|
|
|
>>> excp = Exception(msg)
|
|
|
|
>>> msg == _exception_message(excp)
|
|
|
|
True
|
|
|
|
"""
|
[SPARK-23517][PYTHON] Make `pyspark.util._exception_message` produce the trace from Java side by Py4JJavaError
## What changes were proposed in this pull request?
This PR proposes for `pyspark.util._exception_message` to produce the trace from Java side by `Py4JJavaError`.
Currently, in Python 2, it uses `message` attribute which `Py4JJavaError` didn't happen to have:
```python
>>> from pyspark.util import _exception_message
>>> try:
... sc._jvm.java.lang.String(None)
... except Exception as e:
... pass
...
>>> e.message
''
```
Seems we should use `str` instead for now:
https://github.com/bartdag/py4j/blob/aa6c53b59027925a426eb09b58c453de02c21b7c/py4j-python/src/py4j/protocol.py#L412
but this doesn't address the problem with non-ascii string from Java side -
`https://github.com/bartdag/py4j/issues/306`
So, we could directly call `__str__()`:
```python
>>> e.__str__()
u'An error occurred while calling None.java.lang.String.\n: java.lang.NullPointerException\n\tat java.lang.String.<init>(String.java:588)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)\n\tat sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.lang.reflect.Constructor.newInstance(Constructor.java:422)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:238)\n\tat py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)\n\tat py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:745)\n'
```
which doesn't type coerce unicodes to `str` in Python 2.
This can be actually a problem:
```python
from pyspark.sql.functions import udf
spark.conf.set("spark.sql.execution.arrow.enabled", True)
spark.range(1).select(udf(lambda x: [[]])()).toPandas()
```
**Before**
```
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/.../spark/python/pyspark/sql/dataframe.py", line 2009, in toPandas
raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
RuntimeError:
Note: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.enabled' is set to true. Please set it to false to disable this.
```
**After**
```
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/.../spark/python/pyspark/sql/dataframe.py", line 2009, in toPandas
raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
RuntimeError: An error occurred while calling o47.collectAsArrowToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 0.0 failed 1 times, most recent failure: Lost task 7.0 in stage 0.0 (TID 7, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/.../spark/python/pyspark/worker.py", line 245, in main
process()
File "/.../spark/python/pyspark/worker.py", line 240, in process
...
Note: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.enabled' is set to true. Please set it to false to disable this.
```
## How was this patch tested?
Manually tested and unit tests were added.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #20680 from HyukjinKwon/SPARK-23517.
2018-02-28 10:44:13 -05:00
|
|
|
if isinstance(excp, Py4JJavaError):
|
|
|
|
# 'Py4JJavaError' doesn't contain the stack trace available on the Java side in 'message'
|
|
|
|
# attribute in Python 2. We should call 'str' function on this exception in general but
|
|
|
|
# 'Py4JJavaError' has an issue about addressing non-ascii strings. So, here we work
|
|
|
|
# around by the direct call, '__str__()'. Please see SPARK-23517.
|
|
|
|
return excp.__str__()
|
2017-04-11 15:18:31 -04:00
|
|
|
if hasattr(excp, "message"):
|
|
|
|
return excp.message
|
|
|
|
return str(excp)
|
|
|
|
|
|
|
|
|
2018-03-08 06:29:07 -05:00
|
|
|
def _get_argspec(f):
|
|
|
|
"""
|
|
|
|
Get argspec of a function. Supports both Python 2 and Python 3.
|
|
|
|
"""
|
|
|
|
# `getargspec` is deprecated since python3.0 (incompatible with function annotations).
|
|
|
|
# See SPARK-23569.
|
|
|
|
if sys.version_info[0] < 3:
|
|
|
|
argspec = inspect.getargspec(f)
|
|
|
|
else:
|
|
|
|
argspec = inspect.getfullargspec(f)
|
|
|
|
return argspec
|
|
|
|
|
|
|
|
|
2018-05-08 09:22:54 -04:00
|
|
|
class VersionUtils(object):
|
2018-05-01 22:55:01 -04:00
|
|
|
"""
|
2018-05-08 09:22:54 -04:00
|
|
|
Provides utility method to determine Spark versions with given input string.
|
|
|
|
"""
|
|
|
|
@staticmethod
|
|
|
|
def majorMinorVersion(sparkVersion):
|
|
|
|
"""
|
|
|
|
Given a Spark version string, return the (major version number, minor version number).
|
|
|
|
E.g., for 2.0.1-SNAPSHOT, return (2, 0).
|
2018-05-01 22:55:01 -04:00
|
|
|
|
2018-05-08 09:22:54 -04:00
|
|
|
>>> sparkVersion = "2.4.0"
|
|
|
|
>>> VersionUtils.majorMinorVersion(sparkVersion)
|
|
|
|
(2, 4)
|
|
|
|
>>> sparkVersion = "2.3.0-SNAPSHOT"
|
|
|
|
>>> VersionUtils.majorMinorVersion(sparkVersion)
|
|
|
|
(2, 3)
|
2018-05-01 22:55:01 -04:00
|
|
|
|
2018-05-08 09:22:54 -04:00
|
|
|
"""
|
|
|
|
m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion)
|
|
|
|
if m is not None:
|
|
|
|
return (int(m.group(1)), int(m.group(2)))
|
|
|
|
else:
|
|
|
|
raise ValueError("Spark tried to parse '%s' as a Spark" % sparkVersion +
|
|
|
|
" version string, but it could not find the major and minor" +
|
|
|
|
" version numbers.")
|
2018-05-01 22:55:01 -04:00
|
|
|
|
|
|
|
|
2017-04-11 15:18:31 -04:00
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
|
|
|
(failure_count, test_count) = doctest.testmod()
|
|
|
|
if failure_count:
|
2018-03-08 06:38:34 -05:00
|
|
|
sys.exit(-1)
|