[SPARK-22874][PYSPARK][SQL] Modify checking pandas version to use LooseVersion.

## What changes were proposed in this pull request?

Currently we check pandas version by capturing if `ImportError` for the specific imports is raised or not but we can compare `LooseVersion` of the version strings as the same as we're checking pyarrow version.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20054 from ueshin/issues/SPARK-22874.
This commit is contained in:
Takuya UESHIN 2017-12-22 20:09:51 +09:00 committed by hyukjinkwon
parent 8df1da396f
commit 13190a4f60
6 changed files with 38 additions and 36 deletions

View file

@ -1906,9 +1906,9 @@ class DataFrame(object):
if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
try:
from pyspark.sql.types import _check_dataframe_localize_timestamps
from pyspark.sql.utils import _require_minimum_pyarrow_version
from pyspark.sql.utils import require_minimum_pyarrow_version
import pyarrow
_require_minimum_pyarrow_version()
require_minimum_pyarrow_version()
tables = self._collectAsArrow()
if tables:
table = pyarrow.concat_tables(tables)

View file

@ -493,15 +493,14 @@ class SparkSession(object):
data types will be used to coerce the data in Pandas to Arrow conversion.
"""
from pyspark.serializers import ArrowSerializer, _create_batch
from pyspark.sql.types import from_arrow_schema, to_arrow_type, \
_old_pandas_exception_message, TimestampType
from pyspark.sql.utils import _require_minimum_pyarrow_version
try:
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
except ImportError as e:
raise ImportError(_old_pandas_exception_message(e))
from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
from pyspark.sql.utils import require_minimum_pandas_version, \
require_minimum_pyarrow_version
_require_minimum_pyarrow_version()
require_minimum_pandas_version()
require_minimum_pyarrow_version()
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# Determine arrow types to coerce data when creating batches
if isinstance(schema, StructType):

View file

@ -53,7 +53,8 @@ _have_old_pandas = False
try:
import pandas
try:
import pandas.api
from pyspark.sql.utils import require_minimum_pandas_version
require_minimum_pandas_version()
_have_pandas = True
except:
_have_old_pandas = True
@ -2600,7 +2601,7 @@ class SQLTests(ReusedSQLTestCase):
@unittest.skipIf(not _have_old_pandas, "Old Pandas not installed")
def test_to_pandas_old(self):
with QuietTest(self.sc):
with self.assertRaisesRegexp(ImportError, 'Pandas \(.*\) must be installed'):
with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
self._to_pandas()
@unittest.skipIf(not _have_pandas, "Pandas not installed")
@ -2643,7 +2644,7 @@ class SQLTests(ReusedSQLTestCase):
pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)],
"d": [pd.Timestamp.now().date()]})
with QuietTest(self.sc):
with self.assertRaisesRegexp(ImportError, 'Pandas \(.*\) must be installed'):
with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
self.spark.createDataFrame(pdf)

View file

@ -1678,13 +1678,6 @@ def from_arrow_schema(arrow_schema):
for field in arrow_schema])
def _old_pandas_exception_message(e):
""" Create an error message for importing old Pandas.
"""
msg = "note: Pandas (>=0.19.2) must be installed and available on calling Python process"
return "%s\n%s" % (_exception_message(e), msg)
def _check_dataframe_localize_timestamps(pdf, timezone):
"""
Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone
@ -1693,10 +1686,10 @@ def _check_dataframe_localize_timestamps(pdf, timezone):
:param timezone: the timezone to convert. if None then use local timezone
:return pandas.DataFrame where any timezone aware columns have been converted to tz-naive
"""
try:
from pandas.api.types import is_datetime64tz_dtype
except ImportError as e:
raise ImportError(_old_pandas_exception_message(e))
from pyspark.sql.utils import require_minimum_pandas_version
require_minimum_pandas_version()
from pandas.api.types import is_datetime64tz_dtype
tz = timezone or 'tzlocal()'
for column, series in pdf.iteritems():
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
@ -1714,10 +1707,10 @@ def _check_series_convert_timestamps_internal(s, timezone):
:param timezone: the timezone to convert. if None then use local timezone
:return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone
"""
try:
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
except ImportError as e:
raise ImportError(_old_pandas_exception_message(e))
from pyspark.sql.utils import require_minimum_pandas_version
require_minimum_pandas_version()
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64_dtype(s.dtype):
tz = timezone or 'tzlocal()'
@ -1737,11 +1730,11 @@ def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):
:param to_timezone: the timezone to convert to. if None then use local timezone
:return pandas.Series where if it is a timestamp, has been converted to tz-naive
"""
try:
import pandas as pd
from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
except ImportError as e:
raise ImportError(_old_pandas_exception_message(e))
from pyspark.sql.utils import require_minimum_pandas_version
require_minimum_pandas_version()
import pandas as pd
from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
from_tz = from_timezone or 'tzlocal()'
to_tz = to_timezone or 'tzlocal()'
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?

View file

@ -37,9 +37,9 @@ def _create_udf(f, returnType, evalType):
if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF or \
evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
import inspect
from pyspark.sql.utils import _require_minimum_pyarrow_version
from pyspark.sql.utils import require_minimum_pyarrow_version
_require_minimum_pyarrow_version()
require_minimum_pyarrow_version()
argspec = inspect.getargspec(f)
if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF and len(argspec.args) == 0 and \

View file

@ -112,7 +112,16 @@ def toJArray(gateway, jtype, arr):
return jarr
def _require_minimum_pyarrow_version():
def require_minimum_pandas_version():
""" Raise ImportError if minimum version of Pandas is not installed
"""
from distutils.version import LooseVersion
import pandas
if LooseVersion(pandas.__version__) < LooseVersion('0.19.2'):
raise ImportError("Pandas >= 0.19.2 must be installed on calling Python process")
def require_minimum_pyarrow_version():
""" Raise ImportError if minimum version of pyarrow is not installed
"""
from distutils.version import LooseVersion