From 31557d475958e18c7e26525d5ba6f7448bad800f Mon Sep 17 00:00:00 2001 From: itholic Date: Tue, 17 Aug 2021 10:29:16 -0700 Subject: [PATCH] [SPARK-36387][PYTHON] Fix Series.astype from datetime to nullable string This PR proposes to fix `Series.astype` when converting datetime type to StringDtype, to match the behavior of pandas 1.3. In pandas < 1.3, ```python >>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string") 0 2020-10-27 00:00:01 1 NaT Name: datetime, dtype: string ``` This is changed to ```python >>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string") 0 2020-10-27 00:00:01 1 Name: datetime, dtype: string ``` in pandas >= 1.3, so we follow the behavior of latest pandas. Because pandas-on-Spark always follow the behavior of latest pandas. Yes, the behavior is changed to latest pandas when converting datetime to nullable string (StringDtype) Unittest passed Closes #33735 from itholic/SPARK-36387. Authored-by: itholic Signed-off-by: Takuya UESHIN (cherry picked from commit c0441bb7e83e83e3240bf7e2991de34b01a182f5) Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/data_type_ops/base.py | 2 +- .../pandas/data_type_ops/datetime_ops.py | 19 ++++--------------- python/pyspark/pandas/tests/test_series.py | 8 +++++--- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py index c69715f18e..b4c8c3ea29 100644 --- a/python/pyspark/pandas/data_type_ops/base.py +++ b/python/pyspark/pandas/data_type_ops/base.py @@ -155,7 +155,7 @@ def _as_string_type( index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None) ) -> IndexOpsLike: """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`, - representing null Spark column. + representing null Spark column. Note that `null_str` is for non-extension dtypes only. """ spark_type = StringType() if isinstance(dtype, extension_dtypes): diff --git a/python/pyspark/pandas/data_type_ops/datetime_ops.py b/python/pyspark/pandas/data_type_ops/datetime_ops.py index 071c22ed24..63d817bc88 100644 --- a/python/pyspark/pandas/data_type_ops/datetime_ops.py +++ b/python/pyspark/pandas/data_type_ops/datetime_ops.py @@ -23,7 +23,7 @@ import numpy as np import pandas as pd from pandas.api.types import CategoricalDtype -from pyspark.sql import functions as F, Column +from pyspark.sql import Column from pyspark.sql.types import BooleanType, LongType, StringType, TimestampType from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex @@ -33,10 +33,11 @@ from pyspark.pandas.data_type_ops.base import ( _as_bool_type, _as_categorical_type, _as_other_type, + _as_string_type, _sanitize_list_like, ) from pyspark.pandas.spark import functions as SF -from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type +from pyspark.pandas.typedef import pandas_on_spark_type class DatetimeOps(DataTypeOps): @@ -133,18 +134,6 @@ class DatetimeOps(DataTypeOps): elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): - if isinstance(dtype, extension_dtypes): - # seems like a pandas' bug? - scol = F.when(index_ops.spark.column.isNull(), str(pd.NaT)).otherwise( - index_ops.spark.column.cast(spark_type) - ) - else: - null_str = str(pd.NaT) - casted = index_ops.spark.column.cast(spark_type) - scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) - return index_ops._with_new_scol( - scol.alias(index_ops._internal.data_spark_column_names[0]), - field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), - ) + return _as_string_type(index_ops, dtype, null_str=str(pd.NaT)) else: return _as_other_type(index_ops, dtype, spark_type) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index d9ba3c769f..58c87ed865 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -1556,16 +1556,18 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): if extension_object_dtypes_available: from pandas import StringDtype + # The behavior of casting datetime to nullable string is changed from pandas 1.3. if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: self._check_extension( psser.astype("M").astype("string"), pser.astype("M").astype("string") ) self._check_extension( psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype()) ) + else: + expected = ps.Series(["2020-10-27 00:00:01", None], name="x", dtype="string") + self._check_extension(psser.astype("M").astype("string"), expected) + self._check_extension(psser.astype("M").astype(StringDtype()), expected) with self.assertRaisesRegex(TypeError, "not understood"): psser.astype("int63")