[SPARK-36387][PYTHON] Fix Series.astype from datetime to nullable string

### What changes were proposed in this pull request?

This PR proposes to fix `Series.astype` when converting datetime type to StringDtype, to match the behavior of pandas 1.3.

In pandas < 1.3,
```python
>>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string")
0    2020-10-27 00:00:01
1                    NaT
Name: datetime, dtype: string
```

This is changed to

```python
>>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string")
0    2020-10-27 00:00:01
1                   <NA>
Name: datetime, dtype: string
```

in pandas >= 1.3, so we follow the behavior of latest pandas.

### Why are the changes needed?

Because pandas-on-Spark always follow the behavior of latest pandas.

### Does this PR introduce _any_ user-facing change?

Yes, the behavior is changed to latest pandas when converting datetime to nullable string (StringDtype)

### How was this patch tested?

Unittest passed

Closes #33735 from itholic/SPARK-36387.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
This commit is contained in:
itholic 2021-08-17 10:29:16 -07:00 committed by Takuya UESHIN
parent 8bfb4f1e72
commit c0441bb7e8
3 changed files with 10 additions and 19 deletions

View file

@ -155,7 +155,7 @@ def _as_string_type(
index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None) index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None)
) -> IndexOpsLike: ) -> IndexOpsLike:
"""Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`, """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
representing null Spark column. representing null Spark column. Note that `null_str` is for non-extension dtypes only.
""" """
spark_type = StringType() spark_type = StringType()
if isinstance(dtype, extension_dtypes): if isinstance(dtype, extension_dtypes):

View file

@ -23,7 +23,7 @@ import numpy as np
import pandas as pd import pandas as pd
from pandas.api.types import CategoricalDtype from pandas.api.types import CategoricalDtype
from pyspark.sql import functions as F, Column from pyspark.sql import Column
from pyspark.sql.types import BooleanType, LongType, StringType, TimestampType from pyspark.sql.types import BooleanType, LongType, StringType, TimestampType
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
@ -33,10 +33,11 @@ from pyspark.pandas.data_type_ops.base import (
_as_bool_type, _as_bool_type,
_as_categorical_type, _as_categorical_type,
_as_other_type, _as_other_type,
_as_string_type,
_sanitize_list_like, _sanitize_list_like,
) )
from pyspark.pandas.spark import functions as SF from pyspark.pandas.spark import functions as SF
from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type from pyspark.pandas.typedef import pandas_on_spark_type
class DatetimeOps(DataTypeOps): class DatetimeOps(DataTypeOps):
@ -133,18 +134,6 @@ class DatetimeOps(DataTypeOps):
elif isinstance(spark_type, BooleanType): elif isinstance(spark_type, BooleanType):
return _as_bool_type(index_ops, dtype) return _as_bool_type(index_ops, dtype)
elif isinstance(spark_type, StringType): elif isinstance(spark_type, StringType):
if isinstance(dtype, extension_dtypes): return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
# seems like a pandas' bug?
scol = F.when(index_ops.spark.column.isNull(), str(pd.NaT)).otherwise(
index_ops.spark.column.cast(spark_type)
)
else:
null_str = str(pd.NaT)
casted = index_ops.spark.column.cast(spark_type)
scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted)
return index_ops._with_new_scol(
scol.alias(index_ops._internal.data_spark_column_names[0]),
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
else: else:
return _as_other_type(index_ops, dtype, spark_type) return _as_other_type(index_ops, dtype, spark_type)

View file

@ -1556,16 +1556,18 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
if extension_object_dtypes_available: if extension_object_dtypes_available:
from pandas import StringDtype from pandas import StringDtype
# The behavior of casting datetime to nullable string is changed from pandas 1.3.
if LooseVersion(pd.__version__) >= LooseVersion("1.3"): if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self._check_extension( self._check_extension(
psser.astype("M").astype("string"), pser.astype("M").astype("string") psser.astype("M").astype("string"), pser.astype("M").astype("string")
) )
self._check_extension( self._check_extension(
psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype()) psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype())
) )
else:
expected = ps.Series(["2020-10-27 00:00:01", None], name="x", dtype="string")
self._check_extension(psser.astype("M").astype("string"), expected)
self._check_extension(psser.astype("M").astype(StringDtype()), expected)
with self.assertRaisesRegex(TypeError, "not understood"): with self.assertRaisesRegex(TypeError, "not understood"):
psser.astype("int63") psser.astype("int63")