[SPARK-36387][PYTHON] Fix Series.astype from datetime to nullable string

This PR proposes to fix `Series.astype` when converting datetime type to StringDtype, to match the behavior of pandas 1.3.

In pandas < 1.3,
```python
>>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string")
0    2020-10-27 00:00:01
1                    NaT
Name: datetime, dtype: string
```

This is changed to

```python
>>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string")
0    2020-10-27 00:00:01
1                   <NA>
Name: datetime, dtype: string
```

in pandas >= 1.3, so we follow the behavior of latest pandas.

Because pandas-on-Spark always follow the behavior of latest pandas.

Yes, the behavior is changed to latest pandas when converting datetime to nullable string (StringDtype)

Unittest passed

Closes #33735 from itholic/SPARK-36387.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
(cherry picked from commit c0441bb7e8)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
itholic 2021-08-17 10:29:16 -07:00 committed by Hyukjin Kwon
parent 0fc8c393b4
commit 31557d4759
3 changed files with 10 additions and 19 deletions

View file

@ -155,7 +155,7 @@ def _as_string_type(
index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None)
) -> IndexOpsLike:
"""Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
representing null Spark column.
representing null Spark column. Note that `null_str` is for non-extension dtypes only.
"""
spark_type = StringType()
if isinstance(dtype, extension_dtypes):

View file

@ -23,7 +23,7 @@ import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from pyspark.sql import functions as F, Column
from pyspark.sql import Column
from pyspark.sql.types import BooleanType, LongType, StringType, TimestampType
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
@ -33,10 +33,11 @@ from pyspark.pandas.data_type_ops.base import (
_as_bool_type,
_as_categorical_type,
_as_other_type,
_as_string_type,
_sanitize_list_like,
)
from pyspark.pandas.spark import functions as SF
from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type
from pyspark.pandas.typedef import pandas_on_spark_type
class DatetimeOps(DataTypeOps):
@ -133,18 +134,6 @@ class DatetimeOps(DataTypeOps):
elif isinstance(spark_type, BooleanType):
return _as_bool_type(index_ops, dtype)
elif isinstance(spark_type, StringType):
if isinstance(dtype, extension_dtypes):
# seems like a pandas' bug?
scol = F.when(index_ops.spark.column.isNull(), str(pd.NaT)).otherwise(
index_ops.spark.column.cast(spark_type)
)
else:
null_str = str(pd.NaT)
casted = index_ops.spark.column.cast(spark_type)
scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted)
return index_ops._with_new_scol(
scol.alias(index_ops._internal.data_spark_column_names[0]),
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
else:
return _as_other_type(index_ops, dtype, spark_type)

View file

@ -1556,16 +1556,18 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
if extension_object_dtypes_available:
from pandas import StringDtype
# The behavior of casting datetime to nullable string is changed from pandas 1.3.
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self._check_extension(
psser.astype("M").astype("string"), pser.astype("M").astype("string")
)
self._check_extension(
psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype())
)
else:
expected = ps.Series(["2020-10-27 00:00:01", None], name="x", dtype="string")
self._check_extension(psser.astype("M").astype("string"), expected)
self._check_extension(psser.astype("M").astype(StringDtype()), expected)
with self.assertRaisesRegex(TypeError, "not understood"):
psser.astype("int63")