[SPARK-36387][PYTHON] Fix Series.astype from datetime to nullable string
### What changes were proposed in this pull request? This PR proposes to fix `Series.astype` when converting datetime type to StringDtype, to match the behavior of pandas 1.3. In pandas < 1.3, ```python >>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string") 0 2020-10-27 00:00:01 1 NaT Name: datetime, dtype: string ``` This is changed to ```python >>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string") 0 2020-10-27 00:00:01 1 <NA> Name: datetime, dtype: string ``` in pandas >= 1.3, so we follow the behavior of latest pandas. ### Why are the changes needed? Because pandas-on-Spark always follow the behavior of latest pandas. ### Does this PR introduce _any_ user-facing change? Yes, the behavior is changed to latest pandas when converting datetime to nullable string (StringDtype) ### How was this patch tested? Unittest passed Closes #33735 from itholic/SPARK-36387. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
This commit is contained in:
parent
8bfb4f1e72
commit
c0441bb7e8
|
@ -155,7 +155,7 @@ def _as_string_type(
|
||||||
index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None)
|
index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None)
|
||||||
) -> IndexOpsLike:
|
) -> IndexOpsLike:
|
||||||
"""Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
|
"""Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
|
||||||
representing null Spark column.
|
representing null Spark column. Note that `null_str` is for non-extension dtypes only.
|
||||||
"""
|
"""
|
||||||
spark_type = StringType()
|
spark_type = StringType()
|
||||||
if isinstance(dtype, extension_dtypes):
|
if isinstance(dtype, extension_dtypes):
|
||||||
|
|
|
@ -23,7 +23,7 @@ import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.api.types import CategoricalDtype
|
from pandas.api.types import CategoricalDtype
|
||||||
|
|
||||||
from pyspark.sql import functions as F, Column
|
from pyspark.sql import Column
|
||||||
from pyspark.sql.types import BooleanType, LongType, StringType, TimestampType
|
from pyspark.sql.types import BooleanType, LongType, StringType, TimestampType
|
||||||
|
|
||||||
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
|
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
|
||||||
|
@ -33,10 +33,11 @@ from pyspark.pandas.data_type_ops.base import (
|
||||||
_as_bool_type,
|
_as_bool_type,
|
||||||
_as_categorical_type,
|
_as_categorical_type,
|
||||||
_as_other_type,
|
_as_other_type,
|
||||||
|
_as_string_type,
|
||||||
_sanitize_list_like,
|
_sanitize_list_like,
|
||||||
)
|
)
|
||||||
from pyspark.pandas.spark import functions as SF
|
from pyspark.pandas.spark import functions as SF
|
||||||
from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type
|
from pyspark.pandas.typedef import pandas_on_spark_type
|
||||||
|
|
||||||
|
|
||||||
class DatetimeOps(DataTypeOps):
|
class DatetimeOps(DataTypeOps):
|
||||||
|
@ -133,18 +134,6 @@ class DatetimeOps(DataTypeOps):
|
||||||
elif isinstance(spark_type, BooleanType):
|
elif isinstance(spark_type, BooleanType):
|
||||||
return _as_bool_type(index_ops, dtype)
|
return _as_bool_type(index_ops, dtype)
|
||||||
elif isinstance(spark_type, StringType):
|
elif isinstance(spark_type, StringType):
|
||||||
if isinstance(dtype, extension_dtypes):
|
return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
|
||||||
# seems like a pandas' bug?
|
|
||||||
scol = F.when(index_ops.spark.column.isNull(), str(pd.NaT)).otherwise(
|
|
||||||
index_ops.spark.column.cast(spark_type)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
null_str = str(pd.NaT)
|
|
||||||
casted = index_ops.spark.column.cast(spark_type)
|
|
||||||
scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted)
|
|
||||||
return index_ops._with_new_scol(
|
|
||||||
scol.alias(index_ops._internal.data_spark_column_names[0]),
|
|
||||||
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
return _as_other_type(index_ops, dtype, spark_type)
|
return _as_other_type(index_ops, dtype, spark_type)
|
||||||
|
|
|
@ -1556,16 +1556,18 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
|
||||||
if extension_object_dtypes_available:
|
if extension_object_dtypes_available:
|
||||||
from pandas import StringDtype
|
from pandas import StringDtype
|
||||||
|
|
||||||
|
# The behavior of casting datetime to nullable string is changed from pandas 1.3.
|
||||||
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
|
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
|
||||||
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
self._check_extension(
|
self._check_extension(
|
||||||
psser.astype("M").astype("string"), pser.astype("M").astype("string")
|
psser.astype("M").astype("string"), pser.astype("M").astype("string")
|
||||||
)
|
)
|
||||||
self._check_extension(
|
self._check_extension(
|
||||||
psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype())
|
psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype())
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
expected = ps.Series(["2020-10-27 00:00:01", None], name="x", dtype="string")
|
||||||
|
self._check_extension(psser.astype("M").astype("string"), expected)
|
||||||
|
self._check_extension(psser.astype("M").astype(StringDtype()), expected)
|
||||||
|
|
||||||
with self.assertRaisesRegex(TypeError, "not understood"):
|
with self.assertRaisesRegex(TypeError, "not understood"):
|
||||||
psser.astype("int63")
|
psser.astype("int63")
|
||||||
|
|
Loading…
Reference in a new issue