[SPARK-36537][PYTHON] Revisit disabled tests for CategoricalDtype

### What changes were proposed in this pull request?

This PR proposes to enable the tests, disabled since different behavior with pandas 1.3.

- `inplace` argument for `CategoricalDtype` functions is deprecated from pandas 1.3, and seems they have bug. So we manually created the expected result and test them.
- Fixed the `GroupBy.transform` since it doesn't work properly for `CategoricalDtype`.

### Why are the changes needed?

We should enable the tests as much as possible even if pandas has a bug.

And we should follow the behavior of latest pandas.

### Does this PR introduce _any_ user-facing change?

Yes, `GroupBy.transform` now follow the behavior of latest pandas.

### How was this patch tested?

Unittests.

Closes #33817 from itholic/SPARK-36537.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
itholic 2021-08-26 17:43:49 +09:00 committed by Hyukjin Kwon
parent 97e7d6e667
commit fe486185c4
2 changed files with 63 additions and 54 deletions

View file

@ -2256,6 +2256,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
for c in psdf._internal.data_spark_column_names
if c not in groupkey_names
]
return_schema = StructType([field.struct_field for field in data_fields])
sdf = GroupBy._spark_group_map_apply(

View file

@ -74,10 +74,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
pser.cat.categories = ["z", "y", "x"]
psser.cat.categories = ["z", "y", "x"]
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=["x", "y", "z"]))
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
with self.assertRaises(ValueError):
@ -96,10 +96,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
pser.cat.add_categories(4, inplace=True)
psser.cat.add_categories(4, inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 2, 3, 4]))
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
@ -124,10 +124,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
pser.cat.remove_categories(2, inplace=True)
psser.cat.remove_categories(2, inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 3]))
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
@ -151,10 +151,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
pser.cat.remove_unused_categories(inplace=True)
psser.cat.remove_unused_categories(inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 3]))
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
def test_reorder_categories(self):
@ -180,20 +180,17 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
pser.cat.reorder_categories([1, 2, 3], inplace=True)
psser.cat.reorder_categories([1, 2, 3], inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[3, 2, 1], ordered=True))
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2]))
@ -214,10 +211,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
pser.cat.as_ordered(inplace=True)
psser.cat.as_ordered(inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], ordered=True))
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
# as_unordered
@ -225,6 +222,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
pser.cat.as_unordered(inplace=True)
psser.cat.as_unordered(inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], ordered=False))
pdf.a = pser
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
@ -445,13 +447,16 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
dtype = CategoricalDtype(categories=["a", "b", "c", "d"])
def astype(x) -> ps.Series[dtype]:
# The behavior for CategoricalDtype is changed from pandas 1.3
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
ret_dtype = pdf.b.dtype
else:
ret_dtype = dtype
def astype(x) -> ps.Series[ret_dtype]:
return x.astype(dtype)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
self.assert_eq(
psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
@ -670,28 +675,30 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=["C", "b", "d", "A"]))
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=["C", "B", "D", "A"]))
pdf.b = pser
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[0, 1, 3, 2]))
pdf.b = pser
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
self.assertRaisesRegex(
@ -762,19 +769,20 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=["a", "c", "b", "o"]))
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[2, 3, 1, 0]))
pdf.b = pser
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
self.assertRaisesRegex(