From 2dc15d9d8476da327c54577e3bbb261ad7923f2f Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 26 Aug 2021 17:43:49 +0900 Subject: [PATCH] [SPARK-36537][PYTHON] Revisit disabled tests for CategoricalDtype This PR proposes to enable the tests, disabled since different behavior with pandas 1.3. - `inplace` argument for `CategoricalDtype` functions is deprecated from pandas 1.3, and seems they have bug. So we manually created the expected result and test them. - Fixed the `GroupBy.transform` since it doesn't work properly for `CategoricalDtype`. We should enable the tests as much as possible even if pandas has a bug. And we should follow the behavior of latest pandas. Yes, `GroupBy.transform` now follow the behavior of latest pandas. Unittests. Closes #33817 from itholic/SPARK-36537. Authored-by: itholic Signed-off-by: Hyukjin Kwon (cherry picked from commit fe486185c4a3a05278b1f01884e2b95ed3ca31bc) Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/groupby.py | 1 + .../pyspark/pandas/tests/test_categorical.py | 116 ++++++++++-------- 2 files changed, 63 insertions(+), 54 deletions(-) diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index c732dff72d..2815a6b766 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -2264,6 +2264,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): for c in psdf._internal.data_spark_column_names if c not in groupkey_names ] + return_schema = StructType([field.struct_field for field in data_fields]) sdf = GroupBy._spark_group_map_apply( diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py index 1fb0d5862a..e55c08ce74 100644 --- a/python/pyspark/pandas/tests/test_categorical.py +++ b/python/pyspark/pandas/tests/test_categorical.py @@ -74,10 +74,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.categories = ["z", "y", "x"] psser.cat.categories = ["z", "y", "x"] if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=["x", "y", "z"])) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) with self.assertRaises(ValueError): @@ -96,10 +96,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.add_categories(4, inplace=True) psser.cat.add_categories(4, inplace=True) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=[1, 2, 3, 4])) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaises(ValueError, lambda: psser.cat.add_categories(4)) @@ -124,10 +124,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.remove_categories(2, inplace=True) psser.cat.remove_categories(2, inplace=True) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=[1, 3])) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4)) @@ -151,10 +151,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.remove_unused_categories(inplace=True) psser.cat.remove_unused_categories(inplace=True) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=[1, 3])) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) def test_reorder_categories(self): @@ -180,20 +180,17 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.reorder_categories([1, 2, 3], inplace=True) psser.cat.reorder_categories([1, 2, 3], inplace=True) - if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True) psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=[3, 2, 1], ordered=True)) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2])) @@ -214,10 +211,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.as_ordered(inplace=True) psser.cat.as_ordered(inplace=True) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], ordered=True)) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) # as_unordered @@ -225,6 +222,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.as_unordered(inplace=True) psser.cat.as_unordered(inplace=True) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], ordered=False)) + pdf.a = pser + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) @@ -445,13 +447,16 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): dtype = CategoricalDtype(categories=["a", "b", "c", "d"]) - def astype(x) -> ps.Series[dtype]: + # The behavior for CategoricalDtype is changed from pandas 1.3 + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + ret_dtype = pdf.b.dtype + else: + ret_dtype = dtype + + def astype(x) -> ps.Series[ret_dtype]: return x.astype(dtype) - if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - elif LooseVersion(pd.__version__) >= LooseVersion("1.2"): + if LooseVersion(pd.__version__) >= LooseVersion("1.2"): self.assert_eq( psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True), pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True), @@ -670,28 +675,30 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True) psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=["C", "b", "d", "A"])) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) pser.cat.rename_categories(lambda x: x.upper(), inplace=True) psser.cat.rename_categories(lambda x: x.upper(), inplace=True) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=["C", "B", "D", "A"])) + pdf.b = pser + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) pser.cat.rename_categories([0, 1, 3, 2], inplace=True) psser.cat.rename_categories([0, 1, 3, 2], inplace=True) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=[0, 1, 3, 2])) + pdf.b = pser + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaisesRegex( @@ -762,19 +769,20 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True), ) if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=["a", "c", "b", "o"])) + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False), psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False), if LooseVersion(pd.__version__) >= LooseVersion("1.3"): - # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 - pass - else: - self.assert_eq(pser, psser) + # Bug in pandas 1.3. dtype is not updated properly with `inplace` argument. + pser = pser.astype(CategoricalDtype(categories=[2, 3, 1, 0])) + pdf.b = pser + + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaisesRegex(