From cb075b5301e08b9d9b06f3d33a41b3d63d95378e Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Tue, 3 Aug 2021 14:02:18 +0900 Subject: [PATCH] [SPARK-36345][SPARK-36367][INFRA][PYTHON] Disable tests failed by the incompatible behavior of pandas 1.3 Disable tests failed by the incompatible behavior of pandas 1.3. Pandas 1.3 has been released. There are some behavior changes and we should follow it, but it's not ready yet. No. Disabled some tests related to the behavior change. Closes #33598 from ueshin/issues/SPARK-36367/disable_tests. Authored-by: Takuya UESHIN Signed-off-by: Hyukjin Kwon (cherry picked from commit 8cb9cf39b6a1899175aeaefb2a85480f5a514aac) Signed-off-by: Hyukjin Kwon --- .github/workflows/build_and_test.yml | 4 +- python/pyspark/pandas/groupby.py | 8 ++ .../data_type_ops/test_categorical_ops.py | 6 +- .../pyspark/pandas/tests/indexes/test_base.py | 76 +++++++++-------- .../pandas/tests/indexes/test_category.py | 5 +- .../pyspark/pandas/tests/test_categorical.py | 82 +++++++++++++++---- python/pyspark/pandas/tests/test_expanding.py | 51 +++++++----- ...st_ops_on_diff_frames_groupby_expanding.py | 13 ++- ...test_ops_on_diff_frames_groupby_rolling.py | 14 +++- python/pyspark/pandas/tests/test_rolling.py | 52 +++++++----- python/pyspark/pandas/tests/test_series.py | 16 ++-- 11 files changed, 222 insertions(+), 105 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b518f875bf..7fc99ef5d2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -149,7 +149,7 @@ jobs: name: "Build modules: ${{ matrix.modules }}" runs-on: ubuntu-20.04 container: - image: dongjoon/apache-spark-github-action-image:20210602 + image: dongjoon/apache-spark-github-action-image:20210730 strategy: fail-fast: false matrix: @@ -227,8 +227,6 @@ jobs: # Run the tests. - name: Run tests run: | - # TODO(SPARK-36345): Install mlflow>=1.0 and sklearn in Python 3.9 of the base image - python3.9 -m pip install 'mlflow>=1.0' sklearn export PATH=$PATH:$HOME/miniconda/bin ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" - name: Upload test results to report diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 70ece9cb0c..faa1de671e 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -20,6 +20,7 @@ A wrapper for GroupedData to behave similar to pandas GroupBy. """ from abc import ABCMeta, abstractmethod +import builtins import sys import inspect from collections import OrderedDict, namedtuple @@ -43,6 +44,7 @@ from typing import ( TYPE_CHECKING, ) +import numpy as np import pandas as pd from pandas.api.types import is_hashable, is_list_like @@ -102,6 +104,12 @@ if TYPE_CHECKING: # to keep it the same as pandas NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) +_builtin_table = { + builtins.sum: np.sum, + builtins.max: np.max, + builtins.min: np.min, +} # type: Dict[Callable, Callable] + class GroupBy(Generic[FrameLike], metaclass=ABCMeta): """ diff --git a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py index 6ac9073af5..11871ea2ba 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py @@ -190,8 +190,12 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assert_eq(pser.astype(str), psser.astype(str)) self.assert_eq(pser.astype(bool), psser.astype(bool)) self.assert_eq(pser.astype("category"), psser.astype("category")) + cat_type = CategoricalDtype(categories=[3, 1, 2]) - if LooseVersion(pd.__version__) >= LooseVersion("1.2"): + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + elif LooseVersion(pd.__version__) >= LooseVersion("1.2"): self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) else: self.assert_eq(pd.Series(data).astype(cat_type), psser.astype(cat_type)) diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 8238b672c5..39e22bd116 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -1478,25 +1478,30 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): psidx2 = ps.from_pandas(pidx2) self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2)) - self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1)) self.assert_eq( psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True ) - self.assert_eq( - psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), - pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), - almost=True, - ) self.assert_eq( psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])), pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])), almost=True, ) - self.assert_eq( - psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])), - pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])), - almost=True, - ) + + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1)) + self.assert_eq( + psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), + pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), + almost=True, + ) + self.assert_eq( + psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])), + pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])), + almost=True, + ) # MultiIndex pmidx1 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]) @@ -1508,30 +1513,37 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): psmidx3 = ps.from_pandas(pmidx3) psmidx4 = ps.from_pandas(pmidx4) - self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2)) - self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1)) - self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4)) - self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3)) - self.assert_eq( - psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), - pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), - ) - self.assert_eq( - psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), - pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), - ) - self.assert_eq( - psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), - pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), - ) - self.assert_eq( - psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), - pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), - ) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2)) + self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1)) + self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4)) + self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3)) + self.assert_eq( + psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), + pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]), + ) + self.assert_eq( + psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), + pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]), + ) + self.assert_eq( + psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), + pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]), + ) + self.assert_eq( + psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), + pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]), + ) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass # Testing if the result is correct after sort=False. # The `sort` argument is added in pandas 0.24. - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): + elif LooseVersion(pd.__version__) >= LooseVersion("0.24"): self.assert_eq( psmidx1.union(psmidx2, sort=False).sort_values(), pmidx1.union(pmidx2, sort=False).sort_values(), diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index 37216bd635..f241918893 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -176,7 +176,10 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(kcidx.astype("category"), pcidx.astype("category")) - if LooseVersion(pd.__version__) >= LooseVersion("1.2"): + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + elif LooseVersion(pd.__version__) >= LooseVersion("1.2"): self.assert_eq( kcidx.astype(CategoricalDtype(["b", "c", "a"])), pcidx.astype(CategoricalDtype(["b", "c", "a"])), diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py index 67cdf3c5a1..1335d59d77 100644 --- a/python/pyspark/pandas/tests/test_categorical.py +++ b/python/pyspark/pandas/tests/test_categorical.py @@ -73,7 +73,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.categories = ["z", "y", "x"] psser.cat.categories = ["z", "y", "x"] - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) with self.assertRaises(ValueError): @@ -91,7 +95,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.add_categories(4, inplace=True) psser.cat.add_categories(4, inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaises(ValueError, lambda: psser.cat.add_categories(4)) @@ -115,7 +123,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.remove_categories(2, inplace=True) psser.cat.remove_categories(2, inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4)) @@ -138,7 +150,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.remove_unused_categories(inplace=True) psser.cat.remove_unused_categories(inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) def test_reorder_categories(self): @@ -164,12 +180,20 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.reorder_categories([1, 2, 3], inplace=True) psser.cat.reorder_categories([1, 2, 3], inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True) psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2])) @@ -189,7 +213,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.as_ordered(inplace=True) psser.cat.as_ordered(inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) # as_unordered @@ -215,7 +243,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(kcser.astype("category"), pcser.astype("category")) - if LooseVersion(pd.__version__) >= LooseVersion("1.2"): + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + elif LooseVersion(pd.__version__) >= LooseVersion("1.2"): self.assert_eq( kcser.astype(CategoricalDtype(["b", "c", "a"])), pcser.astype(CategoricalDtype(["b", "c", "a"])), @@ -419,7 +450,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): def astype(x) -> ps.Series[dtype]: return x.astype(dtype) - if LooseVersion(pd.__version__) >= LooseVersion("1.2"): + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + elif LooseVersion(pd.__version__) >= LooseVersion("1.2"): self.assert_eq( psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True), pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True), @@ -637,17 +671,29 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True) psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) pser.cat.rename_categories(lambda x: x.upper(), inplace=True) psser.cat.rename_categories(lambda x: x.upper(), inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) pser.cat.rename_categories([0, 1, 3, 2], inplace=True) psser.cat.rename_categories([0, 1, 3, 2], inplace=True) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaisesRegex( @@ -717,12 +763,20 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): pser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True), psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True), ) - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False), psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False), - self.assert_eq(pser, psser) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq(pser, psser) self.assert_eq(pdf, psdf) self.assertRaisesRegex( diff --git a/python/pyspark/pandas/tests/test_expanding.py b/python/pyspark/pandas/tests/test_expanding.py index 57b4e48f39..2cd5e5284c 100644 --- a/python/pyspark/pandas/tests/test_expanding.py +++ b/python/pyspark/pandas/tests/test_expanding.py @@ -145,18 +145,24 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils): pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 1.0]}) psdf = ps.from_pandas(pdf) - self.assert_eq( - getattr(psdf.groupby(psdf.a).expanding(2), f)().sort_index(), - getattr(pdf.groupby(pdf.a).expanding(2), f)().sort_index(), - ) - self.assert_eq( - getattr(psdf.groupby(psdf.a).expanding(2), f)().sum(), - getattr(pdf.groupby(pdf.a).expanding(2), f)().sum(), - ) - self.assert_eq( - getattr(psdf.groupby(psdf.a + 1).expanding(2), f)().sort_index(), - getattr(pdf.groupby(pdf.a + 1).expanding(2), f)().sort_index(), - ) + + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq( + getattr(psdf.groupby(psdf.a).expanding(2), f)().sort_index(), + getattr(pdf.groupby(pdf.a).expanding(2), f)().sort_index(), + ) + self.assert_eq( + getattr(psdf.groupby(psdf.a).expanding(2), f)().sum(), + getattr(pdf.groupby(pdf.a).expanding(2), f)().sum(), + ) + self.assert_eq( + getattr(psdf.groupby(psdf.a + 1).expanding(2), f)().sort_index(), + getattr(pdf.groupby(pdf.a + 1).expanding(2), f)().sort_index(), + ) + self.assert_eq( getattr(psdf.b.groupby(psdf.a).expanding(2), f)().sort_index(), getattr(pdf.b.groupby(pdf.a).expanding(2), f)().sort_index(), @@ -174,15 +180,20 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils): columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) pdf.columns = columns psdf.columns = columns - self.assert_eq( - getattr(psdf.groupby(("a", "x")).expanding(2), f)().sort_index(), - getattr(pdf.groupby(("a", "x")).expanding(2), f)().sort_index(), - ) - self.assert_eq( - getattr(psdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(), - getattr(pdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(), - ) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq( + getattr(psdf.groupby(("a", "x")).expanding(2), f)().sort_index(), + getattr(pdf.groupby(("a", "x")).expanding(2), f)().sort_index(), + ) + + self.assert_eq( + getattr(psdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(), + getattr(pdf.groupby([("a", "x"), ("a", "y")]).expanding(2), f)().sort_index(), + ) def test_groupby_expanding_count(self): # The behaviour of ExpandingGroupby.count are different between pandas>=1.0.0 and lower, diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py index c6a2852e5b..223adeaa48 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py @@ -52,10 +52,15 @@ class OpsOnDiffFramesGroupByExpandingTest(PandasOnSparkTestCase, TestUtils): psdf = ps.from_pandas(pdf) kkey = ps.from_pandas(pkey) - self.assert_eq( - getattr(psdf.groupby(kkey).expanding(2), f)().sort_index(), - getattr(pdf.groupby(pkey).expanding(2), f)().sort_index(), - ) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq( + getattr(psdf.groupby(kkey).expanding(2), f)().sort_index(), + getattr(pdf.groupby(pkey).expanding(2), f)().sort_index(), + ) + self.assert_eq( getattr(psdf.groupby(kkey)["b"].expanding(2), f)().sort_index(), getattr(pdf.groupby(pkey)["b"].expanding(2), f)().sort_index(), diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py index 306a08196b..4f97769b8e 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from distutils.version import LooseVersion import pandas as pd @@ -49,10 +50,15 @@ class OpsOnDiffFramesGroupByRollingTest(PandasOnSparkTestCase, TestUtils): psdf = ps.from_pandas(pdf) kkey = ps.from_pandas(pkey) - self.assert_eq( - getattr(psdf.groupby(kkey).rolling(2), f)().sort_index(), - getattr(pdf.groupby(pkey).rolling(2), f)().sort_index(), - ) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq( + getattr(psdf.groupby(kkey).rolling(2), f)().sort_index(), + getattr(pdf.groupby(pkey).rolling(2), f)().sort_index(), + ) + self.assert_eq( getattr(psdf.groupby(kkey)["b"].rolling(2), f)().sort_index(), getattr(pdf.groupby(pkey)["b"].rolling(2), f)().sort_index(), diff --git a/python/pyspark/pandas/tests/test_rolling.py b/python/pyspark/pandas/tests/test_rolling.py index 92373d250a..7409d6988c 100644 --- a/python/pyspark/pandas/tests/test_rolling.py +++ b/python/pyspark/pandas/tests/test_rolling.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -110,18 +111,24 @@ class RollingTest(PandasOnSparkTestCase, TestUtils): pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 1.0]}) psdf = ps.from_pandas(pdf) - self.assert_eq( - getattr(psdf.groupby(psdf.a).rolling(2), f)().sort_index(), - getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index(), - ) - self.assert_eq( - getattr(psdf.groupby(psdf.a).rolling(2), f)().sum(), - getattr(pdf.groupby(pdf.a).rolling(2), f)().sum(), - ) - self.assert_eq( - getattr(psdf.groupby(psdf.a + 1).rolling(2), f)().sort_index(), - getattr(pdf.groupby(pdf.a + 1).rolling(2), f)().sort_index(), - ) + + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq( + getattr(psdf.groupby(psdf.a).rolling(2), f)().sort_index(), + getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index(), + ) + self.assert_eq( + getattr(psdf.groupby(psdf.a).rolling(2), f)().sum(), + getattr(pdf.groupby(pdf.a).rolling(2), f)().sum(), + ) + self.assert_eq( + getattr(psdf.groupby(psdf.a + 1).rolling(2), f)().sort_index(), + getattr(pdf.groupby(pdf.a + 1).rolling(2), f)().sort_index(), + ) + self.assert_eq( getattr(psdf.b.groupby(psdf.a).rolling(2), f)().sort_index(), getattr(pdf.b.groupby(pdf.a).rolling(2), f)().sort_index(), @@ -139,15 +146,20 @@ class RollingTest(PandasOnSparkTestCase, TestUtils): columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) pdf.columns = columns psdf.columns = columns - self.assert_eq( - getattr(psdf.groupby(("a", "x")).rolling(2), f)().sort_index(), - getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index(), - ) - self.assert_eq( - getattr(psdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index(), - getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index(), - ) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self.assert_eq( + getattr(psdf.groupby(("a", "x")).rolling(2), f)().sort_index(), + getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index(), + ) + + self.assert_eq( + getattr(psdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index(), + getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index(), + ) def test_groupby_rolling_count(self): self._test_groupby_rolling_func("count") diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index b42d3cd557..d9ba3c769f 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -1556,12 +1556,16 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): if extension_object_dtypes_available: from pandas import StringDtype - self._check_extension( - psser.astype("M").astype("string"), pser.astype("M").astype("string") - ) - self._check_extension( - psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype()) - ) + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3 + pass + else: + self._check_extension( + psser.astype("M").astype("string"), pser.astype("M").astype("string") + ) + self._check_extension( + psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype()) + ) with self.assertRaisesRegex(TypeError, "not understood"): psser.astype("int63")