[SPARK-36345][SPARK-36367][INFRA][PYTHON] Disable tests failed by the incompatible behavior of pandas 1.3

Disable tests failed by the incompatible behavior of pandas 1.3. Pandas 1.3 has been released. There are some behavior changes and we should follow it, but it's not ready yet. No. Disabled some tests related to the behavior change. Closes #33598 from ueshin/issues/SPARK-36367/disable_tests. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 8cb9cf39b6) Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
2021-08-03 14:02:18 +09:00 · 2021-08-03 14:02:18 +09:00 · cb075b5301
parent c25f1e4347
commit cb075b5301
11 changed files with 222 additions and 105 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@ -149,7 +149,7 @@ jobs:
    name: "Build modules: ${{ matrix.modules }}"
    runs-on: ubuntu-20.04
    container:
-      image: dongjoon/apache-spark-github-action-image:20210602
+      image: dongjoon/apache-spark-github-action-image:20210730
    strategy:
      fail-fast: false
      matrix:
@ -227,8 +227,6 @@ jobs:
    # Run the tests.
    - name: Run tests
      run: |
        # TODO(SPARK-36345): Install mlflow>=1.0 and sklearn in Python 3.9 of the base image
        python3.9 -m pip install 'mlflow>=1.0' sklearn
        export PATH=$PATH:$HOME/miniconda/bin
        ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
    - name: Upload test results to report
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@ -20,6 +20,7 @@ A wrapper for GroupedData to behave similar to pandas GroupBy.
 """
 from abc import ABCMeta, abstractmethod
 import builtins
 import sys
 import inspect
 from collections import OrderedDict, namedtuple
@ -43,6 +44,7 @@ from typing import (
    TYPE_CHECKING,
 )
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_hashable, is_list_like
@ -102,6 +104,12 @@ if TYPE_CHECKING:
 # to keep it the same as pandas
 NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
 _builtin_table = {
    builtins.sum: np.sum,
    builtins.max: np.max,
    builtins.min: np.min,
 }  # type: Dict[Callable, Callable]
 class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
    """
--- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
@ -190,8 +190,12 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
        self.assert_eq(pser.astype(str), psser.astype(str))
        self.assert_eq(pser.astype(bool), psser.astype(bool))
        self.assert_eq(pser.astype("category"), psser.astype("category"))
        cat_type = CategoricalDtype(categories=[3, 1, 2])
-        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
            self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
        else:
            self.assert_eq(pd.Series(data).astype(cat_type), psser.astype(cat_type))
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@ -1478,20 +1478,25 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
            psidx2 = ps.from_pandas(pidx2)
            self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
            self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
            self.assert_eq(
                psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True
            )
            self.assert_eq(
                psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
                pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
                almost=True,
            )
            self.assert_eq(
                psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])),
                pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])),
                almost=True,
            )
            if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
                # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
                pass
            else:
                self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
                self.assert_eq(
                    psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
                    pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
                    almost=True,
                )
                self.assert_eq(
                    psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
                    pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])),
@ -1508,6 +1513,10 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
        psmidx3 = ps.from_pandas(pmidx3)
        psmidx4 = ps.from_pandas(pmidx4)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2))
            self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1))
            self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4))
@ -1529,9 +1538,12 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
                pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
            )
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        # Testing if the result is correct after sort=False.
        # The `sort` argument is added in pandas 0.24.
-        if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
+        elif LooseVersion(pd.__version__) >= LooseVersion("0.24"):
            self.assert_eq(
                psmidx1.union(psmidx2, sort=False).sort_values(),
                pmidx1.union(pmidx2, sort=False).sort_values(),
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@ -176,7 +176,10 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
        self.assert_eq(kcidx.astype("category"), pcidx.astype("category"))
-        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
            self.assert_eq(
                kcidx.astype(CategoricalDtype(["b", "c", "a"])),
                pcidx.astype(CategoricalDtype(["b", "c", "a"])),
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@ -73,6 +73,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        pser.cat.categories = ["z", "y", "x"]
        psser.cat.categories = ["z", "y", "x"]
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
@ -91,6 +95,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        pser.cat.add_categories(4, inplace=True)
        psser.cat.add_categories(4, inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
@ -115,6 +123,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        pser.cat.remove_categories(2, inplace=True)
        psser.cat.remove_categories(2, inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
@ -138,6 +150,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        pser.cat.remove_unused_categories(inplace=True)
        psser.cat.remove_unused_categories(inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
@ -164,11 +180,19 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        pser.cat.reorder_categories([1, 2, 3], inplace=True)
        psser.cat.reorder_categories([1, 2, 3], inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
        pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
        psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
@ -189,6 +213,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        pser.cat.as_ordered(inplace=True)
        psser.cat.as_ordered(inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
@ -215,7 +243,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        self.assert_eq(kcser.astype("category"), pcser.astype("category"))
-        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
            self.assert_eq(
                kcser.astype(CategoricalDtype(["b", "c", "a"])),
                pcser.astype(CategoricalDtype(["b", "c", "a"])),
@ -419,7 +450,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        def astype(x) -> ps.Series[dtype]:
            return x.astype(dtype)
-        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
            self.assert_eq(
                psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
                pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
@ -637,16 +671,28 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
        psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
        pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
        psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
        pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
        psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
@ -717,11 +763,19 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
            pser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
            psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
        )
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
        pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
        psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(pser, psser)
        self.assert_eq(pdf, psdf)
--- a/python/pyspark/pandas/tests/test_expanding.py
+++ b/python/pyspark/pandas/tests/test_expanding.py
@ -145,6 +145,11 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils):
        pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 1.0]})
        psdf = ps.from_pandas(pdf)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(
                getattr(psdf.groupby(psdf.a).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(pdf.a).expanding(2), f)().sort_index(),
@ -157,6 +162,7 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils):
                getattr(psdf.groupby(psdf.a + 1).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(pdf.a + 1).expanding(2), f)().sort_index(),
            )
        self.assert_eq(
            getattr(psdf.b.groupby(psdf.a).expanding(2), f)().sort_index(),
            getattr(pdf.b.groupby(pdf.a).expanding(2), f)().sort_index(),
@ -174,6 +180,11 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils):
        columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
        pdf.columns = columns
        psdf.columns = columns
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(
                getattr(psdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py
@ -52,10 +52,15 @@ class OpsOnDiffFramesGroupByExpandingTest(PandasOnSparkTestCase, TestUtils):
        psdf = ps.from_pandas(pdf)
        kkey = ps.from_pandas(pkey)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(
                getattr(psdf.groupby(kkey).expanding(2), f)().sort_index(),
                getattr(pdf.groupby(pkey).expanding(2), f)().sort_index(),
            )
        self.assert_eq(
            getattr(psdf.groupby(kkey)["b"].expanding(2), f)().sort_index(),
            getattr(pdf.groupby(pkey)["b"].expanding(2), f)().sort_index(),
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from distutils.version import LooseVersion
 import pandas as pd
@ -49,10 +50,15 @@ class OpsOnDiffFramesGroupByRollingTest(PandasOnSparkTestCase, TestUtils):
        psdf = ps.from_pandas(pdf)
        kkey = ps.from_pandas(pkey)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(
                getattr(psdf.groupby(kkey).rolling(2), f)().sort_index(),
                getattr(pdf.groupby(pkey).rolling(2), f)().sort_index(),
            )
        self.assert_eq(
            getattr(psdf.groupby(kkey)["b"].rolling(2), f)().sort_index(),
            getattr(pdf.groupby(pkey)["b"].rolling(2), f)().sort_index(),
--- a/python/pyspark/pandas/tests/test_rolling.py
+++ b/python/pyspark/pandas/tests/test_rolling.py
@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from distutils.version import LooseVersion
 import numpy as np
 import pandas as pd
@ -110,6 +111,11 @@ class RollingTest(PandasOnSparkTestCase, TestUtils):
        pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 1.0]})
        psdf = ps.from_pandas(pdf)
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(
                getattr(psdf.groupby(psdf.a).rolling(2), f)().sort_index(),
                getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index(),
@ -122,6 +128,7 @@ class RollingTest(PandasOnSparkTestCase, TestUtils):
                getattr(psdf.groupby(psdf.a + 1).rolling(2), f)().sort_index(),
                getattr(pdf.groupby(pdf.a + 1).rolling(2), f)().sort_index(),
            )
        self.assert_eq(
            getattr(psdf.b.groupby(psdf.a).rolling(2), f)().sort_index(),
            getattr(pdf.b.groupby(pdf.a).rolling(2), f)().sort_index(),
@ -139,6 +146,11 @@ class RollingTest(PandasOnSparkTestCase, TestUtils):
        columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
        pdf.columns = columns
        psdf.columns = columns
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
            pass
        else:
            self.assert_eq(
                getattr(psdf.groupby(("a", "x")).rolling(2), f)().sort_index(),
                getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index(),
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@ -1556,6 +1556,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
        if extension_object_dtypes_available:
            from pandas import StringDtype
            if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
                # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
                pass
            else:
                self._check_extension(
                    psser.astype("M").astype("string"), pser.astype("M").astype("string")
                )