[SPARK-35032][PYTHON] Port Koalas Index unit tests into PySpark

### What changes were proposed in this pull request? Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas Index unit tests to PySpark. ### Why are the changes needed? Currently, the pandas-on-Spark modules are not tested fully. We should enable the Index unit tests. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Enable Index unit tests. Closes #32139 from xinrong-databricks/port.indexes_tests. Authored-by: Xinrong Meng <xinrong.meng@databricks.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2021-04-16 08:53:30 +09:00 · 2021-04-16 08:53:30 +09:00 · 4aee19efb4
parent ba92de0ae5
commit 4aee19efb4
6 changed files with 2789 additions and 31 deletions
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@ -611,43 +611,47 @@ pyspark_pandas = Module(
        "pyspark.pandas.spark.utils",
        "pyspark.pandas.typedef.typehints",
        # unittests
-        "pyspark.pandas.tests.test_dataframe",
+        "pyspark.pandas.tests.indexes.test_base",
-        "pyspark.pandas.tests.test_config",
+        "pyspark.pandas.tests.indexes.test_category",
-        "pyspark.pandas.tests.test_default_index",
+        "pyspark.pandas.tests.indexes.test_datetime",
        "pyspark.pandas.tests.test_extension",
        "pyspark.pandas.tests.test_internal",
        "pyspark.pandas.tests.test_numpy_compat",
        "pyspark.pandas.tests.test_typedef",
        "pyspark.pandas.tests.test_utils",
        "pyspark.pandas.tests.test_dataframe_conversion",
        "pyspark.pandas.tests.test_dataframe_spark_io",
        "pyspark.pandas.tests.test_frame_spark",
        "pyspark.pandas.tests.test_ops_on_diff_frames",
        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
        "pyspark.pandas.tests.test_series",
        "pyspark.pandas.tests.test_series_conversion",
        "pyspark.pandas.tests.test_series_datetime",
        "pyspark.pandas.tests.test_series_string",
        "pyspark.pandas.tests.test_categorical",
        "pyspark.pandas.tests.test_csv",
        "pyspark.pandas.tests.test_groupby",
        "pyspark.pandas.tests.test_expanding",
        "pyspark.pandas.tests.test_indexing",
        "pyspark.pandas.tests.test_namespace",
        "pyspark.pandas.tests.test_repr",
        "pyspark.pandas.tests.test_reshape",
        "pyspark.pandas.tests.test_rolling",
        "pyspark.pandas.tests.test_sql",
        "pyspark.pandas.tests.test_stats",
        "pyspark.pandas.tests.test_window",
        "pyspark.pandas.tests.plot.test_frame_plot",
        "pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
        "pyspark.pandas.tests.plot.test_frame_plot_plotly",
        "pyspark.pandas.tests.plot.test_series_plot",
        "pyspark.pandas.tests.plot.test_series_plot_matplotlib",
        "pyspark.pandas.tests.plot.test_series_plot_plotly",
        "pyspark.pandas.tests.test_categorical",
        "pyspark.pandas.tests.test_config",
        "pyspark.pandas.tests.test_csv",
        "pyspark.pandas.tests.test_dataframe",
        "pyspark.pandas.tests.test_dataframe_conversion",
        "pyspark.pandas.tests.test_dataframe_spark_io",
        "pyspark.pandas.tests.test_default_index",
        "pyspark.pandas.tests.test_expanding",
        "pyspark.pandas.tests.test_extension",
        "pyspark.pandas.tests.test_frame_spark",
        "pyspark.pandas.tests.test_groupby",
        "pyspark.pandas.tests.test_indexing",
        "pyspark.pandas.tests.test_indexops_spark",
        "pyspark.pandas.tests.test_internal",
        "pyspark.pandas.tests.test_namespace",
        "pyspark.pandas.tests.test_numpy_compat",
        "pyspark.pandas.tests.test_ops_on_diff_frames",
        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
        "pyspark.pandas.tests.test_repr",
        "pyspark.pandas.tests.test_reshape",
        "pyspark.pandas.tests.test_rolling",
        "pyspark.pandas.tests.test_series",
        "pyspark.pandas.tests.test_series_conversion",
        "pyspark.pandas.tests.test_series_datetime",
        "pyspark.pandas.tests.test_series_string",
        "pyspark.pandas.tests.test_sql",
        "pyspark.pandas.tests.test_stats",
        "pyspark.pandas.tests.test_typedef",
        "pyspark.pandas.tests.test_utils",
        "pyspark.pandas.tests.test_window",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
--- a/python/pyspark/pandas/tests/indexes/init.py
+++ b/python/pyspark/pandas/tests/indexes/init.py
@ -0,0 +1,16 @@
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@ -0,0 +1,124 @@
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from distutils.version import LooseVersion
 import pandas as pd
 from pandas.api.types import CategoricalDtype
 import pyspark.pandas as ps
 from pyspark.pandas.testing.utils import ReusedSQLTestCase, TestUtils
 class CategoricalIndexTest(ReusedSQLTestCase, TestUtils):
    def test_categorical_index(self):
        pidx = pd.CategoricalIndex([1, 2, 3])
        kidx = ps.CategoricalIndex([1, 2, 3])
        self.assert_eq(kidx, pidx)
        self.assert_eq(kidx.categories, pidx.categories)
        self.assert_eq(kidx.codes, pd.Index(pidx.codes))
        self.assert_eq(kidx.ordered, pidx.ordered)
        pidx = pd.Index([1, 2, 3], dtype="category")
        kidx = ps.Index([1, 2, 3], dtype="category")
        self.assert_eq(kidx, pidx)
        self.assert_eq(kidx.categories, pidx.categories)
        self.assert_eq(kidx.codes, pd.Index(pidx.codes))
        self.assert_eq(kidx.ordered, pidx.ordered)
        pdf = pd.DataFrame(
            {
                "a": pd.Categorical([1, 2, 3, 1, 2, 3]),
                "b": pd.Categorical(["a", "b", "c", "a", "b", "c"], categories=["c", "b", "a"]),
            },
            index=pd.Categorical([10, 20, 30, 20, 30, 10], categories=[30, 10, 20], ordered=True),
        )
        kdf = ps.from_pandas(pdf)
        pidx = pdf.set_index("b").index
        kidx = kdf.set_index("b").index
        self.assert_eq(kidx, pidx)
        self.assert_eq(kidx.categories, pidx.categories)
        self.assert_eq(kidx.codes, pd.Index(pidx.codes))
        self.assert_eq(kidx.ordered, pidx.ordered)
        pidx = pdf.set_index(["a", "b"]).index.get_level_values(0)
        kidx = kdf.set_index(["a", "b"]).index.get_level_values(0)
        self.assert_eq(kidx, pidx)
        self.assert_eq(kidx.categories, pidx.categories)
        self.assert_eq(kidx.codes, pd.Index(pidx.codes))
        self.assert_eq(kidx.ordered, pidx.ordered)
    def test_astype(self):
        pidx = pd.Index(["a", "b", "c"])
        kidx = ps.from_pandas(pidx)
        self.assert_eq(kidx.astype("category"), pidx.astype("category"))
        self.assert_eq(
            kidx.astype(CategoricalDtype(["c", "a", "b"])),
            pidx.astype(CategoricalDtype(["c", "a", "b"])),
        )
        pcidx = pidx.astype(CategoricalDtype(["c", "a", "b"]))
        kcidx = kidx.astype(CategoricalDtype(["c", "a", "b"]))
        self.assert_eq(kcidx.astype("category"), pcidx.astype("category"))
        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
            self.assert_eq(
                kcidx.astype(CategoricalDtype(["b", "c", "a"])),
                pcidx.astype(CategoricalDtype(["b", "c", "a"])),
            )
        else:
            self.assert_eq(
                kcidx.astype(CategoricalDtype(["b", "c", "a"])),
                pidx.astype(CategoricalDtype(["b", "c", "a"])),
            )
        self.assert_eq(kcidx.astype(str), pcidx.astype(str))
    def test_factorize(self):
        pidx = pd.CategoricalIndex([1, 2, 3, None])
        kidx = ps.from_pandas(pidx)
        pcodes, puniques = pidx.factorize()
        kcodes, kuniques = kidx.factorize()
        self.assert_eq(kcodes.tolist(), pcodes.tolist())
        self.assert_eq(kuniques, puniques)
        pcodes, puniques = pidx.factorize(na_sentinel=-2)
        kcodes, kuniques = kidx.factorize(na_sentinel=-2)
        self.assert_eq(kcodes.tolist(), pcodes.tolist())
        self.assert_eq(kuniques, puniques)
 if __name__ == "__main__":
    import unittest
    from pyspark.pandas.tests.indexes.test_category import *  # noqa: F401
    try:
        import xmlrunner  # type: ignore[import]
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@ -0,0 +1,232 @@
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import datetime
 from distutils.version import LooseVersion
 import pandas as pd
 import pyspark.pandas as ps
 from pyspark.pandas.testing.utils import ReusedSQLTestCase, TestUtils
 class DatetimeIndexTest(ReusedSQLTestCase, TestUtils):
    @property
    def fixed_freqs(self):
        return [
            "D",
            "H",
            "T",  # min
            "S",
            "L",  # ms
            "U",  # us
            # 'N' not supported
        ]
    @property
    def non_fixed_freqs(self):
        return ["W", "Q"]
    @property
    def pidxs(self):
        return [
            pd.DatetimeIndex([0]),
            pd.DatetimeIndex(["2004-01-01", "2002-12-31", "2000-04-01"]),
        ] + [
            pd.date_range("2000-01-01", periods=3, freq=freq)
            for freq in (self.fixed_freqs + self.non_fixed_freqs)
        ]
    @property
    def kidxs(self):
        return [ps.from_pandas(pidx) for pidx in self.pidxs]
    @property
    def idx_pairs(self):
        return list(zip(self.kidxs, self.pidxs))
    def _disallow_nanoseconds(self, f):
        self.assertRaises(ValueError, lambda: f(freq="ns"))
        self.assertRaises(ValueError, lambda: f(freq="N"))
    def test_properties(self):
        for kidx, pidx in self.idx_pairs:
            self.assert_eq(kidx.year, pidx.year)
            self.assert_eq(kidx.month, pidx.month)
            self.assert_eq(kidx.day, pidx.day)
            self.assert_eq(kidx.hour, pidx.hour)
            self.assert_eq(kidx.minute, pidx.minute)
            self.assert_eq(kidx.second, pidx.second)
            self.assert_eq(kidx.microsecond, pidx.microsecond)
            self.assert_eq(kidx.week, pidx.week)
            self.assert_eq(kidx.weekofyear, pidx.weekofyear)
            self.assert_eq(kidx.dayofweek, pidx.dayofweek)
            self.assert_eq(kidx.weekday, pidx.weekday)
            self.assert_eq(kidx.dayofyear, pidx.dayofyear)
            self.assert_eq(kidx.quarter, pidx.quarter)
            self.assert_eq(kidx.daysinmonth, pidx.daysinmonth)
            self.assert_eq(kidx.days_in_month, pidx.days_in_month)
            self.assert_eq(kidx.is_month_start, pd.Index(pidx.is_month_start))
            self.assert_eq(kidx.is_month_end, pd.Index(pidx.is_month_end))
            self.assert_eq(kidx.is_quarter_start, pd.Index(pidx.is_quarter_start))
            self.assert_eq(kidx.is_quarter_end, pd.Index(pidx.is_quarter_end))
            self.assert_eq(kidx.is_year_start, pd.Index(pidx.is_year_start))
            self.assert_eq(kidx.is_year_end, pd.Index(pidx.is_year_end))
            self.assert_eq(kidx.is_leap_year, pd.Index(pidx.is_leap_year))
            if LooseVersion(pd.__version__) >= LooseVersion("1.2.0"):
                self.assert_eq(kidx.day_of_year, pidx.day_of_year)
                self.assert_eq(kidx.day_of_week, pidx.day_of_week)
    def test_ceil(self):
        for kidx, pidx in self.idx_pairs:
            for freq in self.fixed_freqs:
                self.assert_eq(kidx.ceil(freq), pidx.ceil(freq))
        self._disallow_nanoseconds(self.kidxs[0].ceil)
    def test_floor(self):
        for kidx, pidx in self.idx_pairs:
            for freq in self.fixed_freqs:
                self.assert_eq(kidx.floor(freq), pidx.floor(freq))
        self._disallow_nanoseconds(self.kidxs[0].floor)
    def test_round(self):
        for kidx, pidx in self.idx_pairs:
            for freq in self.fixed_freqs:
                self.assert_eq(kidx.round(freq), pidx.round(freq))
        self._disallow_nanoseconds(self.kidxs[0].round)
    def test_day_name(self):
        for kidx, pidx in self.idx_pairs:
            self.assert_eq(kidx.day_name(), pidx.day_name())
    def test_month_name(self):
        for kidx, pidx in self.idx_pairs:
            self.assert_eq(kidx.day_name(), pidx.day_name())
    def test_normalize(self):
        for kidx, pidx in self.idx_pairs:
            self.assert_eq(kidx.normalize(), pidx.normalize())
    def test_strftime(self):
        for kidx, pidx in self.idx_pairs:
            self.assert_eq(
                kidx.strftime(date_format="%B %d, %Y"), pidx.strftime(date_format="%B %d, %Y")
            )
    def test_indexer_between_time(self):
        for kidx, pidx in self.idx_pairs:
            self.assert_eq(
                kidx.indexer_between_time("00:00:00", "00:01:00").sort_values(),
                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00")),
            )
            self.assert_eq(
                kidx.indexer_between_time(
                    datetime.time(0, 0, 0), datetime.time(0, 1, 0)
                ).sort_values(),
                pd.Index(pidx.indexer_between_time(datetime.time(0, 0, 0), datetime.time(0, 1, 0))),
            )
            self.assert_eq(
                kidx.indexer_between_time("00:00:00", "00:01:00", True, False).sort_values(),
                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", True, False)),
            )
            self.assert_eq(
                kidx.indexer_between_time("00:00:00", "00:01:00", False, True).sort_values(),
                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", False, True)),
            )
            self.assert_eq(
                kidx.indexer_between_time("00:00:00", "00:01:00", False, False).sort_values(),
                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", False, False)),
            )
            self.assert_eq(
                kidx.indexer_between_time("00:00:00", "00:01:00", True, True).sort_values(),
                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", True, True)),
            )
    def test_indexer_at_time(self):
        for kidx, pidx in self.idx_pairs:
            self.assert_eq(
                kidx.indexer_at_time("00:00:00").sort_values(),
                pd.Index(pidx.indexer_at_time("00:00:00")),
            )
            self.assert_eq(
                kidx.indexer_at_time(datetime.time(0, 1, 0)).sort_values(),
                pd.Index(pidx.indexer_at_time(datetime.time(0, 1, 0))),
            )
            self.assert_eq(
                kidx.indexer_at_time("00:00:01").sort_values(),
                pd.Index(pidx.indexer_at_time("00:00:01")),
            )
        self.assertRaises(
            NotImplementedError,
            lambda: ps.DatetimeIndex([0]).indexer_at_time("00:00:00", asof=True),
        )
    def test_arithmetic_op_exceptions(self):
        for kidx, pidx in self.idx_pairs:
            py_datetime = pidx.to_pydatetime()
            for other in [1, 0.1, kidx, kidx.to_series().reset_index(drop=True), py_datetime]:
                expected_err_msg = "addition can not be applied to date times."
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx + other)
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other + kidx)
                expected_err_msg = "multiplication can not be applied to date times."
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx * other)
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other * kidx)
                expected_err_msg = "division can not be applied to date times."
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx / other)
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other / kidx)
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx // other)
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other // kidx)
                expected_err_msg = "modulo can not be applied to date times."
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx % other)
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % kidx)
            expected_err_msg = "datetime subtraction can only be applied to datetime series."
            for other in [1, 0.1]:
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx - other)
                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other - kidx)
            self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx - other)
            self.assertRaises(NotImplementedError, lambda: py_datetime - kidx)
 if __name__ == "__main__":
    import unittest
    from pyspark.pandas.tests.indexes.test_datetime import *  # noqa: F401
    try:
        import xmlrunner  # type: ignore[import]
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)
--- a/python/pyspark/pandas/tests/test_indexops_spark.py
+++ b/python/pyspark/pandas/tests/test_indexops_spark.py
@ -0,0 +1,74 @@
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import pandas as pd
 from pyspark.sql.utils import AnalysisException
 from pyspark.sql import functions as F
 from pyspark import pandas as ps
 from pyspark.pandas.testing.utils import ReusedSQLTestCase, SQLTestUtils
 class SparkIndexOpsMethodsTest(ReusedSQLTestCase, SQLTestUtils):
    @property
    def pser(self):
        return pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
    @property
    def kser(self):
        return ps.from_pandas(self.pser)
    def test_series_transform_negative(self):
        with self.assertRaisesRegex(
            ValueError, "The output of the function.* pyspark.sql.Column.*int"
        ):
            self.kser.spark.transform(lambda scol: 1)
        with self.assertRaisesRegex(AnalysisException, "cannot resolve.*non-existent.*"):
            self.kser.spark.transform(lambda scol: F.col("non-existent"))
    def test_multiindex_transform_negative(self):
        with self.assertRaisesRegex(
            NotImplementedError, "MultiIndex does not support spark.transform yet"
        ):
            midx = pd.MultiIndex(
                [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
                [[0, 0, 0, 1, 1, 1, 2, 2, 2], [1, 1, 1, 1, 1, 2, 1, 2, 2]],
            )
            s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
            s.index.spark.transform(lambda scol: scol)
    def test_series_apply_negative(self):
        with self.assertRaisesRegex(
            ValueError, "The output of the function.* pyspark.sql.Column.*int"
        ):
            self.kser.spark.apply(lambda scol: 1)
        with self.assertRaisesRegex(AnalysisException, "cannot resolve.*non-existent.*"):
            self.kser.spark.transform(lambda scol: F.col("non-existent"))
 if __name__ == "__main__":
    import unittest
    from pyspark.pandas.tests.test_indexops_spark import *  # noqa: F401
    try:
        import xmlrunner  # type: ignore[import]
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)