[SPARK-35032][PYTHON] Port Koalas Index unit tests into PySpark

### What changes were proposed in this pull request? Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas Index unit tests to PySpark. ### Why are the changes needed? Currently, the pandas-on-Spark modules are not tested fully. We should enable the Index unit tests. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Enable Index unit tests. Closes #32139 from xinrong-databricks/port.indexes_tests. Authored-by: Xinrong Meng <xinrong.meng@databricks.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2021-04-16 08:53:30 +09:00 · 2021-04-16 08:53:30 +09:00 · 4aee19efb4
parent ba92de0ae5
commit 4aee19efb4
6 changed files with 2789 additions and 31 deletions
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@ -611,43 +611,47 @@ pyspark_pandas = Module(
        "pyspark.pandas.spark.utils",
        "pyspark.pandas.typedef.typehints",
        # unittests
-        "pyspark.pandas.tests.test_dataframe",
-        "pyspark.pandas.tests.test_config",
-        "pyspark.pandas.tests.test_default_index",
-        "pyspark.pandas.tests.test_extension",
-        "pyspark.pandas.tests.test_internal",
-        "pyspark.pandas.tests.test_numpy_compat",
-        "pyspark.pandas.tests.test_typedef",
-        "pyspark.pandas.tests.test_utils",
-        "pyspark.pandas.tests.test_dataframe_conversion",
-        "pyspark.pandas.tests.test_dataframe_spark_io",
-        "pyspark.pandas.tests.test_frame_spark",
-        "pyspark.pandas.tests.test_ops_on_diff_frames",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
-        "pyspark.pandas.tests.test_series",
-        "pyspark.pandas.tests.test_series_conversion",
-        "pyspark.pandas.tests.test_series_datetime",
-        "pyspark.pandas.tests.test_series_string",
-        "pyspark.pandas.tests.test_categorical",
-        "pyspark.pandas.tests.test_csv",
-        "pyspark.pandas.tests.test_groupby",
-        "pyspark.pandas.tests.test_expanding",
-        "pyspark.pandas.tests.test_indexing",
-        "pyspark.pandas.tests.test_namespace",
-        "pyspark.pandas.tests.test_repr",
-        "pyspark.pandas.tests.test_reshape",
-        "pyspark.pandas.tests.test_rolling",
-        "pyspark.pandas.tests.test_sql",
-        "pyspark.pandas.tests.test_stats",
-        "pyspark.pandas.tests.test_window",
+        "pyspark.pandas.tests.indexes.test_base",
+        "pyspark.pandas.tests.indexes.test_category",
+        "pyspark.pandas.tests.indexes.test_datetime",
        "pyspark.pandas.tests.plot.test_frame_plot",
        "pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
        "pyspark.pandas.tests.plot.test_frame_plot_plotly",
        "pyspark.pandas.tests.plot.test_series_plot",
        "pyspark.pandas.tests.plot.test_series_plot_matplotlib",
        "pyspark.pandas.tests.plot.test_series_plot_plotly",
+        "pyspark.pandas.tests.test_categorical",
+        "pyspark.pandas.tests.test_config",
+        "pyspark.pandas.tests.test_csv",
+        "pyspark.pandas.tests.test_dataframe",
+        "pyspark.pandas.tests.test_dataframe_conversion",
+        "pyspark.pandas.tests.test_dataframe_spark_io",
+        "pyspark.pandas.tests.test_default_index",
+        "pyspark.pandas.tests.test_expanding",
+        "pyspark.pandas.tests.test_extension",
+        "pyspark.pandas.tests.test_frame_spark",
+        "pyspark.pandas.tests.test_groupby",
+        "pyspark.pandas.tests.test_indexing",
+        "pyspark.pandas.tests.test_indexops_spark",
+        "pyspark.pandas.tests.test_internal",
+        "pyspark.pandas.tests.test_namespace",
+        "pyspark.pandas.tests.test_numpy_compat",
+        "pyspark.pandas.tests.test_ops_on_diff_frames",
+        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
+        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
+        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
+        "pyspark.pandas.tests.test_repr",
+        "pyspark.pandas.tests.test_reshape",
+        "pyspark.pandas.tests.test_rolling",
+        "pyspark.pandas.tests.test_series",
+        "pyspark.pandas.tests.test_series_conversion",
+        "pyspark.pandas.tests.test_series_datetime",
+        "pyspark.pandas.tests.test_series_string",
+        "pyspark.pandas.tests.test_sql",
+        "pyspark.pandas.tests.test_stats",
+        "pyspark.pandas.tests.test_typedef",
+        "pyspark.pandas.tests.test_utils",
+        "pyspark.pandas.tests.test_window",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
--- a/python/pyspark/pandas/tests/indexes/init.py
+++ b/python/pyspark/pandas/tests/indexes/init.py
@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@ -0,0 +1,124 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from distutils.version import LooseVersion
+
+import pandas as pd
+from pandas.api.types import CategoricalDtype
+
+import pyspark.pandas as ps
+from pyspark.pandas.testing.utils import ReusedSQLTestCase, TestUtils
+
+
+class CategoricalIndexTest(ReusedSQLTestCase, TestUtils):
+    def test_categorical_index(self):
+        pidx = pd.CategoricalIndex([1, 2, 3])
+        kidx = ps.CategoricalIndex([1, 2, 3])
+
+        self.assert_eq(kidx, pidx)
+        self.assert_eq(kidx.categories, pidx.categories)
+        self.assert_eq(kidx.codes, pd.Index(pidx.codes))
+        self.assert_eq(kidx.ordered, pidx.ordered)
+
+        pidx = pd.Index([1, 2, 3], dtype="category")
+        kidx = ps.Index([1, 2, 3], dtype="category")
+
+        self.assert_eq(kidx, pidx)
+        self.assert_eq(kidx.categories, pidx.categories)
+        self.assert_eq(kidx.codes, pd.Index(pidx.codes))
+        self.assert_eq(kidx.ordered, pidx.ordered)
+
+        pdf = pd.DataFrame(
+            {
+                "a": pd.Categorical([1, 2, 3, 1, 2, 3]),
+                "b": pd.Categorical(["a", "b", "c", "a", "b", "c"], categories=["c", "b", "a"]),
+            },
+            index=pd.Categorical([10, 20, 30, 20, 30, 10], categories=[30, 10, 20], ordered=True),
+        )
+        kdf = ps.from_pandas(pdf)
+
+        pidx = pdf.set_index("b").index
+        kidx = kdf.set_index("b").index
+
+        self.assert_eq(kidx, pidx)
+        self.assert_eq(kidx.categories, pidx.categories)
+        self.assert_eq(kidx.codes, pd.Index(pidx.codes))
+        self.assert_eq(kidx.ordered, pidx.ordered)
+
+        pidx = pdf.set_index(["a", "b"]).index.get_level_values(0)
+        kidx = kdf.set_index(["a", "b"]).index.get_level_values(0)
+
+        self.assert_eq(kidx, pidx)
+        self.assert_eq(kidx.categories, pidx.categories)
+        self.assert_eq(kidx.codes, pd.Index(pidx.codes))
+        self.assert_eq(kidx.ordered, pidx.ordered)
+
+    def test_astype(self):
+        pidx = pd.Index(["a", "b", "c"])
+        kidx = ps.from_pandas(pidx)
+
+        self.assert_eq(kidx.astype("category"), pidx.astype("category"))
+        self.assert_eq(
+            kidx.astype(CategoricalDtype(["c", "a", "b"])),
+            pidx.astype(CategoricalDtype(["c", "a", "b"])),
+        )
+
+        pcidx = pidx.astype(CategoricalDtype(["c", "a", "b"]))
+        kcidx = kidx.astype(CategoricalDtype(["c", "a", "b"]))
+
+        self.assert_eq(kcidx.astype("category"), pcidx.astype("category"))
+
+        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+            self.assert_eq(
+                kcidx.astype(CategoricalDtype(["b", "c", "a"])),
+                pcidx.astype(CategoricalDtype(["b", "c", "a"])),
+            )
+        else:
+            self.assert_eq(
+                kcidx.astype(CategoricalDtype(["b", "c", "a"])),
+                pidx.astype(CategoricalDtype(["b", "c", "a"])),
+            )
+
+        self.assert_eq(kcidx.astype(str), pcidx.astype(str))
+
+    def test_factorize(self):
+        pidx = pd.CategoricalIndex([1, 2, 3, None])
+        kidx = ps.from_pandas(pidx)
+
+        pcodes, puniques = pidx.factorize()
+        kcodes, kuniques = kidx.factorize()
+
+        self.assert_eq(kcodes.tolist(), pcodes.tolist())
+        self.assert_eq(kuniques, puniques)
+
+        pcodes, puniques = pidx.factorize(na_sentinel=-2)
+        kcodes, kuniques = kidx.factorize(na_sentinel=-2)
+
+        self.assert_eq(kcodes.tolist(), pcodes.tolist())
+        self.assert_eq(kuniques, puniques)
+
+
+if __name__ == "__main__":
+    import unittest
+    from pyspark.pandas.tests.indexes.test_category import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@ -0,0 +1,232 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import datetime
+
+from distutils.version import LooseVersion
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.pandas.testing.utils import ReusedSQLTestCase, TestUtils
+
+
+class DatetimeIndexTest(ReusedSQLTestCase, TestUtils):
+    @property
+    def fixed_freqs(self):
+        return [
+            "D",
+            "H",
+            "T",  # min
+            "S",
+            "L",  # ms
+            "U",  # us
+            # 'N' not supported
+        ]
+
+    @property
+    def non_fixed_freqs(self):
+        return ["W", "Q"]
+
+    @property
+    def pidxs(self):
+        return [
+            pd.DatetimeIndex([0]),
+            pd.DatetimeIndex(["2004-01-01", "2002-12-31", "2000-04-01"]),
+        ] + [
+            pd.date_range("2000-01-01", periods=3, freq=freq)
+            for freq in (self.fixed_freqs + self.non_fixed_freqs)
+        ]
+
+    @property
+    def kidxs(self):
+        return [ps.from_pandas(pidx) for pidx in self.pidxs]
+
+    @property
+    def idx_pairs(self):
+        return list(zip(self.kidxs, self.pidxs))
+
+    def _disallow_nanoseconds(self, f):
+        self.assertRaises(ValueError, lambda: f(freq="ns"))
+        self.assertRaises(ValueError, lambda: f(freq="N"))
+
+    def test_properties(self):
+        for kidx, pidx in self.idx_pairs:
+            self.assert_eq(kidx.year, pidx.year)
+            self.assert_eq(kidx.month, pidx.month)
+            self.assert_eq(kidx.day, pidx.day)
+            self.assert_eq(kidx.hour, pidx.hour)
+            self.assert_eq(kidx.minute, pidx.minute)
+            self.assert_eq(kidx.second, pidx.second)
+            self.assert_eq(kidx.microsecond, pidx.microsecond)
+            self.assert_eq(kidx.week, pidx.week)
+            self.assert_eq(kidx.weekofyear, pidx.weekofyear)
+            self.assert_eq(kidx.dayofweek, pidx.dayofweek)
+            self.assert_eq(kidx.weekday, pidx.weekday)
+            self.assert_eq(kidx.dayofyear, pidx.dayofyear)
+            self.assert_eq(kidx.quarter, pidx.quarter)
+            self.assert_eq(kidx.daysinmonth, pidx.daysinmonth)
+            self.assert_eq(kidx.days_in_month, pidx.days_in_month)
+            self.assert_eq(kidx.is_month_start, pd.Index(pidx.is_month_start))
+            self.assert_eq(kidx.is_month_end, pd.Index(pidx.is_month_end))
+            self.assert_eq(kidx.is_quarter_start, pd.Index(pidx.is_quarter_start))
+            self.assert_eq(kidx.is_quarter_end, pd.Index(pidx.is_quarter_end))
+            self.assert_eq(kidx.is_year_start, pd.Index(pidx.is_year_start))
+            self.assert_eq(kidx.is_year_end, pd.Index(pidx.is_year_end))
+            self.assert_eq(kidx.is_leap_year, pd.Index(pidx.is_leap_year))
+
+            if LooseVersion(pd.__version__) >= LooseVersion("1.2.0"):
+                self.assert_eq(kidx.day_of_year, pidx.day_of_year)
+                self.assert_eq(kidx.day_of_week, pidx.day_of_week)
+
+    def test_ceil(self):
+        for kidx, pidx in self.idx_pairs:
+            for freq in self.fixed_freqs:
+                self.assert_eq(kidx.ceil(freq), pidx.ceil(freq))
+
+        self._disallow_nanoseconds(self.kidxs[0].ceil)
+
+    def test_floor(self):
+        for kidx, pidx in self.idx_pairs:
+            for freq in self.fixed_freqs:
+                self.assert_eq(kidx.floor(freq), pidx.floor(freq))
+
+        self._disallow_nanoseconds(self.kidxs[0].floor)
+
+    def test_round(self):
+        for kidx, pidx in self.idx_pairs:
+            for freq in self.fixed_freqs:
+                self.assert_eq(kidx.round(freq), pidx.round(freq))
+
+        self._disallow_nanoseconds(self.kidxs[0].round)
+
+    def test_day_name(self):
+        for kidx, pidx in self.idx_pairs:
+            self.assert_eq(kidx.day_name(), pidx.day_name())
+
+    def test_month_name(self):
+        for kidx, pidx in self.idx_pairs:
+            self.assert_eq(kidx.day_name(), pidx.day_name())
+
+    def test_normalize(self):
+        for kidx, pidx in self.idx_pairs:
+            self.assert_eq(kidx.normalize(), pidx.normalize())
+
+    def test_strftime(self):
+        for kidx, pidx in self.idx_pairs:
+            self.assert_eq(
+                kidx.strftime(date_format="%B %d, %Y"), pidx.strftime(date_format="%B %d, %Y")
+            )
+
+    def test_indexer_between_time(self):
+        for kidx, pidx in self.idx_pairs:
+            self.assert_eq(
+                kidx.indexer_between_time("00:00:00", "00:01:00").sort_values(),
+                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00")),
+            )
+
+            self.assert_eq(
+                kidx.indexer_between_time(
+                    datetime.time(0, 0, 0), datetime.time(0, 1, 0)
+                ).sort_values(),
+                pd.Index(pidx.indexer_between_time(datetime.time(0, 0, 0), datetime.time(0, 1, 0))),
+            )
+
+            self.assert_eq(
+                kidx.indexer_between_time("00:00:00", "00:01:00", True, False).sort_values(),
+                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", True, False)),
+            )
+
+            self.assert_eq(
+                kidx.indexer_between_time("00:00:00", "00:01:00", False, True).sort_values(),
+                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", False, True)),
+            )
+
+            self.assert_eq(
+                kidx.indexer_between_time("00:00:00", "00:01:00", False, False).sort_values(),
+                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", False, False)),
+            )
+
+            self.assert_eq(
+                kidx.indexer_between_time("00:00:00", "00:01:00", True, True).sort_values(),
+                pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", True, True)),
+            )
+
+    def test_indexer_at_time(self):
+        for kidx, pidx in self.idx_pairs:
+            self.assert_eq(
+                kidx.indexer_at_time("00:00:00").sort_values(),
+                pd.Index(pidx.indexer_at_time("00:00:00")),
+            )
+
+            self.assert_eq(
+                kidx.indexer_at_time(datetime.time(0, 1, 0)).sort_values(),
+                pd.Index(pidx.indexer_at_time(datetime.time(0, 1, 0))),
+            )
+
+            self.assert_eq(
+                kidx.indexer_at_time("00:00:01").sort_values(),
+                pd.Index(pidx.indexer_at_time("00:00:01")),
+            )
+
+        self.assertRaises(
+            NotImplementedError,
+            lambda: ps.DatetimeIndex([0]).indexer_at_time("00:00:00", asof=True),
+        )
+
+    def test_arithmetic_op_exceptions(self):
+        for kidx, pidx in self.idx_pairs:
+            py_datetime = pidx.to_pydatetime()
+            for other in [1, 0.1, kidx, kidx.to_series().reset_index(drop=True), py_datetime]:
+                expected_err_msg = "addition can not be applied to date times."
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx + other)
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other + kidx)
+
+                expected_err_msg = "multiplication can not be applied to date times."
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx * other)
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other * kidx)
+
+                expected_err_msg = "division can not be applied to date times."
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx / other)
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other / kidx)
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx // other)
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other // kidx)
+
+                expected_err_msg = "modulo can not be applied to date times."
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx % other)
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % kidx)
+
+            expected_err_msg = "datetime subtraction can only be applied to datetime series."
+
+            for other in [1, 0.1]:
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx - other)
+                self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other - kidx)
+
+            self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx - other)
+            self.assertRaises(NotImplementedError, lambda: py_datetime - kidx)
+
+
+if __name__ == "__main__":
+    import unittest
+    from pyspark.pandas.tests.indexes.test_datetime import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
--- a/python/pyspark/pandas/tests/test_indexops_spark.py
+++ b/python/pyspark/pandas/tests/test_indexops_spark.py
@ -0,0 +1,74 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pandas as pd
+from pyspark.sql.utils import AnalysisException
+from pyspark.sql import functions as F
+
+from pyspark import pandas as ps
+from pyspark.pandas.testing.utils import ReusedSQLTestCase, SQLTestUtils
+
+
+class SparkIndexOpsMethodsTest(ReusedSQLTestCase, SQLTestUtils):
+    @property
+    def pser(self):
+        return pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
+
+    @property
+    def kser(self):
+        return ps.from_pandas(self.pser)
+
+    def test_series_transform_negative(self):
+        with self.assertRaisesRegex(
+            ValueError, "The output of the function.* pyspark.sql.Column.*int"
+        ):
+            self.kser.spark.transform(lambda scol: 1)
+
+        with self.assertRaisesRegex(AnalysisException, "cannot resolve.*non-existent.*"):
+            self.kser.spark.transform(lambda scol: F.col("non-existent"))
+
+    def test_multiindex_transform_negative(self):
+        with self.assertRaisesRegex(
+            NotImplementedError, "MultiIndex does not support spark.transform yet"
+        ):
+            midx = pd.MultiIndex(
+                [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
+                [[0, 0, 0, 1, 1, 1, 2, 2, 2], [1, 1, 1, 1, 1, 2, 1, 2, 2]],
+            )
+            s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
+            s.index.spark.transform(lambda scol: scol)
+
+    def test_series_apply_negative(self):
+        with self.assertRaisesRegex(
+            ValueError, "The output of the function.* pyspark.sql.Column.*int"
+        ):
+            self.kser.spark.apply(lambda scol: 1)
+
+        with self.assertRaisesRegex(AnalysisException, "cannot resolve.*non-existent.*"):
+            self.kser.spark.transform(lambda scol: F.col("non-existent"))
+
+
+if __name__ == "__main__":
+    import unittest
+    from pyspark.pandas.tests.test_indexops_spark import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)