[SPARK-36506][PYTHON] Improve test coverage for series.py and indexes/*.py

### What changes were proposed in this pull request? This PR proposes improving test coverage for pandas-on-Spark Series & Index code base, which is written in `series.py` and `indexes/*.py` separately. This PR did the following to improve coverage: - Add unittest for untested code - Fix unittest which is not tested properly - Remove unused code **NOTE**: This PR is not only include the test-only update, for example it includes the new warning for `__xor__`, `__and__`, `__or__`. ### Why are the changes needed? To make the project healthier by improving coverage. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unittest. Closes #33844 from itholic/SPARK-36506. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
2021-09-23 14:23:52 +09:00 · 2021-09-23 14:23:52 +09:00 · 5268904742
parent 4ea54e8672
commit 5268904742
6 changed files with 122 additions and 7 deletions
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@ -2061,8 +2061,6 @@ class Index(IndexOpsMixin):
            )
            if is_other_list_of_tuples:
                other = MultiIndex.from_tuples(other)  # type: ignore
            elif isinstance(other, Series):
                other = Index(other)
            else:
                raise TypeError("other must be a MultiIndex or a list of tuples")
@ -2601,7 +2599,31 @@ class Index(IndexOpsMixin):
    def __iter__(self) -> Iterator:
        return MissingPandasLikeIndex.__iter__(self)
    def __and__(self, other: "Index") -> "Index":
        warnings.warn(
            "Index.__and__ operating as a set operation is deprecated, "
            "in the future this will be a logical operation matching Series.__and__.  "
            "Use index.intersection(other) instead",
            FutureWarning,
        )
        return self.intersection(other)
    def __or__(self, other: "Index") -> "Index":
        warnings.warn(
            "Index.__or__ operating as a set operation is deprecated, "
            "in the future this will be a logical operation matching Series.__or__.  "
            "Use index.union(other) instead",
            FutureWarning,
        )
        return self.union(other)
    def __xor__(self, other: "Index") -> "Index":
        warnings.warn(
            "Index.__xor__ operating as a set operation is deprecated, "
            "in the future this will be a logical operation matching Series.__xor__.  "
            "Use index.symmetric_difference(other) instead",
            FutureWarning,
        )
        return self.symmetric_difference(other)
    def __rxor__(self, other: Any) -> "Index":
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@ -1963,7 +1963,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
            if not isinstance(value, (float, int, str, bool)):
                raise TypeError("Unsupported type %s" % type(value).__name__)
            if limit is not None:
-                raise ValueError("limit parameter for value is not support now")
+                raise NotImplementedError("limit parameter for value is not support now")
            scol = F.when(cond, value).otherwise(scol)
        else:
            if method in ["ffill", "pad"]:
@ -3597,7 +3597,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
            raise ValueError(msg)
        if self._internal.index_level > 1:
-            raise ValueError("rank do not support index now")
+            raise NotImplementedError("rank do not support MultiIndex now")
        if ascending:
            asc_func = lambda scol: scol.asc()
@ -6258,7 +6258,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
        """
        axis = validate_axis(axis)
        if axis == 1:
-            raise ValueError("Series does not support columns axis.")
+            raise NotImplementedError("Series does not support columns axis.")
        scol = sfun(self)
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@ -405,6 +405,14 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
            (psidx1 + 1).symmetric_difference(psidx2).sort_values(),
            (pidx1 + 1).symmetric_difference(pidx2).sort_values(),
        )
        self.assert_eq(
            (psidx1 ^ psidx2).sort_values(),
            (pidx1 ^ pidx2).sort_values(),
        )
        self.assert_eq(
            psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
            pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),
        )
        pmidx1 = pd.MultiIndex(
            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
@ -1370,6 +1378,11 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
        self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
        self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
        with self.assertRaisesRegex(
            IndexError, "Too many levels: Index has only 1 level, -2 is not a valid level number"
        ):
            psidx.unique(level=-2)
    def test_asof(self):
        # Increasing values
        pidx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"])
@ -1895,6 +1908,8 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
            psmidx.intersection(4)
        with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
            psmidx.intersection(ps.Series([3, 4, 5, 6]))
        with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
            psmidx.intersection([("c", "z"), ["d", "w"]])
        with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
            psidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}))
        with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
@ -2186,6 +2201,10 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
        with self.assertRaisesRegex(IndexError, err_msg):
            psmidx.insert(4, ("b", "y"))
        err_msg = "index -4 is out of bounds for axis 0 with size 3"
        with self.assertRaisesRegex(IndexError, err_msg):
            psmidx.insert(-4, ("b", "y"))
    def test_astype(self):
        pidx = pd.Index([10, 20, 15, 30, 45], name="x")
        psidx = ps.Index(pidx)
@ -2369,6 +2388,22 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
            lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}),
        )
    def test_to_numpy(self):
        pidx = pd.Index([1, 2, 3, 4])
        psidx = ps.from_pandas(pidx)
        self.assert_eq(pidx.to_numpy(copy=True), psidx.to_numpy(copy=True))
    def test_drop_level(self):
        tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
        pmidx = pd.MultiIndex.from_tuples(tuples)
        psmidx = ps.from_pandas(pmidx)
        with self.assertRaisesRegex(
            IndexError, "Too many levels: Index has only 2 levels, -3 is not a valid level number"
        ):
            psmidx.droplevel(-3)
 if __name__ == "__main__":
    from pyspark.pandas.tests.indexes.test_base import *  # noqa: F401
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@ -118,7 +118,7 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
        self.assert_eq(pidx.remove_categories(None), psidx.remove_categories(None))
        self.assert_eq(pidx.remove_categories([None]), psidx.remove_categories([None]))
-        self.assertRaises(ValueError, lambda: pidx.remove_categories(4, inplace=True))
+        self.assertRaises(ValueError, lambda: psidx.remove_categories(4, inplace=True))
        self.assertRaises(ValueError, lambda: psidx.remove_categories(4))
        self.assertRaises(ValueError, lambda: psidx.remove_categories([4, None]))
@ -145,7 +145,7 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
            psidx.reorder_categories([3, 2, 1], ordered=True),
        )
-        self.assertRaises(ValueError, lambda: pidx.reorder_categories([1, 2, 3], inplace=True))
+        self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 3], inplace=True))
        self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2]))
        self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 4]))
        self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 2]))
@ -311,6 +311,10 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
            TypeError,
            lambda: psidx.rename_categories("x"),
        )
        self.assertRaises(
            ValueError,
            lambda: psidx.rename_categories({"b": "B", "c": "C"}, inplace=True),
        )
    def test_set_categories(self):
        pidx = pd.CategoricalIndex(["a", "b", "c", "d"])
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@ -2012,6 +2012,13 @@ class OpsOnDiffFramesDisabledTest(PandasOnSparkTestCase, SQLTestUtils):
        with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
            psser.rpow(psser_other)
    def test_equals(self):
        psidx1 = ps.Index([1, 2, 3, 4])
        psidx2 = ps.Index([1, 2, 3, 4])
        with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
            psidx1.equals(psidx2)
    def test_combine_first(self):
        pdf1 = pd.DataFrame({"A": [None, 0], "B": [4, None]})
        psdf1 = ps.from_pandas(pdf1)
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@ -566,6 +566,11 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
        self.assert_eq(psser, pser)
        self.assert_eq(psdf, pdf)
        with self.assertRaisesRegex(
            ValueError, "Must specify a fillna 'value' or 'method' parameter."
        ):
            psser.fillna()
    def test_dropna(self):
        pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]})
        psdf = ps.from_pandas(pdf)
@ -1267,6 +1272,9 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
        self.assert_eq(pser.cumsum().astype(int), psser.cumsum())
        self.assert_eq(pser.cumsum(skipna=False).astype(int), psser.cumsum(skipna=False))
        with self.assertRaisesRegex(TypeError, r"Could not convert object \(string\) to numeric"):
            ps.Series(["a", "b", "c", "d"]).cumsum()
    def test_cumprod(self):
        pser = pd.Series([1.0, None, 1.0, 4.0, 9.0])
        psser = ps.from_pandas(pser)
@ -1305,6 +1313,9 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
        self.assert_eq(pser.cumprod(), psser.cumprod())
        self.assert_eq(pser.cumprod(skipna=False).astype(int), psser.cumprod(skipna=False))
        with self.assertRaisesRegex(TypeError, r"Could not convert object \(string\) to numeric"):
            ps.Series(["a", "b", "c", "d"]).cumprod()
    def test_median(self):
        with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
            ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a")
@ -1324,6 +1335,17 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
        with self.assertRaisesRegex(ValueError, msg):
            psser.rank(method="nothing")
        msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
        with self.assertRaisesRegex(ValueError, msg):
            psser.rank(method="nothing")
        midx = pd.MultiIndex.from_tuples([("a", "b"), ("a", "c"), ("b", "c"), ("c", "d")])
        pser.index = midx
        psser = ps.from_pandas(pser)
        msg = "rank do not support MultiIndex now"
        with self.assertRaisesRegex(NotImplementedError, msg):
            psser.rank(method="min")
    def test_round(self):
        pser = pd.Series([0.028208, 0.038683, 0.877076], name="x")
        psser = ps.from_pandas(pser)
@ -1345,6 +1367,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
            ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a")
        with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
            ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])
        with self.assertRaisesRegex(
            ValueError, "percentiles should all be in the interval \\[0, 1\\]"
        ):
            ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=1.1)
        with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
            ps.Series(["a", "b", "c"]).quantile()
@ -1669,6 +1695,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
        with self.assertRaisesRegex(KeyError, msg):
            psser.pop(("lama", "speed", "x"))
        msg = "'key' should be string or tuple that contains strings"
        with self.assertRaisesRegex(TypeError, msg):
            psser.pop(["lama", "speed"])
        pser = pd.Series(["a", "b", "c", "a"], dtype="category")
        psser = ps.from_pandas(pser)
@ -2427,6 +2457,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
        self.assert_eq((psdf["b"] * 10).dot(psdf), (pdf["b"] * 10).dot(pdf))
        self.assert_eq((psdf["b"] * 10).dot(psdf + 1), (pdf["b"] * 10).dot(pdf + 1))
        psdf_other = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["x", "y", "z"])
        with self.assertRaisesRegex(ValueError, "matrices are not aligned"):
            psdf["b"].dot(psdf_other)
    def test_tail(self):
        pser = pd.Series(range(1000), name="Koalas")
        psser = ps.from_pandas(pser)
@ -2961,6 +2995,19 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
            psser.at_time("0:20").sort_index(),
        )
    def test_apply(self):
        psser = self.psser
        def udf(col) -> ps.Series[int]:
            return col + 10
        with self.assertRaisesRegex(
            ValueError,
            r"Expected the return type of this function to be of scalar type, "
            r"but found type SeriesType\[LongType\]",
        ):
            psser.apply(udf)
    def test_combine_first(self):
        pdf = pd.DataFrame(
            {