[SPARK-36506][PYTHON] Improve test coverage for series.py and indexes/*.py

### What changes were proposed in this pull request?

This PR proposes improving test coverage for pandas-on-Spark Series & Index code base, which is written in `series.py` and `indexes/*.py` separately.

This PR did the following to improve coverage:
- Add unittest for untested code
- Fix unittest which is not tested properly
- Remove unused code

**NOTE**: This PR is not only include the test-only update, for example it includes the new warning for `__xor__`, `__and__`, `__or__`.

### Why are the changes needed?

To make the project healthier by improving coverage.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Unittest.

Closes #33844 from itholic/SPARK-36506.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
itholic 2021-09-23 14:23:52 +09:00 committed by Hyukjin Kwon
parent 4ea54e8672
commit 5268904742
6 changed files with 122 additions and 7 deletions

View file

@ -2061,8 +2061,6 @@ class Index(IndexOpsMixin):
) )
if is_other_list_of_tuples: if is_other_list_of_tuples:
other = MultiIndex.from_tuples(other) # type: ignore other = MultiIndex.from_tuples(other) # type: ignore
elif isinstance(other, Series):
other = Index(other)
else: else:
raise TypeError("other must be a MultiIndex or a list of tuples") raise TypeError("other must be a MultiIndex or a list of tuples")
@ -2601,7 +2599,31 @@ class Index(IndexOpsMixin):
def __iter__(self) -> Iterator: def __iter__(self) -> Iterator:
return MissingPandasLikeIndex.__iter__(self) return MissingPandasLikeIndex.__iter__(self)
def __and__(self, other: "Index") -> "Index":
warnings.warn(
"Index.__and__ operating as a set operation is deprecated, "
"in the future this will be a logical operation matching Series.__and__. "
"Use index.intersection(other) instead",
FutureWarning,
)
return self.intersection(other)
def __or__(self, other: "Index") -> "Index":
warnings.warn(
"Index.__or__ operating as a set operation is deprecated, "
"in the future this will be a logical operation matching Series.__or__. "
"Use index.union(other) instead",
FutureWarning,
)
return self.union(other)
def __xor__(self, other: "Index") -> "Index": def __xor__(self, other: "Index") -> "Index":
warnings.warn(
"Index.__xor__ operating as a set operation is deprecated, "
"in the future this will be a logical operation matching Series.__xor__. "
"Use index.symmetric_difference(other) instead",
FutureWarning,
)
return self.symmetric_difference(other) return self.symmetric_difference(other)
def __rxor__(self, other: Any) -> "Index": def __rxor__(self, other: Any) -> "Index":

View file

@ -1963,7 +1963,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
if not isinstance(value, (float, int, str, bool)): if not isinstance(value, (float, int, str, bool)):
raise TypeError("Unsupported type %s" % type(value).__name__) raise TypeError("Unsupported type %s" % type(value).__name__)
if limit is not None: if limit is not None:
raise ValueError("limit parameter for value is not support now") raise NotImplementedError("limit parameter for value is not support now")
scol = F.when(cond, value).otherwise(scol) scol = F.when(cond, value).otherwise(scol)
else: else:
if method in ["ffill", "pad"]: if method in ["ffill", "pad"]:
@ -3597,7 +3597,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
raise ValueError(msg) raise ValueError(msg)
if self._internal.index_level > 1: if self._internal.index_level > 1:
raise ValueError("rank do not support index now") raise NotImplementedError("rank do not support MultiIndex now")
if ascending: if ascending:
asc_func = lambda scol: scol.asc() asc_func = lambda scol: scol.asc()
@ -6258,7 +6258,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
""" """
axis = validate_axis(axis) axis = validate_axis(axis)
if axis == 1: if axis == 1:
raise ValueError("Series does not support columns axis.") raise NotImplementedError("Series does not support columns axis.")
scol = sfun(self) scol = sfun(self)

View file

@ -405,6 +405,14 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
(psidx1 + 1).symmetric_difference(psidx2).sort_values(), (psidx1 + 1).symmetric_difference(psidx2).sort_values(),
(pidx1 + 1).symmetric_difference(pidx2).sort_values(), (pidx1 + 1).symmetric_difference(pidx2).sort_values(),
) )
self.assert_eq(
(psidx1 ^ psidx2).sort_values(),
(pidx1 ^ pidx2).sort_values(),
)
self.assert_eq(
psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),
)
pmidx1 = pd.MultiIndex( pmidx1 = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]], [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
@ -1370,6 +1378,11 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values()) self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values()) self.assert_eq(psmidx.unique().sort_values(), pmidx.unique().sort_values())
with self.assertRaisesRegex(
IndexError, "Too many levels: Index has only 1 level, -2 is not a valid level number"
):
psidx.unique(level=-2)
def test_asof(self): def test_asof(self):
# Increasing values # Increasing values
pidx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"]) pidx = pd.Index(["2013-12-31", "2014-01-02", "2014-01-03"])
@ -1895,6 +1908,8 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
psmidx.intersection(4) psmidx.intersection(4)
with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"): with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
psmidx.intersection(ps.Series([3, 4, 5, 6])) psmidx.intersection(ps.Series([3, 4, 5, 6]))
with self.assertRaisesRegex(TypeError, "other must be a MultiIndex or a list of tuples"):
psmidx.intersection([("c", "z"), ["d", "w"]])
with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"): with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
psidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})) psidx.intersection(ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}))
with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"): with self.assertRaisesRegex(ValueError, "Index data must be 1-dimensional"):
@ -2186,6 +2201,10 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
with self.assertRaisesRegex(IndexError, err_msg): with self.assertRaisesRegex(IndexError, err_msg):
psmidx.insert(4, ("b", "y")) psmidx.insert(4, ("b", "y"))
err_msg = "index -4 is out of bounds for axis 0 with size 3"
with self.assertRaisesRegex(IndexError, err_msg):
psmidx.insert(-4, ("b", "y"))
def test_astype(self): def test_astype(self):
pidx = pd.Index([10, 20, 15, 30, 45], name="x") pidx = pd.Index([10, 20, 15, 30, 45], name="x")
psidx = ps.Index(pidx) psidx = ps.Index(pidx)
@ -2369,6 +2388,22 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}), lambda: psidx.map({1: 1, 2: 2.0, 3: "three"}),
) )
def test_to_numpy(self):
pidx = pd.Index([1, 2, 3, 4])
psidx = ps.from_pandas(pidx)
self.assert_eq(pidx.to_numpy(copy=True), psidx.to_numpy(copy=True))
def test_drop_level(self):
tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")]
pmidx = pd.MultiIndex.from_tuples(tuples)
psmidx = ps.from_pandas(pmidx)
with self.assertRaisesRegex(
IndexError, "Too many levels: Index has only 2 levels, -3 is not a valid level number"
):
psmidx.droplevel(-3)
if __name__ == "__main__": if __name__ == "__main__":
from pyspark.pandas.tests.indexes.test_base import * # noqa: F401 from pyspark.pandas.tests.indexes.test_base import * # noqa: F401

View file

@ -118,7 +118,7 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
self.assert_eq(pidx.remove_categories(None), psidx.remove_categories(None)) self.assert_eq(pidx.remove_categories(None), psidx.remove_categories(None))
self.assert_eq(pidx.remove_categories([None]), psidx.remove_categories([None])) self.assert_eq(pidx.remove_categories([None]), psidx.remove_categories([None]))
self.assertRaises(ValueError, lambda: pidx.remove_categories(4, inplace=True)) self.assertRaises(ValueError, lambda: psidx.remove_categories(4, inplace=True))
self.assertRaises(ValueError, lambda: psidx.remove_categories(4)) self.assertRaises(ValueError, lambda: psidx.remove_categories(4))
self.assertRaises(ValueError, lambda: psidx.remove_categories([4, None])) self.assertRaises(ValueError, lambda: psidx.remove_categories([4, None]))
@ -145,7 +145,7 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
psidx.reorder_categories([3, 2, 1], ordered=True), psidx.reorder_categories([3, 2, 1], ordered=True),
) )
self.assertRaises(ValueError, lambda: pidx.reorder_categories([1, 2, 3], inplace=True)) self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 3], inplace=True))
self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2])) self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2]))
self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 4])) self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 4]))
self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 2])) self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 2]))
@ -311,6 +311,10 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
TypeError, TypeError,
lambda: psidx.rename_categories("x"), lambda: psidx.rename_categories("x"),
) )
self.assertRaises(
ValueError,
lambda: psidx.rename_categories({"b": "B", "c": "C"}, inplace=True),
)
def test_set_categories(self): def test_set_categories(self):
pidx = pd.CategoricalIndex(["a", "b", "c", "d"]) pidx = pd.CategoricalIndex(["a", "b", "c", "d"])

View file

@ -2012,6 +2012,13 @@ class OpsOnDiffFramesDisabledTest(PandasOnSparkTestCase, SQLTestUtils):
with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
psser.rpow(psser_other) psser.rpow(psser_other)
def test_equals(self):
psidx1 = ps.Index([1, 2, 3, 4])
psidx2 = ps.Index([1, 2, 3, 4])
with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
psidx1.equals(psidx2)
def test_combine_first(self): def test_combine_first(self):
pdf1 = pd.DataFrame({"A": [None, 0], "B": [4, None]}) pdf1 = pd.DataFrame({"A": [None, 0], "B": [4, None]})
psdf1 = ps.from_pandas(pdf1) psdf1 = ps.from_pandas(pdf1)

View file

@ -566,6 +566,11 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(psser, pser) self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf) self.assert_eq(psdf, pdf)
with self.assertRaisesRegex(
ValueError, "Must specify a fillna 'value' or 'method' parameter."
):
psser.fillna()
def test_dropna(self): def test_dropna(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]}) pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]})
psdf = ps.from_pandas(pdf) psdf = ps.from_pandas(pdf)
@ -1267,6 +1272,9 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(pser.cumsum().astype(int), psser.cumsum()) self.assert_eq(pser.cumsum().astype(int), psser.cumsum())
self.assert_eq(pser.cumsum(skipna=False).astype(int), psser.cumsum(skipna=False)) self.assert_eq(pser.cumsum(skipna=False).astype(int), psser.cumsum(skipna=False))
with self.assertRaisesRegex(TypeError, r"Could not convert object \(string\) to numeric"):
ps.Series(["a", "b", "c", "d"]).cumsum()
def test_cumprod(self): def test_cumprod(self):
pser = pd.Series([1.0, None, 1.0, 4.0, 9.0]) pser = pd.Series([1.0, None, 1.0, 4.0, 9.0])
psser = ps.from_pandas(pser) psser = ps.from_pandas(pser)
@ -1305,6 +1313,9 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(pser.cumprod(), psser.cumprod()) self.assert_eq(pser.cumprod(), psser.cumprod())
self.assert_eq(pser.cumprod(skipna=False).astype(int), psser.cumprod(skipna=False)) self.assert_eq(pser.cumprod(skipna=False).astype(int), psser.cumprod(skipna=False))
with self.assertRaisesRegex(TypeError, r"Could not convert object \(string\) to numeric"):
ps.Series(["a", "b", "c", "d"]).cumprod()
def test_median(self): def test_median(self):
with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a") ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a")
@ -1324,6 +1335,17 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
with self.assertRaisesRegex(ValueError, msg): with self.assertRaisesRegex(ValueError, msg):
psser.rank(method="nothing") psser.rank(method="nothing")
msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
with self.assertRaisesRegex(ValueError, msg):
psser.rank(method="nothing")
midx = pd.MultiIndex.from_tuples([("a", "b"), ("a", "c"), ("b", "c"), ("c", "d")])
pser.index = midx
psser = ps.from_pandas(pser)
msg = "rank do not support MultiIndex now"
with self.assertRaisesRegex(NotImplementedError, msg):
psser.rank(method="min")
def test_round(self): def test_round(self):
pser = pd.Series([0.028208, 0.038683, 0.877076], name="x") pser = pd.Series([0.028208, 0.038683, 0.877076], name="x")
psser = ps.from_pandas(pser) psser = ps.from_pandas(pser)
@ -1345,6 +1367,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a") ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a")
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"]) ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])
with self.assertRaisesRegex(
ValueError, "percentiles should all be in the interval \\[0, 1\\]"
):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=1.1)
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ps.Series(["a", "b", "c"]).quantile() ps.Series(["a", "b", "c"]).quantile()
@ -1669,6 +1695,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
with self.assertRaisesRegex(KeyError, msg): with self.assertRaisesRegex(KeyError, msg):
psser.pop(("lama", "speed", "x")) psser.pop(("lama", "speed", "x"))
msg = "'key' should be string or tuple that contains strings"
with self.assertRaisesRegex(TypeError, msg):
psser.pop(["lama", "speed"])
pser = pd.Series(["a", "b", "c", "a"], dtype="category") pser = pd.Series(["a", "b", "c", "a"], dtype="category")
psser = ps.from_pandas(pser) psser = ps.from_pandas(pser)
@ -2427,6 +2457,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq((psdf["b"] * 10).dot(psdf), (pdf["b"] * 10).dot(pdf)) self.assert_eq((psdf["b"] * 10).dot(psdf), (pdf["b"] * 10).dot(pdf))
self.assert_eq((psdf["b"] * 10).dot(psdf + 1), (pdf["b"] * 10).dot(pdf + 1)) self.assert_eq((psdf["b"] * 10).dot(psdf + 1), (pdf["b"] * 10).dot(pdf + 1))
psdf_other = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["x", "y", "z"])
with self.assertRaisesRegex(ValueError, "matrices are not aligned"):
psdf["b"].dot(psdf_other)
def test_tail(self): def test_tail(self):
pser = pd.Series(range(1000), name="Koalas") pser = pd.Series(range(1000), name="Koalas")
psser = ps.from_pandas(pser) psser = ps.from_pandas(pser)
@ -2961,6 +2995,19 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
psser.at_time("0:20").sort_index(), psser.at_time("0:20").sort_index(),
) )
def test_apply(self):
psser = self.psser
def udf(col) -> ps.Series[int]:
return col + 10
with self.assertRaisesRegex(
ValueError,
r"Expected the return type of this function to be of scalar type, "
r"but found type SeriesType\[LongType\]",
):
psser.apply(udf)
def test_combine_first(self): def test_combine_first(self):
pdf = pd.DataFrame( pdf = pd.DataFrame(
{ {