[SPARK-36531][SPARK-36515][PYTHON] Improve test coverage for data_type_ops/* and groupby

### What changes were proposed in this pull request?

This PR proposes improving test coverage for pandas-on-Spark data types & GroupBy code base, which is written in `data_type_ops/*.py` and `groupby.py` separately.

This PR did the following to improve coverage:
- Add unittest for untested code
- Fix unittest which is not tested properly
- Remove unused code

**NOTE**: This PR is not only include the test-only update, for example it includes the fixing `astype` for binary ops.

pandas-on-Spark Series we have:
```python
>>> psser
0    [49]
1    [50]
2    [51]
dtype: object
```

before:
```python
>>> psser.astype(bool)
Traceback (most recent call last):
...
pyspark.sql.utils.AnalysisException: cannot resolve 'CAST(`0` AS BOOLEAN)' due to data type mismatch: cannot cast binary to boolean;
...
```

after:
```python
>>> psser.astype(bool)
0    True
1    True
2    True
dtype: bool
```

### Why are the changes needed?

To make the project healthier by improving coverage.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Unittest.

Closes #33850 from itholic/SPARK-36531.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 71dbd03fbe)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
itholic 2021-09-08 10:22:52 +09:00 committed by Hyukjin Kwon
parent 4a92b0e278
commit 3d50760a3e
8 changed files with 61 additions and 8 deletions

View file

@ -23,7 +23,6 @@ from pyspark.pandas.base import column_op, IndexOpsMixin
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
from pyspark.pandas.data_type_ops.base import (
DataTypeOps,
_as_bool_type,
_as_categorical_type,
_as_other_type,
_as_string_type,
@ -100,7 +99,9 @@ class BinaryOps(DataTypeOps):
if isinstance(dtype, CategoricalDtype):
return _as_categorical_type(index_ops, dtype, spark_type)
elif isinstance(spark_type, BooleanType):
return _as_bool_type(index_ops, dtype)
# Cannot cast binary to boolean in Spark.
# We should cast binary to str first, and cast it to boolean
return index_ops.astype(str).astype(bool)
elif isinstance(spark_type, StringType):
return _as_string_type(index_ops, dtype)
else:

View file

@ -30,7 +30,6 @@ from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
from pyspark.pandas.base import IndexOpsMixin
from pyspark.pandas.data_type_ops.base import (
DataTypeOps,
_as_bool_type,
_as_categorical_type,
_as_other_type,
_as_string_type,
@ -132,7 +131,7 @@ class DatetimeOps(DataTypeOps):
if isinstance(dtype, CategoricalDtype):
return _as_categorical_type(index_ops, dtype, spark_type)
elif isinstance(spark_type, BooleanType):
return _as_bool_type(index_ops, dtype)
raise TypeError("cannot astype a datetimelike from [datetime64[ns]] to [bool]")
elif isinstance(spark_type, StringType):
return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
else:

View file

@ -162,7 +162,8 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils):
def test_astype(self):
pser = self.pser
psser = self.psser
self.assert_eq(pd.Series(["1", "2", "3"]), psser.astype(str))
self.assert_eq(psser.astype(str), psser.astype(str))
self.assert_eq(pser.astype(bool), psser.astype(bool))
self.assert_eq(pser.astype("category"), psser.astype("category"))
cat_type = CategoricalDtype(categories=[b"2", b"3", b"1"])
self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))

View file

@ -198,6 +198,20 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
else:
self.assert_eq(psser.astype(cat_type), pser)
# Empty
pser = pd.Series([], dtype="category")
psser = ps.from_pandas(pser)
self.assert_eq(pser.astype(int), psser.astype(int))
self.assert_eq(pser.astype(float), psser.astype(float))
self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
self.assert_eq(pser.astype(np.int32), psser.astype(np.int32))
self.assert_eq(pser.astype(np.int16), psser.astype(np.int16))
self.assert_eq(pser.astype(np.int8), psser.astype(np.int8))
self.assert_eq(pser.astype(str), psser.astype(str))
self.assert_eq(pser.astype(bool), psser.astype(bool))
self.assert_eq(pser.astype("category"), psser.astype("category"))
self.assert_eq(pser.astype("category"), psser.astype("category"))
def test_neg(self):
self.assertRaises(TypeError, lambda: -self.psser)
@ -270,6 +284,13 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
with option_context("compute.ops_on_diff_frames", True):
self.assert_eq(pser1 == pser2, (psser1 == psser2).sort_index())
psser3 = ps.Series(pd.Categorical(list("xyzx")))
self.assertRaisesRegex(
TypeError,
"Categoricals can only be compared if 'categories' are the same.",
lambda: psser1 == psser3,
)
def test_ne(self):
pdf, psdf = self.pdf, self.psdf

View file

@ -212,8 +212,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):
def test_lt(self):
pdf, psdf = self.date_pdf, self.date_psdf
self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"])
self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"])
self.assert_eq(pdf["this"] < pdf["that"], psdf["this"] < psdf["that"])
self.assert_eq(pdf["this"] < pdf["this"], psdf["this"] < psdf["this"])
def test_le(self):
pdf, psdf = self.date_pdf, self.date_psdf

View file

@ -195,6 +195,8 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
cat_type = CategoricalDtype(categories=["a", "b", "c"])
self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
self.assertRaises(TypeError, lambda: psser.astype(bool))
def test_neg(self):
self.assertRaises(TypeError, lambda: -self.psser)

View file

@ -29,6 +29,7 @@ from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.pandas.typedef.typehints import (
extension_dtypes_available,
extension_float_dtypes_available,
extension_object_dtypes_available,
)
from pyspark.sql.types import DecimalType, IntegralType
from pyspark.testing.pandasutils import PandasOnSparkTestCase
@ -350,6 +351,10 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assert_eq(
self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category")
)
if extension_object_dtypes_available and extension_float_dtypes_available:
pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype())
psser = ps.from_pandas(pser)
self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
def test_neg(self):
pdf, psdf = self.pdf, self.psdf

View file

@ -30,7 +30,7 @@ from pyspark.pandas.missing.groupby import (
MissingPandasLikeDataFrameGroupBy,
MissingPandasLikeSeriesGroupBy,
)
from pyspark.pandas.groupby import is_multi_agg_with_relabel
from pyspark.pandas.groupby import is_multi_agg_with_relabel, SeriesGroupBy
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
@ -2135,6 +2135,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
psdf.a.rename().groupby(psdf.b.rename()).transform(lambda x: x + x.min()).sort_index(),
pdf.a.rename().groupby(pdf.b.rename()).transform(lambda x: x + x.min()).sort_index(),
)
with self.assertRaisesRegex(TypeError, "str object is not callable"):
psdf.groupby("a").transform("sum")
def udf(col) -> int:
return col + 10
with self.assertRaisesRegex(
TypeError,
"Expected the return type of this function to be of Series type, "
"but found type ScalarType\\[LongType\\]",
):
psdf.groupby("a").transform(udf)
# multi-index columns
columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
@ -2854,6 +2866,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
check_exact=False,
)
def test_getitem(self):
psdf = ps.DataFrame(
{
"a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3,
"b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3,
"c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3,
},
index=np.random.rand(10 * 3),
)
self.assertTrue(isinstance(psdf.groupby("a")["b"], SeriesGroupBy))
if __name__ == "__main__":
from pyspark.pandas.tests.test_groupby import * # noqa: F401