[SPARK-36531][SPARK-36515][PYTHON] Improve test coverage for data_type_ops/* and groupby
### What changes were proposed in this pull request?
This PR proposes improving test coverage for pandas-on-Spark data types & GroupBy code base, which is written in `data_type_ops/*.py` and `groupby.py` separately.
This PR did the following to improve coverage:
- Add unittest for untested code
- Fix unittest which is not tested properly
- Remove unused code
**NOTE**: This PR is not only include the test-only update, for example it includes the fixing `astype` for binary ops.
pandas-on-Spark Series we have:
```python
>>> psser
0 [49]
1 [50]
2 [51]
dtype: object
```
before:
```python
>>> psser.astype(bool)
Traceback (most recent call last):
...
pyspark.sql.utils.AnalysisException: cannot resolve 'CAST(`0` AS BOOLEAN)' due to data type mismatch: cannot cast binary to boolean;
...
```
after:
```python
>>> psser.astype(bool)
0 True
1 True
2 True
dtype: bool
```
### Why are the changes needed?
To make the project healthier by improving coverage.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Unittest.
Closes #33850 from itholic/SPARK-36531.
Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 71dbd03fbe
)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
parent
4a92b0e278
commit
3d50760a3e
|
@ -23,7 +23,6 @@ from pyspark.pandas.base import column_op, IndexOpsMixin
|
|||
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
|
||||
from pyspark.pandas.data_type_ops.base import (
|
||||
DataTypeOps,
|
||||
_as_bool_type,
|
||||
_as_categorical_type,
|
||||
_as_other_type,
|
||||
_as_string_type,
|
||||
|
@ -100,7 +99,9 @@ class BinaryOps(DataTypeOps):
|
|||
if isinstance(dtype, CategoricalDtype):
|
||||
return _as_categorical_type(index_ops, dtype, spark_type)
|
||||
elif isinstance(spark_type, BooleanType):
|
||||
return _as_bool_type(index_ops, dtype)
|
||||
# Cannot cast binary to boolean in Spark.
|
||||
# We should cast binary to str first, and cast it to boolean
|
||||
return index_ops.astype(str).astype(bool)
|
||||
elif isinstance(spark_type, StringType):
|
||||
return _as_string_type(index_ops, dtype)
|
||||
else:
|
||||
|
|
|
@ -30,7 +30,6 @@ from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
|
|||
from pyspark.pandas.base import IndexOpsMixin
|
||||
from pyspark.pandas.data_type_ops.base import (
|
||||
DataTypeOps,
|
||||
_as_bool_type,
|
||||
_as_categorical_type,
|
||||
_as_other_type,
|
||||
_as_string_type,
|
||||
|
@ -132,7 +131,7 @@ class DatetimeOps(DataTypeOps):
|
|||
if isinstance(dtype, CategoricalDtype):
|
||||
return _as_categorical_type(index_ops, dtype, spark_type)
|
||||
elif isinstance(spark_type, BooleanType):
|
||||
return _as_bool_type(index_ops, dtype)
|
||||
raise TypeError("cannot astype a datetimelike from [datetime64[ns]] to [bool]")
|
||||
elif isinstance(spark_type, StringType):
|
||||
return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
|
||||
else:
|
||||
|
|
|
@ -162,7 +162,8 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
def test_astype(self):
|
||||
pser = self.pser
|
||||
psser = self.psser
|
||||
self.assert_eq(pd.Series(["1", "2", "3"]), psser.astype(str))
|
||||
self.assert_eq(psser.astype(str), psser.astype(str))
|
||||
self.assert_eq(pser.astype(bool), psser.astype(bool))
|
||||
self.assert_eq(pser.astype("category"), psser.astype("category"))
|
||||
cat_type = CategoricalDtype(categories=[b"2", b"3", b"1"])
|
||||
self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
|
||||
|
|
|
@ -198,6 +198,20 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
else:
|
||||
self.assert_eq(psser.astype(cat_type), pser)
|
||||
|
||||
# Empty
|
||||
pser = pd.Series([], dtype="category")
|
||||
psser = ps.from_pandas(pser)
|
||||
self.assert_eq(pser.astype(int), psser.astype(int))
|
||||
self.assert_eq(pser.astype(float), psser.astype(float))
|
||||
self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
|
||||
self.assert_eq(pser.astype(np.int32), psser.astype(np.int32))
|
||||
self.assert_eq(pser.astype(np.int16), psser.astype(np.int16))
|
||||
self.assert_eq(pser.astype(np.int8), psser.astype(np.int8))
|
||||
self.assert_eq(pser.astype(str), psser.astype(str))
|
||||
self.assert_eq(pser.astype(bool), psser.astype(bool))
|
||||
self.assert_eq(pser.astype("category"), psser.astype("category"))
|
||||
self.assert_eq(pser.astype("category"), psser.astype("category"))
|
||||
|
||||
def test_neg(self):
|
||||
self.assertRaises(TypeError, lambda: -self.psser)
|
||||
|
||||
|
@ -270,6 +284,13 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
with option_context("compute.ops_on_diff_frames", True):
|
||||
self.assert_eq(pser1 == pser2, (psser1 == psser2).sort_index())
|
||||
|
||||
psser3 = ps.Series(pd.Categorical(list("xyzx")))
|
||||
self.assertRaisesRegex(
|
||||
TypeError,
|
||||
"Categoricals can only be compared if 'categories' are the same.",
|
||||
lambda: psser1 == psser3,
|
||||
)
|
||||
|
||||
def test_ne(self):
|
||||
pdf, psdf = self.pdf, self.psdf
|
||||
|
||||
|
|
|
@ -212,8 +212,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
|
||||
def test_lt(self):
|
||||
pdf, psdf = self.date_pdf, self.date_psdf
|
||||
self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"])
|
||||
self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"])
|
||||
self.assert_eq(pdf["this"] < pdf["that"], psdf["this"] < psdf["that"])
|
||||
self.assert_eq(pdf["this"] < pdf["this"], psdf["this"] < psdf["this"])
|
||||
|
||||
def test_le(self):
|
||||
pdf, psdf = self.date_pdf, self.date_psdf
|
||||
|
|
|
@ -195,6 +195,8 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
cat_type = CategoricalDtype(categories=["a", "b", "c"])
|
||||
self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
|
||||
|
||||
self.assertRaises(TypeError, lambda: psser.astype(bool))
|
||||
|
||||
def test_neg(self):
|
||||
self.assertRaises(TypeError, lambda: -self.psser)
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
|
|||
from pyspark.pandas.typedef.typehints import (
|
||||
extension_dtypes_available,
|
||||
extension_float_dtypes_available,
|
||||
extension_object_dtypes_available,
|
||||
)
|
||||
from pyspark.sql.types import DecimalType, IntegralType
|
||||
from pyspark.testing.pandasutils import PandasOnSparkTestCase
|
||||
|
@ -350,6 +351,10 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
self.assert_eq(
|
||||
self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category")
|
||||
)
|
||||
if extension_object_dtypes_available and extension_float_dtypes_available:
|
||||
pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype())
|
||||
psser = ps.from_pandas(pser)
|
||||
self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
|
||||
|
||||
def test_neg(self):
|
||||
pdf, psdf = self.pdf, self.psdf
|
||||
|
|
|
@ -30,7 +30,7 @@ from pyspark.pandas.missing.groupby import (
|
|||
MissingPandasLikeDataFrameGroupBy,
|
||||
MissingPandasLikeSeriesGroupBy,
|
||||
)
|
||||
from pyspark.pandas.groupby import is_multi_agg_with_relabel
|
||||
from pyspark.pandas.groupby import is_multi_agg_with_relabel, SeriesGroupBy
|
||||
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
|
||||
|
||||
|
||||
|
@ -2135,6 +2135,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
|
|||
psdf.a.rename().groupby(psdf.b.rename()).transform(lambda x: x + x.min()).sort_index(),
|
||||
pdf.a.rename().groupby(pdf.b.rename()).transform(lambda x: x + x.min()).sort_index(),
|
||||
)
|
||||
with self.assertRaisesRegex(TypeError, "str object is not callable"):
|
||||
psdf.groupby("a").transform("sum")
|
||||
|
||||
def udf(col) -> int:
|
||||
return col + 10
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
TypeError,
|
||||
"Expected the return type of this function to be of Series type, "
|
||||
"but found type ScalarType\\[LongType\\]",
|
||||
):
|
||||
psdf.groupby("a").transform(udf)
|
||||
|
||||
# multi-index columns
|
||||
columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
|
||||
|
@ -2854,6 +2866,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
|
|||
check_exact=False,
|
||||
)
|
||||
|
||||
def test_getitem(self):
|
||||
psdf = ps.DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3,
|
||||
"b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3,
|
||||
"c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3,
|
||||
},
|
||||
index=np.random.rand(10 * 3),
|
||||
)
|
||||
|
||||
self.assertTrue(isinstance(psdf.groupby("a")["b"], SeriesGroupBy))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pyspark.pandas.tests.test_groupby import * # noqa: F401
|
||||
|
|
Loading…
Reference in a new issue