[SPARK-36531][SPARK-36515][PYTHON] Improve test coverage for data_type_ops/* and groupby

### What changes were proposed in this pull request? This PR proposes improving test coverage for pandas-on-Spark data types & GroupBy code base, which is written in `data_type_ops/*.py` and `groupby.py` separately. This PR did the following to improve coverage: - Add unittest for untested code - Fix unittest which is not tested properly - Remove unused code **NOTE**: This PR is not only include the test-only update, for example it includes the fixing `astype` for binary ops. pandas-on-Spark Series we have: ```python >>> psser 0 [49] 1 [50] 2 [51] dtype: object ``` before: ```python >>> psser.astype(bool) Traceback (most recent call last): ... pyspark.sql.utils.AnalysisException: cannot resolve 'CAST(`0` AS BOOLEAN)' due to data type mismatch: cannot cast binary to boolean; ... ``` after: ```python >>> psser.astype(bool) 0 True 1 True 2 True dtype: bool ``` ### Why are the changes needed? To make the project healthier by improving coverage. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unittest. Closes #33850 from itholic/SPARK-36531. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 71dbd03fbe) Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
2021-09-08 10:22:52 +09:00 · 2021-09-08 10:22:52 +09:00 · 3d50760a3e
parent 4a92b0e278
commit 3d50760a3e
8 changed files with 61 additions and 8 deletions
--- a/python/pyspark/pandas/data_type_ops/binary_ops.py
+++ b/python/pyspark/pandas/data_type_ops/binary_ops.py
@ -23,7 +23,6 @@ from pyspark.pandas.base import column_op, IndexOpsMixin
 from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
 from pyspark.pandas.data_type_ops.base import (
    DataTypeOps,
-    _as_bool_type,
    _as_categorical_type,
    _as_other_type,
    _as_string_type,
@ -100,7 +99,9 @@ class BinaryOps(DataTypeOps):
        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
-            return _as_bool_type(index_ops, dtype)
+            # Cannot cast binary to boolean in Spark.
+            # We should cast binary to str first, and cast it to boolean
+            return index_ops.astype(str).astype(bool)
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype)
        else:
--- a/python/pyspark/pandas/data_type_ops/datetime_ops.py
+++ b/python/pyspark/pandas/data_type_ops/datetime_ops.py
@ -30,7 +30,6 @@ from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
 from pyspark.pandas.base import IndexOpsMixin
 from pyspark.pandas.data_type_ops.base import (
    DataTypeOps,
-    _as_bool_type,
    _as_categorical_type,
    _as_other_type,
    _as_string_type,
@ -132,7 +131,7 @@ class DatetimeOps(DataTypeOps):
        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
-            return _as_bool_type(index_ops, dtype)
+            raise TypeError("cannot astype a datetimelike from [datetime64[ns]] to [bool]")
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
        else:
--- a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
@ -162,7 +162,8 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils):
    def test_astype(self):
        pser = self.pser
        psser = self.psser
-        self.assert_eq(pd.Series(["1", "2", "3"]), psser.astype(str))
+        self.assert_eq(psser.astype(str), psser.astype(str))
+        self.assert_eq(pser.astype(bool), psser.astype(bool))
        self.assert_eq(pser.astype("category"), psser.astype("category"))
        cat_type = CategoricalDtype(categories=[b"2", b"3", b"1"])
        self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
--- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
@ -198,6 +198,20 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
        else:
            self.assert_eq(psser.astype(cat_type), pser)

+        # Empty
+        pser = pd.Series([], dtype="category")
+        psser = ps.from_pandas(pser)
+        self.assert_eq(pser.astype(int), psser.astype(int))
+        self.assert_eq(pser.astype(float), psser.astype(float))
+        self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
+        self.assert_eq(pser.astype(np.int32), psser.astype(np.int32))
+        self.assert_eq(pser.astype(np.int16), psser.astype(np.int16))
+        self.assert_eq(pser.astype(np.int8), psser.astype(np.int8))
+        self.assert_eq(pser.astype(str), psser.astype(str))
+        self.assert_eq(pser.astype(bool), psser.astype(bool))
+        self.assert_eq(pser.astype("category"), psser.astype("category"))
+        self.assert_eq(pser.astype("category"), psser.astype("category"))
+
    def test_neg(self):
        self.assertRaises(TypeError, lambda: -self.psser)

@ -270,6 +284,13 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
        with option_context("compute.ops_on_diff_frames", True):
            self.assert_eq(pser1 == pser2, (psser1 == psser2).sort_index())

+        psser3 = ps.Series(pd.Categorical(list("xyzx")))
+        self.assertRaisesRegex(
+            TypeError,
+            "Categoricals can only be compared if 'categories' are the same.",
+            lambda: psser1 == psser3,
+        )
+
    def test_ne(self):
        pdf, psdf = self.pdf, self.psdf

--- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
@ -212,8 +212,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):

    def test_lt(self):
        pdf, psdf = self.date_pdf, self.date_psdf
-        self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"])
-        self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"])
+        self.assert_eq(pdf["this"] < pdf["that"], psdf["this"] < psdf["that"])
+        self.assert_eq(pdf["this"] < pdf["this"], psdf["this"] < psdf["this"])

    def test_le(self):
        pdf, psdf = self.date_pdf, self.date_psdf
--- a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py
@ -195,6 +195,8 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
        cat_type = CategoricalDtype(categories=["a", "b", "c"])
        self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))

+        self.assertRaises(TypeError, lambda: psser.astype(bool))
+
    def test_neg(self):
        self.assertRaises(TypeError, lambda: -self.psser)

--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@ -29,6 +29,7 @@ from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
 from pyspark.pandas.typedef.typehints import (
    extension_dtypes_available,
    extension_float_dtypes_available,
+    extension_object_dtypes_available,
 )
 from pyspark.sql.types import DecimalType, IntegralType
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
@ -350,6 +351,10 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
        self.assert_eq(
            self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category")
        )
+        if extension_object_dtypes_available and extension_float_dtypes_available:
+            pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype())
+            psser = ps.from_pandas(pser)
+            self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))

    def test_neg(self):
        pdf, psdf = self.pdf, self.psdf
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@ -30,7 +30,7 @@ from pyspark.pandas.missing.groupby import (
    MissingPandasLikeDataFrameGroupBy,
    MissingPandasLikeSeriesGroupBy,
 )
-from pyspark.pandas.groupby import is_multi_agg_with_relabel
+from pyspark.pandas.groupby import is_multi_agg_with_relabel, SeriesGroupBy
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils


@ -2135,6 +2135,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
            psdf.a.rename().groupby(psdf.b.rename()).transform(lambda x: x + x.min()).sort_index(),
            pdf.a.rename().groupby(pdf.b.rename()).transform(lambda x: x + x.min()).sort_index(),
        )
+        with self.assertRaisesRegex(TypeError, "str object is not callable"):
+            psdf.groupby("a").transform("sum")
+
+        def udf(col) -> int:
+            return col + 10
+
+        with self.assertRaisesRegex(
+            TypeError,
+            "Expected the return type of this function to be of Series type, "
+            "but found type ScalarType\\[LongType\\]",
+        ):
+            psdf.groupby("a").transform(udf)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
@ -2854,6 +2866,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
                check_exact=False,
            )

+    def test_getitem(self):
+        psdf = ps.DataFrame(
+            {
+                "a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3,
+                "b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3,
+                "c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3,
+            },
+            index=np.random.rand(10 * 3),
+        )
+
+        self.assertTrue(isinstance(psdf.groupby("a")["b"], SeriesGroupBy))
+

 if __name__ == "__main__":
    from pyspark.pandas.tests.test_groupby import *  # noqa: F401