From 3d50760a3edea9a60d87d1dc458ac84c2ce4928e Mon Sep 17 00:00:00 2001
From: itholic <haejoon.lee@databricks.com>
Date: Wed, 8 Sep 2021 10:22:52 +0900
Subject: [PATCH] [SPARK-36531][SPARK-36515][PYTHON] Improve test coverage for
 data_type_ops/* and groupby

### What changes were proposed in this pull request?

This PR proposes improving test coverage for pandas-on-Spark data types & GroupBy code base, which is written in `data_type_ops/*.py` and `groupby.py` separately.

This PR did the following to improve coverage:
- Add unittest for untested code
- Fix unittest which is not tested properly
- Remove unused code

**NOTE**: This PR is not only include the test-only update, for example it includes the fixing `astype` for binary ops.

pandas-on-Spark Series we have:
```python
>>> psser
0    [49]
1    [50]
2    [51]
dtype: object
```

before:
```python
>>> psser.astype(bool)
Traceback (most recent call last):
...
pyspark.sql.utils.AnalysisException: cannot resolve 'CAST(`0` AS BOOLEAN)' due to data type mismatch: cannot cast binary to boolean;
...
```

after:
```python
>>> psser.astype(bool)
0    True
1    True
2    True
dtype: bool
```

### Why are the changes needed?

To make the project healthier by improving coverage.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Unittest.

Closes #33850 from itholic/SPARK-36531.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 71dbd03fbe76f83b50275e6f47649c34a2cb9825)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../pandas/data_type_ops/binary_ops.py        |  5 ++--
 .../pandas/data_type_ops/datetime_ops.py      |  3 +--
 .../tests/data_type_ops/test_binary_ops.py    |  3 ++-
 .../data_type_ops/test_categorical_ops.py     | 21 +++++++++++++++
 .../tests/data_type_ops/test_date_ops.py      |  4 +--
 .../tests/data_type_ops/test_datetime_ops.py  |  2 ++
 .../tests/data_type_ops/test_num_ops.py       |  5 ++++
 python/pyspark/pandas/tests/test_groupby.py   | 26 ++++++++++++++++++-
 8 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/binary_ops.py b/python/pyspark/pandas/data_type_ops/binary_ops.py
index 8247adecd4..77fd4cce78 100644
--- a/python/pyspark/pandas/data_type_ops/binary_ops.py
+++ b/python/pyspark/pandas/data_type_ops/binary_ops.py
@@ -23,7 +23,6 @@ from pyspark.pandas.base import column_op, IndexOpsMixin
 from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
 from pyspark.pandas.data_type_ops.base import (
     DataTypeOps,
-    _as_bool_type,
     _as_categorical_type,
     _as_other_type,
     _as_string_type,
@@ -100,7 +99,9 @@ class BinaryOps(DataTypeOps):
         if isinstance(dtype, CategoricalDtype):
             return _as_categorical_type(index_ops, dtype, spark_type)
         elif isinstance(spark_type, BooleanType):
-            return _as_bool_type(index_ops, dtype)
+            # Cannot cast binary to boolean in Spark.
+            # We should cast binary to str first, and cast it to boolean
+            return index_ops.astype(str).astype(bool)
         elif isinstance(spark_type, StringType):
             return _as_string_type(index_ops, dtype)
         else:
diff --git a/python/pyspark/pandas/data_type_ops/datetime_ops.py b/python/pyspark/pandas/data_type_ops/datetime_ops.py
index 63d817bc88..04b0ceddc3 100644
--- a/python/pyspark/pandas/data_type_ops/datetime_ops.py
+++ b/python/pyspark/pandas/data_type_ops/datetime_ops.py
@@ -30,7 +30,6 @@ from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
 from pyspark.pandas.base import IndexOpsMixin
 from pyspark.pandas.data_type_ops.base import (
     DataTypeOps,
-    _as_bool_type,
     _as_categorical_type,
     _as_other_type,
     _as_string_type,
@@ -132,7 +131,7 @@ class DatetimeOps(DataTypeOps):
         if isinstance(dtype, CategoricalDtype):
             return _as_categorical_type(index_ops, dtype, spark_type)
         elif isinstance(spark_type, BooleanType):
-            return _as_bool_type(index_ops, dtype)
+            raise TypeError("cannot astype a datetimelike from [datetime64[ns]] to [bool]")
         elif isinstance(spark_type, StringType):
             return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
         else:
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
index a68459a750..5dc7f80968 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
@@ -162,7 +162,8 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils):
     def test_astype(self):
         pser = self.pser
         psser = self.psser
-        self.assert_eq(pd.Series(["1", "2", "3"]), psser.astype(str))
+        self.assert_eq(psser.astype(str), psser.astype(str))
+        self.assert_eq(pser.astype(bool), psser.astype(bool))
         self.assert_eq(pser.astype("category"), psser.astype("category"))
         cat_type = CategoricalDtype(categories=[b"2", b"3", b"1"])
         self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
index 5e79eb3682..0aa2e108d7 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
@@ -198,6 +198,20 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         else:
             self.assert_eq(psser.astype(cat_type), pser)
 
+        # Empty
+        pser = pd.Series([], dtype="category")
+        psser = ps.from_pandas(pser)
+        self.assert_eq(pser.astype(int), psser.astype(int))
+        self.assert_eq(pser.astype(float), psser.astype(float))
+        self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
+        self.assert_eq(pser.astype(np.int32), psser.astype(np.int32))
+        self.assert_eq(pser.astype(np.int16), psser.astype(np.int16))
+        self.assert_eq(pser.astype(np.int8), psser.astype(np.int8))
+        self.assert_eq(pser.astype(str), psser.astype(str))
+        self.assert_eq(pser.astype(bool), psser.astype(bool))
+        self.assert_eq(pser.astype("category"), psser.astype("category"))
+        self.assert_eq(pser.astype("category"), psser.astype("category"))
+
     def test_neg(self):
         self.assertRaises(TypeError, lambda: -self.psser)
 
@@ -270,6 +284,13 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         with option_context("compute.ops_on_diff_frames", True):
             self.assert_eq(pser1 == pser2, (psser1 == psser2).sort_index())
 
+        psser3 = ps.Series(pd.Categorical(list("xyzx")))
+        self.assertRaisesRegex(
+            TypeError,
+            "Categoricals can only be compared if 'categories' are the same.",
+            lambda: psser1 == psser3,
+        )
+
     def test_ne(self):
         pdf, psdf = self.pdf, self.psdf
 
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
index 0f1d76855e..8c196d2a71 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py
@@ -212,8 +212,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):
 
     def test_lt(self):
         pdf, psdf = self.date_pdf, self.date_psdf
-        self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"])
-        self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"])
+        self.assert_eq(pdf["this"] < pdf["that"], psdf["this"] < psdf["that"])
+        self.assert_eq(pdf["this"] < pdf["this"], psdf["this"] < psdf["this"])
 
     def test_le(self):
         pdf, psdf = self.date_pdf, self.date_psdf
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py
index d3e59b3ae0..59e013de1a 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py
@@ -195,6 +195,8 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         cat_type = CategoricalDtype(categories=["a", "b", "c"])
         self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
 
+        self.assertRaises(TypeError, lambda: psser.astype(bool))
+
     def test_neg(self):
         self.assertRaises(TypeError, lambda: -self.psser)
 
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
index 29a21b975b..cd36b1a518 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@@ -29,6 +29,7 @@ from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
 from pyspark.pandas.typedef.typehints import (
     extension_dtypes_available,
     extension_float_dtypes_available,
+    extension_object_dtypes_available,
 )
 from pyspark.sql.types import DecimalType, IntegralType
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
@@ -350,6 +351,10 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         self.assert_eq(
             self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category")
         )
+        if extension_object_dtypes_available and extension_float_dtypes_available:
+            pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype())
+            psser = ps.from_pandas(pser)
+            self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
 
     def test_neg(self):
         pdf, psdf = self.pdf, self.psdf
diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py
index 1bc182d846..0640001d9d 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -30,7 +30,7 @@ from pyspark.pandas.missing.groupby import (
     MissingPandasLikeDataFrameGroupBy,
     MissingPandasLikeSeriesGroupBy,
 )
-from pyspark.pandas.groupby import is_multi_agg_with_relabel
+from pyspark.pandas.groupby import is_multi_agg_with_relabel, SeriesGroupBy
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
@@ -2135,6 +2135,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
             psdf.a.rename().groupby(psdf.b.rename()).transform(lambda x: x + x.min()).sort_index(),
             pdf.a.rename().groupby(pdf.b.rename()).transform(lambda x: x + x.min()).sort_index(),
         )
+        with self.assertRaisesRegex(TypeError, "str object is not callable"):
+            psdf.groupby("a").transform("sum")
+
+        def udf(col) -> int:
+            return col + 10
+
+        with self.assertRaisesRegex(
+            TypeError,
+            "Expected the return type of this function to be of Series type, "
+            "but found type ScalarType\\[LongType\\]",
+        ):
+            psdf.groupby("a").transform(udf)
 
         # multi-index columns
         columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
@@ -2854,6 +2866,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
                 check_exact=False,
             )
 
+    def test_getitem(self):
+        psdf = ps.DataFrame(
+            {
+                "a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3,
+                "b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3,
+                "c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3,
+            },
+            index=np.random.rand(10 * 3),
+        )
+
+        self.assertTrue(isinstance(psdf.groupby("a")["b"], SeriesGroupBy))
+
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.test_groupby import *  # noqa: F401