From 3d50760a3edea9a60d87d1dc458ac84c2ce4928e Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 8 Sep 2021 10:22:52 +0900 Subject: [PATCH] [SPARK-36531][SPARK-36515][PYTHON] Improve test coverage for data_type_ops/* and groupby ### What changes were proposed in this pull request? This PR proposes improving test coverage for pandas-on-Spark data types & GroupBy code base, which is written in `data_type_ops/*.py` and `groupby.py` separately. This PR did the following to improve coverage: - Add unittest for untested code - Fix unittest which is not tested properly - Remove unused code **NOTE**: This PR is not only include the test-only update, for example it includes the fixing `astype` for binary ops. pandas-on-Spark Series we have: ```python >>> psser 0 [49] 1 [50] 2 [51] dtype: object ``` before: ```python >>> psser.astype(bool) Traceback (most recent call last): ... pyspark.sql.utils.AnalysisException: cannot resolve 'CAST(`0` AS BOOLEAN)' due to data type mismatch: cannot cast binary to boolean; ... ``` after: ```python >>> psser.astype(bool) 0 True 1 True 2 True dtype: bool ``` ### Why are the changes needed? To make the project healthier by improving coverage. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unittest. Closes #33850 from itholic/SPARK-36531. Authored-by: itholic Signed-off-by: Hyukjin Kwon (cherry picked from commit 71dbd03fbe76f83b50275e6f47649c34a2cb9825) Signed-off-by: Hyukjin Kwon --- .../pandas/data_type_ops/binary_ops.py | 5 ++-- .../pandas/data_type_ops/datetime_ops.py | 3 +-- .../tests/data_type_ops/test_binary_ops.py | 3 ++- .../data_type_ops/test_categorical_ops.py | 21 +++++++++++++++ .../tests/data_type_ops/test_date_ops.py | 4 +-- .../tests/data_type_ops/test_datetime_ops.py | 2 ++ .../tests/data_type_ops/test_num_ops.py | 5 ++++ python/pyspark/pandas/tests/test_groupby.py | 26 ++++++++++++++++++- 8 files changed, 61 insertions(+), 8 deletions(-) diff --git a/python/pyspark/pandas/data_type_ops/binary_ops.py b/python/pyspark/pandas/data_type_ops/binary_ops.py index 8247adecd4..77fd4cce78 100644 --- a/python/pyspark/pandas/data_type_ops/binary_ops.py +++ b/python/pyspark/pandas/data_type_ops/binary_ops.py @@ -23,7 +23,6 @@ from pyspark.pandas.base import column_op, IndexOpsMixin from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex from pyspark.pandas.data_type_ops.base import ( DataTypeOps, - _as_bool_type, _as_categorical_type, _as_other_type, _as_string_type, @@ -100,7 +99,9 @@ class BinaryOps(DataTypeOps): if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): - return _as_bool_type(index_ops, dtype) + # Cannot cast binary to boolean in Spark. + # We should cast binary to str first, and cast it to boolean + return index_ops.astype(str).astype(bool) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype) else: diff --git a/python/pyspark/pandas/data_type_ops/datetime_ops.py b/python/pyspark/pandas/data_type_ops/datetime_ops.py index 63d817bc88..04b0ceddc3 100644 --- a/python/pyspark/pandas/data_type_ops/datetime_ops.py +++ b/python/pyspark/pandas/data_type_ops/datetime_ops.py @@ -30,7 +30,6 @@ from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex from pyspark.pandas.base import IndexOpsMixin from pyspark.pandas.data_type_ops.base import ( DataTypeOps, - _as_bool_type, _as_categorical_type, _as_other_type, _as_string_type, @@ -132,7 +131,7 @@ class DatetimeOps(DataTypeOps): if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): - return _as_bool_type(index_ops, dtype) + raise TypeError("cannot astype a datetimelike from [datetime64[ns]] to [bool]") elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(pd.NaT)) else: diff --git a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py index a68459a750..5dc7f80968 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py @@ -162,7 +162,8 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils): def test_astype(self): pser = self.pser psser = self.psser - self.assert_eq(pd.Series(["1", "2", "3"]), psser.astype(str)) + self.assert_eq(psser.astype(str), psser.astype(str)) + self.assert_eq(pser.astype(bool), psser.astype(bool)) self.assert_eq(pser.astype("category"), psser.astype("category")) cat_type = CategoricalDtype(categories=[b"2", b"3", b"1"]) self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py index 5e79eb3682..0aa2e108d7 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py @@ -198,6 +198,20 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils): else: self.assert_eq(psser.astype(cat_type), pser) + # Empty + pser = pd.Series([], dtype="category") + psser = ps.from_pandas(pser) + self.assert_eq(pser.astype(int), psser.astype(int)) + self.assert_eq(pser.astype(float), psser.astype(float)) + self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) + self.assert_eq(pser.astype(np.int32), psser.astype(np.int32)) + self.assert_eq(pser.astype(np.int16), psser.astype(np.int16)) + self.assert_eq(pser.astype(np.int8), psser.astype(np.int8)) + self.assert_eq(pser.astype(str), psser.astype(str)) + self.assert_eq(pser.astype(bool), psser.astype(bool)) + self.assert_eq(pser.astype("category"), psser.astype("category")) + self.assert_eq(pser.astype("category"), psser.astype("category")) + def test_neg(self): self.assertRaises(TypeError, lambda: -self.psser) @@ -270,6 +284,13 @@ class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils): with option_context("compute.ops_on_diff_frames", True): self.assert_eq(pser1 == pser2, (psser1 == psser2).sort_index()) + psser3 = ps.Series(pd.Categorical(list("xyzx"))) + self.assertRaisesRegex( + TypeError, + "Categoricals can only be compared if 'categories' are the same.", + lambda: psser1 == psser3, + ) + def test_ne(self): pdf, psdf = self.pdf, self.psdf diff --git a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py index 0f1d76855e..8c196d2a71 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py @@ -212,8 +212,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils): def test_lt(self): pdf, psdf = self.date_pdf, self.date_psdf - self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"]) - self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"]) + self.assert_eq(pdf["this"] < pdf["that"], psdf["this"] < psdf["that"]) + self.assert_eq(pdf["this"] < pdf["this"], psdf["this"] < psdf["this"]) def test_le(self): pdf, psdf = self.date_pdf, self.date_psdf diff --git a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py index d3e59b3ae0..59e013de1a 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py @@ -195,6 +195,8 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils): cat_type = CategoricalDtype(categories=["a", "b", "c"]) self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) + self.assertRaises(TypeError, lambda: psser.astype(bool)) + def test_neg(self): self.assertRaises(TypeError, lambda: -self.psser) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py index 29a21b975b..cd36b1a518 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py @@ -29,6 +29,7 @@ from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.pandas.typedef.typehints import ( extension_dtypes_available, extension_float_dtypes_available, + extension_object_dtypes_available, ) from pyspark.sql.types import DecimalType, IntegralType from pyspark.testing.pandasutils import PandasOnSparkTestCase @@ -350,6 +351,10 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assert_eq( self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category") ) + if extension_object_dtypes_available and extension_float_dtypes_available: + pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype())) def test_neg(self): pdf, psdf = self.pdf, self.psdf diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 1bc182d846..0640001d9d 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -30,7 +30,7 @@ from pyspark.pandas.missing.groupby import ( MissingPandasLikeDataFrameGroupBy, MissingPandasLikeSeriesGroupBy, ) -from pyspark.pandas.groupby import is_multi_agg_with_relabel +from pyspark.pandas.groupby import is_multi_agg_with_relabel, SeriesGroupBy from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils @@ -2135,6 +2135,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): psdf.a.rename().groupby(psdf.b.rename()).transform(lambda x: x + x.min()).sort_index(), pdf.a.rename().groupby(pdf.b.rename()).transform(lambda x: x + x.min()).sort_index(), ) + with self.assertRaisesRegex(TypeError, "str object is not callable"): + psdf.groupby("a").transform("sum") + + def udf(col) -> int: + return col + 10 + + with self.assertRaisesRegex( + TypeError, + "Expected the return type of this function to be of Series type, " + "but found type ScalarType\\[LongType\\]", + ): + psdf.groupby("a").transform(udf) # multi-index columns columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) @@ -2854,6 +2866,18 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): check_exact=False, ) + def test_getitem(self): + psdf = ps.DataFrame( + { + "a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, + "b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3, + "c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3, + }, + index=np.random.rand(10 * 3), + ) + + self.assertTrue(isinstance(psdf.groupby("a")["b"], SeriesGroupBy)) + if __name__ == "__main__": from pyspark.pandas.tests.test_groupby import * # noqa: F401