[SPARK-36653][PYTHON] Implement Series.__xor__ and Series.__rxor__

### What changes were proposed in this pull request? Implement Series.\_\_xor__ and Series.\_\_rxor__ ### Why are the changes needed? Follow pandas ### Does this PR introduce _any_ user-facing change? Yes, user can use ``` python psdf = ps.DataFrame([[11, 11], [1, 2]]) psdf[0] ^ psdf[1] ``` ### How was this patch tested? unit tests Closes #33911 from dgd-contributor/SPARK-36653_Implement_Series._xor_. Authored-by: dgd-contributor <dgd_contributor@viettel.com.vn> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
2021-09-13 15:09:22 -07:00 · 2021-09-13 15:09:22 -07:00 · f8657d1924
parent 999473b1a5
commit f8657d1924
8 changed files with 217 additions and 0 deletions
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@ -428,6 +428,12 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
    def __ror__(self, other: Any) -> SeriesOrIndex:
        return self._dtype_op.ror(self, other)

+    def __xor__(self, other: Any) -> SeriesOrIndex:
+        return self._dtype_op.xor(self, other)
+
+    def __rxor__(self, other: Any) -> SeriesOrIndex:
+        return self._dtype_op.rxor(self, other)
+
    def __len__(self) -> int:
        return len(self._psdf)

--- a/python/pyspark/pandas/data_type_ops/base.py
+++ b/python/pyspark/pandas/data_type_ops/base.py
@ -195,6 +195,26 @@ def _sanitize_list_like(operand: Any) -> None:
        raise TypeError("The operation can not be applied to %s." % type(operand).__name__)


+def _is_valid_for_logical_operator(right: Any) -> bool:
+    from pyspark.pandas.base import IndexOpsMixin
+
+    return isinstance(right, (int, bool)) or (
+        isinstance(right, IndexOpsMixin)
+        and (
+            isinstance(right.spark.data_type, BooleanType)
+            or isinstance(right.spark.data_type, IntegralType)
+        )
+    )
+
+
+def _is_boolean_type(right: Any) -> bool:
+    from pyspark.pandas.base import IndexOpsMixin
+
+    return isinstance(right, bool) or (
+        isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BooleanType)
+    )
+
+
 class DataTypeOps(object, metaclass=ABCMeta):
    """The base class for binary operations of pandas-on-Spark objects (of different data types)."""

@ -319,6 +339,9 @@ class DataTypeOps(object, metaclass=ABCMeta):
    def __and__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        raise TypeError("Bitwise and can not be applied to %s." % self.pretty_name)

+    def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
+        raise TypeError("Bitwise xor can not be applied to %s." % self.pretty_name)
+
    def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        raise TypeError("Bitwise or can not be applied to %s." % self.pretty_name)

@ -326,6 +349,10 @@ class DataTypeOps(object, metaclass=ABCMeta):
        _sanitize_list_like(right)
        return left.__and__(right)

+    def rxor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
+        _sanitize_list_like(right)
+        return left ^ right
+
    def ror(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        _sanitize_list_like(right)
        return left.__or__(right)
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@ -31,6 +31,8 @@ from pyspark.pandas.data_type_ops.base import (
    _as_categorical_type,
    _as_other_type,
    _sanitize_list_like,
+    _is_valid_for_logical_operator,
+    _is_boolean_type,
 )
 from pyspark.pandas.spark import functions as SF
 from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes, pandas_on_spark_type
@ -248,6 +250,25 @@ class BooleanOps(DataTypeOps):

            return column_op(and_func)(left, right)

+    def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
+        _sanitize_list_like(right)
+        if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
+            return right ^ left
+        elif _is_valid_for_logical_operator(right):
+
+            def xor_func(left: Column, right: Any) -> Column:
+                if not isinstance(right, Column):
+                    if pd.isna(right):
+                        right = SF.lit(None)
+                    else:
+                        right = SF.lit(right)
+                scol = left.cast("integer").bitwiseXOR(right.cast("integer")).cast("boolean")
+                return F.when(scol.isNull(), False).otherwise(scol)
+
+            return column_op(xor_func)(left, right)
+        else:
+            raise TypeError("XOR can not be applied to given types.")
+
    def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        _sanitize_list_like(right)
        if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
@ -353,6 +374,23 @@ class BooleanExtensionOps(BooleanOps):

        return column_op(or_func)(left, right)

+    def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
+        _sanitize_list_like(right)
+
+        if _is_boolean_type(right):
+
+            def xor_func(left: Column, right: Any) -> Column:
+                if not isinstance(right, Column):
+                    if pd.isna(right):
+                        right = SF.lit(None)
+                    else:
+                        right = SF.lit(right)
+                return left.cast("integer").bitwiseXOR(right.cast("integer")).cast("boolean")
+
+            return column_op(xor_func)(left, right)
+        else:
+            raise TypeError("XOR can not be applied to given types.")
+
    def restore(self, col: pd.Series) -> pd.Series:
        """Restore column when to_pandas."""
        return col.astype(self.dtype)
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@ -33,6 +33,8 @@ from pyspark.pandas.data_type_ops.base import (
    _as_other_type,
    _as_string_type,
    _sanitize_list_like,
+    _is_valid_for_logical_operator,
+    _is_boolean_type,
 )
 from pyspark.pandas.spark import functions as SF
 from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type
@ -181,6 +183,30 @@ class IntegralOps(NumericOps):
    LongType, IntegerType, ByteType and ShortType.
    """

+    def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
+        _sanitize_list_like(right)
+
+        if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
+            return right ^ left
+        elif _is_valid_for_logical_operator(right):
+            right_is_boolean = _is_boolean_type(right)
+
+            def xor_func(left: Column, right: Any) -> Column:
+                if not isinstance(right, Column):
+                    if pd.isna(right):
+                        right = SF.lit(None)
+                    else:
+                        right = SF.lit(right)
+                return (
+                    left.bitwiseXOR(right.cast("integer")).cast("boolean")
+                    if right_is_boolean
+                    else left.bitwiseXOR(right)
+                )
+
+            return column_op(xor_func)(left, right)
+        else:
+            raise TypeError("XOR can not be applied to given types.")
+
    @property
    def pretty_name(self) -> str:
        return "integrals"
@ -435,6 +461,10 @@ class IntegralExtensionOps(IntegralOps):
        Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype
    """

+    def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
+        _sanitize_list_like(right)
+        raise TypeError("XOR can not be applied to given types.")
+
    def restore(self, col: pd.Series) -> pd.Series:
        """Restore column when to_pandas."""
        return col.astype(self.dtype)
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@ -2604,6 +2604,9 @@ class Index(IndexOpsMixin):
    def __xor__(self, other: "Index") -> "Index":
        return self.symmetric_difference(other)

+    def __rxor__(self, other: Any) -> "Index":
+        return NotImplemented
+
    def __bool__(self) -> bool:
        raise ValueError(
            "The truth value of a {0} is ambiguous. "
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@ -24,6 +24,7 @@ import numpy as np
 from pandas.api.types import CategoricalDtype

 from pyspark import pandas as ps
+from pyspark.pandas import option_context
 from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
 from pyspark.pandas.typedef.typehints import (
    extension_float_dtypes_available,
@ -286,6 +287,32 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils):
        self.assert_eq(True | pser, True | psser)
        self.assert_eq(False | pser, False | psser)

+    def test_xor(self):
+        pdf, psdf = self.bool_pdf, self.bool_psdf
+        pser, other_pser = pdf["this"], pdf["that"]
+        psser, other_psser = psdf["this"], psdf["that"]
+
+        self.assert_eq(pser ^ other_pser, psser ^ other_psser)
+        self.assert_eq(pser ^ True, psser ^ True)
+        self.assert_eq(pser ^ False, psser ^ False)
+        self.assert_eq(pser ^ 2, psser ^ 2)
+        self.assert_eq(pser ^ 99, psser ^ 99)
+
+        with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
+            psser ^ "a"
+
+        with option_context("compute.ops_on_diff_frames", True):
+            pser, other_pser = self.pdf["bool"], self.integral_pdf["this"]
+            psser, other_psser = self.psdf["bool"], self.integral_psdf["this"]
+
+            self.assert_eq(pser ^ other_pser, psser ^ other_psser)
+
+    def test_rxor(self):
+        pser, psser = self.pdf["bool"], self.psdf["bool"]
+        self.assert_eq(True ^ pser, True ^ psser)
+        self.assert_eq(False ^ pser, False ^ psser)
+        self.assert_eq(1 ^ pser, 1 ^ psser)
+
    def test_isnull(self):
        self.assert_eq(self.pdf["bool"].isnull(), self.psdf["bool"].isnull())

@ -686,6 +713,26 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
        self.check_extension(True | pser, True | psser)
        self.check_extension(False | pser, False | psser)

+    def test_xor(self):
+        pdf, psdf = self.boolean_pdf, self.boolean_psdf
+        pser, psser = pdf["this"], psdf["this"]
+        other_pser, other_psser = pdf["that"], psdf["that"]
+        self.check_extension(pser ^ True, psser ^ True)
+        self.check_extension(pser ^ False, psser ^ False)
+        self.check_extension(pser ^ pser, psser ^ psser)
+
+        self.check_extension(pser ^ other_pser, psser ^ other_psser)
+        self.check_extension(other_pser ^ pser, other_psser ^ psser)
+        with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
+            psser ^ 2
+
+    def test_rxor(self):
+        pser, psser = self.boolean_pdf["this"], self.boolean_psdf["this"]
+        self.check_extension(True | pser, True | psser)
+        self.check_extension(False | pser, False | psser)
+        with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
+            1 ^ psser
+
    def test_from_to_pandas(self):
        data = [True, True, False, None]
        pser = pd.Series(data, dtype="boolean")
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@ -309,6 +309,33 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
            self.assertRaises(TypeError, lambda: True | psser)
            self.assertRaises(TypeError, lambda: False | psser)

+    def test_xor(self):
+        pdf, psdf = self.integral_pdf, self.integral_psdf
+        pser, other_pser = pdf["this"], pdf["that"]
+        psser, other_psser = psdf["this"], psdf["that"]
+
+        self.assert_eq(pser ^ other_pser, psser ^ other_psser)
+        self.assert_eq(pser ^ 2, psser ^ 2)
+        self.assert_eq(pser ^ 3, psser ^ 3)
+        self.assert_eq(pser ^ False, psser ^ False)
+        self.assert_eq(pser ^ True, psser ^ True)
+
+        with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
+            psser ^ "a"
+            psser ^ None
+
+        with option_context("compute.ops_on_diff_frames", True):
+            pser, other_pser = self.integral_pdf["this"], self.pdf["bool"]
+            psser, other_psser = self.integral_psdf["this"], self.psdf["bool"]
+
+            self.assert_eq(pser ^ other_pser, psser ^ other_psser)
+
+    def test_rxor(self):
+        pser, psser = self.pdf["int"], self.psdf["int"]
+        self.assert_eq(True ^ pser, True ^ psser)
+        self.assert_eq(False ^ pser, False ^ psser)
+        self.assert_eq(1 ^ pser, 1 ^ psser)
+
    def test_from_to_pandas(self):
        pdf, psdf = self.pdf, self.psdf
        for col in self.numeric_df_cols:
@ -527,6 +554,37 @@ class IntegralExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
            for pser, psser in self.intergral_extension_pser_psser_pairs:
                self.check_extension(pser >= pser, (psser >= psser).sort_index())

+    def test_xor(self):
+        for psser in self.intergral_extension_pssers:
+            self.assertRaisesRegex(
+                TypeError,
+                "XOR can not be applied to given types.",
+                lambda: psser ^ 1,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "XOR can not be applied to given types.",
+                lambda: psser ^ psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "XOR can not be applied to given types.",
+                lambda: psser ^ False,
+            )
+
+    def test_rxor(self):
+        for psser in self.intergral_extension_pssers:
+            self.assertRaisesRegex(
+                TypeError,
+                "XOR can not be applied to given types.",
+                lambda: 1 ^ psser,
+            )
+            self.assertRaisesRegex(
+                TypeError,
+                "XOR can not be applied to given types.",
+                lambda: False ^ psser,
+            )
+

@unittest.skipIf(
    not extension_float_dtypes_available, "pandas extension float dtypes are not available"
--- a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
+++ b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
@ -61,6 +61,14 @@ class TestCasesUtils(object):
    def numeric_df_cols(self):
        return self.numeric_pdf.columns

+    @property
+    def integral_pdf(self):
+        return pd.DataFrame({"this": [1, 2, 3], "that": [2, 2, 1]})
+
+    @property
+    def integral_psdf(self):
+        return ps.from_pandas(self.integral_pdf)
+
    # TODO(SPARK-36031): Merge self.numeric_w_nan_p(s)df into self.numeric_p(s)df
    @property
    def numeric_w_nan_pdf(self):