[SPARK-36653][PYTHON] Implement Series.__xor__ and Series.__rxor__

### What changes were proposed in this pull request?
Implement Series.\_\_xor__ and Series.\_\_rxor__

### Why are the changes needed?
Follow pandas

### Does this PR introduce _any_ user-facing change?
Yes, user can use
``` python
psdf = ps.DataFrame([[11, 11], [1, 2]])
psdf[0] ^ psdf[1]
```

### How was this patch tested?
unit tests

Closes #33911 from dgd-contributor/SPARK-36653_Implement_Series._xor_.

Authored-by: dgd-contributor <dgd_contributor@viettel.com.vn>
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
This commit is contained in:
dgd-contributor 2021-09-13 15:09:22 -07:00 committed by Takuya UESHIN
parent 999473b1a5
commit f8657d1924
8 changed files with 217 additions and 0 deletions

View file

@ -428,6 +428,12 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
def __ror__(self, other: Any) -> SeriesOrIndex:
return self._dtype_op.ror(self, other)
def __xor__(self, other: Any) -> SeriesOrIndex:
return self._dtype_op.xor(self, other)
def __rxor__(self, other: Any) -> SeriesOrIndex:
return self._dtype_op.rxor(self, other)
def __len__(self) -> int:
return len(self._psdf)

View file

@ -195,6 +195,26 @@ def _sanitize_list_like(operand: Any) -> None:
raise TypeError("The operation can not be applied to %s." % type(operand).__name__)
def _is_valid_for_logical_operator(right: Any) -> bool:
from pyspark.pandas.base import IndexOpsMixin
return isinstance(right, (int, bool)) or (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.spark.data_type, BooleanType)
or isinstance(right.spark.data_type, IntegralType)
)
)
def _is_boolean_type(right: Any) -> bool:
from pyspark.pandas.base import IndexOpsMixin
return isinstance(right, bool) or (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BooleanType)
)
class DataTypeOps(object, metaclass=ABCMeta):
"""The base class for binary operations of pandas-on-Spark objects (of different data types)."""
@ -319,6 +339,9 @@ class DataTypeOps(object, metaclass=ABCMeta):
def __and__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("Bitwise and can not be applied to %s." % self.pretty_name)
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("Bitwise xor can not be applied to %s." % self.pretty_name)
def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("Bitwise or can not be applied to %s." % self.pretty_name)
@ -326,6 +349,10 @@ class DataTypeOps(object, metaclass=ABCMeta):
_sanitize_list_like(right)
return left.__and__(right)
def rxor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
return left ^ right
def ror(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
return left.__or__(right)

View file

@ -31,6 +31,8 @@ from pyspark.pandas.data_type_ops.base import (
_as_categorical_type,
_as_other_type,
_sanitize_list_like,
_is_valid_for_logical_operator,
_is_boolean_type,
)
from pyspark.pandas.spark import functions as SF
from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes, pandas_on_spark_type
@ -248,6 +250,25 @@ class BooleanOps(DataTypeOps):
return column_op(and_func)(left, right)
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
return right ^ left
elif _is_valid_for_logical_operator(right):
def xor_func(left: Column, right: Any) -> Column:
if not isinstance(right, Column):
if pd.isna(right):
right = SF.lit(None)
else:
right = SF.lit(right)
scol = left.cast("integer").bitwiseXOR(right.cast("integer")).cast("boolean")
return F.when(scol.isNull(), False).otherwise(scol)
return column_op(xor_func)(left, right)
else:
raise TypeError("XOR can not be applied to given types.")
def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
@ -353,6 +374,23 @@ class BooleanExtensionOps(BooleanOps):
return column_op(or_func)(left, right)
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
if _is_boolean_type(right):
def xor_func(left: Column, right: Any) -> Column:
if not isinstance(right, Column):
if pd.isna(right):
right = SF.lit(None)
else:
right = SF.lit(right)
return left.cast("integer").bitwiseXOR(right.cast("integer")).cast("boolean")
return column_op(xor_func)(left, right)
else:
raise TypeError("XOR can not be applied to given types.")
def restore(self, col: pd.Series) -> pd.Series:
"""Restore column when to_pandas."""
return col.astype(self.dtype)

View file

@ -33,6 +33,8 @@ from pyspark.pandas.data_type_ops.base import (
_as_other_type,
_as_string_type,
_sanitize_list_like,
_is_valid_for_logical_operator,
_is_boolean_type,
)
from pyspark.pandas.spark import functions as SF
from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type
@ -181,6 +183,30 @@ class IntegralOps(NumericOps):
LongType, IntegerType, ByteType and ShortType.
"""
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
return right ^ left
elif _is_valid_for_logical_operator(right):
right_is_boolean = _is_boolean_type(right)
def xor_func(left: Column, right: Any) -> Column:
if not isinstance(right, Column):
if pd.isna(right):
right = SF.lit(None)
else:
right = SF.lit(right)
return (
left.bitwiseXOR(right.cast("integer")).cast("boolean")
if right_is_boolean
else left.bitwiseXOR(right)
)
return column_op(xor_func)(left, right)
else:
raise TypeError("XOR can not be applied to given types.")
@property
def pretty_name(self) -> str:
return "integrals"
@ -435,6 +461,10 @@ class IntegralExtensionOps(IntegralOps):
Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype
"""
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
raise TypeError("XOR can not be applied to given types.")
def restore(self, col: pd.Series) -> pd.Series:
"""Restore column when to_pandas."""
return col.astype(self.dtype)

View file

@ -2604,6 +2604,9 @@ class Index(IndexOpsMixin):
def __xor__(self, other: "Index") -> "Index":
return self.symmetric_difference(other)
def __rxor__(self, other: Any) -> "Index":
return NotImplemented
def __bool__(self) -> bool:
raise ValueError(
"The truth value of a {0} is ambiguous. "

View file

@ -24,6 +24,7 @@ import numpy as np
from pandas.api.types import CategoricalDtype
from pyspark import pandas as ps
from pyspark.pandas import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.pandas.typedef.typehints import (
extension_float_dtypes_available,
@ -286,6 +287,32 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assert_eq(True | pser, True | psser)
self.assert_eq(False | pser, False | psser)
def test_xor(self):
pdf, psdf = self.bool_pdf, self.bool_psdf
pser, other_pser = pdf["this"], pdf["that"]
psser, other_psser = psdf["this"], psdf["that"]
self.assert_eq(pser ^ other_pser, psser ^ other_psser)
self.assert_eq(pser ^ True, psser ^ True)
self.assert_eq(pser ^ False, psser ^ False)
self.assert_eq(pser ^ 2, psser ^ 2)
self.assert_eq(pser ^ 99, psser ^ 99)
with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
psser ^ "a"
with option_context("compute.ops_on_diff_frames", True):
pser, other_pser = self.pdf["bool"], self.integral_pdf["this"]
psser, other_psser = self.psdf["bool"], self.integral_psdf["this"]
self.assert_eq(pser ^ other_pser, psser ^ other_psser)
def test_rxor(self):
pser, psser = self.pdf["bool"], self.psdf["bool"]
self.assert_eq(True ^ pser, True ^ psser)
self.assert_eq(False ^ pser, False ^ psser)
self.assert_eq(1 ^ pser, 1 ^ psser)
def test_isnull(self):
self.assert_eq(self.pdf["bool"].isnull(), self.psdf["bool"].isnull())
@ -686,6 +713,26 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.check_extension(True | pser, True | psser)
self.check_extension(False | pser, False | psser)
def test_xor(self):
pdf, psdf = self.boolean_pdf, self.boolean_psdf
pser, psser = pdf["this"], psdf["this"]
other_pser, other_psser = pdf["that"], psdf["that"]
self.check_extension(pser ^ True, psser ^ True)
self.check_extension(pser ^ False, psser ^ False)
self.check_extension(pser ^ pser, psser ^ psser)
self.check_extension(pser ^ other_pser, psser ^ other_psser)
self.check_extension(other_pser ^ pser, other_psser ^ psser)
with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
psser ^ 2
def test_rxor(self):
pser, psser = self.boolean_pdf["this"], self.boolean_psdf["this"]
self.check_extension(True | pser, True | psser)
self.check_extension(False | pser, False | psser)
with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
1 ^ psser
def test_from_to_pandas(self):
data = [True, True, False, None]
pser = pd.Series(data, dtype="boolean")

View file

@ -309,6 +309,33 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: True | psser)
self.assertRaises(TypeError, lambda: False | psser)
def test_xor(self):
pdf, psdf = self.integral_pdf, self.integral_psdf
pser, other_pser = pdf["this"], pdf["that"]
psser, other_psser = psdf["this"], psdf["that"]
self.assert_eq(pser ^ other_pser, psser ^ other_psser)
self.assert_eq(pser ^ 2, psser ^ 2)
self.assert_eq(pser ^ 3, psser ^ 3)
self.assert_eq(pser ^ False, psser ^ False)
self.assert_eq(pser ^ True, psser ^ True)
with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
psser ^ "a"
psser ^ None
with option_context("compute.ops_on_diff_frames", True):
pser, other_pser = self.integral_pdf["this"], self.pdf["bool"]
psser, other_psser = self.integral_psdf["this"], self.psdf["bool"]
self.assert_eq(pser ^ other_pser, psser ^ other_psser)
def test_rxor(self):
pser, psser = self.pdf["int"], self.psdf["int"]
self.assert_eq(True ^ pser, True ^ psser)
self.assert_eq(False ^ pser, False ^ psser)
self.assert_eq(1 ^ pser, 1 ^ psser)
def test_from_to_pandas(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
@ -527,6 +554,37 @@ class IntegralExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
for pser, psser in self.intergral_extension_pser_psser_pairs:
self.check_extension(pser >= pser, (psser >= psser).sort_index())
def test_xor(self):
for psser in self.intergral_extension_pssers:
self.assertRaisesRegex(
TypeError,
"XOR can not be applied to given types.",
lambda: psser ^ 1,
)
self.assertRaisesRegex(
TypeError,
"XOR can not be applied to given types.",
lambda: psser ^ psser,
)
self.assertRaisesRegex(
TypeError,
"XOR can not be applied to given types.",
lambda: psser ^ False,
)
def test_rxor(self):
for psser in self.intergral_extension_pssers:
self.assertRaisesRegex(
TypeError,
"XOR can not be applied to given types.",
lambda: 1 ^ psser,
)
self.assertRaisesRegex(
TypeError,
"XOR can not be applied to given types.",
lambda: False ^ psser,
)
@unittest.skipIf(
not extension_float_dtypes_available, "pandas extension float dtypes are not available"

View file

@ -61,6 +61,14 @@ class TestCasesUtils(object):
def numeric_df_cols(self):
return self.numeric_pdf.columns
@property
def integral_pdf(self):
return pd.DataFrame({"this": [1, 2, 3], "that": [2, 2, 1]})
@property
def integral_psdf(self):
return ps.from_pandas(self.integral_pdf)
# TODO(SPARK-36031): Merge self.numeric_w_nan_p(s)df into self.numeric_p(s)df
@property
def numeric_w_nan_pdf(self):