[SPARK-36653][PYTHON] Implement Series.__xor__ and Series.__rxor__
### What changes were proposed in this pull request? Implement Series.\_\_xor__ and Series.\_\_rxor__ ### Why are the changes needed? Follow pandas ### Does this PR introduce _any_ user-facing change? Yes, user can use ``` python psdf = ps.DataFrame([[11, 11], [1, 2]]) psdf[0] ^ psdf[1] ``` ### How was this patch tested? unit tests Closes #33911 from dgd-contributor/SPARK-36653_Implement_Series._xor_. Authored-by: dgd-contributor <dgd_contributor@viettel.com.vn> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
This commit is contained in:
parent
999473b1a5
commit
f8657d1924
|
@ -428,6 +428,12 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
|
|||
def __ror__(self, other: Any) -> SeriesOrIndex:
|
||||
return self._dtype_op.ror(self, other)
|
||||
|
||||
def __xor__(self, other: Any) -> SeriesOrIndex:
|
||||
return self._dtype_op.xor(self, other)
|
||||
|
||||
def __rxor__(self, other: Any) -> SeriesOrIndex:
|
||||
return self._dtype_op.rxor(self, other)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._psdf)
|
||||
|
||||
|
|
|
@ -195,6 +195,26 @@ def _sanitize_list_like(operand: Any) -> None:
|
|||
raise TypeError("The operation can not be applied to %s." % type(operand).__name__)
|
||||
|
||||
|
||||
def _is_valid_for_logical_operator(right: Any) -> bool:
|
||||
from pyspark.pandas.base import IndexOpsMixin
|
||||
|
||||
return isinstance(right, (int, bool)) or (
|
||||
isinstance(right, IndexOpsMixin)
|
||||
and (
|
||||
isinstance(right.spark.data_type, BooleanType)
|
||||
or isinstance(right.spark.data_type, IntegralType)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _is_boolean_type(right: Any) -> bool:
|
||||
from pyspark.pandas.base import IndexOpsMixin
|
||||
|
||||
return isinstance(right, bool) or (
|
||||
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BooleanType)
|
||||
)
|
||||
|
||||
|
||||
class DataTypeOps(object, metaclass=ABCMeta):
|
||||
"""The base class for binary operations of pandas-on-Spark objects (of different data types)."""
|
||||
|
||||
|
@ -319,6 +339,9 @@ class DataTypeOps(object, metaclass=ABCMeta):
|
|||
def __and__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
raise TypeError("Bitwise and can not be applied to %s." % self.pretty_name)
|
||||
|
||||
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
raise TypeError("Bitwise xor can not be applied to %s." % self.pretty_name)
|
||||
|
||||
def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
raise TypeError("Bitwise or can not be applied to %s." % self.pretty_name)
|
||||
|
||||
|
@ -326,6 +349,10 @@ class DataTypeOps(object, metaclass=ABCMeta):
|
|||
_sanitize_list_like(right)
|
||||
return left.__and__(right)
|
||||
|
||||
def rxor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
_sanitize_list_like(right)
|
||||
return left ^ right
|
||||
|
||||
def ror(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
_sanitize_list_like(right)
|
||||
return left.__or__(right)
|
||||
|
|
|
@ -31,6 +31,8 @@ from pyspark.pandas.data_type_ops.base import (
|
|||
_as_categorical_type,
|
||||
_as_other_type,
|
||||
_sanitize_list_like,
|
||||
_is_valid_for_logical_operator,
|
||||
_is_boolean_type,
|
||||
)
|
||||
from pyspark.pandas.spark import functions as SF
|
||||
from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes, pandas_on_spark_type
|
||||
|
@ -248,6 +250,25 @@ class BooleanOps(DataTypeOps):
|
|||
|
||||
return column_op(and_func)(left, right)
|
||||
|
||||
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
_sanitize_list_like(right)
|
||||
if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
|
||||
return right ^ left
|
||||
elif _is_valid_for_logical_operator(right):
|
||||
|
||||
def xor_func(left: Column, right: Any) -> Column:
|
||||
if not isinstance(right, Column):
|
||||
if pd.isna(right):
|
||||
right = SF.lit(None)
|
||||
else:
|
||||
right = SF.lit(right)
|
||||
scol = left.cast("integer").bitwiseXOR(right.cast("integer")).cast("boolean")
|
||||
return F.when(scol.isNull(), False).otherwise(scol)
|
||||
|
||||
return column_op(xor_func)(left, right)
|
||||
else:
|
||||
raise TypeError("XOR can not be applied to given types.")
|
||||
|
||||
def __or__(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
_sanitize_list_like(right)
|
||||
if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
|
||||
|
@ -353,6 +374,23 @@ class BooleanExtensionOps(BooleanOps):
|
|||
|
||||
return column_op(or_func)(left, right)
|
||||
|
||||
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
_sanitize_list_like(right)
|
||||
|
||||
if _is_boolean_type(right):
|
||||
|
||||
def xor_func(left: Column, right: Any) -> Column:
|
||||
if not isinstance(right, Column):
|
||||
if pd.isna(right):
|
||||
right = SF.lit(None)
|
||||
else:
|
||||
right = SF.lit(right)
|
||||
return left.cast("integer").bitwiseXOR(right.cast("integer")).cast("boolean")
|
||||
|
||||
return column_op(xor_func)(left, right)
|
||||
else:
|
||||
raise TypeError("XOR can not be applied to given types.")
|
||||
|
||||
def restore(self, col: pd.Series) -> pd.Series:
|
||||
"""Restore column when to_pandas."""
|
||||
return col.astype(self.dtype)
|
||||
|
|
|
@ -33,6 +33,8 @@ from pyspark.pandas.data_type_ops.base import (
|
|||
_as_other_type,
|
||||
_as_string_type,
|
||||
_sanitize_list_like,
|
||||
_is_valid_for_logical_operator,
|
||||
_is_boolean_type,
|
||||
)
|
||||
from pyspark.pandas.spark import functions as SF
|
||||
from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type
|
||||
|
@ -181,6 +183,30 @@ class IntegralOps(NumericOps):
|
|||
LongType, IntegerType, ByteType and ShortType.
|
||||
"""
|
||||
|
||||
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
_sanitize_list_like(right)
|
||||
|
||||
if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, extension_dtypes):
|
||||
return right ^ left
|
||||
elif _is_valid_for_logical_operator(right):
|
||||
right_is_boolean = _is_boolean_type(right)
|
||||
|
||||
def xor_func(left: Column, right: Any) -> Column:
|
||||
if not isinstance(right, Column):
|
||||
if pd.isna(right):
|
||||
right = SF.lit(None)
|
||||
else:
|
||||
right = SF.lit(right)
|
||||
return (
|
||||
left.bitwiseXOR(right.cast("integer")).cast("boolean")
|
||||
if right_is_boolean
|
||||
else left.bitwiseXOR(right)
|
||||
)
|
||||
|
||||
return column_op(xor_func)(left, right)
|
||||
else:
|
||||
raise TypeError("XOR can not be applied to given types.")
|
||||
|
||||
@property
|
||||
def pretty_name(self) -> str:
|
||||
return "integrals"
|
||||
|
@ -435,6 +461,10 @@ class IntegralExtensionOps(IntegralOps):
|
|||
Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype
|
||||
"""
|
||||
|
||||
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
|
||||
_sanitize_list_like(right)
|
||||
raise TypeError("XOR can not be applied to given types.")
|
||||
|
||||
def restore(self, col: pd.Series) -> pd.Series:
|
||||
"""Restore column when to_pandas."""
|
||||
return col.astype(self.dtype)
|
||||
|
|
|
@ -2604,6 +2604,9 @@ class Index(IndexOpsMixin):
|
|||
def __xor__(self, other: "Index") -> "Index":
|
||||
return self.symmetric_difference(other)
|
||||
|
||||
def __rxor__(self, other: Any) -> "Index":
|
||||
return NotImplemented
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
raise ValueError(
|
||||
"The truth value of a {0} is ambiguous. "
|
||||
|
|
|
@ -24,6 +24,7 @@ import numpy as np
|
|||
from pandas.api.types import CategoricalDtype
|
||||
|
||||
from pyspark import pandas as ps
|
||||
from pyspark.pandas import option_context
|
||||
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
|
||||
from pyspark.pandas.typedef.typehints import (
|
||||
extension_float_dtypes_available,
|
||||
|
@ -286,6 +287,32 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
self.assert_eq(True | pser, True | psser)
|
||||
self.assert_eq(False | pser, False | psser)
|
||||
|
||||
def test_xor(self):
|
||||
pdf, psdf = self.bool_pdf, self.bool_psdf
|
||||
pser, other_pser = pdf["this"], pdf["that"]
|
||||
psser, other_psser = psdf["this"], psdf["that"]
|
||||
|
||||
self.assert_eq(pser ^ other_pser, psser ^ other_psser)
|
||||
self.assert_eq(pser ^ True, psser ^ True)
|
||||
self.assert_eq(pser ^ False, psser ^ False)
|
||||
self.assert_eq(pser ^ 2, psser ^ 2)
|
||||
self.assert_eq(pser ^ 99, psser ^ 99)
|
||||
|
||||
with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
|
||||
psser ^ "a"
|
||||
|
||||
with option_context("compute.ops_on_diff_frames", True):
|
||||
pser, other_pser = self.pdf["bool"], self.integral_pdf["this"]
|
||||
psser, other_psser = self.psdf["bool"], self.integral_psdf["this"]
|
||||
|
||||
self.assert_eq(pser ^ other_pser, psser ^ other_psser)
|
||||
|
||||
def test_rxor(self):
|
||||
pser, psser = self.pdf["bool"], self.psdf["bool"]
|
||||
self.assert_eq(True ^ pser, True ^ psser)
|
||||
self.assert_eq(False ^ pser, False ^ psser)
|
||||
self.assert_eq(1 ^ pser, 1 ^ psser)
|
||||
|
||||
def test_isnull(self):
|
||||
self.assert_eq(self.pdf["bool"].isnull(), self.psdf["bool"].isnull())
|
||||
|
||||
|
@ -686,6 +713,26 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
self.check_extension(True | pser, True | psser)
|
||||
self.check_extension(False | pser, False | psser)
|
||||
|
||||
def test_xor(self):
|
||||
pdf, psdf = self.boolean_pdf, self.boolean_psdf
|
||||
pser, psser = pdf["this"], psdf["this"]
|
||||
other_pser, other_psser = pdf["that"], psdf["that"]
|
||||
self.check_extension(pser ^ True, psser ^ True)
|
||||
self.check_extension(pser ^ False, psser ^ False)
|
||||
self.check_extension(pser ^ pser, psser ^ psser)
|
||||
|
||||
self.check_extension(pser ^ other_pser, psser ^ other_psser)
|
||||
self.check_extension(other_pser ^ pser, other_psser ^ psser)
|
||||
with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
|
||||
psser ^ 2
|
||||
|
||||
def test_rxor(self):
|
||||
pser, psser = self.boolean_pdf["this"], self.boolean_psdf["this"]
|
||||
self.check_extension(True | pser, True | psser)
|
||||
self.check_extension(False | pser, False | psser)
|
||||
with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
|
||||
1 ^ psser
|
||||
|
||||
def test_from_to_pandas(self):
|
||||
data = [True, True, False, None]
|
||||
pser = pd.Series(data, dtype="boolean")
|
||||
|
|
|
@ -309,6 +309,33 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
self.assertRaises(TypeError, lambda: True | psser)
|
||||
self.assertRaises(TypeError, lambda: False | psser)
|
||||
|
||||
def test_xor(self):
|
||||
pdf, psdf = self.integral_pdf, self.integral_psdf
|
||||
pser, other_pser = pdf["this"], pdf["that"]
|
||||
psser, other_psser = psdf["this"], psdf["that"]
|
||||
|
||||
self.assert_eq(pser ^ other_pser, psser ^ other_psser)
|
||||
self.assert_eq(pser ^ 2, psser ^ 2)
|
||||
self.assert_eq(pser ^ 3, psser ^ 3)
|
||||
self.assert_eq(pser ^ False, psser ^ False)
|
||||
self.assert_eq(pser ^ True, psser ^ True)
|
||||
|
||||
with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
|
||||
psser ^ "a"
|
||||
psser ^ None
|
||||
|
||||
with option_context("compute.ops_on_diff_frames", True):
|
||||
pser, other_pser = self.integral_pdf["this"], self.pdf["bool"]
|
||||
psser, other_psser = self.integral_psdf["this"], self.psdf["bool"]
|
||||
|
||||
self.assert_eq(pser ^ other_pser, psser ^ other_psser)
|
||||
|
||||
def test_rxor(self):
|
||||
pser, psser = self.pdf["int"], self.psdf["int"]
|
||||
self.assert_eq(True ^ pser, True ^ psser)
|
||||
self.assert_eq(False ^ pser, False ^ psser)
|
||||
self.assert_eq(1 ^ pser, 1 ^ psser)
|
||||
|
||||
def test_from_to_pandas(self):
|
||||
pdf, psdf = self.pdf, self.psdf
|
||||
for col in self.numeric_df_cols:
|
||||
|
@ -527,6 +554,37 @@ class IntegralExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
|
|||
for pser, psser in self.intergral_extension_pser_psser_pairs:
|
||||
self.check_extension(pser >= pser, (psser >= psser).sort_index())
|
||||
|
||||
def test_xor(self):
|
||||
for psser in self.intergral_extension_pssers:
|
||||
self.assertRaisesRegex(
|
||||
TypeError,
|
||||
"XOR can not be applied to given types.",
|
||||
lambda: psser ^ 1,
|
||||
)
|
||||
self.assertRaisesRegex(
|
||||
TypeError,
|
||||
"XOR can not be applied to given types.",
|
||||
lambda: psser ^ psser,
|
||||
)
|
||||
self.assertRaisesRegex(
|
||||
TypeError,
|
||||
"XOR can not be applied to given types.",
|
||||
lambda: psser ^ False,
|
||||
)
|
||||
|
||||
def test_rxor(self):
|
||||
for psser in self.intergral_extension_pssers:
|
||||
self.assertRaisesRegex(
|
||||
TypeError,
|
||||
"XOR can not be applied to given types.",
|
||||
lambda: 1 ^ psser,
|
||||
)
|
||||
self.assertRaisesRegex(
|
||||
TypeError,
|
||||
"XOR can not be applied to given types.",
|
||||
lambda: False ^ psser,
|
||||
)
|
||||
|
||||
|
||||
@unittest.skipIf(
|
||||
not extension_float_dtypes_available, "pandas extension float dtypes are not available"
|
||||
|
|
|
@ -61,6 +61,14 @@ class TestCasesUtils(object):
|
|||
def numeric_df_cols(self):
|
||||
return self.numeric_pdf.columns
|
||||
|
||||
@property
|
||||
def integral_pdf(self):
|
||||
return pd.DataFrame({"this": [1, 2, 3], "that": [2, 2, 1]})
|
||||
|
||||
@property
|
||||
def integral_psdf(self):
|
||||
return ps.from_pandas(self.integral_pdf)
|
||||
|
||||
# TODO(SPARK-36031): Merge self.numeric_w_nan_p(s)df into self.numeric_p(s)df
|
||||
@property
|
||||
def numeric_w_nan_pdf(self):
|
||||
|
|
Loading…
Reference in a new issue