Revert "[SPARK-35338][PYTHON] Separate arithmetic operations into data type based structures"

This reverts commit d1b24d8aba.
This commit is contained in:
Takuya UESHIN 2021-05-19 16:49:47 -07:00
parent 586caae3cc
commit d44e6c7f10
23 changed files with 259 additions and 1881 deletions

View file

@ -611,12 +611,6 @@ pyspark_pandas = Module(
"pyspark.pandas.spark.utils",
"pyspark.pandas.typedef.typehints",
# unittests
"pyspark.pandas.tests.data_type_ops.test_boolean_ops",
"pyspark.pandas.tests.data_type_ops.test_categorical_ops",
"pyspark.pandas.tests.data_type_ops.test_date_ops",
"pyspark.pandas.tests.data_type_ops.test_datetime_ops",
"pyspark.pandas.tests.data_type_ops.test_num_ops",
"pyspark.pandas.tests.data_type_ops.test_string_ops",
"pyspark.pandas.tests.indexes.test_base",
"pyspark.pandas.tests.indexes.test_category",
"pyspark.pandas.tests.indexes.test_datetime",

View file

@ -19,6 +19,7 @@
Base and utility classes for pandas-on-Spark objects.
"""
from abc import ABCMeta, abstractmethod
import datetime
from functools import wraps, partial
from itertools import chain
from typing import Any, Callable, Optional, Tuple, Union, cast, TYPE_CHECKING
@ -34,6 +35,7 @@ from pyspark.sql.types import (
DateType,
DoubleType,
FloatType,
IntegralType,
LongType,
NumericType,
StringType,
@ -48,9 +50,11 @@ from pyspark.pandas.internal import (
NATURAL_ORDER_COLUMN_NAME,
SPARK_DEFAULT_INDEX_NAME,
)
from pyspark.pandas.spark import functions as SF
from pyspark.pandas.spark.accessors import SparkIndexOpsMethods
from pyspark.pandas.typedef import (
Dtype,
as_spark_type,
extension_dtypes,
pandas_on_spark_type,
spark_type_to_pandas_dtype,
@ -318,23 +322,100 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
spark_column.__doc__ = SparkIndexOpsMethods.column.__doc__
@property
def _dtype_op(self):
from pyspark.pandas.data_type_ops.base import DataTypeOps
return DataTypeOps(self.dtype, self.spark.data_type)
# arithmetic operators
__neg__ = column_op(Column.__neg__)
def __add__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__add__(self, other)
if not isinstance(self.spark.data_type, StringType) and (
(isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType))
or isinstance(other, str)
):
raise TypeError("string addition can only be applied to string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("addition can not be applied to date times.")
if isinstance(self.spark.data_type, StringType):
# Concatenate string columns
if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType):
return column_op(F.concat)(self, other)
# Handle df['col'] + 'literal'
elif isinstance(other, str):
return column_op(F.concat)(self, F.lit(other))
else:
raise TypeError("string addition can only be applied to string series or literals.")
else:
return column_op(Column.__add__)(self, other)
def __sub__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__sub__(self, other)
if (
isinstance(self.spark.data_type, StringType)
or (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType))
or isinstance(other, str)
):
raise TypeError("substraction can not be applied to string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
# Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
msg = (
"Note that there is a behavior difference of timestamp subtraction. "
"The timestamp subtraction returns an integer in seconds, "
"whereas pandas returns 'timedelta64[ns]'."
)
if isinstance(other, IndexOpsMixin) and isinstance(
other.spark.data_type, TimestampType
):
warnings.warn(msg, UserWarning)
return self.astype("long") - other.astype("long")
elif isinstance(other, datetime.datetime):
warnings.warn(msg, UserWarning)
return self.astype("long") - F.lit(other).cast(as_spark_type("long"))
else:
raise TypeError("datetime subtraction can only be applied to datetime series.")
elif isinstance(self.spark.data_type, DateType):
# Note that date subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction.
msg = (
"Note that there is a behavior difference of date subtraction. "
"The date subtraction returns an integer in days, "
"whereas pandas returns 'timedelta64[ns]'."
)
if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, DateType):
warnings.warn(msg, UserWarning)
return column_op(F.datediff)(self, other).astype("long")
elif isinstance(other, datetime.date) and not isinstance(other, datetime.datetime):
warnings.warn(msg, UserWarning)
return column_op(F.datediff)(self, F.lit(other)).astype("long")
else:
raise TypeError("date subtraction can only be applied to date series.")
return column_op(Column.__sub__)(self, other)
def __mul__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__mul__(self, other)
if isinstance(other, str):
raise TypeError("multiplication can not be applied to a string literal.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("multiplication can not be applied to date times.")
if (
isinstance(self.spark.data_type, IntegralType)
and isinstance(other, IndexOpsMixin)
and isinstance(other.spark.data_type, StringType)
):
return column_op(SF.repeat)(other, self)
if isinstance(self.spark.data_type, StringType):
if (
isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, IntegralType)
) or isinstance(other, int):
return column_op(SF.repeat)(self, other)
else:
raise TypeError(
"a string series can only be multiplied to an int series or literal"
)
return column_op(Column.__mul__)(self, other)
def __truediv__(self, other) -> Union["Series", "Index"]:
"""
@ -353,22 +434,122 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
| -10 | null | -np.inf |
+-----------------------|---------|---------+
"""
return self._dtype_op.__truediv__(self, other)
if (
isinstance(self.spark.data_type, StringType)
or (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType))
or isinstance(other, str)
):
raise TypeError("division can not be applied on string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("division can not be applied to date times.")
def truediv(left, right):
return F.when(F.lit(right != 0) | F.lit(right).isNull(), left.__div__(right)).otherwise(
F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
F.lit(np.inf).__div__(left)
)
)
return numpy_column_op(truediv)(self, other)
def __mod__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__mod__(self, other)
if (
isinstance(self.spark.data_type, StringType)
or (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType))
or isinstance(other, str)
):
raise TypeError("modulo can not be applied on string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("modulo can not be applied to date times.")
def mod(left, right):
return ((left % right) + right) % right
return column_op(mod)(self, other)
def __radd__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__radd__(self, other)
# Handle 'literal' + df['col']
if not isinstance(self.spark.data_type, StringType) and isinstance(other, str):
raise TypeError("string addition can only be applied to string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("addition can not be applied to date times.")
if isinstance(self.spark.data_type, StringType):
if isinstance(other, str):
return self._with_new_scol(
F.concat(F.lit(other), self.spark.column)
) # TODO: dtype?
else:
raise TypeError("string addition can only be applied to string series or literals.")
else:
return column_op(Column.__radd__)(self, other)
def __rsub__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__rsub__(self, other)
if isinstance(self.spark.data_type, StringType) or isinstance(other, str):
raise TypeError("substraction can not be applied to string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
# Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
msg = (
"Note that there is a behavior difference of timestamp subtraction. "
"The timestamp subtraction returns an integer in seconds, "
"whereas pandas returns 'timedelta64[ns]'."
)
if isinstance(other, datetime.datetime):
warnings.warn(msg, UserWarning)
return -(self.astype("long") - F.lit(other).cast(as_spark_type("long")))
else:
raise TypeError("datetime subtraction can only be applied to datetime series.")
elif isinstance(self.spark.data_type, DateType):
# Note that date subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction.
msg = (
"Note that there is a behavior difference of date subtraction. "
"The date subtraction returns an integer in days, "
"whereas pandas returns 'timedelta64[ns]'."
)
if isinstance(other, datetime.date) and not isinstance(other, datetime.datetime):
warnings.warn(msg, UserWarning)
return -column_op(F.datediff)(self, F.lit(other)).astype("long")
else:
raise TypeError("date subtraction can only be applied to date series.")
return column_op(Column.__rsub__)(self, other)
def __rmul__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__rmul__(self, other)
if isinstance(other, str):
raise TypeError("multiplication can not be applied to a string literal.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("multiplication can not be applied to date times.")
if isinstance(self.spark.data_type, StringType):
if isinstance(other, int):
return column_op(SF.repeat)(self, other)
else:
raise TypeError(
"a string series can only be multiplied to an int series or literal"
)
return column_op(Column.__rmul__)(self, other)
def __rtruediv__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__rtruediv__(self, other)
if isinstance(self.spark.data_type, StringType) or isinstance(other, str):
raise TypeError("division can not be applied on string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("division can not be applied to date times.")
def rtruediv(left, right):
return F.when(left == 0, F.lit(np.inf).__div__(right)).otherwise(
F.lit(right).__truediv__(left)
)
return numpy_column_op(rtruediv)(self, other)
def __floordiv__(self, other) -> Union["Series", "Index"]:
"""
@ -387,19 +568,66 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
| -10 | null | -np.inf |
+-----------------------|---------|---------+
"""
return self._dtype_op.__floordiv__(self, other)
if (
isinstance(self.spark.data_type, StringType)
or (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType))
or isinstance(other, str)
):
raise TypeError("division can not be applied on string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("division can not be applied to date times.")
def floordiv(left, right):
return F.when(F.lit(right is np.nan), np.nan).otherwise(
F.when(
F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right))
).otherwise(
F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
F.lit(np.inf).__div__(left)
)
)
)
return numpy_column_op(floordiv)(self, other)
def __rfloordiv__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__rfloordiv__(self, other)
if isinstance(self.spark.data_type, StringType) or isinstance(other, str):
raise TypeError("division can not be applied on string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("division can not be applied to date times.")
def rfloordiv(left, right):
return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise(
F.when(F.lit(left) == np.nan, np.nan).otherwise(F.floor(F.lit(right).__div__(left)))
)
return numpy_column_op(rfloordiv)(self, other)
def __rmod__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__rmod__(self, other)
if isinstance(self.spark.data_type, StringType) or isinstance(other, str):
raise TypeError("modulo can not be applied on string series or literals.")
if isinstance(self.spark.data_type, TimestampType):
raise TypeError("modulo can not be applied to date times.")
def rmod(left, right):
return ((right % left) + left) % left
return column_op(rmod)(self, other)
def __pow__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__pow__(self, other)
def pow_func(left, right):
return F.when(left == 1, left).otherwise(Column.__pow__(left, right))
return column_op(pow_func)(self, other)
def __rpow__(self, other) -> Union["Series", "Index"]:
return self._dtype_op.__rpow__(self, other)
def rpow_func(left, right):
return F.when(F.lit(right == 1), right).otherwise(Column.__rpow__(left, right))
return column_op(rpow_func)(self, other)
__abs__ = column_op(F.abs)

View file

@ -1,16 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View file

@ -1,120 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from abc import ABCMeta, abstractmethod
from typing import TYPE_CHECKING, Union
from pandas.api.types import CategoricalDtype
from pyspark.sql.types import (
BooleanType,
DataType,
DateType,
FractionalType,
IntegralType,
StringType,
TimestampType,
)
from pyspark.pandas.typedef import Dtype
if TYPE_CHECKING:
from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943)
from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943)
class DataTypeOps(object, metaclass=ABCMeta):
"""The base class for binary operations of pandas-on-Spark objects (of different data types)."""
def __new__(cls, dtype: Dtype, spark_type: DataType):
from pyspark.pandas.data_type_ops.boolean_ops import BooleanOps
from pyspark.pandas.data_type_ops.categorical_ops import CategoricalOps
from pyspark.pandas.data_type_ops.date_ops import DateOps
from pyspark.pandas.data_type_ops.datetime_ops import DatetimeOps
from pyspark.pandas.data_type_ops.num_ops import (
IntegralOps,
FractionalOps,
)
from pyspark.pandas.data_type_ops.string_ops import StringOps
if isinstance(dtype, CategoricalDtype):
return object.__new__(CategoricalOps)
elif isinstance(spark_type, FractionalType):
return object.__new__(FractionalOps)
elif isinstance(spark_type, IntegralType):
return object.__new__(IntegralOps)
elif isinstance(spark_type, StringType):
return object.__new__(StringOps)
elif isinstance(spark_type, BooleanType):
return object.__new__(BooleanOps)
elif isinstance(spark_type, TimestampType):
return object.__new__(DatetimeOps)
elif isinstance(spark_type, DateType):
return object.__new__(DateOps)
else:
raise TypeError("Type %s was not understood." % dtype)
def __init__(self, dtype: Dtype, spark_type: DataType):
self.dtype = dtype
self.spark_type = spark_type
@property
@abstractmethod
def pretty_name(self) -> str:
raise NotImplementedError()
def __add__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Addition can not be applied to %s." % self.pretty_name)
def __sub__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Subtraction can not be applied to %s." % self.pretty_name)
def __mul__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Multiplication can not be applied to %s." % self.pretty_name)
def __truediv__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("True division can not be applied to %s." % self.pretty_name)
def __floordiv__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Floor division can not be applied to %s." % self.pretty_name)
def __mod__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Modulo can not be applied to %s." % self.pretty_name)
def __pow__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Exponentiation can not be applied to %s." % self.pretty_name)
def __radd__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Addition can not be applied to %s." % self.pretty_name)
def __rsub__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Subtraction can not be applied to %s." % self.pretty_name)
def __rmul__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Multiplication can not be applied to %s." % self.pretty_name)
def __rtruediv__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("True division can not be applied to %s." % self.pretty_name)
def __rfloordiv__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Floor division can not be applied to %s." % self.pretty_name)
def __rmod__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Modulo can not be applied to %s." % self.pretty_name)
def __rpow__(self, left, right) -> Union["Series", "Index"]:
raise TypeError("Exponentiation can not be applied to %s." % self.pretty_name)

View file

@ -1,28 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pyspark.pandas.data_type_ops.base import DataTypeOps
class BooleanOps(DataTypeOps):
"""
The class for binary operations of pandas-on-Spark objects with spark type: BooleanType.
"""
@property
def pretty_name(self) -> str:
return 'booleans'

View file

@ -1,28 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pyspark.pandas.data_type_ops.base import DataTypeOps
class CategoricalOps(DataTypeOps):
"""
The class for binary operations of pandas-on-Spark objects with categorical types.
"""
@property
def pretty_name(self) -> str:
return 'categoricals'

View file

@ -1,71 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import warnings
from typing import TYPE_CHECKING, Union
from pyspark.sql import functions as F
from pyspark.sql.types import DateType
from pyspark.pandas.base import column_op, IndexOpsMixin
from pyspark.pandas.data_type_ops.base import DataTypeOps
if TYPE_CHECKING:
from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943)
from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943)
class DateOps(DataTypeOps):
"""
The class for binary operations of pandas-on-Spark objects with spark type: DateType.
"""
@property
def pretty_name(self) -> str:
return 'dates'
def __sub__(self, left, right) -> Union["Series", "Index"]:
# Note that date subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction.
msg = (
"Note that there is a behavior difference of date subtraction. "
"The date subtraction returns an integer in days, "
"whereas pandas returns 'timedelta64[ns]'."
)
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, DateType):
warnings.warn(msg, UserWarning)
return column_op(F.datediff)(left, right).astype("long")
elif isinstance(right, datetime.date) and not isinstance(right, datetime.datetime):
warnings.warn(msg, UserWarning)
return column_op(F.datediff)(left, F.lit(right)).astype("long")
else:
raise TypeError("date subtraction can only be applied to date series.")
def __rsub__(self, left, right) -> Union["Series", "Index"]:
# Note that date subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction.
msg = (
"Note that there is a behavior difference of date subtraction. "
"The date subtraction returns an integer in days, "
"whereas pandas returns 'timedelta64[ns]'."
)
if isinstance(right, datetime.date) and not isinstance(right, datetime.datetime):
warnings.warn(msg, UserWarning)
return -column_op(F.datediff)(left, F.lit(right)).astype("long")
else:
raise TypeError("date subtraction can only be applied to date series.")

View file

@ -1,72 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import warnings
from typing import TYPE_CHECKING, Union
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType
from pyspark.pandas.base import IndexOpsMixin
from pyspark.pandas.data_type_ops.base import DataTypeOps
from pyspark.pandas.typedef import as_spark_type
if TYPE_CHECKING:
from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943)
from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943)
class DatetimeOps(DataTypeOps):
"""
The class for binary operations of pandas-on-Spark objects with spark type: TimestampType.
"""
@property
def pretty_name(self) -> str:
return 'datetimes'
def __sub__(self, left, right) -> Union["Series", "Index"]:
# Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
msg = (
"Note that there is a behavior difference of timestamp subtraction. "
"The timestamp subtraction returns an integer in seconds, "
"whereas pandas returns 'timedelta64[ns]'."
)
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, TimestampType):
warnings.warn(msg, UserWarning)
return left.astype("long") - right.astype("long")
elif isinstance(right, datetime.datetime):
warnings.warn(msg, UserWarning)
return left.astype("long") - F.lit(right).cast(as_spark_type("long"))
else:
raise TypeError("datetime subtraction can only be applied to datetime series.")
def __rsub__(self, left, right) -> Union["Series", "Index"]:
# Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
msg = (
"Note that there is a behavior difference of timestamp subtraction. "
"The timestamp subtraction returns an integer in seconds, "
"whereas pandas returns 'timedelta64[ns]'."
)
if isinstance(right, datetime.datetime):
warnings.warn(msg, UserWarning)
return -(left.astype("long") - F.lit(right).cast(as_spark_type("long")))
else:
raise TypeError("datetime subtraction can only be applied to datetime series.")

View file

@ -1,378 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numbers
from typing import TYPE_CHECKING, Union
import numpy as np
from pandas.api.types import CategoricalDtype
from pyspark.sql import Column, functions as F
from pyspark.sql.types import (
NumericType,
StringType,
TimestampType,
)
from pyspark.pandas.base import column_op, IndexOpsMixin, numpy_column_op
from pyspark.pandas.data_type_ops.base import DataTypeOps
from pyspark.pandas.spark import functions as SF
if TYPE_CHECKING:
from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943)
from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943)
class NumericOps(DataTypeOps):
"""
The class for binary operations of numeric pandas-on-Spark objects.
"""
@property
def pretty_name(self) -> str:
return 'numerics'
def __add__(self, left, right) -> Union["Series", "Index"]:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("string addition can only be applied to string series or literals.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or (not isinstance(right.spark.data_type, NumericType))
)
) and not isinstance(right, numbers.Number):
raise TypeError("addition can not be applied to given types.")
return column_op(Column.__add__)(left, right)
def __sub__(self, left, right) -> Union["Series", "Index"]:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("subtraction can not be applied to string series or literals.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or (not isinstance(right.spark.data_type, NumericType))
)
) and not isinstance(right, numbers.Number):
raise TypeError("subtraction can not be applied to given types.")
return column_op(Column.__sub__)(left, right)
def __mod__(self, left, right) -> Union["Series", "Index"]:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("modulo can not be applied on string series or literals.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or (not isinstance(right.spark.data_type, NumericType))
)
) and not isinstance(right, numbers.Number):
raise TypeError("modulo can not be applied to given types.")
def mod(left, right):
return ((left % right) + right) % right
return column_op(mod)(left, right)
def __pow__(self, left, right) -> Union["Series", "Index"]:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("exponentiation can not be applied on string series or literals.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or (not isinstance(right.spark.data_type, NumericType))
)
) and not isinstance(right, numbers.Number):
raise TypeError("exponentiation can not be applied to given types.")
def pow_func(left, right):
return F.when(left == 1, left).otherwise(Column.__pow__(left, right))
return column_op(pow_func)(left, right)
def __radd__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("string addition can only be applied to string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("addition can not be applied to given types.")
return column_op(Column.__radd__)(left, right)
def __rsub__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("subtraction can not be applied to string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("subtraction can not be applied to given types.")
return column_op(Column.__rsub__)(left, right)
def __rmul__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("multiplication can not be applied to a string literal.")
if not isinstance(right, numbers.Number):
raise TypeError("multiplication can not be applied to given types.")
return column_op(Column.__rmul__)(left, right)
def __rpow__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("exponentiation can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("exponentiation can not be applied to given types.")
def rpow_func(left, right):
return F.when(F.lit(right == 1), right).otherwise(Column.__rpow__(left, right))
return column_op(rpow_func)(left, right)
def __rmod__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("modulo can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("modulo can not be applied to given types.")
def rmod(left, right):
return ((right % left) + left) % left
return column_op(rmod)(left, right)
class IntegralOps(NumericOps):
"""
The class for binary operations of pandas-on-Spark objects with spark types:
LongType, IntegerType, ByteType and ShortType.
"""
@property
def pretty_name(self) -> str:
return 'integrals'
def __mul__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("multiplication can not be applied to a string literal.")
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, TimestampType):
raise TypeError("multiplication can not be applied to date times.")
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType):
return column_op(SF.repeat)(right, left)
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or not isinstance(right.spark.data_type, NumericType)
)
) and not isinstance(right, numbers.Number):
raise TypeError("multiplication can not be applied to given types.")
return column_op(Column.__mul__)(left, right)
def __truediv__(self, left, right) -> Union["Series", "Index"]:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or (not isinstance(right.spark.data_type, NumericType))
)
) and not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
def truediv(left, right):
return F.when(F.lit(right != 0) | F.lit(right).isNull(), left.__div__(right)).otherwise(
F.lit(np.inf).__div__(left)
)
return numpy_column_op(truediv)(left, right)
def __floordiv__(self, left, right) -> Union["Series", "Index"]:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or (not isinstance(right.spark.data_type, NumericType))
)
) and not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
def floordiv(left, right):
return F.when(F.lit(right is np.nan), np.nan).otherwise(
F.when(
F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right))
).otherwise(
F.lit(np.inf).__div__(left)
)
)
return numpy_column_op(floordiv)(left, right)
def __rtruediv__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
def rtruediv(left, right):
return F.when(left == 0, F.lit(np.inf).__div__(right)).otherwise(
F.lit(right).__truediv__(left)
)
return numpy_column_op(rtruediv)(left, right)
def __rfloordiv__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
def rfloordiv(left, right):
return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise(
F.floor(F.lit(right).__div__(left))
)
return numpy_column_op(rfloordiv)(left, right)
class FractionalOps(NumericOps):
"""
The class for binary operations of pandas-on-Spark objects with spark types:
FloatType, DoubleType and DecimalType.
"""
@property
def pretty_name(self) -> str:
return 'fractions'
def __mul__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("multiplication can not be applied to a string literal.")
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, TimestampType):
raise TypeError("multiplication can not be applied to date times.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or not isinstance(right.spark.data_type, NumericType)
)
) and not isinstance(right, numbers.Number):
raise TypeError("multiplication can not be applied to given types.")
return column_op(Column.__mul__)(left, right)
def __truediv__(self, left, right) -> Union["Series", "Index"]:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or (not isinstance(right.spark.data_type, NumericType))
)
) and not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
def truediv(left, right):
return F.when(F.lit(right != 0) | F.lit(right).isNull(), left.__div__(right)).otherwise(
F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
F.lit(np.inf).__div__(left)
)
)
return numpy_column_op(truediv)(left, right)
def __floordiv__(self, left, right) -> Union["Series", "Index"]:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if (
isinstance(right, IndexOpsMixin)
and (
isinstance(right.dtype, CategoricalDtype)
or (not isinstance(right.spark.data_type, NumericType))
)
) and not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
def floordiv(left, right):
return F.when(F.lit(right is np.nan), np.nan).otherwise(
F.when(
F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right))
).otherwise(
F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
F.lit(np.inf).__div__(left)
)
)
)
return numpy_column_op(floordiv)(left, right)
def __rtruediv__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
def rtruediv(left, right):
return F.when(left == 0, F.lit(np.inf).__div__(right)).otherwise(
F.lit(right).__truediv__(left)
)
return numpy_column_op(rtruediv)(left, right)
def __rfloordiv__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
def rfloordiv(left, right):
return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise(
F.when(F.lit(left) == np.nan, np.nan).otherwise(F.floor(F.lit(right).__div__(left)))
)
return numpy_column_op(rfloordiv)(left, right)

View file

@ -1,104 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import TYPE_CHECKING, Union
from pandas.api.types import CategoricalDtype
from pyspark.sql import functions as F
from pyspark.sql.types import IntegralType, StringType
from pyspark.pandas.base import column_op, IndexOpsMixin
from pyspark.pandas.data_type_ops.base import DataTypeOps
from pyspark.pandas.spark import functions as SF
if TYPE_CHECKING:
from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943)
from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943)
class StringOps(DataTypeOps):
"""
The class for binary operations of pandas-on-Spark objects with spark type: StringType.
"""
@property
def pretty_name(self) -> str:
return 'strings'
def __add__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType):
return column_op(F.concat)(left, right)
elif isinstance(right, str):
return column_op(F.concat)(left, F.lit(right))
else:
raise TypeError("string addition can only be applied to string series or literals.")
def __sub__(self, left, right):
raise TypeError("subtraction can not be applied to string series or literals.")
def __mul__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
raise TypeError("multiplication can not be applied to a string literal.")
if (
isinstance(right, IndexOpsMixin)
and isinstance(right.spark.data_type, IntegralType)
and not isinstance(right.dtype, CategoricalDtype)
) or isinstance(right, int):
return column_op(SF.repeat)(left, right)
else:
raise TypeError("a string series can only be multiplied to an int series or literal")
def __truediv__(self, left, right):
raise TypeError("division can not be applied on string series or literals.")
def __floordiv__(self, left, right):
raise TypeError("division can not be applied on string series or literals.")
def __mod__(self, left, right):
raise TypeError("modulo can not be applied on string series or literals.")
def __pow__(self, left, right):
raise TypeError("exponentiation can not be applied on string series or literals.")
def __radd__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, str):
return left._with_new_scol(F.concat(F.lit(right), left.spark.column)) # TODO: dtype?
else:
raise TypeError("string addition can only be applied to string series or literals.")
def __rsub__(self, left, right):
raise TypeError("subtraction can not be applied to string series or literals.")
def __rmul__(self, left, right) -> Union["Series", "Index"]:
if isinstance(right, int):
return column_op(SF.repeat)(left, right)
else:
raise TypeError("a string series can only be multiplied to an int series or literal")
def __rtruediv__(self, left, right):
raise TypeError("division can not be applied on string series or literals.")
def __rfloordiv__(self, left, right):
raise TypeError("division can not be applied on string series or literals.")
def __rpow__(self, left, right):
raise TypeError("exponentiation can not be applied on string series or literals.")
def __rmod__(self, left, right):
raise TypeError("modulo can not be applied on string series or literals.")

View file

@ -1,16 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View file

@ -1,150 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import pandas as pd
from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase
class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils):
@property
def pser(self):
return pd.Series([True, True, False])
@property
def kser(self):
return ps.from_pandas(self.pser)
def test_add(self):
self.assertRaises(TypeError, lambda: self.kser + 1)
self.assertRaises(TypeError, lambda: self.kser + 0.1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser + kser)
def test_sub(self):
self.assertRaises(TypeError, lambda: self.kser - 1)
self.assertRaises(TypeError, lambda: self.kser - 0.1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser - kser)
def test_mul(self):
self.assertRaises(TypeError, lambda: self.kser * 1)
self.assertRaises(TypeError, lambda: self.kser * 0.1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser * kser)
def test_truediv(self):
self.assertRaises(TypeError, lambda: self.kser / 1)
self.assertRaises(TypeError, lambda: self.kser / 0.1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser / kser)
def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.kser // 1)
self.assertRaises(TypeError, lambda: self.kser // 0.1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser // kser)
def test_mod(self):
self.assertRaises(TypeError, lambda: self.kser % 1)
self.assertRaises(TypeError, lambda: self.kser % 0.1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser % kser)
def test_pow(self):
self.assertRaises(TypeError, lambda: self.kser ** 1)
self.assertRaises(TypeError, lambda: self.kser ** 0.1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser ** kser)
def test_radd(self):
self.assertRaises(TypeError, lambda: 1 + self.kser)
self.assertRaises(TypeError, lambda: 0.1 + self.kser)
self.assertRaises(TypeError, lambda: "x" + self.kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) + self.kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) + self.kser)
def test_rsub(self):
self.assertRaises(TypeError, lambda: 1 - self.kser)
self.assertRaises(TypeError, lambda: 0.1 - self.kser)
self.assertRaises(TypeError, lambda: "x" - self.kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) - self.kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) - self.kser)
def test_rmul(self):
self.assertRaises(TypeError, lambda: 1 * self.kser)
self.assertRaises(TypeError, lambda: 0.1 * self.kser)
self.assertRaises(TypeError, lambda: "x" * self.kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) * self.kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) * self.kser)
def test_rtruediv(self):
self.assertRaises(TypeError, lambda: 1 / self.kser)
self.assertRaises(TypeError, lambda: 0.1 / self.kser)
self.assertRaises(TypeError, lambda: "x" / self.kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) / self.kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) / self.kser)
def test_rfloordiv(self):
self.assertRaises(TypeError, lambda: 1 // self.kser)
self.assertRaises(TypeError, lambda: 0.1 // self.kser)
self.assertRaises(TypeError, lambda: "x" + self.kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) // self.kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) // self.kser)
def test_rpow(self):
self.assertRaises(TypeError, lambda: 1 ** self.kser)
self.assertRaises(TypeError, lambda: 0.1 ** self.kser)
self.assertRaises(TypeError, lambda: "x" ** self.kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** self.kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** self.kser)
def test_rmod(self):
self.assertRaises(TypeError, lambda: 1 % self.kser)
self.assertRaises(TypeError, lambda: 0.1 % self.kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) % self.kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) % self.kser)
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.data_type_ops.test_boolean_ops import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)

View file

@ -1,128 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import pandas as pd
from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase
class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils):
@property
def pser(self):
return pd.Series([1, "x", "y"], dtype="category")
@property
def kser(self):
return ps.from_pandas(self.pser)
def test_add(self):
self.assertRaises(TypeError, lambda: self.kser + "x")
self.assertRaises(TypeError, lambda: self.kser + 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser + kser)
def test_sub(self):
self.assertRaises(TypeError, lambda: self.kser - "x")
self.assertRaises(TypeError, lambda: self.kser - 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser - kser)
def test_mul(self):
self.assertRaises(TypeError, lambda: self.kser * "x")
self.assertRaises(TypeError, lambda: self.kser * 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser * kser)
def test_truediv(self):
self.assertRaises(TypeError, lambda: self.kser / "x")
self.assertRaises(TypeError, lambda: self.kser / 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser / kser)
def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.kser // "x")
self.assertRaises(TypeError, lambda: self.kser // 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser // kser)
def test_mod(self):
self.assertRaises(TypeError, lambda: self.kser % "x")
self.assertRaises(TypeError, lambda: self.kser % 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser % kser)
def test_pow(self):
self.assertRaises(TypeError, lambda: self.kser ** "x")
self.assertRaises(TypeError, lambda: self.kser ** 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser ** kser)
def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.kser)
self.assertRaises(TypeError, lambda: 1 + self.kser)
def test_rsub(self):
self.assertRaises(TypeError, lambda: "x" - self.kser)
self.assertRaises(TypeError, lambda: 1 - self.kser)
def test_rmul(self):
self.assertRaises(TypeError, lambda: "x" * self.kser)
self.assertRaises(TypeError, lambda: 2 * self.kser)
def test_rtruediv(self):
self.assertRaises(TypeError, lambda: "x" / self.kser)
self.assertRaises(TypeError, lambda: 1 / self.kser)
def test_rfloordiv(self):
self.assertRaises(TypeError, lambda: "x" // self.kser)
self.assertRaises(TypeError, lambda: 1 // self.kser)
def test_rmod(self):
self.assertRaises(TypeError, lambda: 1 % self.kser)
def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.kser)
self.assertRaises(TypeError, lambda: 1 ** self.kser)
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.data_type_ops.test_categorical_ops import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)

View file

@ -1,158 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import pandas as pd
from pyspark.sql.types import DateType
from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase
class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):
@property
def pser(self):
return pd.Series(
[datetime.date(1994, 1, 31), datetime.date(1994, 2, 1), datetime.date(1994, 2, 2)]
)
@property
def kser(self):
return ps.from_pandas(self.pser)
@property
def some_date(self):
return datetime.date(1994, 1, 1)
def test_add(self):
self.assertRaises(TypeError, lambda: self.kser + "x")
self.assertRaises(TypeError, lambda: self.kser + 1)
self.assertRaises(TypeError, lambda: self.kser + self.some_date)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser + kser)
def test_sub(self):
self.assertRaises(TypeError, lambda: self.kser - "x")
self.assertRaises(TypeError, lambda: self.kser - 1)
self.assert_eq(
(self.pser - self.some_date).dt.days, self.kser - self.some_date,
)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.pser_kser_pairs:
if isinstance(kser.spark.data_type, DateType):
self.assert_eq((self.pser - pser).dt.days, (self.kser - kser).sort_index())
else:
self.assertRaises(TypeError, lambda: self.kser - kser)
def test_mul(self):
self.assertRaises(TypeError, lambda: self.kser * "x")
self.assertRaises(TypeError, lambda: self.kser * 1)
self.assertRaises(TypeError, lambda: self.kser * self.some_date)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser * kser)
def test_truediv(self):
self.assertRaises(TypeError, lambda: self.kser / "x")
self.assertRaises(TypeError, lambda: self.kser / 1)
self.assertRaises(TypeError, lambda: self.kser / self.some_date)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser / kser)
def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.kser // "x")
self.assertRaises(TypeError, lambda: self.kser // 1)
self.assertRaises(TypeError, lambda: self.kser // self.some_date)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser // kser)
def test_mod(self):
self.assertRaises(TypeError, lambda: self.kser % "x")
self.assertRaises(TypeError, lambda: self.kser % 1)
self.assertRaises(TypeError, lambda: self.kser % self.some_date)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser % kser)
def test_pow(self):
self.assertRaises(TypeError, lambda: self.kser ** "x")
self.assertRaises(TypeError, lambda: self.kser ** 1)
self.assertRaises(TypeError, lambda: self.kser ** self.some_date)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser ** kser)
def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.kser)
self.assertRaises(TypeError, lambda: 1 + self.kser)
self.assertRaises(TypeError, lambda: self.some_date + self.kser)
def test_rsub(self):
self.assertRaises(TypeError, lambda: "x" - self.kser)
self.assertRaises(TypeError, lambda: 1 - self.kser)
self.assert_eq(
(self.some_date - self.pser).dt.days, self.some_date - self.kser,
)
def test_rmul(self):
self.assertRaises(TypeError, lambda: "x" * self.kser)
self.assertRaises(TypeError, lambda: 1 * self.kser)
self.assertRaises(TypeError, lambda: self.some_date * self.kser)
def test_rtruediv(self):
self.assertRaises(TypeError, lambda: "x" / self.kser)
self.assertRaises(TypeError, lambda: 1 / self.kser)
self.assertRaises(TypeError, lambda: self.some_date / self.kser)
def test_rfloordiv(self):
self.assertRaises(TypeError, lambda: "x" // self.kser)
self.assertRaises(TypeError, lambda: 1 // self.kser)
self.assertRaises(TypeError, lambda: self.some_date // self.kser)
def test_rmod(self):
self.assertRaises(TypeError, lambda: 1 % self.kser)
self.assertRaises(TypeError, lambda: self.some_date % self.kser)
def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.kser)
self.assertRaises(TypeError, lambda: 1 ** self.kser)
self.assertRaises(TypeError, lambda: self.some_date ** self.kser)
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.data_type_ops.test_date_ops import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)

View file

@ -1,160 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import numpy as np
import pandas as pd
from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase
class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
@property
def pser(self):
return pd.Series(pd.date_range("1994-1-31 10:30:15", periods=3, freq="M"))
@property
def kser(self):
return ps.from_pandas(self.pser)
@property
def some_datetime(self):
return datetime.datetime(1994, 1, 31, 10, 30, 00)
def test_add(self):
self.assertRaises(TypeError, lambda: self.kser + "x")
self.assertRaises(TypeError, lambda: self.kser + 1)
self.assertRaises(TypeError, lambda: self.kser + self.some_datetime)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser + kser)
def test_sub(self):
self.assertRaises(TypeError, lambda: self.kser - "x")
self.assertRaises(TypeError, lambda: self.kser - 1)
self.assert_eq(
(self.pser - self.some_datetime).dt.total_seconds().astype("int"),
self.kser - self.some_datetime,
)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.pser_kser_pairs:
if pser.dtype == np.dtype("<M8[ns]"):
self.assert_eq(
(self.pser - pser).dt.total_seconds().astype("int"),
(self.kser - kser).sort_index(),
)
else:
self.assertRaises(TypeError, lambda: self.kser - kser)
def test_mul(self):
self.assertRaises(TypeError, lambda: self.kser * "x")
self.assertRaises(TypeError, lambda: self.kser * 1)
self.assertRaises(TypeError, lambda: self.kser * self.some_datetime)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser * kser)
def test_truediv(self):
self.assertRaises(TypeError, lambda: self.kser / "x")
self.assertRaises(TypeError, lambda: self.kser / 1)
self.assertRaises(TypeError, lambda: self.kser / self.some_datetime)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser / kser)
def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.kser // "x")
self.assertRaises(TypeError, lambda: self.kser // 1)
self.assertRaises(TypeError, lambda: self.kser // self.some_datetime)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser // kser)
def test_mod(self):
self.assertRaises(TypeError, lambda: self.kser % "x")
self.assertRaises(TypeError, lambda: self.kser % 1)
self.assertRaises(TypeError, lambda: self.kser % self.some_datetime)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser % kser)
def test_pow(self):
self.assertRaises(TypeError, lambda: self.kser ** "x")
self.assertRaises(TypeError, lambda: self.kser ** 1)
self.assertRaises(TypeError, lambda: self.kser ** self.some_datetime)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser ** kser)
def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.kser)
self.assertRaises(TypeError, lambda: 1 + self.kser)
self.assertRaises(TypeError, lambda: self.some_datetime + self.kser)
def test_rsub(self):
self.assertRaises(TypeError, lambda: "x" - self.kser)
self.assertRaises(TypeError, lambda: 1 - self.kser)
self.assert_eq(
(self.some_datetime - self.pser).dt.total_seconds().astype("int"),
self.some_datetime - self.kser,
)
def test_rmul(self):
self.assertRaises(TypeError, lambda: "x" * self.kser)
self.assertRaises(TypeError, lambda: 1 * self.kser)
self.assertRaises(TypeError, lambda: self.some_datetime * self.kser)
def test_rtruediv(self):
self.assertRaises(TypeError, lambda: "x" / self.kser)
self.assertRaises(TypeError, lambda: 1 / self.kser)
self.assertRaises(TypeError, lambda: self.some_datetime / self.kser)
def test_rfloordiv(self):
self.assertRaises(TypeError, lambda: "x" // self.kser)
self.assertRaises(TypeError, lambda: 1 // self.kser)
self.assertRaises(TypeError, lambda: self.some_datetime // self.kser)
def test_rmod(self):
self.assertRaises(TypeError, lambda: 1 % self.kser)
self.assertRaises(TypeError, lambda: self.some_datetime % self.kser)
def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.kser)
self.assertRaises(TypeError, lambda: 1 ** self.kser)
self.assertRaises(TypeError, lambda: self.some_datetime ** self.kser)
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.data_type_ops.test_datetime_ops import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)

View file

@ -1,195 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import numpy as np
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase
class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
"""Unit tests for arithmetic operations of numeric data types.
A few test cases are disabled because pandas-on-Spark returns float64 whereas pandas
returns float32.
The underlying reason is the respective Spark operations return DoubleType always.
"""
def test_add(self):
for pser, kser in self.numeric_pser_kser_pairs:
self.assert_eq(pser + pser, kser + kser)
self.assert_eq(pser + 1, kser + 1)
# self.assert_eq(pser + 0.1, kser + 0.1)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.numeric_pser_kser_pairs:
self.assertRaises(TypeError, lambda: kser + self.non_numeric_ksers["string"])
self.assertRaises(TypeError, lambda: kser + self.non_numeric_ksers["datetime"])
self.assertRaises(TypeError, lambda: kser + self.non_numeric_ksers["date"])
self.assertRaises(TypeError, lambda: kser + self.non_numeric_ksers["categorical"])
self.assertRaises(TypeError, lambda: kser + self.non_numeric_ksers["bool"])
def test_sub(self):
for pser, kser in self.numeric_pser_kser_pairs:
self.assert_eq(pser - pser, kser - kser)
self.assert_eq(pser - 1, kser - 1)
# self.assert_eq(pser - 0.1, kser - 0.1)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.numeric_pser_kser_pairs:
self.assertRaises(TypeError, lambda: kser - self.non_numeric_ksers["string"])
self.assertRaises(TypeError, lambda: kser - self.non_numeric_ksers["datetime"])
self.assertRaises(TypeError, lambda: kser - self.non_numeric_ksers["date"])
self.assertRaises(TypeError, lambda: kser - self.non_numeric_ksers["categorical"])
self.assertRaises(TypeError, lambda: kser - self.non_numeric_ksers["bool"])
def test_mul(self):
for pser, kser in self.numeric_pser_kser_pairs:
self.assert_eq(pser * pser, kser * kser)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.numeric_pser_kser_pairs:
if kser.dtype in [int, np.int32]:
self.assert_eq(
(kser * self.non_numeric_ksers["string"]).sort_index(),
pser * self.non_numeric_psers["string"],
)
else:
self.assertRaises(TypeError, lambda: kser * self.non_numeric_ksers["string"])
self.assertRaises(TypeError, lambda: kser * self.non_numeric_ksers["datetime"])
self.assertRaises(TypeError, lambda: kser * self.non_numeric_ksers["date"])
self.assertRaises(TypeError, lambda: kser * self.non_numeric_ksers["categorical"])
self.assertRaises(TypeError, lambda: kser * self.non_numeric_ksers["bool"])
def test_truediv(self):
for pser, kser in self.numeric_pser_kser_pairs:
if kser.dtype in [float, int, np.int32]:
self.assert_eq(pser / pser, kser / kser)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.numeric_pser_kser_pairs:
self.assertRaises(TypeError, lambda: kser / self.non_numeric_ksers["string"])
self.assertRaises(TypeError, lambda: kser / self.non_numeric_ksers["datetime"])
self.assertRaises(TypeError, lambda: kser / self.non_numeric_ksers["date"])
self.assertRaises(TypeError, lambda: kser / self.non_numeric_ksers["categorical"])
self.assertRaises(TypeError, lambda: kser / self.non_numeric_ksers["bool"])
def test_floordiv(self):
for pser, kser in self.numeric_pser_kser_pairs:
if kser.dtype == float:
self.assert_eq(pser // pser, kser // kser)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.numeric_pser_kser_pairs:
self.assertRaises(TypeError, lambda: kser // self.non_numeric_ksers["string"])
self.assertRaises(TypeError, lambda: kser // self.non_numeric_ksers["datetime"])
self.assertRaises(TypeError, lambda: kser // self.non_numeric_ksers["date"])
self.assertRaises(TypeError, lambda: kser // self.non_numeric_ksers["categorical"])
self.assertRaises(TypeError, lambda: kser // self.non_numeric_ksers["bool"])
def test_mod(self):
for pser, kser in self.numeric_pser_kser_pairs:
self.assert_eq(pser % pser, kser % kser)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.numeric_pser_kser_pairs:
self.assertRaises(TypeError, lambda: kser % self.non_numeric_ksers["string"])
self.assertRaises(TypeError, lambda: kser % self.non_numeric_ksers["datetime"])
self.assertRaises(TypeError, lambda: kser % self.non_numeric_ksers["date"])
self.assertRaises(TypeError, lambda: kser % self.non_numeric_ksers["categorical"])
self.assertRaises(TypeError, lambda: kser % self.non_numeric_ksers["bool"])
def test_pow(self):
for pser, kser in self.numeric_pser_kser_pairs:
if kser.dtype == float:
self.assert_eq(pser ** pser, kser ** kser)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.numeric_pser_kser_pairs:
self.assertRaises(TypeError, lambda: kser ** self.non_numeric_ksers["string"])
self.assertRaises(TypeError, lambda: kser ** self.non_numeric_ksers["datetime"])
self.assertRaises(TypeError, lambda: kser ** self.non_numeric_ksers["date"])
self.assertRaises(TypeError, lambda: kser ** self.non_numeric_ksers["categorical"])
self.assertRaises(TypeError, lambda: kser ** self.non_numeric_ksers["bool"])
def test_radd(self):
for pser, kser in self.numeric_pser_kser_pairs:
self.assert_eq(1 + pser, 1 + kser)
# self.assert_eq(0.1 + pser, 0.1 + kser)
self.assertRaises(TypeError, lambda: "x" + kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) + kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) + kser)
def test_rsub(self):
for pser, kser in self.numeric_pser_kser_pairs:
self.assert_eq(1 - pser, 1 - kser)
# self.assert_eq(0.1 - pser, 0.1 - kser)
self.assertRaises(TypeError, lambda: "x" - kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) - kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) - kser)
def test_rmul(self):
for pser, kser in self.numeric_pser_kser_pairs:
self.assert_eq(1 * pser, 1 * kser)
# self.assert_eq(0.1 * pser, 0.1 * kser)
self.assertRaises(TypeError, lambda: "x" * kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) * kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) * kser)
def test_rtruediv(self):
for pser, kser in self.numeric_pser_kser_pairs:
# self.assert_eq(5 / pser, 5 / kser)
# self.assert_eq(0.1 / pser, 0.1 / kser)
self.assertRaises(TypeError, lambda: "x" + kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) / kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) / kser)
def test_rfloordiv(self):
for pser, kser in self.numeric_pser_kser_pairs:
# self.assert_eq(5 // pser, 5 // kser)
# self.assert_eq(0.1 // pser, 0.1 // kser)
self.assertRaises(TypeError, lambda: "x" // kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) // kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) // kser)
def test_rpow(self):
for pser, kser in self.numeric_pser_kser_pairs:
# self.assert_eq(1 ** pser, 1 ** kser)
# self.assert_eq(0.1 ** pser, 0.1 ** kser)
self.assertRaises(TypeError, lambda: "x" ** kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** kser)
def test_rmod(self):
for pser, kser in self.numeric_pser_kser_pairs:
self.assert_eq(1 % pser, 1 % kser)
# self.assert_eq(0.1 % pser, 0.1 % kser)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) % kser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) % kser)
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.data_type_ops.test_string_ops import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)

View file

@ -1,140 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import pandas as pd
from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase
class StringOpsTest(PandasOnSparkTestCase, TestCasesUtils):
@property
def pser(self):
return pd.Series(["x", "y", "z"])
@property
def kser(self):
return ps.from_pandas(self.pser)
def test_add(self):
self.assert_eq(self.pser + "x", self.kser + "x")
self.assertRaises(TypeError, lambda: self.kser + 1)
with option_context("compute.ops_on_diff_frames", True):
self.assert_eq(
self.pser + self.non_numeric_psers["string"],
(self.kser + self.non_numeric_ksers["string"]).sort_index(),
)
self.assertRaises(TypeError, lambda: self.kser + self.non_numeric_ksers["datetime"])
self.assertRaises(TypeError, lambda: self.kser + self.non_numeric_ksers["date"])
self.assertRaises(TypeError, lambda: self.kser + self.non_numeric_ksers["categorical"])
self.assertRaises(TypeError, lambda: self.kser + self.non_numeric_ksers["bool"])
for kser in self.numeric_ksers:
self.assertRaises(TypeError, lambda: self.kser + kser)
def test_sub(self):
self.assertRaises(TypeError, lambda: self.kser - "x")
self.assertRaises(TypeError, lambda: self.kser - 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser - kser)
def test_mul(self):
self.assertRaises(TypeError, lambda: self.kser * "x")
self.assert_eq(self.pser * 1, self.kser * 1)
with option_context("compute.ops_on_diff_frames", True):
for pser, kser in self.pser_kser_pairs:
if kser.dtype in [np.int64, np.int32]:
self.assert_eq(self.pser * pser, (self.kser * kser).sort_index())
else:
self.assertRaises(TypeError, lambda: self.kser * kser)
def test_truediv(self):
self.assertRaises(TypeError, lambda: self.kser / "x")
self.assertRaises(TypeError, lambda: self.kser / 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser / kser)
def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.kser // "x")
self.assertRaises(TypeError, lambda: self.kser // 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser // kser)
def test_mod(self):
self.assertRaises(TypeError, lambda: self.kser % "x")
self.assertRaises(TypeError, lambda: self.kser % 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser % kser)
def test_pow(self):
self.assertRaises(TypeError, lambda: self.kser ** "x")
self.assertRaises(TypeError, lambda: self.kser ** 1)
with option_context("compute.ops_on_diff_frames", True):
for kser in self.ksers:
self.assertRaises(TypeError, lambda: self.kser ** kser)
def test_radd(self):
self.assert_eq("x" + self.pser, "x" + self.kser)
self.assertRaises(TypeError, lambda: 1 + self.kser)
def test_rsub(self):
self.assertRaises(TypeError, lambda: "x" - self.kser)
self.assertRaises(TypeError, lambda: 1 - self.kser)
def test_rmul(self):
self.assertRaises(TypeError, lambda: "x" * self.kser)
self.assert_eq(1 * self.pser, 1 * self.kser)
def test_rtruediv(self):
self.assertRaises(TypeError, lambda: "x" / self.kser)
self.assertRaises(TypeError, lambda: 1 / self.kser)
def test_rfloordiv(self):
self.assertRaises(TypeError, lambda: "x" // self.kser)
self.assertRaises(TypeError, lambda: 1 // self.kser)
def test_rmod(self):
self.assertRaises(TypeError, lambda: 1 % self.kser)
def test_rpow(self):
self.assertRaises(TypeError, lambda: "x" ** self.kser)
self.assertRaises(TypeError, lambda: 1 ** self.kser)
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.data_type_ops.test_num_ops import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)

View file

@ -1,75 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import decimal
import numpy as np
import pandas as pd
import pyspark.pandas as ps
class TestCasesUtils(object):
"""A utility holding common test cases for arithmetic operations of different data types."""
@property
def numeric_psers(self):
dtypes = [np.float32, float, int, np.int32]
sers = [pd.Series([1, 2, 3], dtype=dtype) for dtype in dtypes]
sers.append(pd.Series([decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(3)]))
return sers
@property
def numeric_ksers(self):
return [ps.from_pandas(pser) for pser in self.numeric_psers]
@property
def numeric_pser_kser_pairs(self):
return zip(self.numeric_psers, self.numeric_ksers)
@property
def non_numeric_psers(self):
psers = {
"string": pd.Series(["x", "y", "z"]),
"datetime": pd.to_datetime(pd.Series([1, 2, 3])),
"bool": pd.Series([True, True, False]),
"date": pd.Series(
[datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]
),
"categorical": pd.Series(["a", "b", "a"], dtype="category"),
}
return psers
@property
def non_numeric_ksers(self):
ksers = {}
for k, v in self.non_numeric_psers.items():
ksers[k] = ps.from_pandas(v)
return ksers
@property
def ksers(self):
return self.numeric_ksers + list(self.non_numeric_ksers.values())
@property
def psers(self):
return self.numeric_psers + list(self.non_numeric_psers.values())
@property
def pser_kser_pairs(self):
return zip(self.psers, self.ksers)

View file

@ -192,23 +192,21 @@ class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
for kidx, pidx in self.idx_pairs:
py_datetime = pidx.to_pydatetime()
for other in [1, 0.1, kidx, kidx.to_series().reset_index(drop=True), py_datetime]:
expected_err_msg = "Addition can not be applied to datetimes."
expected_err_msg = "addition can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx + other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other + kidx)
expected_err_msg = "Multiplication can not be applied to datetimes."
expected_err_msg = "multiplication can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx * other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other * kidx)
expected_err_msg = "True division can not be applied to datetimes."
expected_err_msg = "division can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx / other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other / kidx)
expected_err_msg = "Floor division can not be applied to datetimes."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx // other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other // kidx)
expected_err_msg = "Modulo can not be applied to datetimes."
expected_err_msg = "modulo can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx % other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % kidx)

View file

@ -2355,7 +2355,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# Negative
kdf = ps.DataFrame({"a": ["x"], "b": [1]})
ks_err_msg = "subtraction can not be applied to string series or literals"
ks_err_msg = "substraction can not be applied to string series or literals"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: kdf["a"] - kdf["b"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: kdf["b"] - kdf["a"])
@ -2430,12 +2430,12 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: kdf["b"] * "literal")
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" * kdf["b"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: kdf["a"] * "literal")
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" * kdf["a"])
ks_err_msg = "a string series can only be multiplied to an int series or literal"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: kdf["a"] * kdf["a"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: kdf["a"] * 0.1)
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 0.1 * kdf["a"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" * kdf["a"])
def test_sample(self):
pdf = pd.DataFrame({"A": [0, 2, 4]})

View file

@ -84,23 +84,21 @@ class SeriesDateTimeTest(PandasOnSparkTestCase, SQLTestUtils):
datetime_index = ps.Index(self.pd_start_date)
for other in [1, 0.1, kser, datetime_index, py_datetime]:
expected_err_msg = "Addition can not be applied to datetimes."
expected_err_msg = "addition can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser + other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other + kser)
expected_err_msg = "Multiplication can not be applied to datetimes."
expected_err_msg = "multiplication can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser * other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other * kser)
expected_err_msg = "True division can not be applied to datetimes."
expected_err_msg = "division can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser / other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other / kser)
expected_err_msg = "Floor division can not be applied to datetimes."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser // other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other // kser)
expected_err_msg = "Modulo can not be applied to datetimes."
expected_err_msg = "modulo can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser % other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % kser)

View file

@ -70,7 +70,7 @@ class PandasOnSparkTestCase(unittest.TestCase, SQLTestUtils):
def tearDownClass(cls):
# We don't stop Spark session to reuse across all tests.
# The Spark session will be started and stopped at PyTest session level.
# Please see pyspark/pandas/conftest.py.
# Please see databricks/koalas/conftest.py.
pass
def assertPandasEqual(self, left, right, check_exact=True):

View file

@ -221,7 +221,6 @@ try:
'pyspark.sbin',
'pyspark.jars',
'pyspark.pandas',
'pyspark.pandas.data_type_ops',
'pyspark.pandas.indexes',
'pyspark.pandas.missing',
'pyspark.pandas.plot',