diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index c35618e53a..ab65ccd7d3 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -611,12 +611,6 @@ pyspark_pandas = Module( "pyspark.pandas.spark.utils", "pyspark.pandas.typedef.typehints", # unittests - "pyspark.pandas.tests.data_type_ops.test_boolean_ops", - "pyspark.pandas.tests.data_type_ops.test_categorical_ops", - "pyspark.pandas.tests.data_type_ops.test_date_ops", - "pyspark.pandas.tests.data_type_ops.test_datetime_ops", - "pyspark.pandas.tests.data_type_ops.test_num_ops", - "pyspark.pandas.tests.data_type_ops.test_string_ops", "pyspark.pandas.tests.indexes.test_base", "pyspark.pandas.tests.indexes.test_category", "pyspark.pandas.tests.indexes.test_datetime", diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 6cecb7397f..1082052af2 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -19,6 +19,7 @@ Base and utility classes for pandas-on-Spark objects. """ from abc import ABCMeta, abstractmethod +import datetime from functools import wraps, partial from itertools import chain from typing import Any, Callable, Optional, Tuple, Union, cast, TYPE_CHECKING @@ -34,6 +35,7 @@ from pyspark.sql.types import ( DateType, DoubleType, FloatType, + IntegralType, LongType, NumericType, StringType, @@ -48,9 +50,11 @@ from pyspark.pandas.internal import ( NATURAL_ORDER_COLUMN_NAME, SPARK_DEFAULT_INDEX_NAME, ) +from pyspark.pandas.spark import functions as SF from pyspark.pandas.spark.accessors import SparkIndexOpsMethods from pyspark.pandas.typedef import ( Dtype, + as_spark_type, extension_dtypes, pandas_on_spark_type, spark_type_to_pandas_dtype, @@ -318,23 +322,100 @@ class IndexOpsMixin(object, metaclass=ABCMeta): spark_column.__doc__ = SparkIndexOpsMethods.column.__doc__ - @property - def _dtype_op(self): - from pyspark.pandas.data_type_ops.base import DataTypeOps - - return DataTypeOps(self.dtype, self.spark.data_type) - # arithmetic operators __neg__ = column_op(Column.__neg__) def __add__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__add__(self, other) + if not isinstance(self.spark.data_type, StringType) and ( + (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType)) + or isinstance(other, str) + ): + raise TypeError("string addition can only be applied to string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("addition can not be applied to date times.") + + if isinstance(self.spark.data_type, StringType): + # Concatenate string columns + if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType): + return column_op(F.concat)(self, other) + # Handle df['col'] + 'literal' + elif isinstance(other, str): + return column_op(F.concat)(self, F.lit(other)) + else: + raise TypeError("string addition can only be applied to string series or literals.") + else: + return column_op(Column.__add__)(self, other) def __sub__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__sub__(self, other) + if ( + isinstance(self.spark.data_type, StringType) + or (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType)) + or isinstance(other, str) + ): + raise TypeError("substraction can not be applied to string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's + # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. + msg = ( + "Note that there is a behavior difference of timestamp subtraction. " + "The timestamp subtraction returns an integer in seconds, " + "whereas pandas returns 'timedelta64[ns]'." + ) + if isinstance(other, IndexOpsMixin) and isinstance( + other.spark.data_type, TimestampType + ): + warnings.warn(msg, UserWarning) + return self.astype("long") - other.astype("long") + elif isinstance(other, datetime.datetime): + warnings.warn(msg, UserWarning) + return self.astype("long") - F.lit(other).cast(as_spark_type("long")) + else: + raise TypeError("datetime subtraction can only be applied to datetime series.") + elif isinstance(self.spark.data_type, DateType): + # Note that date subtraction casts arguments to integer. This is to mimic pandas's + # behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction. + msg = ( + "Note that there is a behavior difference of date subtraction. " + "The date subtraction returns an integer in days, " + "whereas pandas returns 'timedelta64[ns]'." + ) + if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, DateType): + warnings.warn(msg, UserWarning) + return column_op(F.datediff)(self, other).astype("long") + elif isinstance(other, datetime.date) and not isinstance(other, datetime.datetime): + warnings.warn(msg, UserWarning) + return column_op(F.datediff)(self, F.lit(other)).astype("long") + else: + raise TypeError("date subtraction can only be applied to date series.") + return column_op(Column.__sub__)(self, other) def __mul__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__mul__(self, other) + if isinstance(other, str): + raise TypeError("multiplication can not be applied to a string literal.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("multiplication can not be applied to date times.") + + if ( + isinstance(self.spark.data_type, IntegralType) + and isinstance(other, IndexOpsMixin) + and isinstance(other.spark.data_type, StringType) + ): + return column_op(SF.repeat)(other, self) + + if isinstance(self.spark.data_type, StringType): + if ( + isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, IntegralType) + ) or isinstance(other, int): + return column_op(SF.repeat)(self, other) + else: + raise TypeError( + "a string series can only be multiplied to an int series or literal" + ) + + return column_op(Column.__mul__)(self, other) def __truediv__(self, other) -> Union["Series", "Index"]: """ @@ -353,22 +434,122 @@ class IndexOpsMixin(object, metaclass=ABCMeta): | -10 | null | -np.inf | +-----------------------|---------|---------+ """ - return self._dtype_op.__truediv__(self, other) + + if ( + isinstance(self.spark.data_type, StringType) + or (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType)) + or isinstance(other, str) + ): + raise TypeError("division can not be applied on string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("division can not be applied to date times.") + + def truediv(left, right): + return F.when(F.lit(right != 0) | F.lit(right).isNull(), left.__div__(right)).otherwise( + F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( + F.lit(np.inf).__div__(left) + ) + ) + + return numpy_column_op(truediv)(self, other) def __mod__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__mod__(self, other) + if ( + isinstance(self.spark.data_type, StringType) + or (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType)) + or isinstance(other, str) + ): + raise TypeError("modulo can not be applied on string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("modulo can not be applied to date times.") + + def mod(left, right): + return ((left % right) + right) % right + + return column_op(mod)(self, other) def __radd__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__radd__(self, other) + # Handle 'literal' + df['col'] + if not isinstance(self.spark.data_type, StringType) and isinstance(other, str): + raise TypeError("string addition can only be applied to string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("addition can not be applied to date times.") + + if isinstance(self.spark.data_type, StringType): + if isinstance(other, str): + return self._with_new_scol( + F.concat(F.lit(other), self.spark.column) + ) # TODO: dtype? + else: + raise TypeError("string addition can only be applied to string series or literals.") + else: + return column_op(Column.__radd__)(self, other) def __rsub__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__rsub__(self, other) + if isinstance(self.spark.data_type, StringType) or isinstance(other, str): + raise TypeError("substraction can not be applied to string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's + # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. + msg = ( + "Note that there is a behavior difference of timestamp subtraction. " + "The timestamp subtraction returns an integer in seconds, " + "whereas pandas returns 'timedelta64[ns]'." + ) + if isinstance(other, datetime.datetime): + warnings.warn(msg, UserWarning) + return -(self.astype("long") - F.lit(other).cast(as_spark_type("long"))) + else: + raise TypeError("datetime subtraction can only be applied to datetime series.") + elif isinstance(self.spark.data_type, DateType): + # Note that date subtraction casts arguments to integer. This is to mimic pandas's + # behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction. + msg = ( + "Note that there is a behavior difference of date subtraction. " + "The date subtraction returns an integer in days, " + "whereas pandas returns 'timedelta64[ns]'." + ) + if isinstance(other, datetime.date) and not isinstance(other, datetime.datetime): + warnings.warn(msg, UserWarning) + return -column_op(F.datediff)(self, F.lit(other)).astype("long") + else: + raise TypeError("date subtraction can only be applied to date series.") + return column_op(Column.__rsub__)(self, other) def __rmul__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__rmul__(self, other) + if isinstance(other, str): + raise TypeError("multiplication can not be applied to a string literal.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("multiplication can not be applied to date times.") + + if isinstance(self.spark.data_type, StringType): + if isinstance(other, int): + return column_op(SF.repeat)(self, other) + else: + raise TypeError( + "a string series can only be multiplied to an int series or literal" + ) + + return column_op(Column.__rmul__)(self, other) def __rtruediv__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__rtruediv__(self, other) + if isinstance(self.spark.data_type, StringType) or isinstance(other, str): + raise TypeError("division can not be applied on string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("division can not be applied to date times.") + + def rtruediv(left, right): + return F.when(left == 0, F.lit(np.inf).__div__(right)).otherwise( + F.lit(right).__truediv__(left) + ) + + return numpy_column_op(rtruediv)(self, other) def __floordiv__(self, other) -> Union["Series", "Index"]: """ @@ -387,19 +568,66 @@ class IndexOpsMixin(object, metaclass=ABCMeta): | -10 | null | -np.inf | +-----------------------|---------|---------+ """ - return self._dtype_op.__floordiv__(self, other) + if ( + isinstance(self.spark.data_type, StringType) + or (isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType)) + or isinstance(other, str) + ): + raise TypeError("division can not be applied on string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("division can not be applied to date times.") + + def floordiv(left, right): + return F.when(F.lit(right is np.nan), np.nan).otherwise( + F.when( + F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right)) + ).otherwise( + F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( + F.lit(np.inf).__div__(left) + ) + ) + ) + + return numpy_column_op(floordiv)(self, other) def __rfloordiv__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__rfloordiv__(self, other) + if isinstance(self.spark.data_type, StringType) or isinstance(other, str): + raise TypeError("division can not be applied on string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("division can not be applied to date times.") + + def rfloordiv(left, right): + return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise( + F.when(F.lit(left) == np.nan, np.nan).otherwise(F.floor(F.lit(right).__div__(left))) + ) + + return numpy_column_op(rfloordiv)(self, other) def __rmod__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__rmod__(self, other) + if isinstance(self.spark.data_type, StringType) or isinstance(other, str): + raise TypeError("modulo can not be applied on string series or literals.") + + if isinstance(self.spark.data_type, TimestampType): + raise TypeError("modulo can not be applied to date times.") + + def rmod(left, right): + return ((right % left) + left) % left + + return column_op(rmod)(self, other) def __pow__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__pow__(self, other) + def pow_func(left, right): + return F.when(left == 1, left).otherwise(Column.__pow__(left, right)) + + return column_op(pow_func)(self, other) def __rpow__(self, other) -> Union["Series", "Index"]: - return self._dtype_op.__rpow__(self, other) + def rpow_func(left, right): + return F.when(F.lit(right == 1), right).otherwise(Column.__rpow__(left, right)) + + return column_op(rpow_func)(self, other) __abs__ = column_op(F.abs) diff --git a/python/pyspark/pandas/data_type_ops/__init__.py b/python/pyspark/pandas/data_type_ops/__init__.py deleted file mode 100644 index cce3acad34..0000000000 --- a/python/pyspark/pandas/data_type_ops/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py deleted file mode 100644 index 4f92a2e09e..0000000000 --- a/python/pyspark/pandas/data_type_ops/base.py +++ /dev/null @@ -1,120 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from abc import ABCMeta, abstractmethod -from typing import TYPE_CHECKING, Union - -from pandas.api.types import CategoricalDtype - -from pyspark.sql.types import ( - BooleanType, - DataType, - DateType, - FractionalType, - IntegralType, - StringType, - TimestampType, -) - -from pyspark.pandas.typedef import Dtype - -if TYPE_CHECKING: - from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943) - from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943) - - -class DataTypeOps(object, metaclass=ABCMeta): - """The base class for binary operations of pandas-on-Spark objects (of different data types).""" - - def __new__(cls, dtype: Dtype, spark_type: DataType): - from pyspark.pandas.data_type_ops.boolean_ops import BooleanOps - from pyspark.pandas.data_type_ops.categorical_ops import CategoricalOps - from pyspark.pandas.data_type_ops.date_ops import DateOps - from pyspark.pandas.data_type_ops.datetime_ops import DatetimeOps - from pyspark.pandas.data_type_ops.num_ops import ( - IntegralOps, - FractionalOps, - ) - from pyspark.pandas.data_type_ops.string_ops import StringOps - - if isinstance(dtype, CategoricalDtype): - return object.__new__(CategoricalOps) - elif isinstance(spark_type, FractionalType): - return object.__new__(FractionalOps) - elif isinstance(spark_type, IntegralType): - return object.__new__(IntegralOps) - elif isinstance(spark_type, StringType): - return object.__new__(StringOps) - elif isinstance(spark_type, BooleanType): - return object.__new__(BooleanOps) - elif isinstance(spark_type, TimestampType): - return object.__new__(DatetimeOps) - elif isinstance(spark_type, DateType): - return object.__new__(DateOps) - else: - raise TypeError("Type %s was not understood." % dtype) - - def __init__(self, dtype: Dtype, spark_type: DataType): - self.dtype = dtype - self.spark_type = spark_type - - @property - @abstractmethod - def pretty_name(self) -> str: - raise NotImplementedError() - - def __add__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Addition can not be applied to %s." % self.pretty_name) - - def __sub__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Subtraction can not be applied to %s." % self.pretty_name) - - def __mul__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Multiplication can not be applied to %s." % self.pretty_name) - - def __truediv__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("True division can not be applied to %s." % self.pretty_name) - - def __floordiv__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Floor division can not be applied to %s." % self.pretty_name) - - def __mod__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Modulo can not be applied to %s." % self.pretty_name) - - def __pow__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Exponentiation can not be applied to %s." % self.pretty_name) - - def __radd__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Addition can not be applied to %s." % self.pretty_name) - - def __rsub__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Subtraction can not be applied to %s." % self.pretty_name) - - def __rmul__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Multiplication can not be applied to %s." % self.pretty_name) - - def __rtruediv__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("True division can not be applied to %s." % self.pretty_name) - - def __rfloordiv__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Floor division can not be applied to %s." % self.pretty_name) - - def __rmod__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Modulo can not be applied to %s." % self.pretty_name) - - def __rpow__(self, left, right) -> Union["Series", "Index"]: - raise TypeError("Exponentiation can not be applied to %s." % self.pretty_name) diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py deleted file mode 100644 index 4a72123a0a..0000000000 --- a/python/pyspark/pandas/data_type_ops/boolean_ops.py +++ /dev/null @@ -1,28 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from pyspark.pandas.data_type_ops.base import DataTypeOps - - -class BooleanOps(DataTypeOps): - """ - The class for binary operations of pandas-on-Spark objects with spark type: BooleanType. - """ - - @property - def pretty_name(self) -> str: - return 'booleans' diff --git a/python/pyspark/pandas/data_type_ops/categorical_ops.py b/python/pyspark/pandas/data_type_ops/categorical_ops.py deleted file mode 100644 index 9c5786820e..0000000000 --- a/python/pyspark/pandas/data_type_ops/categorical_ops.py +++ /dev/null @@ -1,28 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from pyspark.pandas.data_type_ops.base import DataTypeOps - - -class CategoricalOps(DataTypeOps): - """ - The class for binary operations of pandas-on-Spark objects with categorical types. - """ - - @property - def pretty_name(self) -> str: - return 'categoricals' diff --git a/python/pyspark/pandas/data_type_ops/date_ops.py b/python/pyspark/pandas/data_type_ops/date_ops.py deleted file mode 100644 index 501280c1a4..0000000000 --- a/python/pyspark/pandas/data_type_ops/date_ops.py +++ /dev/null @@ -1,71 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import datetime -import warnings -from typing import TYPE_CHECKING, Union - -from pyspark.sql import functions as F -from pyspark.sql.types import DateType - -from pyspark.pandas.base import column_op, IndexOpsMixin -from pyspark.pandas.data_type_ops.base import DataTypeOps - -if TYPE_CHECKING: - from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943) - from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943) - - -class DateOps(DataTypeOps): - """ - The class for binary operations of pandas-on-Spark objects with spark type: DateType. - """ - - @property - def pretty_name(self) -> str: - return 'dates' - - def __sub__(self, left, right) -> Union["Series", "Index"]: - # Note that date subtraction casts arguments to integer. This is to mimic pandas's - # behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction. - msg = ( - "Note that there is a behavior difference of date subtraction. " - "The date subtraction returns an integer in days, " - "whereas pandas returns 'timedelta64[ns]'." - ) - if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, DateType): - warnings.warn(msg, UserWarning) - return column_op(F.datediff)(left, right).astype("long") - elif isinstance(right, datetime.date) and not isinstance(right, datetime.datetime): - warnings.warn(msg, UserWarning) - return column_op(F.datediff)(left, F.lit(right)).astype("long") - else: - raise TypeError("date subtraction can only be applied to date series.") - - def __rsub__(self, left, right) -> Union["Series", "Index"]: - # Note that date subtraction casts arguments to integer. This is to mimic pandas's - # behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction. - msg = ( - "Note that there is a behavior difference of date subtraction. " - "The date subtraction returns an integer in days, " - "whereas pandas returns 'timedelta64[ns]'." - ) - if isinstance(right, datetime.date) and not isinstance(right, datetime.datetime): - warnings.warn(msg, UserWarning) - return -column_op(F.datediff)(left, F.lit(right)).astype("long") - else: - raise TypeError("date subtraction can only be applied to date series.") diff --git a/python/pyspark/pandas/data_type_ops/datetime_ops.py b/python/pyspark/pandas/data_type_ops/datetime_ops.py deleted file mode 100644 index 0a57d3f02f..0000000000 --- a/python/pyspark/pandas/data_type_ops/datetime_ops.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import datetime -import warnings -from typing import TYPE_CHECKING, Union - -from pyspark.sql import functions as F -from pyspark.sql.types import TimestampType - -from pyspark.pandas.base import IndexOpsMixin -from pyspark.pandas.data_type_ops.base import DataTypeOps -from pyspark.pandas.typedef import as_spark_type - -if TYPE_CHECKING: - from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943) - from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943) - - -class DatetimeOps(DataTypeOps): - """ - The class for binary operations of pandas-on-Spark objects with spark type: TimestampType. - """ - - @property - def pretty_name(self) -> str: - return 'datetimes' - - def __sub__(self, left, right) -> Union["Series", "Index"]: - # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's - # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. - msg = ( - "Note that there is a behavior difference of timestamp subtraction. " - "The timestamp subtraction returns an integer in seconds, " - "whereas pandas returns 'timedelta64[ns]'." - ) - if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, TimestampType): - warnings.warn(msg, UserWarning) - return left.astype("long") - right.astype("long") - elif isinstance(right, datetime.datetime): - warnings.warn(msg, UserWarning) - return left.astype("long") - F.lit(right).cast(as_spark_type("long")) - else: - raise TypeError("datetime subtraction can only be applied to datetime series.") - - def __rsub__(self, left, right) -> Union["Series", "Index"]: - # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's - # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. - msg = ( - "Note that there is a behavior difference of timestamp subtraction. " - "The timestamp subtraction returns an integer in seconds, " - "whereas pandas returns 'timedelta64[ns]'." - ) - if isinstance(right, datetime.datetime): - warnings.warn(msg, UserWarning) - return -(left.astype("long") - F.lit(right).cast(as_spark_type("long"))) - else: - raise TypeError("datetime subtraction can only be applied to datetime series.") diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py deleted file mode 100644 index e6b6d96c00..0000000000 --- a/python/pyspark/pandas/data_type_ops/num_ops.py +++ /dev/null @@ -1,378 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numbers -from typing import TYPE_CHECKING, Union - -import numpy as np -from pandas.api.types import CategoricalDtype - -from pyspark.sql import Column, functions as F -from pyspark.sql.types import ( - NumericType, - StringType, - TimestampType, -) - -from pyspark.pandas.base import column_op, IndexOpsMixin, numpy_column_op -from pyspark.pandas.data_type_ops.base import DataTypeOps -from pyspark.pandas.spark import functions as SF - -if TYPE_CHECKING: - from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943) - from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943) - - -class NumericOps(DataTypeOps): - """ - The class for binary operations of numeric pandas-on-Spark objects. - """ - - @property - def pretty_name(self) -> str: - return 'numerics' - - def __add__(self, left, right) -> Union["Series", "Index"]: - if ( - isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType) - ) or isinstance(right, str): - raise TypeError("string addition can only be applied to string series or literals.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or (not isinstance(right.spark.data_type, NumericType)) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("addition can not be applied to given types.") - - return column_op(Column.__add__)(left, right) - - def __sub__(self, left, right) -> Union["Series", "Index"]: - if ( - isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType) - ) or isinstance(right, str): - raise TypeError("subtraction can not be applied to string series or literals.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or (not isinstance(right.spark.data_type, NumericType)) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("subtraction can not be applied to given types.") - - return column_op(Column.__sub__)(left, right) - - def __mod__(self, left, right) -> Union["Series", "Index"]: - if ( - isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType) - ) or isinstance(right, str): - raise TypeError("modulo can not be applied on string series or literals.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or (not isinstance(right.spark.data_type, NumericType)) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("modulo can not be applied to given types.") - - def mod(left, right): - return ((left % right) + right) % right - - return column_op(mod)(left, right) - - def __pow__(self, left, right) -> Union["Series", "Index"]: - if ( - isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType) - ) or isinstance(right, str): - raise TypeError("exponentiation can not be applied on string series or literals.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or (not isinstance(right.spark.data_type, NumericType)) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("exponentiation can not be applied to given types.") - - def pow_func(left, right): - return F.when(left == 1, left).otherwise(Column.__pow__(left, right)) - - return column_op(pow_func)(left, right) - - def __radd__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("string addition can only be applied to string series or literals.") - if not isinstance(right, numbers.Number): - raise TypeError("addition can not be applied to given types.") - - return column_op(Column.__radd__)(left, right) - - def __rsub__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("subtraction can not be applied to string series or literals.") - if not isinstance(right, numbers.Number): - raise TypeError("subtraction can not be applied to given types.") - return column_op(Column.__rsub__)(left, right) - - def __rmul__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("multiplication can not be applied to a string literal.") - if not isinstance(right, numbers.Number): - raise TypeError("multiplication can not be applied to given types.") - return column_op(Column.__rmul__)(left, right) - - def __rpow__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("exponentiation can not be applied on string series or literals.") - if not isinstance(right, numbers.Number): - raise TypeError("exponentiation can not be applied to given types.") - - def rpow_func(left, right): - return F.when(F.lit(right == 1), right).otherwise(Column.__rpow__(left, right)) - - return column_op(rpow_func)(left, right) - - def __rmod__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("modulo can not be applied on string series or literals.") - if not isinstance(right, numbers.Number): - raise TypeError("modulo can not be applied to given types.") - - def rmod(left, right): - return ((right % left) + left) % left - - return column_op(rmod)(left, right) - - -class IntegralOps(NumericOps): - """ - The class for binary operations of pandas-on-Spark objects with spark types: - LongType, IntegerType, ByteType and ShortType. - """ - - @property - def pretty_name(self) -> str: - return 'integrals' - - def __mul__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("multiplication can not be applied to a string literal.") - - if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, TimestampType): - raise TypeError("multiplication can not be applied to date times.") - - if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType): - return column_op(SF.repeat)(right, left) - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or not isinstance(right.spark.data_type, NumericType) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("multiplication can not be applied to given types.") - - return column_op(Column.__mul__)(left, right) - - def __truediv__(self, left, right) -> Union["Series", "Index"]: - if ( - isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType) - ) or isinstance(right, str): - raise TypeError("division can not be applied on string series or literals.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or (not isinstance(right.spark.data_type, NumericType)) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("division can not be applied to given types.") - - def truediv(left, right): - return F.when(F.lit(right != 0) | F.lit(right).isNull(), left.__div__(right)).otherwise( - F.lit(np.inf).__div__(left) - ) - - return numpy_column_op(truediv)(left, right) - - def __floordiv__(self, left, right) -> Union["Series", "Index"]: - if ( - isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType) - ) or isinstance(right, str): - raise TypeError("division can not be applied on string series or literals.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or (not isinstance(right.spark.data_type, NumericType)) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("division can not be applied to given types.") - - def floordiv(left, right): - return F.when(F.lit(right is np.nan), np.nan).otherwise( - F.when( - F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right)) - ).otherwise( - F.lit(np.inf).__div__(left) - ) - ) - - return numpy_column_op(floordiv)(left, right) - - def __rtruediv__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("division can not be applied on string series or literals.") - if not isinstance(right, numbers.Number): - raise TypeError("division can not be applied to given types.") - - def rtruediv(left, right): - return F.when(left == 0, F.lit(np.inf).__div__(right)).otherwise( - F.lit(right).__truediv__(left) - ) - - return numpy_column_op(rtruediv)(left, right) - - def __rfloordiv__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("division can not be applied on string series or literals.") - if not isinstance(right, numbers.Number): - raise TypeError("division can not be applied to given types.") - - def rfloordiv(left, right): - return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise( - F.floor(F.lit(right).__div__(left)) - ) - - return numpy_column_op(rfloordiv)(left, right) - - -class FractionalOps(NumericOps): - """ - The class for binary operations of pandas-on-Spark objects with spark types: - FloatType, DoubleType and DecimalType. - """ - - @property - def pretty_name(self) -> str: - return 'fractions' - - def __mul__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("multiplication can not be applied to a string literal.") - - if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, TimestampType): - raise TypeError("multiplication can not be applied to date times.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or not isinstance(right.spark.data_type, NumericType) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("multiplication can not be applied to given types.") - - return column_op(Column.__mul__)(left, right) - - def __truediv__(self, left, right) -> Union["Series", "Index"]: - if ( - isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType) - ) or isinstance(right, str): - raise TypeError("division can not be applied on string series or literals.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or (not isinstance(right.spark.data_type, NumericType)) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("division can not be applied to given types.") - - def truediv(left, right): - return F.when(F.lit(right != 0) | F.lit(right).isNull(), left.__div__(right)).otherwise( - F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( - F.lit(np.inf).__div__(left) - ) - ) - - return numpy_column_op(truediv)(left, right) - - def __floordiv__(self, left, right) -> Union["Series", "Index"]: - if ( - isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType) - ) or isinstance(right, str): - raise TypeError("division can not be applied on string series or literals.") - - if ( - isinstance(right, IndexOpsMixin) - and ( - isinstance(right.dtype, CategoricalDtype) - or (not isinstance(right.spark.data_type, NumericType)) - ) - ) and not isinstance(right, numbers.Number): - raise TypeError("division can not be applied to given types.") - - def floordiv(left, right): - return F.when(F.lit(right is np.nan), np.nan).otherwise( - F.when( - F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right)) - ).otherwise( - F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( - F.lit(np.inf).__div__(left) - ) - ) - ) - - return numpy_column_op(floordiv)(left, right) - - def __rtruediv__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("division can not be applied on string series or literals.") - if not isinstance(right, numbers.Number): - raise TypeError("division can not be applied to given types.") - - def rtruediv(left, right): - return F.when(left == 0, F.lit(np.inf).__div__(right)).otherwise( - F.lit(right).__truediv__(left) - ) - - return numpy_column_op(rtruediv)(left, right) - - def __rfloordiv__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("division can not be applied on string series or literals.") - if not isinstance(right, numbers.Number): - raise TypeError("division can not be applied to given types.") - - def rfloordiv(left, right): - return F.when(F.lit(left == 0), F.lit(np.inf).__div__(right)).otherwise( - F.when(F.lit(left) == np.nan, np.nan).otherwise(F.floor(F.lit(right).__div__(left))) - ) - - return numpy_column_op(rfloordiv)(left, right) diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py deleted file mode 100644 index f97e504d10..0000000000 --- a/python/pyspark/pandas/data_type_ops/string_ops.py +++ /dev/null @@ -1,104 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import TYPE_CHECKING, Union - -from pandas.api.types import CategoricalDtype - -from pyspark.sql import functions as F -from pyspark.sql.types import IntegralType, StringType - -from pyspark.pandas.base import column_op, IndexOpsMixin -from pyspark.pandas.data_type_ops.base import DataTypeOps -from pyspark.pandas.spark import functions as SF - -if TYPE_CHECKING: - from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943) - from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943) - - -class StringOps(DataTypeOps): - """ - The class for binary operations of pandas-on-Spark objects with spark type: StringType. - """ - - @property - def pretty_name(self) -> str: - return 'strings' - - def __add__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType): - return column_op(F.concat)(left, right) - elif isinstance(right, str): - return column_op(F.concat)(left, F.lit(right)) - else: - raise TypeError("string addition can only be applied to string series or literals.") - - def __sub__(self, left, right): - raise TypeError("subtraction can not be applied to string series or literals.") - - def __mul__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - raise TypeError("multiplication can not be applied to a string literal.") - - if ( - isinstance(right, IndexOpsMixin) - and isinstance(right.spark.data_type, IntegralType) - and not isinstance(right.dtype, CategoricalDtype) - ) or isinstance(right, int): - return column_op(SF.repeat)(left, right) - else: - raise TypeError("a string series can only be multiplied to an int series or literal") - - def __truediv__(self, left, right): - raise TypeError("division can not be applied on string series or literals.") - - def __floordiv__(self, left, right): - raise TypeError("division can not be applied on string series or literals.") - - def __mod__(self, left, right): - raise TypeError("modulo can not be applied on string series or literals.") - - def __pow__(self, left, right): - raise TypeError("exponentiation can not be applied on string series or literals.") - - def __radd__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, str): - return left._with_new_scol(F.concat(F.lit(right), left.spark.column)) # TODO: dtype? - else: - raise TypeError("string addition can only be applied to string series or literals.") - - def __rsub__(self, left, right): - raise TypeError("subtraction can not be applied to string series or literals.") - - def __rmul__(self, left, right) -> Union["Series", "Index"]: - if isinstance(right, int): - return column_op(SF.repeat)(left, right) - else: - raise TypeError("a string series can only be multiplied to an int series or literal") - - def __rtruediv__(self, left, right): - raise TypeError("division can not be applied on string series or literals.") - - def __rfloordiv__(self, left, right): - raise TypeError("division can not be applied on string series or literals.") - - def __rpow__(self, left, right): - raise TypeError("exponentiation can not be applied on string series or literals.") - - def __rmod__(self, left, right): - raise TypeError("modulo can not be applied on string series or literals.") diff --git a/python/pyspark/pandas/tests/data_type_ops/__init__.py b/python/pyspark/pandas/tests/data_type_ops/__init__.py deleted file mode 100644 index cce3acad34..0000000000 --- a/python/pyspark/pandas/tests/data_type_ops/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py deleted file mode 100644 index 8689ecbd27..0000000000 --- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +++ /dev/null @@ -1,150 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import datetime -import pandas as pd - -from pyspark import pandas as ps -from pyspark.pandas.config import option_context -from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils -from pyspark.testing.pandasutils import PandasOnSparkTestCase - - -class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils): - @property - def pser(self): - return pd.Series([True, True, False]) - - @property - def kser(self): - return ps.from_pandas(self.pser) - - def test_add(self): - self.assertRaises(TypeError, lambda: self.kser + 1) - self.assertRaises(TypeError, lambda: self.kser + 0.1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser + kser) - - def test_sub(self): - self.assertRaises(TypeError, lambda: self.kser - 1) - self.assertRaises(TypeError, lambda: self.kser - 0.1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser - kser) - - def test_mul(self): - self.assertRaises(TypeError, lambda: self.kser * 1) - self.assertRaises(TypeError, lambda: self.kser * 0.1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser * kser) - - def test_truediv(self): - self.assertRaises(TypeError, lambda: self.kser / 1) - self.assertRaises(TypeError, lambda: self.kser / 0.1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser / kser) - - def test_floordiv(self): - self.assertRaises(TypeError, lambda: self.kser // 1) - self.assertRaises(TypeError, lambda: self.kser // 0.1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser // kser) - - def test_mod(self): - self.assertRaises(TypeError, lambda: self.kser % 1) - self.assertRaises(TypeError, lambda: self.kser % 0.1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser % kser) - - def test_pow(self): - self.assertRaises(TypeError, lambda: self.kser ** 1) - self.assertRaises(TypeError, lambda: self.kser ** 0.1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser ** kser) - - def test_radd(self): - self.assertRaises(TypeError, lambda: 1 + self.kser) - self.assertRaises(TypeError, lambda: 0.1 + self.kser) - self.assertRaises(TypeError, lambda: "x" + self.kser) - self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) + self.kser) - self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) + self.kser) - - def test_rsub(self): - self.assertRaises(TypeError, lambda: 1 - self.kser) - self.assertRaises(TypeError, lambda: 0.1 - self.kser) - self.assertRaises(TypeError, lambda: "x" - self.kser) - self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) - self.kser) - self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) - self.kser) - - def test_rmul(self): - self.assertRaises(TypeError, lambda: 1 * self.kser) - self.assertRaises(TypeError, lambda: 0.1 * self.kser) - self.assertRaises(TypeError, lambda: "x" * self.kser) - self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) * self.kser) - self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) * self.kser) - - def test_rtruediv(self): - self.assertRaises(TypeError, lambda: 1 / self.kser) - self.assertRaises(TypeError, lambda: 0.1 / self.kser) - self.assertRaises(TypeError, lambda: "x" / self.kser) - self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) / self.kser) - self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) / self.kser) - - def test_rfloordiv(self): - self.assertRaises(TypeError, lambda: 1 // self.kser) - self.assertRaises(TypeError, lambda: 0.1 // self.kser) - self.assertRaises(TypeError, lambda: "x" + self.kser) - self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) // self.kser) - self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) // self.kser) - - def test_rpow(self): - self.assertRaises(TypeError, lambda: 1 ** self.kser) - self.assertRaises(TypeError, lambda: 0.1 ** self.kser) - self.assertRaises(TypeError, lambda: "x" ** self.kser) - self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** self.kser) - self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** self.kser) - - def test_rmod(self): - self.assertRaises(TypeError, lambda: 1 % self.kser) - self.assertRaises(TypeError, lambda: 0.1 % self.kser) - self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) % self.kser) - self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) % self.kser) - - -if __name__ == "__main__": - import unittest - from pyspark.pandas.tests.data_type_ops.test_boolean_ops import * # noqa: F401 - - try: - import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) - except ImportError: - testRunner = None - unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py deleted file mode 100644 index ea61c978ff..0000000000 --- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py +++ /dev/null @@ -1,128 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import pandas as pd - -from pyspark import pandas as ps -from pyspark.pandas.config import option_context -from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils -from pyspark.testing.pandasutils import PandasOnSparkTestCase - - -class CategoricalOpsTest(PandasOnSparkTestCase, TestCasesUtils): - @property - def pser(self): - return pd.Series([1, "x", "y"], dtype="category") - - @property - def kser(self): - return ps.from_pandas(self.pser) - - def test_add(self): - self.assertRaises(TypeError, lambda: self.kser + "x") - self.assertRaises(TypeError, lambda: self.kser + 1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser + kser) - - def test_sub(self): - self.assertRaises(TypeError, lambda: self.kser - "x") - self.assertRaises(TypeError, lambda: self.kser - 1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser - kser) - - def test_mul(self): - self.assertRaises(TypeError, lambda: self.kser * "x") - self.assertRaises(TypeError, lambda: self.kser * 1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser * kser) - - def test_truediv(self): - self.assertRaises(TypeError, lambda: self.kser / "x") - self.assertRaises(TypeError, lambda: self.kser / 1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser / kser) - - def test_floordiv(self): - self.assertRaises(TypeError, lambda: self.kser // "x") - self.assertRaises(TypeError, lambda: self.kser // 1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser // kser) - - def test_mod(self): - self.assertRaises(TypeError, lambda: self.kser % "x") - self.assertRaises(TypeError, lambda: self.kser % 1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser % kser) - - def test_pow(self): - self.assertRaises(TypeError, lambda: self.kser ** "x") - self.assertRaises(TypeError, lambda: self.kser ** 1) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser ** kser) - - def test_radd(self): - self.assertRaises(TypeError, lambda: "x" + self.kser) - self.assertRaises(TypeError, lambda: 1 + self.kser) - - def test_rsub(self): - self.assertRaises(TypeError, lambda: "x" - self.kser) - self.assertRaises(TypeError, lambda: 1 - self.kser) - - def test_rmul(self): - self.assertRaises(TypeError, lambda: "x" * self.kser) - self.assertRaises(TypeError, lambda: 2 * self.kser) - - def test_rtruediv(self): - self.assertRaises(TypeError, lambda: "x" / self.kser) - self.assertRaises(TypeError, lambda: 1 / self.kser) - - def test_rfloordiv(self): - self.assertRaises(TypeError, lambda: "x" // self.kser) - self.assertRaises(TypeError, lambda: 1 // self.kser) - - def test_rmod(self): - self.assertRaises(TypeError, lambda: 1 % self.kser) - - def test_rpow(self): - self.assertRaises(TypeError, lambda: "x" ** self.kser) - self.assertRaises(TypeError, lambda: 1 ** self.kser) - - -if __name__ == "__main__": - import unittest - from pyspark.pandas.tests.data_type_ops.test_categorical_ops import * # noqa: F401 - - try: - import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) - except ImportError: - testRunner = None - unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py deleted file mode 100644 index 8674355758..0000000000 --- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +++ /dev/null @@ -1,158 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import datetime - -import pandas as pd - -from pyspark.sql.types import DateType - -from pyspark import pandas as ps -from pyspark.pandas.config import option_context -from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils -from pyspark.testing.pandasutils import PandasOnSparkTestCase - - -class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils): - @property - def pser(self): - return pd.Series( - [datetime.date(1994, 1, 31), datetime.date(1994, 2, 1), datetime.date(1994, 2, 2)] - ) - - @property - def kser(self): - return ps.from_pandas(self.pser) - - @property - def some_date(self): - return datetime.date(1994, 1, 1) - - def test_add(self): - self.assertRaises(TypeError, lambda: self.kser + "x") - self.assertRaises(TypeError, lambda: self.kser + 1) - self.assertRaises(TypeError, lambda: self.kser + self.some_date) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser + kser) - - def test_sub(self): - self.assertRaises(TypeError, lambda: self.kser - "x") - self.assertRaises(TypeError, lambda: self.kser - 1) - self.assert_eq( - (self.pser - self.some_date).dt.days, self.kser - self.some_date, - ) - with option_context("compute.ops_on_diff_frames", True): - for pser, kser in self.pser_kser_pairs: - if isinstance(kser.spark.data_type, DateType): - self.assert_eq((self.pser - pser).dt.days, (self.kser - kser).sort_index()) - else: - self.assertRaises(TypeError, lambda: self.kser - kser) - - def test_mul(self): - self.assertRaises(TypeError, lambda: self.kser * "x") - self.assertRaises(TypeError, lambda: self.kser * 1) - self.assertRaises(TypeError, lambda: self.kser * self.some_date) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser * kser) - - def test_truediv(self): - self.assertRaises(TypeError, lambda: self.kser / "x") - self.assertRaises(TypeError, lambda: self.kser / 1) - self.assertRaises(TypeError, lambda: self.kser / self.some_date) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser / kser) - - def test_floordiv(self): - self.assertRaises(TypeError, lambda: self.kser // "x") - self.assertRaises(TypeError, lambda: self.kser // 1) - self.assertRaises(TypeError, lambda: self.kser // self.some_date) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser // kser) - - def test_mod(self): - self.assertRaises(TypeError, lambda: self.kser % "x") - self.assertRaises(TypeError, lambda: self.kser % 1) - self.assertRaises(TypeError, lambda: self.kser % self.some_date) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser % kser) - - def test_pow(self): - self.assertRaises(TypeError, lambda: self.kser ** "x") - self.assertRaises(TypeError, lambda: self.kser ** 1) - self.assertRaises(TypeError, lambda: self.kser ** self.some_date) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser ** kser) - - def test_radd(self): - self.assertRaises(TypeError, lambda: "x" + self.kser) - self.assertRaises(TypeError, lambda: 1 + self.kser) - self.assertRaises(TypeError, lambda: self.some_date + self.kser) - - def test_rsub(self): - self.assertRaises(TypeError, lambda: "x" - self.kser) - self.assertRaises(TypeError, lambda: 1 - self.kser) - self.assert_eq( - (self.some_date - self.pser).dt.days, self.some_date - self.kser, - ) - - def test_rmul(self): - self.assertRaises(TypeError, lambda: "x" * self.kser) - self.assertRaises(TypeError, lambda: 1 * self.kser) - self.assertRaises(TypeError, lambda: self.some_date * self.kser) - - def test_rtruediv(self): - self.assertRaises(TypeError, lambda: "x" / self.kser) - self.assertRaises(TypeError, lambda: 1 / self.kser) - self.assertRaises(TypeError, lambda: self.some_date / self.kser) - - def test_rfloordiv(self): - self.assertRaises(TypeError, lambda: "x" // self.kser) - self.assertRaises(TypeError, lambda: 1 // self.kser) - self.assertRaises(TypeError, lambda: self.some_date // self.kser) - - def test_rmod(self): - self.assertRaises(TypeError, lambda: 1 % self.kser) - self.assertRaises(TypeError, lambda: self.some_date % self.kser) - - def test_rpow(self): - self.assertRaises(TypeError, lambda: "x" ** self.kser) - self.assertRaises(TypeError, lambda: 1 ** self.kser) - self.assertRaises(TypeError, lambda: self.some_date ** self.kser) - - -if __name__ == "__main__": - import unittest - from pyspark.pandas.tests.data_type_ops.test_date_ops import * # noqa: F401 - - try: - import xmlrunner # type: ignore[import] - testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) - except ImportError: - testRunner = None - unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py deleted file mode 100644 index c8076e4384..0000000000 --- a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +++ /dev/null @@ -1,160 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import datetime - -import numpy as np -import pandas as pd - -from pyspark import pandas as ps -from pyspark.pandas.config import option_context -from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils -from pyspark.testing.pandasutils import PandasOnSparkTestCase - - -class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils): - @property - def pser(self): - return pd.Series(pd.date_range("1994-1-31 10:30:15", periods=3, freq="M")) - - @property - def kser(self): - return ps.from_pandas(self.pser) - - @property - def some_datetime(self): - return datetime.datetime(1994, 1, 31, 10, 30, 00) - - def test_add(self): - self.assertRaises(TypeError, lambda: self.kser + "x") - self.assertRaises(TypeError, lambda: self.kser + 1) - self.assertRaises(TypeError, lambda: self.kser + self.some_datetime) - - with option_context("compute.ops_on_diff_frames", True): - for kser in self.ksers: - self.assertRaises(TypeError, lambda: self.kser + kser) - - def test_sub(self): - self.assertRaises(TypeError, lambda: self.kser - "x") - self.assertRaises(TypeError, lambda: self.kser - 1) - self.assert_eq( - (self.pser - self.some_datetime).dt.total_seconds().astype("int"), - self.kser - self.some_datetime, - ) - with option_context("compute.ops_on_diff_frames", True): - for pser, kser in self.pser_kser_pairs: - if pser.dtype == np.dtype("