[SPARK-35340][PYTHON] Standardize TypeError messages for unsupported basic operations

### What changes were proposed in this pull request?
The PR is proposed to standardize TypeError messages for unsupported basic operations by:
- Capitalize the first letter
- Leverage TypeError messages defined in `pyspark/pandas/data_type_ops/base.py`
- Take advantage of the utility `is_valid_operand_for_numeric_arithmetic` to save duplicated TypeError messages

Related unit tests should be adjusted as well.

### Why are the changes needed?
Inconsistent TypeError messages are shown for unsupported data-type-based basic operations.

Take addition's TypeError messages for example:
- addition can not be applied to given types.
- string addition can only be applied to string series or literals.

Standardizing TypeError messages would improve user experience and reduce maintenance costs.

### Does this PR introduce _any_ user-facing change?
No user-facing behavior change. Only TypeError messages are modified.

### How was this patch tested?

Unit tests.

Closes #33237 from xinrong-databricks/datatypeops_err.

Authored-by: Xinrong Meng <xinrong.meng@databricks.com>
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
This commit is contained in:
Xinrong Meng 2021-07-08 12:27:48 -07:00 committed by Takuya UESHIN
parent ee945e99cc
commit 819c482498
7 changed files with 49 additions and 148 deletions

View file

@ -62,7 +62,7 @@ class DateOps(DataTypeOps):
warnings.warn(msg, UserWarning)
return column_op(F.datediff)(left, SF.lit(right)).astype("long")
else:
raise TypeError("date subtraction can only be applied to date series.")
raise TypeError("Date subtraction can only be applied to date series.")
def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
# Note that date subtraction casts arguments to integer. This is to mimic pandas's
@ -76,7 +76,7 @@ class DateOps(DataTypeOps):
warnings.warn(msg, UserWarning)
return -column_op(F.datediff)(left, SF.lit(right)).astype("long")
else:
raise TypeError("date subtraction can only be applied to date series.")
raise TypeError("Date subtraction can only be applied to date series.")
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
from pyspark.pandas.base import column_op

View file

@ -67,7 +67,7 @@ class DatetimeOps(DataTypeOps):
),
)
else:
raise TypeError("datetime subtraction can only be applied to datetime series.")
raise TypeError("Datetime subtraction can only be applied to datetime series.")
def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
# Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
@ -86,7 +86,7 @@ class DatetimeOps(DataTypeOps):
),
)
else:
raise TypeError("datetime subtraction can only be applied to datetime series.")
raise TypeError("Datetime subtraction can only be applied to datetime series.")
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
from pyspark.pandas.base import column_op

View file

@ -41,7 +41,6 @@ from pyspark.sql.column import Column
from pyspark.sql.types import (
BooleanType,
StringType,
TimestampType,
)
@ -53,39 +52,24 @@ class NumericOps(DataTypeOps):
return "numerics"
def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("string addition can only be applied to string series or literals.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("addition can not be applied to given types.")
raise TypeError("Addition can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
return column_op(Column.__add__)(left, right)
def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("subtraction can not be applied to string series or literals.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("subtraction can not be applied to given types.")
raise TypeError("Subtraction can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
return column_op(Column.__sub__)(left, right)
def mod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("modulo can not be applied on string series or literals.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("modulo can not be applied to given types.")
raise TypeError("Modulo can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
@ -95,13 +79,8 @@ class NumericOps(DataTypeOps):
return column_op(mod)(left, right)
def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("exponentiation can not be applied on string series or literals.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("exponentiation can not be applied to given types.")
raise TypeError("Exponentiation can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
@ -111,34 +90,26 @@ class NumericOps(DataTypeOps):
return column_op(pow_func)(left, right)
def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("string addition can only be applied to string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("addition can not be applied to given types.")
raise TypeError("Addition can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right)
return column_op(Column.__radd__)(left, right)
def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("subtraction can not be applied to string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("subtraction can not be applied to given types.")
raise TypeError("Subtraction can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right)
return column_op(Column.__rsub__)(left, right)
def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("multiplication can not be applied to a string literal.")
if not isinstance(right, numbers.Number):
raise TypeError("multiplication can not be applied to given types.")
raise TypeError("Multiplication can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right)
return column_op(Column.__rmul__)(left, right)
def rpow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("exponentiation can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("exponentiation can not be applied to given types.")
raise TypeError("Exponentiation can not be applied to given types.")
def rpow_func(left: Column, right: Any) -> Column:
return F.when(SF.lit(right == 1), right).otherwise(Column.__rpow__(left, right))
@ -147,10 +118,8 @@ class NumericOps(DataTypeOps):
return column_op(rpow_func)(left, right)
def rmod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("modulo can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("modulo can not be applied to given types.")
raise TypeError("Modulo can not be applied to given types.")
def rmod(left: Column, right: Any) -> Column:
return ((right % left) + left) % left
@ -204,30 +173,19 @@ class IntegralOps(NumericOps):
return "integrals"
def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("multiplication can not be applied to a string literal.")
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, TimestampType):
raise TypeError("multiplication can not be applied to date times.")
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType):
return column_op(SF.repeat)(right, left)
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("multiplication can not be applied to given types.")
raise TypeError("Multiplication can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
return column_op(Column.__mul__)(left, right)
def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("division can not be applied to given types.")
raise TypeError("True division can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
@ -239,13 +197,8 @@ class IntegralOps(NumericOps):
return numpy_column_op(truediv)(left, right)
def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("division can not be applied to given types.")
raise TypeError("Floor division can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
@ -259,10 +212,8 @@ class IntegralOps(NumericOps):
return numpy_column_op(floordiv)(left, right)
def rtruediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
raise TypeError("True division can not be applied to given types.")
def rtruediv(left: Column, right: Any) -> Column:
return F.when(left == 0, SF.lit(np.inf).__div__(right)).otherwise(
@ -273,10 +224,8 @@ class IntegralOps(NumericOps):
return numpy_column_op(rtruediv)(left, right)
def rfloordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
raise TypeError("Floor division can not be applied to given types.")
def rfloordiv(left: Column, right: Any) -> Column:
return F.when(SF.lit(left == 0), SF.lit(np.inf).__div__(right)).otherwise(
@ -310,27 +259,16 @@ class FractionalOps(NumericOps):
return "fractions"
def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("multiplication can not be applied to a string literal.")
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, TimestampType):
raise TypeError("multiplication can not be applied to date times.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("multiplication can not be applied to given types.")
raise TypeError("Multiplication can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
return column_op(Column.__mul__)(left, right)
def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("division can not be applied to given types.")
raise TypeError("True division can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
@ -346,13 +284,8 @@ class FractionalOps(NumericOps):
return numpy_column_op(truediv)(left, right)
def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if (
isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType)
) or isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("division can not be applied to given types.")
raise TypeError("Floor division can not be applied to given types.")
right = transform_boolean_operand_to_numeric(right, left.spark.data_type)
@ -370,10 +303,8 @@ class FractionalOps(NumericOps):
return numpy_column_op(floordiv)(left, right)
def rtruediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
raise TypeError("True division can not be applied to given types.")
def rtruediv(left: Column, right: Any) -> Column:
return F.when(left == 0, SF.lit(np.inf).__div__(right)).otherwise(
@ -384,10 +315,8 @@ class FractionalOps(NumericOps):
return numpy_column_op(rtruediv)(left, right)
def rfloordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("division can not be applied on string series or literals.")
if not isinstance(right, numbers.Number):
raise TypeError("division can not be applied to given types.")
raise TypeError("Floor division can not be applied to given types.")
def rfloordiv(left: Column, right: Any) -> Column:
return F.when(SF.lit(left == 0), SF.lit(np.inf).__div__(right)).otherwise(

View file

@ -53,15 +53,9 @@ class StringOps(DataTypeOps):
elif isinstance(right, str):
return column_op(F.concat)(left, SF.lit(right))
else:
raise TypeError("string addition can only be applied to string series or literals.")
def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("subtraction can not be applied to string series or literals.")
raise TypeError("Addition can not be applied to given types.")
def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
raise TypeError("multiplication can not be applied to a string literal.")
if (
isinstance(right, IndexOpsMixin)
and isinstance(right.spark.data_type, IntegralType)
@ -69,19 +63,7 @@ class StringOps(DataTypeOps):
) or isinstance(right, int):
return column_op(SF.repeat)(left, right)
else:
raise TypeError("a string series can only be multiplied to an int series or literal")
def truediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("division can not be applied on string series or literals.")
def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("division can not be applied on string series or literals.")
def mod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("modulo can not be applied on string series or literals.")
def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("exponentiation can not be applied on string series or literals.")
raise TypeError("Multiplication can not be applied to given types.")
def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, str):
@ -90,28 +72,13 @@ class StringOps(DataTypeOps):
left._with_new_scol(F.concat(SF.lit(right), left.spark.column)), # TODO: dtype?
)
else:
raise TypeError("string addition can only be applied to string series or literals.")
def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("subtraction can not be applied to string series or literals.")
raise TypeError("Addition can not be applied to given types.")
def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
if isinstance(right, int):
return column_op(SF.repeat)(left, right)
else:
raise TypeError("a string series can only be multiplied to an int series or literal")
def rtruediv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("division can not be applied on string series or literals.")
def rfloordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("division can not be applied on string series or literals.")
def rpow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("exponentiation can not be applied on string series or literals.")
def rmod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
raise TypeError("modulo can not be applied on string series or literals.")
raise TypeError("Multiplication can not be applied to given types.")
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
from pyspark.pandas.base import column_op

View file

@ -212,7 +212,7 @@ class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: psidx % other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % psidx)
expected_err_msg = "datetime subtraction can only be applied to datetime series."
expected_err_msg = "Datetime subtraction can only be applied to datetime series."
for other in [1, 0.1]:
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: psidx - other)

View file

@ -2347,7 +2347,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(psdf["c"] + psdf["d"], pdf["c"] + pdf["d"])
# Negative
ks_err_msg = "string addition can only be applied to string series or literals"
ks_err_msg = "Addition can not be applied to given types"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] + psdf["c"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["c"] + psdf["a"])
@ -2365,12 +2365,13 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# Negative
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
ks_err_msg = "subtraction can not be applied to string series or literals"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - psdf["b"])
ks_err_msg = "Subtraction can not be applied to given types"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] - psdf["a"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] - "literal")
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" - psdf["b"])
ks_err_msg = "Subtraction can not be applied to strings"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - psdf["b"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 - psdf["a"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - 1)
@ -2386,23 +2387,27 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# Negative
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
ks_err_msg = "division can not be applied on string series or literals"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] / psdf["b"])
ks_err_msg = "True division can not be applied to given types"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] / psdf["a"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] / "literal")
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" / psdf["b"])
ks_err_msg = "True division can not be applied to strings"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] / psdf["b"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 / psdf["a"])
def test_binary_operator_floordiv(self):
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
ks_err_msg = "division can not be applied on string series or literals"
ks_err_msg = "Floor division can not be applied to strings"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] // psdf["b"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 // psdf["a"])
ks_err_msg = "Floor division can not be applied to given types"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] // psdf["a"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] // "literal")
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" // psdf["b"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 // psdf["a"])
def test_binary_operator_mod(self):
# Positive
@ -2413,11 +2418,12 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# Negative
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
ks_err_msg = "modulo can not be applied on string series or literals"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] % psdf["b"])
ks_err_msg = "Modulo can not be applied to given types"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] % psdf["a"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] % "literal")
ks_err_msg = "Modulo can not be applied to strings"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] % psdf["b"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 % psdf["a"])
def test_binary_operator_multiply(self):
@ -2436,12 +2442,11 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# Negative
psdf = ps.DataFrame({"a": ["x"], "b": [2]})
ks_err_msg = "multiplication can not be applied to a string literal"
ks_err_msg = "Multiplication can not be applied to given types"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["b"] * "literal")
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: "literal" * psdf["b"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] * "literal")
ks_err_msg = "a string series can only be multiplied to an int series or literal"
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] * psdf["a"])
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] * 0.1)
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 0.1 * psdf["a"])

View file

@ -70,7 +70,7 @@ class SeriesDateTimeTest(PandasOnSparkTestCase, SQLTestUtils):
psdf = ps.DataFrame(
{"a": pd.date_range("2016-12-31", "2017-01-08", freq="D"), "b": pd.Series(range(9))}
)
expected_error_message = "datetime subtraction can only be applied to datetime series."
expected_error_message = "Datetime subtraction can only be applied to datetime series."
with self.assertRaisesRegex(TypeError, expected_error_message):
psdf["a"] - psdf["b"]
with self.assertRaisesRegex(TypeError, expected_error_message):
@ -104,7 +104,7 @@ class SeriesDateTimeTest(PandasOnSparkTestCase, SQLTestUtils):
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: psser % other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % psser)
expected_err_msg = "datetime subtraction can only be applied to datetime series."
expected_err_msg = "Datetime subtraction can only be applied to datetime series."
for other in [1, 0.1]:
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: psser - other)
@ -135,7 +135,7 @@ class SeriesDateTimeTest(PandasOnSparkTestCase, SQLTestUtils):
psdf = ps.DataFrame(
{"a": pd.date_range("2016-12-31", "2017-01-08", freq="D"), "b": pd.Series(range(9))}
)
expected_error_message = "date subtraction can only be applied to date series."
expected_error_message = "Date subtraction can only be applied to date series."
with self.assertRaisesRegex(TypeError, expected_error_message):
psdf["a"].dt.date - psdf["b"]
with self.assertRaisesRegex(TypeError, expected_error_message):