From 44b7931936d9eff4d8f3054abdef3363af26afb6 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Mon, 3 May 2021 15:34:24 +0900 Subject: [PATCH] [SPARK-35176][PYTHON] Standardize input validation error type ### What changes were proposed in this pull request? This PR corrects some exception type when the function input params are failed to validate due to TypeError. In order to convenient to review, there are 3 commits in this PR: - Standardize input validation error type on sql - Standardize input validation error type on ml - Standardize input validation error type on pandas ### Why are the changes needed? As suggestion from Python exception doc [1]: "Raised when an operation or function is applied to an object of inappropriate type.", but there are many Value error are raised in some pyspark code, this patch fix them. [1] https://docs.python.org/3/library/exceptions.html#TypeError Note that: this patch only addresses the exsiting some wrong raise type for input validation, the input validation decorator/framework which mentioned in [SPARK-35176](https://issues.apache.org/jira/browse/SPARK-35176), would be submited in a speparated patch. ### Does this PR introduce _any_ user-facing change? Yes, code can raise the right TypeError instead of ValueError. ### How was this patch tested? Existing test case and UT Closes #32368 from Yikun/SPARK-35176. Authored-by: Yikun Jiang Signed-off-by: HyukjinKwon --- python/docs/source/migration_guide/index.rst | 1 + .../migration_guide/pyspark_3.1_to_3.2.rst | 23 ++++++++ python/pyspark/ml/base.py | 6 +- python/pyspark/ml/classification.py | 10 ++-- python/pyspark/ml/evaluation.py | 2 +- python/pyspark/ml/param/__init__.py | 2 +- python/pyspark/ml/regression.py | 4 +- python/pyspark/ml/tests/test_base.py | 23 ++++++-- python/pyspark/ml/tests/test_evaluation.py | 6 ++ python/pyspark/ml/tests/test_param.py | 15 +++++ python/pyspark/mllib/linalg/distributed.py | 6 +- python/pyspark/mllib/tests/test_linalg.py | 13 +++- python/pyspark/pandas/base.py | 4 +- python/pyspark/pandas/config.py | 4 +- python/pyspark/pandas/frame.py | 38 ++++++------ python/pyspark/pandas/generic.py | 2 +- python/pyspark/pandas/groupby.py | 2 +- python/pyspark/pandas/indexes/base.py | 2 +- python/pyspark/pandas/indexes/multi.py | 2 +- python/pyspark/pandas/namespace.py | 4 +- python/pyspark/pandas/plot/core.py | 4 +- python/pyspark/pandas/series.py | 22 +++---- python/pyspark/pandas/strings.py | 2 +- .../pyspark/pandas/tests/indexes/test_base.py | 22 +++---- python/pyspark/pandas/tests/test_config.py | 8 +-- python/pyspark/pandas/tests/test_dataframe.py | 52 ++++++++-------- python/pyspark/pandas/tests/test_groupby.py | 2 +- python/pyspark/pandas/tests/test_namespace.py | 4 +- .../pandas/tests/test_ops_on_diff_frames.py | 2 +- python/pyspark/pandas/tests/test_series.py | 32 +++++----- .../pandas/tests/test_series_string.py | 2 +- python/pyspark/pandas/tests/test_utils.py | 2 +- python/pyspark/pandas/utils.py | 2 +- python/pyspark/sql/dataframe.py | 59 ++++++++++--------- python/pyspark/sql/tests/test_dataframe.py | 6 +- python/pyspark/sql/tests/test_functions.py | 6 +- python/pyspark/taskcontext.py | 2 +- 37 files changed, 234 insertions(+), 164 deletions(-) create mode 100644 python/docs/source/migration_guide/pyspark_3.1_to_3.2.rst diff --git a/python/docs/source/migration_guide/index.rst b/python/docs/source/migration_guide/index.rst index d309d44780..5d30b585dc 100644 --- a/python/docs/source/migration_guide/index.rst +++ b/python/docs/source/migration_guide/index.rst @@ -25,6 +25,7 @@ This page describes the migration guide specific to PySpark. .. toctree:: :maxdepth: 2 + pyspark_3.1_to_3.2 pyspark_2.4_to_3.0 pyspark_2.3_to_2.4 pyspark_2.3.0_to_2.3.1_above diff --git a/python/docs/source/migration_guide/pyspark_3.1_to_3.2.rst b/python/docs/source/migration_guide/pyspark_3.1_to_3.2.rst new file mode 100644 index 0000000000..1537ef8e51 --- /dev/null +++ b/python/docs/source/migration_guide/pyspark_3.1_to_3.2.rst @@ -0,0 +1,23 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +================================= +Upgrading from PySpark 3.1 to 3.2 +================================= + +* In Spark 3.2, the PySpark methods from sql, ml, spark_on_pandas modules raise the ``TypeError`` instead of ``ValueError`` when are applied to an param of inappropriate type. diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py index fa5b553ac2..31ce93d2e6 100644 --- a/python/pyspark/ml/base.py +++ b/python/pyspark/ml/base.py @@ -160,8 +160,8 @@ class Estimator(Params, metaclass=ABCMeta): else: return self._fit(dataset) else: - raise ValueError("Params must be either a param map or a list/tuple of param maps, " - "but got %s." % type(params)) + raise TypeError("Params must be either a param map or a list/tuple of param maps, " + "but got %s." % type(params)) @inherit_doc @@ -216,7 +216,7 @@ class Transformer(Params, metaclass=ABCMeta): else: return self._transform(dataset) else: - raise ValueError("Params must be a param map but got %s." % type(params)) + raise TypeError("Params must be a param map but got %s." % type(params)) @inherit_doc diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 620760905a..cd68ff43a6 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -759,7 +759,7 @@ class LinearSVCModel(_JavaClassificationModel, _LinearSVCParams, JavaMLWritable, Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_lsvc_summary = self._call_java("evaluate", dataset) return LinearSVCSummary(java_lsvc_summary) @@ -1263,7 +1263,7 @@ class LogisticRegressionModel(_JavaProbabilisticClassificationModel, _LogisticRe Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_blr_summary = self._call_java("evaluate", dataset) if self.numClasses <= 2: return BinaryLogisticRegressionSummary(java_blr_summary) @@ -1869,7 +1869,7 @@ class RandomForestClassificationModel(_TreeEnsembleModel, _JavaProbabilisticClas Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_rf_summary = self._call_java("evaluate", dataset) if self.numClasses <= 2: return BinaryRandomForestClassificationSummary(java_rf_summary) @@ -2722,7 +2722,7 @@ class MultilayerPerceptronClassificationModel(_JavaProbabilisticClassificationMo Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_mlp_summary = self._call_java("evaluate", dataset) return MultilayerPerceptronClassificationSummary(java_mlp_summary) @@ -3521,7 +3521,7 @@ class FMClassificationModel(_JavaProbabilisticClassificationModel, _Factorizatio Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_fm_summary = self._call_java("evaluate", dataset) return FMClassificationSummary(java_fm_summary) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index b93d483067..e8cada9075 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -83,7 +83,7 @@ class Evaluator(Params, metaclass=ABCMeta): else: return self._evaluate(dataset) else: - raise ValueError("Params must be a param map but got %s." % type(params)) + raise TypeError("Params must be a param map but got %s." % type(params)) @since("1.5.0") def isLargerBetter(self): diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 3eab6607aa..ab3491c059 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -435,7 +435,7 @@ class Params(Identifiable, metaclass=ABCMeta): elif isinstance(param, str): return self.getParam(param) else: - raise ValueError("Cannot resolve %r as a param." % param) + raise TypeError("Cannot resolve %r as a param." % param) def _testOwnParam(self, param_parent, param_name): """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 122322e9f3..a2745db417 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -371,7 +371,7 @@ class LinearRegressionModel(_JavaRegressionModel, _LinearRegressionParams, Gener instance of :py:class:`pyspark.sql.DataFrame` """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_lr_summary = self._call_java("evaluate", dataset) return LinearRegressionSummary(java_lr_summary) @@ -2294,7 +2294,7 @@ class GeneralizedLinearRegressionModel(_JavaRegressionModel, _GeneralizedLinearR instance of :py:class:`pyspark.sql.DataFrame` """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_glr_summary = self._call_java("evaluate", dataset) return GeneralizedLinearRegressionSummary(java_glr_summary) diff --git a/python/pyspark/ml/tests/test_base.py b/python/pyspark/ml/tests/test_base.py index d2c0bdfdf8..28166e2d5c 100644 --- a/python/pyspark/ml/tests/test_base.py +++ b/python/pyspark/ml/tests/test_base.py @@ -19,7 +19,15 @@ import unittest from pyspark.sql.types import DoubleType, IntegerType from pyspark.testing.mlutils import MockDataset, MockEstimator, MockUnaryTransformer, \ - SparkSessionTestCase + MockTransformer, SparkSessionTestCase + + +class TransformerTests(unittest.TestCase): + + def test_transform_invalid_type(self): + transformer = MockTransformer() + data = MockDataset() + self.assertRaises(TypeError, transformer.transform, data, "") class UnaryTransformerTests(SparkSessionTestCase): @@ -52,13 +60,18 @@ class UnaryTransformerTests(SparkSessionTestCase): class EstimatorTest(unittest.TestCase): + def setUp(self): + self.estimator = MockEstimator() + self.data = MockDataset() + + def test_fit_invalid_params(self): + invalid_type_parms = "" + self.assertRaises(TypeError, self.estimator.fit, self.data, invalid_type_parms) def testDefaultFitMultiple(self): N = 4 - data = MockDataset() - estimator = MockEstimator() - params = [{estimator.fake: i} for i in range(N)] - modelIter = estimator.fitMultiple(data, params) + params = [{self.estimator.fake: i} for i in range(N)] + modelIter = self.estimator.fitMultiple(self.data, params) indexList = [] for index, model in modelIter: self.assertEqual(model.getFake(), index) diff --git a/python/pyspark/ml/tests/test_evaluation.py b/python/pyspark/ml/tests/test_evaluation.py index 746605076f..28df1b26dc 100644 --- a/python/pyspark/ml/tests/test_evaluation.py +++ b/python/pyspark/ml/tests/test_evaluation.py @@ -27,6 +27,12 @@ from pyspark.testing.mlutils import SparkSessionTestCase class EvaluatorTests(SparkSessionTestCase): + def test_evaluate_invalid_type(self): + evaluator = RegressionEvaluator(metricName="r2") + df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)]) + invalid_type = "" + self.assertRaises(TypeError, evaluator.evaluate, df, invalid_type) + def test_java_params(self): """ This tests a bug fixed by SPARK-18274 which causes multiple copies diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 09fe21e9fd..da875588cf 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -30,6 +30,7 @@ from pyspark.ml.feature import Binarizer, Bucketizer, ElementwiseProduct, IndexT from pyspark.ml.linalg import DenseVector, SparseVector, Vectors from pyspark.ml.param import Param, Params, TypeConverters from pyspark.ml.param.shared import HasInputCol, HasMaxIter, HasSeed +from pyspark.ml.regression import LinearRegressionModel, GeneralizedLinearRegressionModel from pyspark.ml.wrapper import JavaParams from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase @@ -197,6 +198,10 @@ class ParamTests(SparkSessionTestCase): self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter) self.assertRaises(AttributeError, lambda: testParams._resolveParam(u"아")) + # Invalid type + invalid_type = 1 + self.assertRaises(TypeError, testParams._resolveParam, invalid_type) + def test_params(self): testParams = TestParams() maxIter = testParams.maxIter @@ -332,6 +337,16 @@ class ParamTests(SparkSessionTestCase): self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0) + def test_lr_evaluate_invaild_type(self): + lr = LinearRegressionModel() + invalid_type = "" + self.assertRaises(TypeError, lr.evaluate, invalid_type) + + def test_glr_evaluate_invaild_type(self): + glr = GeneralizedLinearRegressionModel() + invalid_type = "" + self.assertRaises(TypeError, glr.evaluate, invalid_type) + class DefaultValuesTests(PySparkTestCase): """ diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index f0e889b15b..0128c204d9 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -465,8 +465,7 @@ class RowMatrix(DistributedMatrix): [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])] """ if not isinstance(matrix, DenseMatrix): - raise ValueError("Only multiplication with DenseMatrix " - "is supported.") + raise TypeError("Only multiplication with DenseMatrix is supported.") j_model = self._java_matrix_wrapper.call("multiply", matrix) return RowMatrix(j_model) @@ -854,8 +853,7 @@ class IndexedRowMatrix(DistributedMatrix): [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] """ if not isinstance(matrix, DenseMatrix): - raise ValueError("Only multiplication with DenseMatrix " - "is supported.") + raise TypeError("Only multiplication with DenseMatrix is supported.") return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) diff --git a/python/pyspark/mllib/tests/test_linalg.py b/python/pyspark/mllib/tests/test_linalg.py index a8303ba434..680076ed94 100644 --- a/python/pyspark/mllib/tests/test_linalg.py +++ b/python/pyspark/mllib/tests/test_linalg.py @@ -26,7 +26,7 @@ from pyspark.mllib.linalg import ( # type: ignore[attr-defined] Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT ) -from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix +from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix, IndexedRow from pyspark.mllib.regression import LabeledPoint from pyspark.sql import Row from pyspark.testing.mllibutils import MLlibTestCase @@ -452,6 +452,17 @@ class VectorUDTTests(MLlibTestCase): with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1")) + def test_row_matrix_invalid_type(self): + rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]]) + invalid_type = "" + matrix = RowMatrix(rows) + self.assertRaises(TypeError, matrix.multiply, invalid_type) + + irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), + IndexedRow(1, [4, 5, 6])]) + imatrix = IndexedRowMatrix(irows) + self.assertRaises(TypeError, imatrix.multiply, invalid_type) + class MatrixUDTTests(MLlibTestCase): diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 87685eeee1..9eff56d814 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -1498,7 +1498,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): def _shift(self, periods, fill_value, *, part_cols=()): if not isinstance(periods, int): - raise ValueError("periods should be an int; however, got [%s]" % type(periods).__name__) + raise TypeError("periods should be an int; however, got [%s]" % type(periods).__name__) col = self.spark.column window = ( @@ -1828,7 +1828,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): ) """ if not is_list_like(indices) or isinstance(indices, (dict, set)): - raise ValueError("`indices` must be a list-like except dict or set") + raise TypeError("`indices` must be a list-like except dict or set") if isinstance(self, ps.Series): return cast(ps.Series, self.iloc[indices]) else: diff --git a/python/pyspark/pandas/config.py b/python/pyspark/pandas/config.py index 98ce92c85e..93c6c8419d 100644 --- a/python/pyspark/pandas/config.py +++ b/python/pyspark/pandas/config.py @@ -70,7 +70,7 @@ class Option: >>> option.validate('abc') # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - ValueError: The value for option 'option.name' was ; + TypeError: The value for option 'option.name' was ; however, expected types are [(, )]. >>> option.validate(-1.1) @@ -101,7 +101,7 @@ class Option: Validate the given value and throw an exception with related information such as key. """ if not isinstance(v, self.types): - raise ValueError( + raise TypeError( "The value for option '%s' was %s; however, expected types are " "[%s]." % (self.key, type(v), str(self.types)) ) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 2f244e72c4..cf6c8d7bea 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -771,7 +771,7 @@ class DataFrame(Frame, Generic[T]): if not isinstance(other, DataFrame) and ( isinstance(other, IndexOpsMixin) or is_sequence(other) ): - raise ValueError( + raise TypeError( "%s with a sequence is currently not supported; " "however, got %s." % (op, type(other).__name__) ) @@ -2936,7 +2936,7 @@ defaultdict(, {'col..., 'col...})] from pyspark.pandas.series import first_series if not is_name_like_value(key): - raise ValueError("'key' should be a scalar value or tuple that contains scalar values") + raise TypeError("'key' should be a scalar value or tuple that contains scalar values") if level is not None and is_name_like_tuple(key): raise KeyError(key) @@ -3301,7 +3301,7 @@ defaultdict(, {'col..., 'col...})] ] kdf[tmp_cond_col_names] = cond else: - raise ValueError("type of cond must be a DataFrame or Series") + raise TypeError("type of cond must be a DataFrame or Series") tmp_other_col_names = [ tmp_other_col_name(name_like_string(label)) for label in self._internal.column_labels @@ -3431,7 +3431,7 @@ defaultdict(, {'col..., 'col...})] from pyspark.pandas.series import Series if not isinstance(cond, (DataFrame, Series)): - raise ValueError("type of cond must be a DataFrame or Series") + raise TypeError("type of cond must be a DataFrame or Series") cond_inversed = cond._apply_series_op(lambda kser: ~kser) return self.where(cond_inversed, other) @@ -3997,7 +3997,7 @@ defaultdict(, {'col..., 'col...})] assert allow_duplicates is False if not is_name_like_value(column): - raise ValueError( + raise TypeError( '"column" should be a scalar value or tuple that contains scalar values' ) @@ -4289,7 +4289,7 @@ defaultdict(, {'col..., 'col...})] elif isinstance(decimals, int): decimals = {k: decimals for k in self._internal.column_labels} else: - raise ValueError("decimals must be an integer, a dict-like or a Series") + raise TypeError("decimals must be an integer, a dict-like or a Series") def op(kser): label = kser._column_label @@ -5660,7 +5660,7 @@ defaultdict(, {'col..., 'col...})] will output the original DataFrame, simply ignoring the incompatible types. """ if is_list_like(lower) or is_list_like(upper): - raise ValueError( + raise TypeError( "List-like value are not supported for 'lower' and 'upper' at the " + "moment" ) @@ -5941,12 +5941,12 @@ defaultdict(, {'col..., 'col...})] small 5.5 2.333333 17 13 """ if not is_name_like_value(columns): - raise ValueError("columns should be one column name.") + raise TypeError("columns should be one column name.") if not is_name_like_value(values) and not ( isinstance(values, list) and all(is_name_like_value(v) for v in values) ): - raise ValueError("values should be one column or list of columns.") + raise TypeError("values should be one column or list of columns.") if not isinstance(aggfunc, str) and ( not isinstance(aggfunc, dict) @@ -5954,7 +5954,7 @@ defaultdict(, {'col..., 'col...})] is_name_like_value(key) and isinstance(value, str) for key, value in aggfunc.items() ) ): - raise ValueError( + raise TypeError( "aggfunc must be a dict mapping from column name " "to aggregate functions (string)." ) @@ -6031,7 +6031,7 @@ defaultdict(, {'col..., 'col...})] .agg(*agg_cols) ) else: - raise ValueError("index should be a None or a list of columns.") + raise TypeError("index should be a None or a list of columns.") if fill_value is not None and isinstance(fill_value, (int, float)): sdf = sdf.fillna(fill_value) @@ -7940,7 +7940,7 @@ defaultdict(, {'col..., 'col...})] 3 3 4 """ if isinstance(other, ps.Series): - raise ValueError("DataFrames.append() does not support appending Series to DataFrames") + raise TypeError("DataFrames.append() does not support appending Series to DataFrames") if sort: raise NotImplementedError("The 'sort' parameter is currently not supported") @@ -10726,7 +10726,7 @@ defaultdict(, {'col..., 'col...})] raise NotImplementedError('axis should be either 0 or "index" currently.') if not isinstance(accuracy, int): - raise ValueError( + raise TypeError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ ) @@ -10735,7 +10735,7 @@ defaultdict(, {'col..., 'col...})] for v in q if isinstance(q, list) else [q]: if not isinstance(v, float): - raise ValueError( + raise TypeError( "q must be a float or an array of floats; however, [%s] found." % type(v) ) if v < 0.0 or v > 1.0: @@ -10904,9 +10904,9 @@ defaultdict(, {'col..., 'col...})] 0 1 10 10 """ if isinstance(self.columns, pd.MultiIndex): - raise ValueError("Doesn't support for MultiIndex columns") + raise TypeError("Doesn't support for MultiIndex columns") if not isinstance(expr, str): - raise ValueError( + raise TypeError( "expr must be a string to be evaluated, {} given".format(type(expr).__name__) ) inplace = validate_bool_kwarg(inplace, "inplace") @@ -11012,7 +11012,7 @@ defaultdict(, {'col..., 'col...})] """ axis = validate_axis(axis) if not is_list_like(indices) or isinstance(indices, (dict, set)): - raise ValueError("`indices` must be a list-like except dict or set") + raise TypeError("`indices` must be a list-like except dict or set") if axis == 0: return cast(DataFrame, self.iloc[indices, :]) else: @@ -11098,7 +11098,7 @@ defaultdict(, {'col..., 'col...})] from pyspark.pandas.series import first_series if isinstance(self.columns, pd.MultiIndex): - raise ValueError("`eval` is not supported for multi-index columns") + raise TypeError("`eval` is not supported for multi-index columns") inplace = validate_bool_kwarg(inplace, "inplace") should_return_series = False series_name = None @@ -11179,7 +11179,7 @@ defaultdict(, {'col..., 'col...})] from pyspark.pandas.series import Series if not is_name_like_value(column): - raise ValueError("column must be a scalar") + raise TypeError("column must be a scalar") kdf = DataFrame(self._internal.resolved_copy) # type: "DataFrame" kser = kdf[column] diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 0140ed5e2a..9eede11a50 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -1895,7 +1895,7 @@ class Frame(object, metaclass=ABCMeta): numeric_only = True if not isinstance(accuracy, int): - raise ValueError( + raise TypeError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ ) diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 072a26739d..4bb43394f8 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -2416,7 +2416,7 @@ class GroupBy(object, metaclass=ABCMeta): Name: b, dtype: float64 """ if not isinstance(accuracy, int): - raise ValueError( + raise TypeError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ ) diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py index 5cc1fd6f53..c6918f303d 100644 --- a/python/pyspark/pandas/indexes/base.py +++ b/python/pyspark/pandas/indexes/base.py @@ -2076,7 +2076,7 @@ class Index(IndexOpsMixin): MultiIndex([], ) """ if not isinstance(repeats, int): - raise ValueError( + raise TypeError( "`repeats` argument must be integer, but got {}".format(type(repeats).__name__) ) elif repeats < 0: diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index 939bdaba4a..04717982e6 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -342,7 +342,7 @@ class MultiIndex(Index): if names is None: names = df._internal.column_labels elif not is_list_like(names): - raise ValueError("Names should be list-like for a MultiIndex") + raise TypeError("Names should be list-like for a MultiIndex") else: names = [name if is_name_like_tuple(name) else (name,) for name in names] diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 599206f42a..682abbd94d 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -126,7 +126,7 @@ def from_pandas(pobj: Union[pd.DataFrame, pd.Series, pd.Index]) -> Union[Series, elif isinstance(pobj, pd.Index): return DataFrame(pd.DataFrame(index=pobj)).index else: - raise ValueError("Unknown data type: {}".format(type(pobj).__name__)) + raise TypeError("Unknown data type: {}".format(type(pobj).__name__)) _range = range # built-in range @@ -2770,7 +2770,7 @@ def broadcast(obj) -> DataFrame: ... """ if not isinstance(obj, DataFrame): - raise ValueError("Invalid type : expected DataFrame got {}".format(type(obj).__name__)) + raise TypeError("Invalid type : expected DataFrame got {}".format(type(obj).__name__)) return DataFrame( obj._internal.with_new_sdf(F.broadcast(obj._internal.resolved_copy.spark_frame)) ) diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py index d8359d9abc..ac4d606267 100644 --- a/python/pyspark/pandas/plot/core.py +++ b/python/pyspark/pandas/plot/core.py @@ -40,7 +40,7 @@ class TopNPlotBase: if isinstance(data, (Series, DataFrame)): data = data.head(max_rows + 1).to_pandas() else: - raise ValueError("Only DataFrame and Series are supported for plotting.") + raise TypeError("Only DataFrame and Series are supported for plotting.") self.partial = False if len(data) > max_rows: @@ -80,7 +80,7 @@ class SampledPlotBase: sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction) return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas() else: - raise ValueError("Only DataFrame and Series are supported for plotting.") + raise TypeError("Only DataFrame and Series are supported for plotting.") def set_result_text(self, ax): assert hasattr(self, "fraction") diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index 63fb73b65e..467393bad7 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -2016,7 +2016,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): original Series, simply ignoring the incompatible types. """ if is_list_like(lower) or is_list_like(upper): - raise ValueError( + raise TypeError( "List-like value are not supported for 'lower' and 'upper' at the " + "moment" ) @@ -3182,7 +3182,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): elif isinstance(func, str): return getattr(self, func)() else: - raise ValueError("func must be a string or list of strings") + raise TypeError("func must be a string or list of strings") agg = aggregate @@ -3345,7 +3345,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): Name: x, dtype: float64 """ if not isinstance(decimals, int): - raise ValueError("decimals must be an integer") + raise TypeError("decimals must be an integer") scol = F.round(self.spark.column, decimals) return self._with_new_scol(scol) @@ -3402,12 +3402,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]): ).rename(self.name) else: if not isinstance(accuracy, int): - raise ValueError( + raise TypeError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ ) if not isinstance(q, float): - raise ValueError( + raise TypeError( "q must be a float or an array of floats; however, [%s] found." % type(q) ) if q < 0.0 or q > 1.0: @@ -3639,7 +3639,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): def _diff(self, periods, *, part_cols=()): if not isinstance(periods, int): - raise ValueError("periods should be an int; however, got [%s]" % type(periods).__name__) + raise TypeError("periods should be an int; however, got [%s]" % type(periods).__name__) window = ( Window.partitionBy(*part_cols) .orderBy(NATURAL_ORDER_COLUMN_NAME) @@ -3984,7 +3984,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): dtype: float64 """ if not is_name_like_value(item): - raise ValueError("'key' should be string or tuple that contains strings") + raise TypeError("'key' should be string or tuple that contains strings") if not is_name_like_tuple(item): item = (item,) if self._internal.index_level < len(item): @@ -4328,7 +4328,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): if to_replace is None: return self.fillna(method="ffill") if not isinstance(to_replace, (str, list, tuple, dict, int, float)): - raise ValueError("'to_replace' should be one of str, list, tuple, dict, int, float") + raise TypeError("'to_replace' should be one of str, list, tuple, dict, int, float") if regex: raise NotImplementedError("replace currently not support for regex") to_replace = list(to_replace) if isinstance(to_replace, tuple) else to_replace @@ -4438,7 +4438,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): >>> reset_option("compute.ops_on_diff_frames") """ if not isinstance(other, Series): - raise ValueError("'other' must be a Series") + raise TypeError("'other' must be a Series") combined = combine_frames(self._kdf, other._kdf, how="leftouter") @@ -4813,7 +4813,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): dtype: float64 """ if not isinstance(other, ps.Series): - raise ValueError("`combine_first` only allows `Series` for parameter `other`") + raise TypeError("`combine_first` only allows `Series` for parameter `other`") if same_anchor(self, other): this = self.spark.column that = other.spark.column @@ -4977,7 +4977,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): Series([], dtype: int64) """ if not isinstance(repeats, (int, Series)): - raise ValueError( + raise TypeError( "`repeats` argument must be integer or Series, but got {}".format(type(repeats)) ) diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py index b70256d643..2ae2940081 100644 --- a/python/pyspark/pandas/strings.py +++ b/python/pyspark/pandas/strings.py @@ -1489,7 +1489,7 @@ class StringMethods(object): dtype: object """ if not isinstance(repeats, int): - raise ValueError("repeats expects an int parameter") + raise TypeError("repeats expects an int parameter") return self._data.spark.transform(lambda c: SF.repeat(col=c, n=repeats)) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True) -> "ps.Series": diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index a0eb243a6c..87656c9239 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -1356,7 +1356,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): self.assert_eq((kidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values()) self.assertRaises(ValueError, lambda: kidx.repeat(-1)) - self.assertRaises(ValueError, lambda: kidx.repeat("abc")) + self.assertRaises(TypeError, lambda: kidx.repeat("abc")) pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) kmidx = ps.from_pandas(pmidx) @@ -1365,7 +1365,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(kmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True) self.assertRaises(ValueError, lambda: kmidx.repeat(-1)) - self.assertRaises(ValueError, lambda: kmidx.repeat("abc")) + self.assertRaises(TypeError, lambda: kmidx.repeat("abc")) def test_unique(self): pidx = pd.Index(["a", "b", "a"]) @@ -1618,14 +1618,14 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): ) # Checking the type of indices. - self.assertRaises(ValueError, lambda: kidx.take(1)) - self.assertRaises(ValueError, lambda: kidx.take("1")) - self.assertRaises(ValueError, lambda: kidx.take({1, 2})) - self.assertRaises(ValueError, lambda: kidx.take({1: None, 2: None})) - self.assertRaises(ValueError, lambda: kmidx.take(1)) - self.assertRaises(ValueError, lambda: kmidx.take("1")) - self.assertRaises(ValueError, lambda: kmidx.take({1, 2})) - self.assertRaises(ValueError, lambda: kmidx.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: kidx.take(1)) + self.assertRaises(TypeError, lambda: kidx.take("1")) + self.assertRaises(TypeError, lambda: kidx.take({1, 2})) + self.assertRaises(TypeError, lambda: kidx.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: kmidx.take(1)) + self.assertRaises(TypeError, lambda: kmidx.take("1")) + self.assertRaises(TypeError, lambda: kmidx.take({1, 2})) + self.assertRaises(TypeError, lambda: kmidx.take({1: None, 2: None})) def test_index_get_level_values(self): pidx = pd.Index([1, 2, 3], name="ks") @@ -1998,7 +1998,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): with self.assertRaisesRegex(TypeError, err_msg): ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]}) - self.assertRaises(ValueError, lambda: ps.MultiIndex.from_frame(kdf, names="ab")) + self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(kdf, names="ab")) # non-string names self.assert_eq( diff --git a/python/pyspark/pandas/tests/test_config.py b/python/pyspark/pandas/tests/test_config.py index ba717a9712..0709caddf8 100644 --- a/python/pyspark/pandas/tests/test_config.py +++ b/python/pyspark/pandas/tests/test_config.py @@ -77,16 +77,16 @@ class ConfigTest(PandasOnSparkTestCase): self.assertEqual(ps.get_option("test.config.int.none"), None) def test_different_types(self): - with self.assertRaisesRegex(ValueError, "was "): + with self.assertRaisesRegex(TypeError, "was "): ps.set_option("test.config.list", 1) - with self.assertRaisesRegex(ValueError, "however, expected types are"): + with self.assertRaisesRegex(TypeError, "however, expected types are"): ps.set_option("test.config.float", "abc") - with self.assertRaisesRegex(ValueError, "[]"): + with self.assertRaisesRegex(TypeError, "[]"): ps.set_option("test.config.int", "abc") - with self.assertRaisesRegex(ValueError, "(, )"): + with self.assertRaisesRegex(TypeError, "(, )"): ps.set_option("test.config.int.none", "abc") def test_check_func(self): diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index d7cb3ab359..7577f012e6 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -214,7 +214,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ValueError, "cannot insert b, already exists", lambda: kdf.insert(1, "b", 10) ) self.assertRaisesRegex( - ValueError, + TypeError, '"column" should be a scalar value or tuple that contains scalar values', lambda: kdf.insert(0, list("abc"), kser), ) @@ -2206,7 +2206,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # Assert appending a Series fails msg = "DataFrames.append() does not support appending Series to DataFrames" - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kdf.append(kdf["A"]) # Assert using the sort parameter raises an exception @@ -2286,9 +2286,9 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # Assert list-like values are not accepted for 'lower' and 'upper' msg = "List-like value are not supported for 'lower' and 'upper' at the moment" - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kdf.clip(lower=[1]) - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kdf.clip(upper=[1]) # Assert no lower or upper @@ -2323,7 +2323,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) self.assertRaisesRegex( - ValueError, + TypeError, "add with a sequence is currently not supported", lambda: ps.range(10).add(ps.range(10).id), ) @@ -3060,7 +3060,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assertRaises(KeyError, lambda: kdf.pivot_table(index=["c"], columns="a", values=5)) msg = "index should be a None or a list of columns." - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.pivot_table(index="c", columns="a", values="b") msg = "pivot_table doesn't support aggfunc as dict and without index." @@ -3068,7 +3068,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): kdf.pivot_table(columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"}) msg = "columns should be one column name." - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.pivot_table(columns=["a"], values=["b"], aggfunc={"b": "mean", "e": "sum"}) msg = "Columns in aggfunc must be the same as values." @@ -3843,7 +3843,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pdf.round({"A": 1, "D": 2}), kdf.round({"A": 1, "D": 2})) self.assert_eq(pdf.round(pser), kdf.round(kser)) msg = "decimals must be an integer, a dict-like or a Series" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.round(1.5) # multi-index columns @@ -3894,7 +3894,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) self.assert_eq(pdf1, kdf.shift(periods=3, fill_value=0)) msg = "should be an int" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.shift(1.5) # multi-index columns @@ -3916,7 +3916,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pdf.diff().sum().astype(int), kdf.diff().sum()) msg = "should be an int" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.diff(1.5) msg = 'axis should be either 0 or "index" currently.' with self.assertRaisesRegex(NotImplementedError, msg): @@ -4491,11 +4491,11 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): NotImplementedError, 'axis should be either 0 or "index" currently.' ): kdf.quantile(0.5, axis=1) - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): kdf.quantile(accuracy="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): kdf.quantile(q="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): kdf.quantile(q=["a"]) self.assert_eq(kdf.quantile(0.5, numeric_only=False), pdf.quantile(0.5, numeric_only=False)) @@ -4541,13 +4541,13 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): def test_where(self): kdf = ps.from_pandas(self.pdf) - with self.assertRaisesRegex(ValueError, "type of cond must be a DataFrame or Series"): + with self.assertRaisesRegex(TypeError, "type of cond must be a DataFrame or Series"): kdf.where(1) def test_mask(self): kdf = ps.from_pandas(self.pdf) - with self.assertRaisesRegex(ValueError, "type of cond must be a DataFrame or Series"): + with self.assertRaisesRegex(TypeError, "type of cond must be a DataFrame or Series"): kdf.mask(1) def test_query(self): @@ -4575,7 +4575,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): invalid_exprs = (1, 1.0, (exprs[0],), [exprs[0]]) for expr in invalid_exprs: with self.assertRaisesRegex( - ValueError, + TypeError, "expr must be a string to be evaluated, {} given".format(type(expr).__name__), ): kdf.query(expr) @@ -4584,7 +4584,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): invalid_inplaces = (1, 0, "True", "False") for inplace in invalid_inplaces: with self.assertRaisesRegex( - ValueError, + TypeError, 'For argument "inplace" expected type bool, received type {}.'.format( type(inplace).__name__ ), @@ -4594,7 +4594,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # doesn't support for MultiIndex columns columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "C")]) kdf.columns = columns - with self.assertRaisesRegex(ValueError, "Doesn't support for MultiIndex columns"): + with self.assertRaisesRegex(TypeError, "Doesn't support for MultiIndex columns"): kdf.query("('A', 'Z') > ('B', 'X')") def test_take(self): @@ -4683,10 +4683,10 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) # Checking the type of indices. - self.assertRaises(ValueError, lambda: kdf.take(1)) - self.assertRaises(ValueError, lambda: kdf.take("1")) - self.assertRaises(ValueError, lambda: kdf.take({1, 2})) - self.assertRaises(ValueError, lambda: kdf.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: kdf.take(1)) + self.assertRaises(TypeError, lambda: kdf.take("1")) + self.assertRaises(TypeError, lambda: kdf.take({1, 2})) + self.assertRaises(TypeError, lambda: kdf.take({1: None, 2: None})) def test_axes(self): pdf = self.pdf @@ -4739,7 +4739,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # doesn't support for multi-index columns columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b"), ("z", "c")]) kdf.columns = columns - self.assertRaises(ValueError, lambda: kdf.eval("x.a + y.b")) + self.assertRaises(TypeError, lambda: kdf.eval("x.a + y.b")) @unittest.skipIf(not have_tabulate, tabulate_requirement_message) def test_to_markdown(self): @@ -4972,7 +4972,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kdf.explode("A").index.name, expected_result1.index.name) self.assert_eq(kdf.explode("A").columns.name, expected_result1.columns.name) - self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"])) + self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"])) # MultiIndex midx = pd.MultiIndex.from_tuples( @@ -4997,7 +4997,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kdf.explode("A").index.names, expected_result1.index.names) self.assert_eq(kdf.explode("A").columns.name, expected_result1.columns.name) - self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"])) + self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"])) # MultiIndex columns columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X")], names=["column1", "column2"]) @@ -5022,7 +5022,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kdf.A.explode("Z"), expected_result3, almost=True) - self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"])) + self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"])) self.assertRaises(ValueError, lambda: kdf.explode("A")) def test_spark_schema(self): diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index a6d006fad9..dde3162604 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -2643,7 +2643,7 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): ) self.assert_eq(expected_result, kdf.groupby("a")["b"].median().sort_index()) - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): kdf.groupby("a").median(accuracy="a") def test_tail(self): diff --git a/python/pyspark/pandas/tests/test_namespace.py b/python/pyspark/pandas/tests/test_namespace.py index e8787397e1..c45c65316b 100644 --- a/python/pyspark/pandas/tests/test_namespace.py +++ b/python/pyspark/pandas/tests/test_namespace.py @@ -48,7 +48,7 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kmidx, pmidx) expected_error_message = "Unknown data type: {}".format(type(kidx).__name__) - with self.assertRaisesRegex(ValueError, expected_error_message): + with self.assertRaisesRegex(TypeError, expected_error_message): ps.from_pandas(kidx) def test_to_datetime(self): @@ -303,7 +303,7 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils): expected_error_message = "Invalid type : expected DataFrame got {}".format( type(kser).__name__ ) - with self.assertRaisesRegex(ValueError, expected_error_message): + with self.assertRaisesRegex(TypeError, expected_error_message): ps.broadcast(kser) def test_get_index_map(self): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index a998414542..31a296f45f 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -622,7 +622,7 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): kser1.combine_first(kser2).sort_index(), pser1.combine_first(pser2).sort_index() ) with self.assertRaisesRegex( - ValueError, "`combine_first` only allows `Series` for parameter `other`" + TypeError, "`combine_first` only allows `Series` for parameter `other`" ): kser1.combine_first(50) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index eae26bc4c8..c9f3f7e363 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -1109,9 +1109,9 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): # Assert list-like values are not accepted for 'lower' and 'upper' msg = "List-like value are not supported for 'lower' and 'upper' at the moment" - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kser.clip(lower=[1]) - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kser.clip(upper=[1]) # Assert no lower or upper @@ -1324,7 +1324,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pser.cumprod(skipna=False).astype(int), kser.cumprod(skipna=False)) def test_median(self): - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a") def test_rank(self): @@ -1347,7 +1347,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): kser = ps.from_pandas(pser) self.assert_eq(pser.round(2), kser.round(2)) msg = "decimals must be an integer" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kser.round(1.5) def test_quantile(self): @@ -1357,11 +1357,11 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kser.quantile(0.5), pser.quantile(0.5)) self.assert_eq(kser.quantile([0.25, 0.5, 0.75]), pser.quantile([0.25, 0.5, 0.75])) - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(accuracy="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"]) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): @@ -1433,7 +1433,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kser.shift(periods=2), pser.shift(periods=2)) else: self.assert_eq(kser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0)) - with self.assertRaisesRegex(ValueError, "periods should be an int; however"): + with self.assertRaisesRegex(TypeError, "periods should be an int; however"): kser.shift(periods=1.5) def test_diff(self): @@ -1602,7 +1602,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): pser = pd.Series([10, 20, 15, 30, 45], name="x") kser = ps.Series(pser) msg = "func must be a string or list of strings" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kser.aggregate({"x": ["min", "max"]}) msg = ( "If the given function is a list, it " "should only contains function names as strings." @@ -1692,7 +1692,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kser.replace((10, 15), (45, 50)), pser.replace((10, 15), (45, 50))) msg = "'to_replace' should be one of str, list, tuple, dict, int, float" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kser.replace(ps.range(5)) msg = "Replacement lists must match in length. Expecting 3 got 2" with self.assertRaisesRegex(ValueError, msg): @@ -1734,7 +1734,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): kser = ps.Series(pser) msg = "'other' must be a Series" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kser.update(10) def test_where(self): @@ -1883,7 +1883,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kser.repeat(0).sort_index(), pser.repeat(0).sort_index()) self.assertRaises(ValueError, lambda: kser.repeat(-1)) - self.assertRaises(ValueError, lambda: kser.repeat("abc")) + self.assertRaises(TypeError, lambda: kser.repeat("abc")) pdf = pd.DataFrame({"a": ["a", "b", "c"], "rep": [10, 20, 30]}, index=np.random.rand(3)) kdf = ps.from_pandas(pdf) @@ -1904,10 +1904,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): ) # Checking the type of indices. - self.assertRaises(ValueError, lambda: kser.take(1)) - self.assertRaises(ValueError, lambda: kser.take("1")) - self.assertRaises(ValueError, lambda: kser.take({1, 2})) - self.assertRaises(ValueError, lambda: kser.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: kser.take(1)) + self.assertRaises(TypeError, lambda: kser.take("1")) + self.assertRaises(TypeError, lambda: kser.take({1, 2})) + self.assertRaises(TypeError, lambda: kser.take({1: None, 2: None})) def test_divmod(self): pser = pd.Series([100, None, 300, None, 500], name="Koalas") diff --git a/python/pyspark/pandas/tests/test_series_string.py b/python/pyspark/pandas/tests/test_series_string.py index 69a9ab3424..057db77360 100644 --- a/python/pyspark/pandas/tests/test_series_string.py +++ b/python/pyspark/pandas/tests/test_series_string.py @@ -241,7 +241,7 @@ class SeriesStringTest(PandasOnSparkTestCase, SQLTestUtils): def test_string_repeat(self): self.check_func(lambda x: x.str.repeat(repeats=3)) - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): self.check_func(lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) def test_string_replace(self): diff --git a/python/pyspark/pandas/tests/test_utils.py b/python/pyspark/pandas/tests/test_utils.py index 2f4039ba20..a3595486e0 100644 --- a/python/pyspark/pandas/tests/test_utils.py +++ b/python/pyspark/pandas/tests/test_utils.py @@ -78,7 +78,7 @@ class UtilsTest(PandasOnSparkTestCase, SQLTestUtils): # This should fail because we are explicitly setting a non-boolean value koalas = "true" with self.assertRaisesRegex( - ValueError, 'For argument "koalas" expected type bool, received type str.' + TypeError, 'For argument "koalas" expected type bool, received type str.' ): validate_bool_kwarg(koalas, "koalas") diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py index fa98f14fe5..948a786c63 100644 --- a/python/pyspark/pandas/utils.py +++ b/python/pyspark/pandas/utils.py @@ -689,7 +689,7 @@ def validate_axis(axis=0, none_axis=0): def validate_bool_kwarg(value, arg_name): """ Ensures that argument passed in arg_name is of type bool. """ if not (isinstance(value, bool) or value is None): - raise ValueError( + raise TypeError( 'For argument "{}" expected type bool, received ' "type {}.".format(arg_name, type(value).__name__) ) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index a31e3d95d0..8fe263e152 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1134,12 +1134,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if isinstance(col, str): col = Column(col) elif not isinstance(col, Column): - raise ValueError("col must be a string or a column, but got %r" % type(col)) + raise TypeError("col must be a string or a column, but got %r" % type(col)) if not isinstance(fractions, dict): - raise ValueError("fractions must be a dict but got %r" % type(fractions)) + raise TypeError("fractions must be a dict but got %r" % type(fractions)) for k, v in fractions.items(): if not isinstance(k, (float, int, str)): - raise ValueError("key must be float, int, or string, but got %r" % type(k)) + raise TypeError("key must be float, int, or string, but got %r" % type(k)) fractions[k] = float(v) col = col._jc seed = seed if seed is not None else random.randint(0, sys.maxsize) @@ -1225,7 +1225,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): +----+ """ if not isinstance(colName, str): - raise ValueError("colName should be provided as string") + raise TypeError("colName should be provided as string") jc = self._jdf.colRegex(colName) return Column(jc) @@ -2009,7 +2009,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): elif isinstance(subset, str): subset = [subset] elif not isinstance(subset, (list, tuple)): - raise ValueError("subset should be a list or tuple of column names") + raise TypeError("subset should be a list or tuple of column names") if thresh is None: thresh = len(subset) if how == 'any' else 1 @@ -2067,7 +2067,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): +---+------+-------+ """ if not isinstance(value, (float, int, str, bool, dict)): - raise ValueError("value should be a float, int, string, bool or dict") + raise TypeError("value should be a float, int, string, bool or dict") # Note that bool validates isinstance(int), but we don't want to # convert bools to floats @@ -2083,7 +2083,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if isinstance(subset, str): subset = [subset] elif not isinstance(subset, (list, tuple)): - raise ValueError("subset should be a list or tuple of column names") + raise TypeError("subset should be a list or tuple of column names") return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) @@ -2186,15 +2186,15 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): # Validate input types valid_types = (bool, float, int, str, list, tuple) if not isinstance(to_replace, valid_types + (dict, )): - raise ValueError( + raise TypeError( "to_replace should be a bool, float, int, string, list, tuple, or dict. " "Got {0}".format(type(to_replace))) if not isinstance(value, valid_types) and value is not None \ and not isinstance(to_replace, dict): - raise ValueError("If to_replace is not a dict, value should be " - "a bool, float, int, string, list, tuple or None. " - "Got {0}".format(type(value))) + raise TypeError("If to_replace is not a dict, value should be " + "a bool, float, int, string, list, tuple or None. " + "Got {0}".format(type(value))) if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)): if len(to_replace) != len(value): @@ -2202,8 +2202,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): "Got {0} and {1}".format(len(to_replace), len(value))) if not (subset is None or isinstance(subset, (list, tuple, str))): - raise ValueError("subset should be a list or tuple of column names, " - "column name or None. Got {0}".format(type(subset))) + raise TypeError("subset should be a list or tuple of column names, " + "column name or None. Got {0}".format(type(subset))) # Reshape input arguments if necessary if isinstance(to_replace, (float, int, str)): @@ -2285,7 +2285,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): """ if not isinstance(col, (str, list, tuple)): - raise ValueError("col should be a string, list or tuple, but got %r" % type(col)) + raise TypeError("col should be a string, list or tuple, but got %r" % type(col)) isStr = isinstance(col, str) @@ -2296,11 +2296,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): for c in col: if not isinstance(c, str): - raise ValueError("columns should be strings, but got %r" % type(c)) + raise TypeError("columns should be strings, but got %r" % type(c)) col = _to_list(self._sc, col) if not isinstance(probabilities, (list, tuple)): - raise ValueError("probabilities should be a list or tuple") + raise TypeError("probabilities should be a list or tuple") if isinstance(probabilities, tuple): probabilities = list(probabilities) for p in probabilities: @@ -2308,8 +2308,10 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): raise ValueError("probabilities should be numerical (float, int) in [0,1].") probabilities = _to_list(self._sc, probabilities) - if not isinstance(relativeError, (float, int)) or relativeError < 0: - raise ValueError("relativeError should be numerical (float, int) >= 0.") + if not isinstance(relativeError, (float, int)): + raise TypeError("relativeError should be numerical (float, int)") + if relativeError < 0: + raise ValueError("relativeError should be >= 0.") relativeError = float(relativeError) jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError) @@ -2334,9 +2336,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): The correlation method. Currently only supports "pearson" """ if not isinstance(col1, str): - raise ValueError("col1 should be a string.") + raise TypeError("col1 should be a string.") if not isinstance(col2, str): - raise ValueError("col2 should be a string.") + raise TypeError("col2 should be a string.") if not method: method = "pearson" if not method == "pearson": @@ -2359,9 +2361,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): The name of the second column """ if not isinstance(col1, str): - raise ValueError("col1 should be a string.") + raise TypeError("col1 should be a string.") if not isinstance(col2, str): - raise ValueError("col2 should be a string.") + raise TypeError("col2 should be a string.") return self._jdf.stat().cov(col1, col2) def crosstab(self, col1, col2): @@ -2386,9 +2388,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): of the :class:`DataFrame`. """ if not isinstance(col1, str): - raise ValueError("col1 should be a string.") + raise TypeError("col1 should be a string.") if not isinstance(col2, str): - raise ValueError("col2 should be a string.") + raise TypeError("col2 should be a string.") return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx) def freqItems(self, cols, support=None): @@ -2418,7 +2420,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if isinstance(cols, tuple): cols = list(cols) if not isinstance(cols, list): - raise ValueError("cols must be a list or tuple of column names as strings.") + raise TypeError("cols must be a list or tuple of column names as strings.") if not support: support = 0.01 return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx) @@ -2453,7 +2455,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): [Row(age=2, name='Alice', age2=4), Row(age=5, name='Bob', age2=7)] """ - assert isinstance(col, Column), "col should be Column" + if not isinstance(col, Column): + raise TypeError("col should be Column") return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx) def withColumnRenamed(self, existing, new): @@ -2597,8 +2600,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): True """ if not isinstance(other, DataFrame): - raise ValueError("other parameter should be of DataFrame; however, got %s" - % type(other)) + raise TypeError("other parameter should be of DataFrame; however, got %s" + % type(other)) return self._jdf.sameSemantics(other._jdf) def semanticHash(self): diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index e3977e8185..3e961cba7e 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -319,7 +319,7 @@ class DataFrameTests(ReusedSQLTestCase): self.assertTupleEqual(row, (u'Alice', 20, None)) # should fail if subset is not list, tuple or None - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): self.spark.createDataFrame( [(u'Alice', 10, 80.1)], schema).replace({10: 11}, subset=1).first() @@ -329,7 +329,7 @@ class DataFrameTests(ReusedSQLTestCase): [(u'Alice', 10, 80.1)], schema).replace(["Alice", "Bob"], ["Eve"]).first() # should fail if when received unexpected type - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): from datetime import datetime self.spark.createDataFrame( [(u'Alice', 10, 80.1)], schema).replace(datetime.now(), datetime.now()).first() @@ -818,7 +818,7 @@ class DataFrameTests(ReusedSQLTestCase): def test_same_semantics_error(self): with QuietTest(self.sc): - with self.assertRaisesRegex(ValueError, "should be of DataFrame.*int"): + with self.assertRaisesRegex(TypeError, "should be of DataFrame.*int"): self.spark.range(10).sameSemantics(1) def test_input_files(self): diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 112043e792..f49b7b2f35 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -279,9 +279,9 @@ class FunctionsTests(ReusedSQLTestCase): self.assertTrue(isinstance(aqt[1], list)) self.assertEqual(len(aqt[1]), 3) self.assertTrue(all(isinstance(q, float) for q in aqt[1])) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(123, [0.1, 0.9], 0.1)) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(("a", 123), [0.1, 0.9], 0.1)) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(["a", 123], [0.1, 0.9], 0.1)) + self.assertRaises(TypeError, lambda: df.stat.approxQuantile(123, [0.1, 0.9], 0.1)) + self.assertRaises(TypeError, lambda: df.stat.approxQuantile(("a", 123), [0.1, 0.9], 0.1)) + self.assertRaises(TypeError, lambda: df.stat.approxQuantile(["a", 123], [0.1, 0.9], 0.1)) def test_sorting_functions_with_column(self): from pyspark.sql import functions diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py index 091c7636c8..1afbe63084 100644 --- a/python/pyspark/taskcontext.py +++ b/python/pyspark/taskcontext.py @@ -236,7 +236,7 @@ class BarrierTaskContext(TaskContext): This API is experimental """ if not isinstance(message, str): - raise ValueError("Argument `message` must be of type `str`") + raise TypeError("Argument `message` must be of type `str`") elif self._port is None or self._secret is None: raise Exception("Not supported to call barrier() before initialize " + "BarrierTaskContext.")