diff --git a/python/docs/source/migration_guide/index.rst b/python/docs/source/migration_guide/index.rst index d309d44780..5d30b585dc 100644 --- a/python/docs/source/migration_guide/index.rst +++ b/python/docs/source/migration_guide/index.rst @@ -25,6 +25,7 @@ This page describes the migration guide specific to PySpark. .. toctree:: :maxdepth: 2 + pyspark_3.1_to_3.2 pyspark_2.4_to_3.0 pyspark_2.3_to_2.4 pyspark_2.3.0_to_2.3.1_above diff --git a/python/docs/source/migration_guide/pyspark_3.1_to_3.2.rst b/python/docs/source/migration_guide/pyspark_3.1_to_3.2.rst new file mode 100644 index 0000000000..1537ef8e51 --- /dev/null +++ b/python/docs/source/migration_guide/pyspark_3.1_to_3.2.rst @@ -0,0 +1,23 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +================================= +Upgrading from PySpark 3.1 to 3.2 +================================= + +* In Spark 3.2, the PySpark methods from sql, ml, spark_on_pandas modules raise the ``TypeError`` instead of ``ValueError`` when are applied to an param of inappropriate type. diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py index fa5b553ac2..31ce93d2e6 100644 --- a/python/pyspark/ml/base.py +++ b/python/pyspark/ml/base.py @@ -160,8 +160,8 @@ class Estimator(Params, metaclass=ABCMeta): else: return self._fit(dataset) else: - raise ValueError("Params must be either a param map or a list/tuple of param maps, " - "but got %s." % type(params)) + raise TypeError("Params must be either a param map or a list/tuple of param maps, " + "but got %s." % type(params)) @inherit_doc @@ -216,7 +216,7 @@ class Transformer(Params, metaclass=ABCMeta): else: return self._transform(dataset) else: - raise ValueError("Params must be a param map but got %s." % type(params)) + raise TypeError("Params must be a param map but got %s." % type(params)) @inherit_doc diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 620760905a..cd68ff43a6 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -759,7 +759,7 @@ class LinearSVCModel(_JavaClassificationModel, _LinearSVCParams, JavaMLWritable, Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_lsvc_summary = self._call_java("evaluate", dataset) return LinearSVCSummary(java_lsvc_summary) @@ -1263,7 +1263,7 @@ class LogisticRegressionModel(_JavaProbabilisticClassificationModel, _LogisticRe Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_blr_summary = self._call_java("evaluate", dataset) if self.numClasses <= 2: return BinaryLogisticRegressionSummary(java_blr_summary) @@ -1869,7 +1869,7 @@ class RandomForestClassificationModel(_TreeEnsembleModel, _JavaProbabilisticClas Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_rf_summary = self._call_java("evaluate", dataset) if self.numClasses <= 2: return BinaryRandomForestClassificationSummary(java_rf_summary) @@ -2722,7 +2722,7 @@ class MultilayerPerceptronClassificationModel(_JavaProbabilisticClassificationMo Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_mlp_summary = self._call_java("evaluate", dataset) return MultilayerPerceptronClassificationSummary(java_mlp_summary) @@ -3521,7 +3521,7 @@ class FMClassificationModel(_JavaProbabilisticClassificationModel, _Factorizatio Test dataset to evaluate model on. """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_fm_summary = self._call_java("evaluate", dataset) return FMClassificationSummary(java_fm_summary) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index b93d483067..e8cada9075 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -83,7 +83,7 @@ class Evaluator(Params, metaclass=ABCMeta): else: return self._evaluate(dataset) else: - raise ValueError("Params must be a param map but got %s." % type(params)) + raise TypeError("Params must be a param map but got %s." % type(params)) @since("1.5.0") def isLargerBetter(self): diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 3eab6607aa..ab3491c059 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -435,7 +435,7 @@ class Params(Identifiable, metaclass=ABCMeta): elif isinstance(param, str): return self.getParam(param) else: - raise ValueError("Cannot resolve %r as a param." % param) + raise TypeError("Cannot resolve %r as a param." % param) def _testOwnParam(self, param_parent, param_name): """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 122322e9f3..a2745db417 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -371,7 +371,7 @@ class LinearRegressionModel(_JavaRegressionModel, _LinearRegressionParams, Gener instance of :py:class:`pyspark.sql.DataFrame` """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_lr_summary = self._call_java("evaluate", dataset) return LinearRegressionSummary(java_lr_summary) @@ -2294,7 +2294,7 @@ class GeneralizedLinearRegressionModel(_JavaRegressionModel, _GeneralizedLinearR instance of :py:class:`pyspark.sql.DataFrame` """ if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + raise TypeError("dataset must be a DataFrame but got %s." % type(dataset)) java_glr_summary = self._call_java("evaluate", dataset) return GeneralizedLinearRegressionSummary(java_glr_summary) diff --git a/python/pyspark/ml/tests/test_base.py b/python/pyspark/ml/tests/test_base.py index d2c0bdfdf8..28166e2d5c 100644 --- a/python/pyspark/ml/tests/test_base.py +++ b/python/pyspark/ml/tests/test_base.py @@ -19,7 +19,15 @@ import unittest from pyspark.sql.types import DoubleType, IntegerType from pyspark.testing.mlutils import MockDataset, MockEstimator, MockUnaryTransformer, \ - SparkSessionTestCase + MockTransformer, SparkSessionTestCase + + +class TransformerTests(unittest.TestCase): + + def test_transform_invalid_type(self): + transformer = MockTransformer() + data = MockDataset() + self.assertRaises(TypeError, transformer.transform, data, "") class UnaryTransformerTests(SparkSessionTestCase): @@ -52,13 +60,18 @@ class UnaryTransformerTests(SparkSessionTestCase): class EstimatorTest(unittest.TestCase): + def setUp(self): + self.estimator = MockEstimator() + self.data = MockDataset() + + def test_fit_invalid_params(self): + invalid_type_parms = "" + self.assertRaises(TypeError, self.estimator.fit, self.data, invalid_type_parms) def testDefaultFitMultiple(self): N = 4 - data = MockDataset() - estimator = MockEstimator() - params = [{estimator.fake: i} for i in range(N)] - modelIter = estimator.fitMultiple(data, params) + params = [{self.estimator.fake: i} for i in range(N)] + modelIter = self.estimator.fitMultiple(self.data, params) indexList = [] for index, model in modelIter: self.assertEqual(model.getFake(), index) diff --git a/python/pyspark/ml/tests/test_evaluation.py b/python/pyspark/ml/tests/test_evaluation.py index 746605076f..28df1b26dc 100644 --- a/python/pyspark/ml/tests/test_evaluation.py +++ b/python/pyspark/ml/tests/test_evaluation.py @@ -27,6 +27,12 @@ from pyspark.testing.mlutils import SparkSessionTestCase class EvaluatorTests(SparkSessionTestCase): + def test_evaluate_invalid_type(self): + evaluator = RegressionEvaluator(metricName="r2") + df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)]) + invalid_type = "" + self.assertRaises(TypeError, evaluator.evaluate, df, invalid_type) + def test_java_params(self): """ This tests a bug fixed by SPARK-18274 which causes multiple copies diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 09fe21e9fd..da875588cf 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -30,6 +30,7 @@ from pyspark.ml.feature import Binarizer, Bucketizer, ElementwiseProduct, IndexT from pyspark.ml.linalg import DenseVector, SparseVector, Vectors from pyspark.ml.param import Param, Params, TypeConverters from pyspark.ml.param.shared import HasInputCol, HasMaxIter, HasSeed +from pyspark.ml.regression import LinearRegressionModel, GeneralizedLinearRegressionModel from pyspark.ml.wrapper import JavaParams from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase @@ -197,6 +198,10 @@ class ParamTests(SparkSessionTestCase): self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter) self.assertRaises(AttributeError, lambda: testParams._resolveParam(u"아")) + # Invalid type + invalid_type = 1 + self.assertRaises(TypeError, testParams._resolveParam, invalid_type) + def test_params(self): testParams = TestParams() maxIter = testParams.maxIter @@ -332,6 +337,16 @@ class ParamTests(SparkSessionTestCase): self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0) + def test_lr_evaluate_invaild_type(self): + lr = LinearRegressionModel() + invalid_type = "" + self.assertRaises(TypeError, lr.evaluate, invalid_type) + + def test_glr_evaluate_invaild_type(self): + glr = GeneralizedLinearRegressionModel() + invalid_type = "" + self.assertRaises(TypeError, glr.evaluate, invalid_type) + class DefaultValuesTests(PySparkTestCase): """ diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index f0e889b15b..0128c204d9 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -465,8 +465,7 @@ class RowMatrix(DistributedMatrix): [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])] """ if not isinstance(matrix, DenseMatrix): - raise ValueError("Only multiplication with DenseMatrix " - "is supported.") + raise TypeError("Only multiplication with DenseMatrix is supported.") j_model = self._java_matrix_wrapper.call("multiply", matrix) return RowMatrix(j_model) @@ -854,8 +853,7 @@ class IndexedRowMatrix(DistributedMatrix): [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] """ if not isinstance(matrix, DenseMatrix): - raise ValueError("Only multiplication with DenseMatrix " - "is supported.") + raise TypeError("Only multiplication with DenseMatrix is supported.") return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) diff --git a/python/pyspark/mllib/tests/test_linalg.py b/python/pyspark/mllib/tests/test_linalg.py index a8303ba434..680076ed94 100644 --- a/python/pyspark/mllib/tests/test_linalg.py +++ b/python/pyspark/mllib/tests/test_linalg.py @@ -26,7 +26,7 @@ from pyspark.mllib.linalg import ( # type: ignore[attr-defined] Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT ) -from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix +from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix, IndexedRow from pyspark.mllib.regression import LabeledPoint from pyspark.sql import Row from pyspark.testing.mllibutils import MLlibTestCase @@ -452,6 +452,17 @@ class VectorUDTTests(MLlibTestCase): with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1")) + def test_row_matrix_invalid_type(self): + rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]]) + invalid_type = "" + matrix = RowMatrix(rows) + self.assertRaises(TypeError, matrix.multiply, invalid_type) + + irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), + IndexedRow(1, [4, 5, 6])]) + imatrix = IndexedRowMatrix(irows) + self.assertRaises(TypeError, imatrix.multiply, invalid_type) + class MatrixUDTTests(MLlibTestCase): diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 87685eeee1..9eff56d814 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -1498,7 +1498,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): def _shift(self, periods, fill_value, *, part_cols=()): if not isinstance(periods, int): - raise ValueError("periods should be an int; however, got [%s]" % type(periods).__name__) + raise TypeError("periods should be an int; however, got [%s]" % type(periods).__name__) col = self.spark.column window = ( @@ -1828,7 +1828,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): ) """ if not is_list_like(indices) or isinstance(indices, (dict, set)): - raise ValueError("`indices` must be a list-like except dict or set") + raise TypeError("`indices` must be a list-like except dict or set") if isinstance(self, ps.Series): return cast(ps.Series, self.iloc[indices]) else: diff --git a/python/pyspark/pandas/config.py b/python/pyspark/pandas/config.py index 98ce92c85e..93c6c8419d 100644 --- a/python/pyspark/pandas/config.py +++ b/python/pyspark/pandas/config.py @@ -70,7 +70,7 @@ class Option: >>> option.validate('abc') # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - ValueError: The value for option 'option.name' was ; + TypeError: The value for option 'option.name' was ; however, expected types are [(, )]. >>> option.validate(-1.1) @@ -101,7 +101,7 @@ class Option: Validate the given value and throw an exception with related information such as key. """ if not isinstance(v, self.types): - raise ValueError( + raise TypeError( "The value for option '%s' was %s; however, expected types are " "[%s]." % (self.key, type(v), str(self.types)) ) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 2f244e72c4..cf6c8d7bea 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -771,7 +771,7 @@ class DataFrame(Frame, Generic[T]): if not isinstance(other, DataFrame) and ( isinstance(other, IndexOpsMixin) or is_sequence(other) ): - raise ValueError( + raise TypeError( "%s with a sequence is currently not supported; " "however, got %s." % (op, type(other).__name__) ) @@ -2936,7 +2936,7 @@ defaultdict(, {'col..., 'col...})] from pyspark.pandas.series import first_series if not is_name_like_value(key): - raise ValueError("'key' should be a scalar value or tuple that contains scalar values") + raise TypeError("'key' should be a scalar value or tuple that contains scalar values") if level is not None and is_name_like_tuple(key): raise KeyError(key) @@ -3301,7 +3301,7 @@ defaultdict(, {'col..., 'col...})] ] kdf[tmp_cond_col_names] = cond else: - raise ValueError("type of cond must be a DataFrame or Series") + raise TypeError("type of cond must be a DataFrame or Series") tmp_other_col_names = [ tmp_other_col_name(name_like_string(label)) for label in self._internal.column_labels @@ -3431,7 +3431,7 @@ defaultdict(, {'col..., 'col...})] from pyspark.pandas.series import Series if not isinstance(cond, (DataFrame, Series)): - raise ValueError("type of cond must be a DataFrame or Series") + raise TypeError("type of cond must be a DataFrame or Series") cond_inversed = cond._apply_series_op(lambda kser: ~kser) return self.where(cond_inversed, other) @@ -3997,7 +3997,7 @@ defaultdict(, {'col..., 'col...})] assert allow_duplicates is False if not is_name_like_value(column): - raise ValueError( + raise TypeError( '"column" should be a scalar value or tuple that contains scalar values' ) @@ -4289,7 +4289,7 @@ defaultdict(, {'col..., 'col...})] elif isinstance(decimals, int): decimals = {k: decimals for k in self._internal.column_labels} else: - raise ValueError("decimals must be an integer, a dict-like or a Series") + raise TypeError("decimals must be an integer, a dict-like or a Series") def op(kser): label = kser._column_label @@ -5660,7 +5660,7 @@ defaultdict(, {'col..., 'col...})] will output the original DataFrame, simply ignoring the incompatible types. """ if is_list_like(lower) or is_list_like(upper): - raise ValueError( + raise TypeError( "List-like value are not supported for 'lower' and 'upper' at the " + "moment" ) @@ -5941,12 +5941,12 @@ defaultdict(, {'col..., 'col...})] small 5.5 2.333333 17 13 """ if not is_name_like_value(columns): - raise ValueError("columns should be one column name.") + raise TypeError("columns should be one column name.") if not is_name_like_value(values) and not ( isinstance(values, list) and all(is_name_like_value(v) for v in values) ): - raise ValueError("values should be one column or list of columns.") + raise TypeError("values should be one column or list of columns.") if not isinstance(aggfunc, str) and ( not isinstance(aggfunc, dict) @@ -5954,7 +5954,7 @@ defaultdict(, {'col..., 'col...})] is_name_like_value(key) and isinstance(value, str) for key, value in aggfunc.items() ) ): - raise ValueError( + raise TypeError( "aggfunc must be a dict mapping from column name " "to aggregate functions (string)." ) @@ -6031,7 +6031,7 @@ defaultdict(, {'col..., 'col...})] .agg(*agg_cols) ) else: - raise ValueError("index should be a None or a list of columns.") + raise TypeError("index should be a None or a list of columns.") if fill_value is not None and isinstance(fill_value, (int, float)): sdf = sdf.fillna(fill_value) @@ -7940,7 +7940,7 @@ defaultdict(, {'col..., 'col...})] 3 3 4 """ if isinstance(other, ps.Series): - raise ValueError("DataFrames.append() does not support appending Series to DataFrames") + raise TypeError("DataFrames.append() does not support appending Series to DataFrames") if sort: raise NotImplementedError("The 'sort' parameter is currently not supported") @@ -10726,7 +10726,7 @@ defaultdict(, {'col..., 'col...})] raise NotImplementedError('axis should be either 0 or "index" currently.') if not isinstance(accuracy, int): - raise ValueError( + raise TypeError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ ) @@ -10735,7 +10735,7 @@ defaultdict(, {'col..., 'col...})] for v in q if isinstance(q, list) else [q]: if not isinstance(v, float): - raise ValueError( + raise TypeError( "q must be a float or an array of floats; however, [%s] found." % type(v) ) if v < 0.0 or v > 1.0: @@ -10904,9 +10904,9 @@ defaultdict(, {'col..., 'col...})] 0 1 10 10 """ if isinstance(self.columns, pd.MultiIndex): - raise ValueError("Doesn't support for MultiIndex columns") + raise TypeError("Doesn't support for MultiIndex columns") if not isinstance(expr, str): - raise ValueError( + raise TypeError( "expr must be a string to be evaluated, {} given".format(type(expr).__name__) ) inplace = validate_bool_kwarg(inplace, "inplace") @@ -11012,7 +11012,7 @@ defaultdict(, {'col..., 'col...})] """ axis = validate_axis(axis) if not is_list_like(indices) or isinstance(indices, (dict, set)): - raise ValueError("`indices` must be a list-like except dict or set") + raise TypeError("`indices` must be a list-like except dict or set") if axis == 0: return cast(DataFrame, self.iloc[indices, :]) else: @@ -11098,7 +11098,7 @@ defaultdict(, {'col..., 'col...})] from pyspark.pandas.series import first_series if isinstance(self.columns, pd.MultiIndex): - raise ValueError("`eval` is not supported for multi-index columns") + raise TypeError("`eval` is not supported for multi-index columns") inplace = validate_bool_kwarg(inplace, "inplace") should_return_series = False series_name = None @@ -11179,7 +11179,7 @@ defaultdict(, {'col..., 'col...})] from pyspark.pandas.series import Series if not is_name_like_value(column): - raise ValueError("column must be a scalar") + raise TypeError("column must be a scalar") kdf = DataFrame(self._internal.resolved_copy) # type: "DataFrame" kser = kdf[column] diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 0140ed5e2a..9eede11a50 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -1895,7 +1895,7 @@ class Frame(object, metaclass=ABCMeta): numeric_only = True if not isinstance(accuracy, int): - raise ValueError( + raise TypeError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ ) diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 072a26739d..4bb43394f8 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -2416,7 +2416,7 @@ class GroupBy(object, metaclass=ABCMeta): Name: b, dtype: float64 """ if not isinstance(accuracy, int): - raise ValueError( + raise TypeError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ ) diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py index 5cc1fd6f53..c6918f303d 100644 --- a/python/pyspark/pandas/indexes/base.py +++ b/python/pyspark/pandas/indexes/base.py @@ -2076,7 +2076,7 @@ class Index(IndexOpsMixin): MultiIndex([], ) """ if not isinstance(repeats, int): - raise ValueError( + raise TypeError( "`repeats` argument must be integer, but got {}".format(type(repeats).__name__) ) elif repeats < 0: diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index 939bdaba4a..04717982e6 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -342,7 +342,7 @@ class MultiIndex(Index): if names is None: names = df._internal.column_labels elif not is_list_like(names): - raise ValueError("Names should be list-like for a MultiIndex") + raise TypeError("Names should be list-like for a MultiIndex") else: names = [name if is_name_like_tuple(name) else (name,) for name in names] diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 599206f42a..682abbd94d 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -126,7 +126,7 @@ def from_pandas(pobj: Union[pd.DataFrame, pd.Series, pd.Index]) -> Union[Series, elif isinstance(pobj, pd.Index): return DataFrame(pd.DataFrame(index=pobj)).index else: - raise ValueError("Unknown data type: {}".format(type(pobj).__name__)) + raise TypeError("Unknown data type: {}".format(type(pobj).__name__)) _range = range # built-in range @@ -2770,7 +2770,7 @@ def broadcast(obj) -> DataFrame: ... """ if not isinstance(obj, DataFrame): - raise ValueError("Invalid type : expected DataFrame got {}".format(type(obj).__name__)) + raise TypeError("Invalid type : expected DataFrame got {}".format(type(obj).__name__)) return DataFrame( obj._internal.with_new_sdf(F.broadcast(obj._internal.resolved_copy.spark_frame)) ) diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py index d8359d9abc..ac4d606267 100644 --- a/python/pyspark/pandas/plot/core.py +++ b/python/pyspark/pandas/plot/core.py @@ -40,7 +40,7 @@ class TopNPlotBase: if isinstance(data, (Series, DataFrame)): data = data.head(max_rows + 1).to_pandas() else: - raise ValueError("Only DataFrame and Series are supported for plotting.") + raise TypeError("Only DataFrame and Series are supported for plotting.") self.partial = False if len(data) > max_rows: @@ -80,7 +80,7 @@ class SampledPlotBase: sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction) return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas() else: - raise ValueError("Only DataFrame and Series are supported for plotting.") + raise TypeError("Only DataFrame and Series are supported for plotting.") def set_result_text(self, ax): assert hasattr(self, "fraction") diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index 63fb73b65e..467393bad7 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -2016,7 +2016,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): original Series, simply ignoring the incompatible types. """ if is_list_like(lower) or is_list_like(upper): - raise ValueError( + raise TypeError( "List-like value are not supported for 'lower' and 'upper' at the " + "moment" ) @@ -3182,7 +3182,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): elif isinstance(func, str): return getattr(self, func)() else: - raise ValueError("func must be a string or list of strings") + raise TypeError("func must be a string or list of strings") agg = aggregate @@ -3345,7 +3345,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): Name: x, dtype: float64 """ if not isinstance(decimals, int): - raise ValueError("decimals must be an integer") + raise TypeError("decimals must be an integer") scol = F.round(self.spark.column, decimals) return self._with_new_scol(scol) @@ -3402,12 +3402,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]): ).rename(self.name) else: if not isinstance(accuracy, int): - raise ValueError( + raise TypeError( "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ ) if not isinstance(q, float): - raise ValueError( + raise TypeError( "q must be a float or an array of floats; however, [%s] found." % type(q) ) if q < 0.0 or q > 1.0: @@ -3639,7 +3639,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): def _diff(self, periods, *, part_cols=()): if not isinstance(periods, int): - raise ValueError("periods should be an int; however, got [%s]" % type(periods).__name__) + raise TypeError("periods should be an int; however, got [%s]" % type(periods).__name__) window = ( Window.partitionBy(*part_cols) .orderBy(NATURAL_ORDER_COLUMN_NAME) @@ -3984,7 +3984,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): dtype: float64 """ if not is_name_like_value(item): - raise ValueError("'key' should be string or tuple that contains strings") + raise TypeError("'key' should be string or tuple that contains strings") if not is_name_like_tuple(item): item = (item,) if self._internal.index_level < len(item): @@ -4328,7 +4328,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): if to_replace is None: return self.fillna(method="ffill") if not isinstance(to_replace, (str, list, tuple, dict, int, float)): - raise ValueError("'to_replace' should be one of str, list, tuple, dict, int, float") + raise TypeError("'to_replace' should be one of str, list, tuple, dict, int, float") if regex: raise NotImplementedError("replace currently not support for regex") to_replace = list(to_replace) if isinstance(to_replace, tuple) else to_replace @@ -4438,7 +4438,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): >>> reset_option("compute.ops_on_diff_frames") """ if not isinstance(other, Series): - raise ValueError("'other' must be a Series") + raise TypeError("'other' must be a Series") combined = combine_frames(self._kdf, other._kdf, how="leftouter") @@ -4813,7 +4813,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): dtype: float64 """ if not isinstance(other, ps.Series): - raise ValueError("`combine_first` only allows `Series` for parameter `other`") + raise TypeError("`combine_first` only allows `Series` for parameter `other`") if same_anchor(self, other): this = self.spark.column that = other.spark.column @@ -4977,7 +4977,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): Series([], dtype: int64) """ if not isinstance(repeats, (int, Series)): - raise ValueError( + raise TypeError( "`repeats` argument must be integer or Series, but got {}".format(type(repeats)) ) diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py index b70256d643..2ae2940081 100644 --- a/python/pyspark/pandas/strings.py +++ b/python/pyspark/pandas/strings.py @@ -1489,7 +1489,7 @@ class StringMethods(object): dtype: object """ if not isinstance(repeats, int): - raise ValueError("repeats expects an int parameter") + raise TypeError("repeats expects an int parameter") return self._data.spark.transform(lambda c: SF.repeat(col=c, n=repeats)) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True) -> "ps.Series": diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index a0eb243a6c..87656c9239 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -1356,7 +1356,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): self.assert_eq((kidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values()) self.assertRaises(ValueError, lambda: kidx.repeat(-1)) - self.assertRaises(ValueError, lambda: kidx.repeat("abc")) + self.assertRaises(TypeError, lambda: kidx.repeat("abc")) pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) kmidx = ps.from_pandas(pmidx) @@ -1365,7 +1365,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(kmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True) self.assertRaises(ValueError, lambda: kmidx.repeat(-1)) - self.assertRaises(ValueError, lambda: kmidx.repeat("abc")) + self.assertRaises(TypeError, lambda: kmidx.repeat("abc")) def test_unique(self): pidx = pd.Index(["a", "b", "a"]) @@ -1618,14 +1618,14 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): ) # Checking the type of indices. - self.assertRaises(ValueError, lambda: kidx.take(1)) - self.assertRaises(ValueError, lambda: kidx.take("1")) - self.assertRaises(ValueError, lambda: kidx.take({1, 2})) - self.assertRaises(ValueError, lambda: kidx.take({1: None, 2: None})) - self.assertRaises(ValueError, lambda: kmidx.take(1)) - self.assertRaises(ValueError, lambda: kmidx.take("1")) - self.assertRaises(ValueError, lambda: kmidx.take({1, 2})) - self.assertRaises(ValueError, lambda: kmidx.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: kidx.take(1)) + self.assertRaises(TypeError, lambda: kidx.take("1")) + self.assertRaises(TypeError, lambda: kidx.take({1, 2})) + self.assertRaises(TypeError, lambda: kidx.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: kmidx.take(1)) + self.assertRaises(TypeError, lambda: kmidx.take("1")) + self.assertRaises(TypeError, lambda: kmidx.take({1, 2})) + self.assertRaises(TypeError, lambda: kmidx.take({1: None, 2: None})) def test_index_get_level_values(self): pidx = pd.Index([1, 2, 3], name="ks") @@ -1998,7 +1998,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): with self.assertRaisesRegex(TypeError, err_msg): ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]}) - self.assertRaises(ValueError, lambda: ps.MultiIndex.from_frame(kdf, names="ab")) + self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(kdf, names="ab")) # non-string names self.assert_eq( diff --git a/python/pyspark/pandas/tests/test_config.py b/python/pyspark/pandas/tests/test_config.py index ba717a9712..0709caddf8 100644 --- a/python/pyspark/pandas/tests/test_config.py +++ b/python/pyspark/pandas/tests/test_config.py @@ -77,16 +77,16 @@ class ConfigTest(PandasOnSparkTestCase): self.assertEqual(ps.get_option("test.config.int.none"), None) def test_different_types(self): - with self.assertRaisesRegex(ValueError, "was "): + with self.assertRaisesRegex(TypeError, "was "): ps.set_option("test.config.list", 1) - with self.assertRaisesRegex(ValueError, "however, expected types are"): + with self.assertRaisesRegex(TypeError, "however, expected types are"): ps.set_option("test.config.float", "abc") - with self.assertRaisesRegex(ValueError, "[]"): + with self.assertRaisesRegex(TypeError, "[]"): ps.set_option("test.config.int", "abc") - with self.assertRaisesRegex(ValueError, "(, )"): + with self.assertRaisesRegex(TypeError, "(, )"): ps.set_option("test.config.int.none", "abc") def test_check_func(self): diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index d7cb3ab359..7577f012e6 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -214,7 +214,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ValueError, "cannot insert b, already exists", lambda: kdf.insert(1, "b", 10) ) self.assertRaisesRegex( - ValueError, + TypeError, '"column" should be a scalar value or tuple that contains scalar values', lambda: kdf.insert(0, list("abc"), kser), ) @@ -2206,7 +2206,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # Assert appending a Series fails msg = "DataFrames.append() does not support appending Series to DataFrames" - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kdf.append(kdf["A"]) # Assert using the sort parameter raises an exception @@ -2286,9 +2286,9 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # Assert list-like values are not accepted for 'lower' and 'upper' msg = "List-like value are not supported for 'lower' and 'upper' at the moment" - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kdf.clip(lower=[1]) - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kdf.clip(upper=[1]) # Assert no lower or upper @@ -2323,7 +2323,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) self.assertRaisesRegex( - ValueError, + TypeError, "add with a sequence is currently not supported", lambda: ps.range(10).add(ps.range(10).id), ) @@ -3060,7 +3060,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assertRaises(KeyError, lambda: kdf.pivot_table(index=["c"], columns="a", values=5)) msg = "index should be a None or a list of columns." - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.pivot_table(index="c", columns="a", values="b") msg = "pivot_table doesn't support aggfunc as dict and without index." @@ -3068,7 +3068,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): kdf.pivot_table(columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"}) msg = "columns should be one column name." - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.pivot_table(columns=["a"], values=["b"], aggfunc={"b": "mean", "e": "sum"}) msg = "Columns in aggfunc must be the same as values." @@ -3843,7 +3843,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pdf.round({"A": 1, "D": 2}), kdf.round({"A": 1, "D": 2})) self.assert_eq(pdf.round(pser), kdf.round(kser)) msg = "decimals must be an integer, a dict-like or a Series" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.round(1.5) # multi-index columns @@ -3894,7 +3894,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) self.assert_eq(pdf1, kdf.shift(periods=3, fill_value=0)) msg = "should be an int" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.shift(1.5) # multi-index columns @@ -3916,7 +3916,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pdf.diff().sum().astype(int), kdf.diff().sum()) msg = "should be an int" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kdf.diff(1.5) msg = 'axis should be either 0 or "index" currently.' with self.assertRaisesRegex(NotImplementedError, msg): @@ -4491,11 +4491,11 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): NotImplementedError, 'axis should be either 0 or "index" currently.' ): kdf.quantile(0.5, axis=1) - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): kdf.quantile(accuracy="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): kdf.quantile(q="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): kdf.quantile(q=["a"]) self.assert_eq(kdf.quantile(0.5, numeric_only=False), pdf.quantile(0.5, numeric_only=False)) @@ -4541,13 +4541,13 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): def test_where(self): kdf = ps.from_pandas(self.pdf) - with self.assertRaisesRegex(ValueError, "type of cond must be a DataFrame or Series"): + with self.assertRaisesRegex(TypeError, "type of cond must be a DataFrame or Series"): kdf.where(1) def test_mask(self): kdf = ps.from_pandas(self.pdf) - with self.assertRaisesRegex(ValueError, "type of cond must be a DataFrame or Series"): + with self.assertRaisesRegex(TypeError, "type of cond must be a DataFrame or Series"): kdf.mask(1) def test_query(self): @@ -4575,7 +4575,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): invalid_exprs = (1, 1.0, (exprs[0],), [exprs[0]]) for expr in invalid_exprs: with self.assertRaisesRegex( - ValueError, + TypeError, "expr must be a string to be evaluated, {} given".format(type(expr).__name__), ): kdf.query(expr) @@ -4584,7 +4584,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): invalid_inplaces = (1, 0, "True", "False") for inplace in invalid_inplaces: with self.assertRaisesRegex( - ValueError, + TypeError, 'For argument "inplace" expected type bool, received type {}.'.format( type(inplace).__name__ ), @@ -4594,7 +4594,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # doesn't support for MultiIndex columns columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "C")]) kdf.columns = columns - with self.assertRaisesRegex(ValueError, "Doesn't support for MultiIndex columns"): + with self.assertRaisesRegex(TypeError, "Doesn't support for MultiIndex columns"): kdf.query("('A', 'Z') > ('B', 'X')") def test_take(self): @@ -4683,10 +4683,10 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) # Checking the type of indices. - self.assertRaises(ValueError, lambda: kdf.take(1)) - self.assertRaises(ValueError, lambda: kdf.take("1")) - self.assertRaises(ValueError, lambda: kdf.take({1, 2})) - self.assertRaises(ValueError, lambda: kdf.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: kdf.take(1)) + self.assertRaises(TypeError, lambda: kdf.take("1")) + self.assertRaises(TypeError, lambda: kdf.take({1, 2})) + self.assertRaises(TypeError, lambda: kdf.take({1: None, 2: None})) def test_axes(self): pdf = self.pdf @@ -4739,7 +4739,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): # doesn't support for multi-index columns columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b"), ("z", "c")]) kdf.columns = columns - self.assertRaises(ValueError, lambda: kdf.eval("x.a + y.b")) + self.assertRaises(TypeError, lambda: kdf.eval("x.a + y.b")) @unittest.skipIf(not have_tabulate, tabulate_requirement_message) def test_to_markdown(self): @@ -4972,7 +4972,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kdf.explode("A").index.name, expected_result1.index.name) self.assert_eq(kdf.explode("A").columns.name, expected_result1.columns.name) - self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"])) + self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"])) # MultiIndex midx = pd.MultiIndex.from_tuples( @@ -4997,7 +4997,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kdf.explode("A").index.names, expected_result1.index.names) self.assert_eq(kdf.explode("A").columns.name, expected_result1.columns.name) - self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"])) + self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"])) # MultiIndex columns columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X")], names=["column1", "column2"]) @@ -5022,7 +5022,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kdf.A.explode("Z"), expected_result3, almost=True) - self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"])) + self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"])) self.assertRaises(ValueError, lambda: kdf.explode("A")) def test_spark_schema(self): diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index a6d006fad9..dde3162604 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -2643,7 +2643,7 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): ) self.assert_eq(expected_result, kdf.groupby("a")["b"].median().sort_index()) - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): kdf.groupby("a").median(accuracy="a") def test_tail(self): diff --git a/python/pyspark/pandas/tests/test_namespace.py b/python/pyspark/pandas/tests/test_namespace.py index e8787397e1..c45c65316b 100644 --- a/python/pyspark/pandas/tests/test_namespace.py +++ b/python/pyspark/pandas/tests/test_namespace.py @@ -48,7 +48,7 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kmidx, pmidx) expected_error_message = "Unknown data type: {}".format(type(kidx).__name__) - with self.assertRaisesRegex(ValueError, expected_error_message): + with self.assertRaisesRegex(TypeError, expected_error_message): ps.from_pandas(kidx) def test_to_datetime(self): @@ -303,7 +303,7 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils): expected_error_message = "Invalid type : expected DataFrame got {}".format( type(kser).__name__ ) - with self.assertRaisesRegex(ValueError, expected_error_message): + with self.assertRaisesRegex(TypeError, expected_error_message): ps.broadcast(kser) def test_get_index_map(self): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index a998414542..31a296f45f 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -622,7 +622,7 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): kser1.combine_first(kser2).sort_index(), pser1.combine_first(pser2).sort_index() ) with self.assertRaisesRegex( - ValueError, "`combine_first` only allows `Series` for parameter `other`" + TypeError, "`combine_first` only allows `Series` for parameter `other`" ): kser1.combine_first(50) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index eae26bc4c8..c9f3f7e363 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -1109,9 +1109,9 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): # Assert list-like values are not accepted for 'lower' and 'upper' msg = "List-like value are not supported for 'lower' and 'upper' at the moment" - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kser.clip(lower=[1]) - with self.assertRaises(ValueError, msg=msg): + with self.assertRaises(TypeError, msg=msg): kser.clip(upper=[1]) # Assert no lower or upper @@ -1324,7 +1324,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pser.cumprod(skipna=False).astype(int), kser.cumprod(skipna=False)) def test_median(self): - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a") def test_rank(self): @@ -1347,7 +1347,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): kser = ps.from_pandas(pser) self.assert_eq(pser.round(2), kser.round(2)) msg = "decimals must be an integer" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kser.round(1.5) def test_quantile(self): @@ -1357,11 +1357,11 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kser.quantile(0.5), pser.quantile(0.5)) self.assert_eq(kser.quantile([0.25, 0.5, 0.75]), pser.quantile([0.25, 0.5, 0.75])) - with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): + with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(accuracy="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a") - with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"): + with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"]) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): @@ -1433,7 +1433,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kser.shift(periods=2), pser.shift(periods=2)) else: self.assert_eq(kser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0)) - with self.assertRaisesRegex(ValueError, "periods should be an int; however"): + with self.assertRaisesRegex(TypeError, "periods should be an int; however"): kser.shift(periods=1.5) def test_diff(self): @@ -1602,7 +1602,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): pser = pd.Series([10, 20, 15, 30, 45], name="x") kser = ps.Series(pser) msg = "func must be a string or list of strings" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kser.aggregate({"x": ["min", "max"]}) msg = ( "If the given function is a list, it " "should only contains function names as strings." @@ -1692,7 +1692,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kser.replace((10, 15), (45, 50)), pser.replace((10, 15), (45, 50))) msg = "'to_replace' should be one of str, list, tuple, dict, int, float" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kser.replace(ps.range(5)) msg = "Replacement lists must match in length. Expecting 3 got 2" with self.assertRaisesRegex(ValueError, msg): @@ -1734,7 +1734,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): kser = ps.Series(pser) msg = "'other' must be a Series" - with self.assertRaisesRegex(ValueError, msg): + with self.assertRaisesRegex(TypeError, msg): kser.update(10) def test_where(self): @@ -1883,7 +1883,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(kser.repeat(0).sort_index(), pser.repeat(0).sort_index()) self.assertRaises(ValueError, lambda: kser.repeat(-1)) - self.assertRaises(ValueError, lambda: kser.repeat("abc")) + self.assertRaises(TypeError, lambda: kser.repeat("abc")) pdf = pd.DataFrame({"a": ["a", "b", "c"], "rep": [10, 20, 30]}, index=np.random.rand(3)) kdf = ps.from_pandas(pdf) @@ -1904,10 +1904,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): ) # Checking the type of indices. - self.assertRaises(ValueError, lambda: kser.take(1)) - self.assertRaises(ValueError, lambda: kser.take("1")) - self.assertRaises(ValueError, lambda: kser.take({1, 2})) - self.assertRaises(ValueError, lambda: kser.take({1: None, 2: None})) + self.assertRaises(TypeError, lambda: kser.take(1)) + self.assertRaises(TypeError, lambda: kser.take("1")) + self.assertRaises(TypeError, lambda: kser.take({1, 2})) + self.assertRaises(TypeError, lambda: kser.take({1: None, 2: None})) def test_divmod(self): pser = pd.Series([100, None, 300, None, 500], name="Koalas") diff --git a/python/pyspark/pandas/tests/test_series_string.py b/python/pyspark/pandas/tests/test_series_string.py index 69a9ab3424..057db77360 100644 --- a/python/pyspark/pandas/tests/test_series_string.py +++ b/python/pyspark/pandas/tests/test_series_string.py @@ -241,7 +241,7 @@ class SeriesStringTest(PandasOnSparkTestCase, SQLTestUtils): def test_string_repeat(self): self.check_func(lambda x: x.str.repeat(repeats=3)) - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): self.check_func(lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) def test_string_replace(self): diff --git a/python/pyspark/pandas/tests/test_utils.py b/python/pyspark/pandas/tests/test_utils.py index 2f4039ba20..a3595486e0 100644 --- a/python/pyspark/pandas/tests/test_utils.py +++ b/python/pyspark/pandas/tests/test_utils.py @@ -78,7 +78,7 @@ class UtilsTest(PandasOnSparkTestCase, SQLTestUtils): # This should fail because we are explicitly setting a non-boolean value koalas = "true" with self.assertRaisesRegex( - ValueError, 'For argument "koalas" expected type bool, received type str.' + TypeError, 'For argument "koalas" expected type bool, received type str.' ): validate_bool_kwarg(koalas, "koalas") diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py index fa98f14fe5..948a786c63 100644 --- a/python/pyspark/pandas/utils.py +++ b/python/pyspark/pandas/utils.py @@ -689,7 +689,7 @@ def validate_axis(axis=0, none_axis=0): def validate_bool_kwarg(value, arg_name): """ Ensures that argument passed in arg_name is of type bool. """ if not (isinstance(value, bool) or value is None): - raise ValueError( + raise TypeError( 'For argument "{}" expected type bool, received ' "type {}.".format(arg_name, type(value).__name__) ) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index a31e3d95d0..8fe263e152 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1134,12 +1134,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if isinstance(col, str): col = Column(col) elif not isinstance(col, Column): - raise ValueError("col must be a string or a column, but got %r" % type(col)) + raise TypeError("col must be a string or a column, but got %r" % type(col)) if not isinstance(fractions, dict): - raise ValueError("fractions must be a dict but got %r" % type(fractions)) + raise TypeError("fractions must be a dict but got %r" % type(fractions)) for k, v in fractions.items(): if not isinstance(k, (float, int, str)): - raise ValueError("key must be float, int, or string, but got %r" % type(k)) + raise TypeError("key must be float, int, or string, but got %r" % type(k)) fractions[k] = float(v) col = col._jc seed = seed if seed is not None else random.randint(0, sys.maxsize) @@ -1225,7 +1225,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): +----+ """ if not isinstance(colName, str): - raise ValueError("colName should be provided as string") + raise TypeError("colName should be provided as string") jc = self._jdf.colRegex(colName) return Column(jc) @@ -2009,7 +2009,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): elif isinstance(subset, str): subset = [subset] elif not isinstance(subset, (list, tuple)): - raise ValueError("subset should be a list or tuple of column names") + raise TypeError("subset should be a list or tuple of column names") if thresh is None: thresh = len(subset) if how == 'any' else 1 @@ -2067,7 +2067,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): +---+------+-------+ """ if not isinstance(value, (float, int, str, bool, dict)): - raise ValueError("value should be a float, int, string, bool or dict") + raise TypeError("value should be a float, int, string, bool or dict") # Note that bool validates isinstance(int), but we don't want to # convert bools to floats @@ -2083,7 +2083,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if isinstance(subset, str): subset = [subset] elif not isinstance(subset, (list, tuple)): - raise ValueError("subset should be a list or tuple of column names") + raise TypeError("subset should be a list or tuple of column names") return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) @@ -2186,15 +2186,15 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): # Validate input types valid_types = (bool, float, int, str, list, tuple) if not isinstance(to_replace, valid_types + (dict, )): - raise ValueError( + raise TypeError( "to_replace should be a bool, float, int, string, list, tuple, or dict. " "Got {0}".format(type(to_replace))) if not isinstance(value, valid_types) and value is not None \ and not isinstance(to_replace, dict): - raise ValueError("If to_replace is not a dict, value should be " - "a bool, float, int, string, list, tuple or None. " - "Got {0}".format(type(value))) + raise TypeError("If to_replace is not a dict, value should be " + "a bool, float, int, string, list, tuple or None. " + "Got {0}".format(type(value))) if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)): if len(to_replace) != len(value): @@ -2202,8 +2202,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): "Got {0} and {1}".format(len(to_replace), len(value))) if not (subset is None or isinstance(subset, (list, tuple, str))): - raise ValueError("subset should be a list or tuple of column names, " - "column name or None. Got {0}".format(type(subset))) + raise TypeError("subset should be a list or tuple of column names, " + "column name or None. Got {0}".format(type(subset))) # Reshape input arguments if necessary if isinstance(to_replace, (float, int, str)): @@ -2285,7 +2285,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): """ if not isinstance(col, (str, list, tuple)): - raise ValueError("col should be a string, list or tuple, but got %r" % type(col)) + raise TypeError("col should be a string, list or tuple, but got %r" % type(col)) isStr = isinstance(col, str) @@ -2296,11 +2296,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): for c in col: if not isinstance(c, str): - raise ValueError("columns should be strings, but got %r" % type(c)) + raise TypeError("columns should be strings, but got %r" % type(c)) col = _to_list(self._sc, col) if not isinstance(probabilities, (list, tuple)): - raise ValueError("probabilities should be a list or tuple") + raise TypeError("probabilities should be a list or tuple") if isinstance(probabilities, tuple): probabilities = list(probabilities) for p in probabilities: @@ -2308,8 +2308,10 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): raise ValueError("probabilities should be numerical (float, int) in [0,1].") probabilities = _to_list(self._sc, probabilities) - if not isinstance(relativeError, (float, int)) or relativeError < 0: - raise ValueError("relativeError should be numerical (float, int) >= 0.") + if not isinstance(relativeError, (float, int)): + raise TypeError("relativeError should be numerical (float, int)") + if relativeError < 0: + raise ValueError("relativeError should be >= 0.") relativeError = float(relativeError) jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError) @@ -2334,9 +2336,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): The correlation method. Currently only supports "pearson" """ if not isinstance(col1, str): - raise ValueError("col1 should be a string.") + raise TypeError("col1 should be a string.") if not isinstance(col2, str): - raise ValueError("col2 should be a string.") + raise TypeError("col2 should be a string.") if not method: method = "pearson" if not method == "pearson": @@ -2359,9 +2361,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): The name of the second column """ if not isinstance(col1, str): - raise ValueError("col1 should be a string.") + raise TypeError("col1 should be a string.") if not isinstance(col2, str): - raise ValueError("col2 should be a string.") + raise TypeError("col2 should be a string.") return self._jdf.stat().cov(col1, col2) def crosstab(self, col1, col2): @@ -2386,9 +2388,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): of the :class:`DataFrame`. """ if not isinstance(col1, str): - raise ValueError("col1 should be a string.") + raise TypeError("col1 should be a string.") if not isinstance(col2, str): - raise ValueError("col2 should be a string.") + raise TypeError("col2 should be a string.") return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx) def freqItems(self, cols, support=None): @@ -2418,7 +2420,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): if isinstance(cols, tuple): cols = list(cols) if not isinstance(cols, list): - raise ValueError("cols must be a list or tuple of column names as strings.") + raise TypeError("cols must be a list or tuple of column names as strings.") if not support: support = 0.01 return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx) @@ -2453,7 +2455,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): [Row(age=2, name='Alice', age2=4), Row(age=5, name='Bob', age2=7)] """ - assert isinstance(col, Column), "col should be Column" + if not isinstance(col, Column): + raise TypeError("col should be Column") return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx) def withColumnRenamed(self, existing, new): @@ -2597,8 +2600,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): True """ if not isinstance(other, DataFrame): - raise ValueError("other parameter should be of DataFrame; however, got %s" - % type(other)) + raise TypeError("other parameter should be of DataFrame; however, got %s" + % type(other)) return self._jdf.sameSemantics(other._jdf) def semanticHash(self): diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index e3977e8185..3e961cba7e 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -319,7 +319,7 @@ class DataFrameTests(ReusedSQLTestCase): self.assertTupleEqual(row, (u'Alice', 20, None)) # should fail if subset is not list, tuple or None - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): self.spark.createDataFrame( [(u'Alice', 10, 80.1)], schema).replace({10: 11}, subset=1).first() @@ -329,7 +329,7 @@ class DataFrameTests(ReusedSQLTestCase): [(u'Alice', 10, 80.1)], schema).replace(["Alice", "Bob"], ["Eve"]).first() # should fail if when received unexpected type - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): from datetime import datetime self.spark.createDataFrame( [(u'Alice', 10, 80.1)], schema).replace(datetime.now(), datetime.now()).first() @@ -818,7 +818,7 @@ class DataFrameTests(ReusedSQLTestCase): def test_same_semantics_error(self): with QuietTest(self.sc): - with self.assertRaisesRegex(ValueError, "should be of DataFrame.*int"): + with self.assertRaisesRegex(TypeError, "should be of DataFrame.*int"): self.spark.range(10).sameSemantics(1) def test_input_files(self): diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 112043e792..f49b7b2f35 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -279,9 +279,9 @@ class FunctionsTests(ReusedSQLTestCase): self.assertTrue(isinstance(aqt[1], list)) self.assertEqual(len(aqt[1]), 3) self.assertTrue(all(isinstance(q, float) for q in aqt[1])) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(123, [0.1, 0.9], 0.1)) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(("a", 123), [0.1, 0.9], 0.1)) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(["a", 123], [0.1, 0.9], 0.1)) + self.assertRaises(TypeError, lambda: df.stat.approxQuantile(123, [0.1, 0.9], 0.1)) + self.assertRaises(TypeError, lambda: df.stat.approxQuantile(("a", 123), [0.1, 0.9], 0.1)) + self.assertRaises(TypeError, lambda: df.stat.approxQuantile(["a", 123], [0.1, 0.9], 0.1)) def test_sorting_functions_with_column(self): from pyspark.sql import functions diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py index 091c7636c8..1afbe63084 100644 --- a/python/pyspark/taskcontext.py +++ b/python/pyspark/taskcontext.py @@ -236,7 +236,7 @@ class BarrierTaskContext(TaskContext): This API is experimental """ if not isinstance(message, str): - raise ValueError("Argument `message` must be of type `str`") + raise TypeError("Argument `message` must be of type `str`") elif self._port is None or self._secret is None: raise Exception("Not supported to call barrier() before initialize " + "BarrierTaskContext.")