[SPARK-35176][PYTHON] Standardize input validation error type

### What changes were proposed in this pull request?
This PR corrects some exception type when the function input params are failed to validate due to TypeError.
In order to convenient to review, there are 3 commits in this PR:
- Standardize input validation error type on sql
- Standardize input validation error type on ml
- Standardize input validation error type on pandas

### Why are the changes needed?
As suggestion from Python exception doc [1]: "Raised when an operation or function is applied to an object of inappropriate type.", but there are many Value error are raised in some pyspark code, this patch fix them.

[1] https://docs.python.org/3/library/exceptions.html#TypeError

Note that: this patch only addresses the exsiting some wrong raise type for input validation, the input validation decorator/framework which mentioned in [SPARK-35176](https://issues.apache.org/jira/browse/SPARK-35176), would be submited in a speparated patch.

### Does this PR introduce _any_ user-facing change?
Yes, code can raise the right TypeError instead of ValueError.

### How was this patch tested?
Existing test case and UT

Closes #32368 from Yikun/SPARK-35176.

Authored-by: Yikun Jiang <yikunkero@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
Yikun Jiang 2021-05-03 15:34:24 +09:00 committed by HyukjinKwon
parent 2a8d7ed4bf
commit 44b7931936
37 changed files with 234 additions and 164 deletions

View file

@ -25,6 +25,7 @@ This page describes the migration guide specific to PySpark.
.. toctree::
:maxdepth: 2
pyspark_3.1_to_3.2
pyspark_2.4_to_3.0
pyspark_2.3_to_2.4
pyspark_2.3.0_to_2.3.1_above

View file

@ -0,0 +1,23 @@
.. Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
.. http://www.apache.org/licenses/LICENSE-2.0
.. Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
=================================
Upgrading from PySpark 3.1 to 3.2
=================================
* In Spark 3.2, the PySpark methods from sql, ml, spark_on_pandas modules raise the ``TypeError`` instead of ``ValueError`` when are applied to an param of inappropriate type.

View file

@ -160,8 +160,8 @@ class Estimator(Params, metaclass=ABCMeta):
else:
return self._fit(dataset)
else:
raise ValueError("Params must be either a param map or a list/tuple of param maps, "
"but got %s." % type(params))
raise TypeError("Params must be either a param map or a list/tuple of param maps, "
"but got %s." % type(params))
@inherit_doc
@ -216,7 +216,7 @@ class Transformer(Params, metaclass=ABCMeta):
else:
return self._transform(dataset)
else:
raise ValueError("Params must be a param map but got %s." % type(params))
raise TypeError("Params must be a param map but got %s." % type(params))
@inherit_doc

View file

@ -759,7 +759,7 @@ class LinearSVCModel(_JavaClassificationModel, _LinearSVCParams, JavaMLWritable,
Test dataset to evaluate model on.
"""
if not isinstance(dataset, DataFrame):
raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
java_lsvc_summary = self._call_java("evaluate", dataset)
return LinearSVCSummary(java_lsvc_summary)
@ -1263,7 +1263,7 @@ class LogisticRegressionModel(_JavaProbabilisticClassificationModel, _LogisticRe
Test dataset to evaluate model on.
"""
if not isinstance(dataset, DataFrame):
raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
java_blr_summary = self._call_java("evaluate", dataset)
if self.numClasses <= 2:
return BinaryLogisticRegressionSummary(java_blr_summary)
@ -1869,7 +1869,7 @@ class RandomForestClassificationModel(_TreeEnsembleModel, _JavaProbabilisticClas
Test dataset to evaluate model on.
"""
if not isinstance(dataset, DataFrame):
raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
java_rf_summary = self._call_java("evaluate", dataset)
if self.numClasses <= 2:
return BinaryRandomForestClassificationSummary(java_rf_summary)
@ -2722,7 +2722,7 @@ class MultilayerPerceptronClassificationModel(_JavaProbabilisticClassificationMo
Test dataset to evaluate model on.
"""
if not isinstance(dataset, DataFrame):
raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
java_mlp_summary = self._call_java("evaluate", dataset)
return MultilayerPerceptronClassificationSummary(java_mlp_summary)
@ -3521,7 +3521,7 @@ class FMClassificationModel(_JavaProbabilisticClassificationModel, _Factorizatio
Test dataset to evaluate model on.
"""
if not isinstance(dataset, DataFrame):
raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
java_fm_summary = self._call_java("evaluate", dataset)
return FMClassificationSummary(java_fm_summary)

View file

@ -83,7 +83,7 @@ class Evaluator(Params, metaclass=ABCMeta):
else:
return self._evaluate(dataset)
else:
raise ValueError("Params must be a param map but got %s." % type(params))
raise TypeError("Params must be a param map but got %s." % type(params))
@since("1.5.0")
def isLargerBetter(self):

View file

@ -435,7 +435,7 @@ class Params(Identifiable, metaclass=ABCMeta):
elif isinstance(param, str):
return self.getParam(param)
else:
raise ValueError("Cannot resolve %r as a param." % param)
raise TypeError("Cannot resolve %r as a param." % param)
def _testOwnParam(self, param_parent, param_name):
"""

View file

@ -371,7 +371,7 @@ class LinearRegressionModel(_JavaRegressionModel, _LinearRegressionParams, Gener
instance of :py:class:`pyspark.sql.DataFrame`
"""
if not isinstance(dataset, DataFrame):
raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
java_lr_summary = self._call_java("evaluate", dataset)
return LinearRegressionSummary(java_lr_summary)
@ -2294,7 +2294,7 @@ class GeneralizedLinearRegressionModel(_JavaRegressionModel, _GeneralizedLinearR
instance of :py:class:`pyspark.sql.DataFrame`
"""
if not isinstance(dataset, DataFrame):
raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
java_glr_summary = self._call_java("evaluate", dataset)
return GeneralizedLinearRegressionSummary(java_glr_summary)

View file

@ -19,7 +19,15 @@ import unittest
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.testing.mlutils import MockDataset, MockEstimator, MockUnaryTransformer, \
SparkSessionTestCase
MockTransformer, SparkSessionTestCase
class TransformerTests(unittest.TestCase):
def test_transform_invalid_type(self):
transformer = MockTransformer()
data = MockDataset()
self.assertRaises(TypeError, transformer.transform, data, "")
class UnaryTransformerTests(SparkSessionTestCase):
@ -52,13 +60,18 @@ class UnaryTransformerTests(SparkSessionTestCase):
class EstimatorTest(unittest.TestCase):
def setUp(self):
self.estimator = MockEstimator()
self.data = MockDataset()
def test_fit_invalid_params(self):
invalid_type_parms = ""
self.assertRaises(TypeError, self.estimator.fit, self.data, invalid_type_parms)
def testDefaultFitMultiple(self):
N = 4
data = MockDataset()
estimator = MockEstimator()
params = [{estimator.fake: i} for i in range(N)]
modelIter = estimator.fitMultiple(data, params)
params = [{self.estimator.fake: i} for i in range(N)]
modelIter = self.estimator.fitMultiple(self.data, params)
indexList = []
for index, model in modelIter:
self.assertEqual(model.getFake(), index)

View file

@ -27,6 +27,12 @@ from pyspark.testing.mlutils import SparkSessionTestCase
class EvaluatorTests(SparkSessionTestCase):
def test_evaluate_invalid_type(self):
evaluator = RegressionEvaluator(metricName="r2")
df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
invalid_type = ""
self.assertRaises(TypeError, evaluator.evaluate, df, invalid_type)
def test_java_params(self):
"""
This tests a bug fixed by SPARK-18274 which causes multiple copies

View file

@ -30,6 +30,7 @@ from pyspark.ml.feature import Binarizer, Bucketizer, ElementwiseProduct, IndexT
from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasInputCol, HasMaxIter, HasSeed
from pyspark.ml.regression import LinearRegressionModel, GeneralizedLinearRegressionModel
from pyspark.ml.wrapper import JavaParams
from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase
@ -197,6 +198,10 @@ class ParamTests(SparkSessionTestCase):
self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter)
self.assertRaises(AttributeError, lambda: testParams._resolveParam(u""))
# Invalid type
invalid_type = 1
self.assertRaises(TypeError, testParams._resolveParam, invalid_type)
def test_params(self):
testParams = TestParams()
maxIter = testParams.maxIter
@ -332,6 +337,16 @@ class ParamTests(SparkSessionTestCase):
self.assertFalse(binarizer.isSet(binarizer.outputCol))
self.assertEqual(result[0][0], 1.0)
def test_lr_evaluate_invaild_type(self):
lr = LinearRegressionModel()
invalid_type = ""
self.assertRaises(TypeError, lr.evaluate, invalid_type)
def test_glr_evaluate_invaild_type(self):
glr = GeneralizedLinearRegressionModel()
invalid_type = ""
self.assertRaises(TypeError, glr.evaluate, invalid_type)
class DefaultValuesTests(PySparkTestCase):
"""

View file

@ -465,8 +465,7 @@ class RowMatrix(DistributedMatrix):
[DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])]
"""
if not isinstance(matrix, DenseMatrix):
raise ValueError("Only multiplication with DenseMatrix "
"is supported.")
raise TypeError("Only multiplication with DenseMatrix is supported.")
j_model = self._java_matrix_wrapper.call("multiply", matrix)
return RowMatrix(j_model)
@ -854,8 +853,7 @@ class IndexedRowMatrix(DistributedMatrix):
[IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])]
"""
if not isinstance(matrix, DenseMatrix):
raise ValueError("Only multiplication with DenseMatrix "
"is supported.")
raise TypeError("Only multiplication with DenseMatrix is supported.")
return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix))

View file

@ -26,7 +26,7 @@ from pyspark.mllib.linalg import ( # type: ignore[attr-defined]
Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,
DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
)
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix, IndexedRow
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import Row
from pyspark.testing.mllibutils import MLlibTestCase
@ -452,6 +452,17 @@ class VectorUDTTests(MLlibTestCase):
with self.assertRaises(IllegalArgumentException):
IndexedRowMatrix(df.drop("_1"))
def test_row_matrix_invalid_type(self):
rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
invalid_type = ""
matrix = RowMatrix(rows)
self.assertRaises(TypeError, matrix.multiply, invalid_type)
irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]),
IndexedRow(1, [4, 5, 6])])
imatrix = IndexedRowMatrix(irows)
self.assertRaises(TypeError, imatrix.multiply, invalid_type)
class MatrixUDTTests(MLlibTestCase):

View file

@ -1498,7 +1498,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
def _shift(self, periods, fill_value, *, part_cols=()):
if not isinstance(periods, int):
raise ValueError("periods should be an int; however, got [%s]" % type(periods).__name__)
raise TypeError("periods should be an int; however, got [%s]" % type(periods).__name__)
col = self.spark.column
window = (
@ -1828,7 +1828,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
)
"""
if not is_list_like(indices) or isinstance(indices, (dict, set)):
raise ValueError("`indices` must be a list-like except dict or set")
raise TypeError("`indices` must be a list-like except dict or set")
if isinstance(self, ps.Series):
return cast(ps.Series, self.iloc[indices])
else:

View file

@ -70,7 +70,7 @@ class Option:
>>> option.validate('abc') # doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
ValueError: The value for option 'option.name' was <class 'str'>;
TypeError: The value for option 'option.name' was <class 'str'>;
however, expected types are [(<class 'float'>, <class 'int'>)].
>>> option.validate(-1.1)
@ -101,7 +101,7 @@ class Option:
Validate the given value and throw an exception with related information such as key.
"""
if not isinstance(v, self.types):
raise ValueError(
raise TypeError(
"The value for option '%s' was %s; however, expected types are "
"[%s]." % (self.key, type(v), str(self.types))
)

View file

@ -771,7 +771,7 @@ class DataFrame(Frame, Generic[T]):
if not isinstance(other, DataFrame) and (
isinstance(other, IndexOpsMixin) or is_sequence(other)
):
raise ValueError(
raise TypeError(
"%s with a sequence is currently not supported; "
"however, got %s." % (op, type(other).__name__)
)
@ -2936,7 +2936,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
from pyspark.pandas.series import first_series
if not is_name_like_value(key):
raise ValueError("'key' should be a scalar value or tuple that contains scalar values")
raise TypeError("'key' should be a scalar value or tuple that contains scalar values")
if level is not None and is_name_like_tuple(key):
raise KeyError(key)
@ -3301,7 +3301,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
]
kdf[tmp_cond_col_names] = cond
else:
raise ValueError("type of cond must be a DataFrame or Series")
raise TypeError("type of cond must be a DataFrame or Series")
tmp_other_col_names = [
tmp_other_col_name(name_like_string(label)) for label in self._internal.column_labels
@ -3431,7 +3431,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
from pyspark.pandas.series import Series
if not isinstance(cond, (DataFrame, Series)):
raise ValueError("type of cond must be a DataFrame or Series")
raise TypeError("type of cond must be a DataFrame or Series")
cond_inversed = cond._apply_series_op(lambda kser: ~kser)
return self.where(cond_inversed, other)
@ -3997,7 +3997,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
assert allow_duplicates is False
if not is_name_like_value(column):
raise ValueError(
raise TypeError(
'"column" should be a scalar value or tuple that contains scalar values'
)
@ -4289,7 +4289,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
elif isinstance(decimals, int):
decimals = {k: decimals for k in self._internal.column_labels}
else:
raise ValueError("decimals must be an integer, a dict-like or a Series")
raise TypeError("decimals must be an integer, a dict-like or a Series")
def op(kser):
label = kser._column_label
@ -5660,7 +5660,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
will output the original DataFrame, simply ignoring the incompatible types.
"""
if is_list_like(lower) or is_list_like(upper):
raise ValueError(
raise TypeError(
"List-like value are not supported for 'lower' and 'upper' at the " + "moment"
)
@ -5941,12 +5941,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
small 5.5 2.333333 17 13
"""
if not is_name_like_value(columns):
raise ValueError("columns should be one column name.")
raise TypeError("columns should be one column name.")
if not is_name_like_value(values) and not (
isinstance(values, list) and all(is_name_like_value(v) for v in values)
):
raise ValueError("values should be one column or list of columns.")
raise TypeError("values should be one column or list of columns.")
if not isinstance(aggfunc, str) and (
not isinstance(aggfunc, dict)
@ -5954,7 +5954,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
is_name_like_value(key) and isinstance(value, str) for key, value in aggfunc.items()
)
):
raise ValueError(
raise TypeError(
"aggfunc must be a dict mapping from column name "
"to aggregate functions (string)."
)
@ -6031,7 +6031,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
.agg(*agg_cols)
)
else:
raise ValueError("index should be a None or a list of columns.")
raise TypeError("index should be a None or a list of columns.")
if fill_value is not None and isinstance(fill_value, (int, float)):
sdf = sdf.fillna(fill_value)
@ -7940,7 +7940,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
3 3 4
"""
if isinstance(other, ps.Series):
raise ValueError("DataFrames.append() does not support appending Series to DataFrames")
raise TypeError("DataFrames.append() does not support appending Series to DataFrames")
if sort:
raise NotImplementedError("The 'sort' parameter is currently not supported")
@ -10726,7 +10726,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
raise NotImplementedError('axis should be either 0 or "index" currently.')
if not isinstance(accuracy, int):
raise ValueError(
raise TypeError(
"accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
)
@ -10735,7 +10735,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
for v in q if isinstance(q, list) else [q]:
if not isinstance(v, float):
raise ValueError(
raise TypeError(
"q must be a float or an array of floats; however, [%s] found." % type(v)
)
if v < 0.0 or v > 1.0:
@ -10904,9 +10904,9 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
0 1 10 10
"""
if isinstance(self.columns, pd.MultiIndex):
raise ValueError("Doesn't support for MultiIndex columns")
raise TypeError("Doesn't support for MultiIndex columns")
if not isinstance(expr, str):
raise ValueError(
raise TypeError(
"expr must be a string to be evaluated, {} given".format(type(expr).__name__)
)
inplace = validate_bool_kwarg(inplace, "inplace")
@ -11012,7 +11012,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
"""
axis = validate_axis(axis)
if not is_list_like(indices) or isinstance(indices, (dict, set)):
raise ValueError("`indices` must be a list-like except dict or set")
raise TypeError("`indices` must be a list-like except dict or set")
if axis == 0:
return cast(DataFrame, self.iloc[indices, :])
else:
@ -11098,7 +11098,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
from pyspark.pandas.series import first_series
if isinstance(self.columns, pd.MultiIndex):
raise ValueError("`eval` is not supported for multi-index columns")
raise TypeError("`eval` is not supported for multi-index columns")
inplace = validate_bool_kwarg(inplace, "inplace")
should_return_series = False
series_name = None
@ -11179,7 +11179,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
from pyspark.pandas.series import Series
if not is_name_like_value(column):
raise ValueError("column must be a scalar")
raise TypeError("column must be a scalar")
kdf = DataFrame(self._internal.resolved_copy) # type: "DataFrame"
kser = kdf[column]

View file

@ -1895,7 +1895,7 @@ class Frame(object, metaclass=ABCMeta):
numeric_only = True
if not isinstance(accuracy, int):
raise ValueError(
raise TypeError(
"accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
)

View file

@ -2416,7 +2416,7 @@ class GroupBy(object, metaclass=ABCMeta):
Name: b, dtype: float64
"""
if not isinstance(accuracy, int):
raise ValueError(
raise TypeError(
"accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
)

View file

@ -2076,7 +2076,7 @@ class Index(IndexOpsMixin):
MultiIndex([], )
"""
if not isinstance(repeats, int):
raise ValueError(
raise TypeError(
"`repeats` argument must be integer, but got {}".format(type(repeats).__name__)
)
elif repeats < 0:

View file

@ -342,7 +342,7 @@ class MultiIndex(Index):
if names is None:
names = df._internal.column_labels
elif not is_list_like(names):
raise ValueError("Names should be list-like for a MultiIndex")
raise TypeError("Names should be list-like for a MultiIndex")
else:
names = [name if is_name_like_tuple(name) else (name,) for name in names]

View file

@ -126,7 +126,7 @@ def from_pandas(pobj: Union[pd.DataFrame, pd.Series, pd.Index]) -> Union[Series,
elif isinstance(pobj, pd.Index):
return DataFrame(pd.DataFrame(index=pobj)).index
else:
raise ValueError("Unknown data type: {}".format(type(pobj).__name__))
raise TypeError("Unknown data type: {}".format(type(pobj).__name__))
_range = range # built-in range
@ -2770,7 +2770,7 @@ def broadcast(obj) -> DataFrame:
...
"""
if not isinstance(obj, DataFrame):
raise ValueError("Invalid type : expected DataFrame got {}".format(type(obj).__name__))
raise TypeError("Invalid type : expected DataFrame got {}".format(type(obj).__name__))
return DataFrame(
obj._internal.with_new_sdf(F.broadcast(obj._internal.resolved_copy.spark_frame))
)

View file

@ -40,7 +40,7 @@ class TopNPlotBase:
if isinstance(data, (Series, DataFrame)):
data = data.head(max_rows + 1).to_pandas()
else:
raise ValueError("Only DataFrame and Series are supported for plotting.")
raise TypeError("Only DataFrame and Series are supported for plotting.")
self.partial = False
if len(data) > max_rows:
@ -80,7 +80,7 @@ class SampledPlotBase:
sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction)
return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas()
else:
raise ValueError("Only DataFrame and Series are supported for plotting.")
raise TypeError("Only DataFrame and Series are supported for plotting.")
def set_result_text(self, ax):
assert hasattr(self, "fraction")

View file

@ -2016,7 +2016,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
original Series, simply ignoring the incompatible types.
"""
if is_list_like(lower) or is_list_like(upper):
raise ValueError(
raise TypeError(
"List-like value are not supported for 'lower' and 'upper' at the " + "moment"
)
@ -3182,7 +3182,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
elif isinstance(func, str):
return getattr(self, func)()
else:
raise ValueError("func must be a string or list of strings")
raise TypeError("func must be a string or list of strings")
agg = aggregate
@ -3345,7 +3345,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Name: x, dtype: float64
"""
if not isinstance(decimals, int):
raise ValueError("decimals must be an integer")
raise TypeError("decimals must be an integer")
scol = F.round(self.spark.column, decimals)
return self._with_new_scol(scol)
@ -3402,12 +3402,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
).rename(self.name)
else:
if not isinstance(accuracy, int):
raise ValueError(
raise TypeError(
"accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
)
if not isinstance(q, float):
raise ValueError(
raise TypeError(
"q must be a float or an array of floats; however, [%s] found." % type(q)
)
if q < 0.0 or q > 1.0:
@ -3639,7 +3639,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
def _diff(self, periods, *, part_cols=()):
if not isinstance(periods, int):
raise ValueError("periods should be an int; however, got [%s]" % type(periods).__name__)
raise TypeError("periods should be an int; however, got [%s]" % type(periods).__name__)
window = (
Window.partitionBy(*part_cols)
.orderBy(NATURAL_ORDER_COLUMN_NAME)
@ -3984,7 +3984,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
dtype: float64
"""
if not is_name_like_value(item):
raise ValueError("'key' should be string or tuple that contains strings")
raise TypeError("'key' should be string or tuple that contains strings")
if not is_name_like_tuple(item):
item = (item,)
if self._internal.index_level < len(item):
@ -4328,7 +4328,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
if to_replace is None:
return self.fillna(method="ffill")
if not isinstance(to_replace, (str, list, tuple, dict, int, float)):
raise ValueError("'to_replace' should be one of str, list, tuple, dict, int, float")
raise TypeError("'to_replace' should be one of str, list, tuple, dict, int, float")
if regex:
raise NotImplementedError("replace currently not support for regex")
to_replace = list(to_replace) if isinstance(to_replace, tuple) else to_replace
@ -4438,7 +4438,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
>>> reset_option("compute.ops_on_diff_frames")
"""
if not isinstance(other, Series):
raise ValueError("'other' must be a Series")
raise TypeError("'other' must be a Series")
combined = combine_frames(self._kdf, other._kdf, how="leftouter")
@ -4813,7 +4813,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
dtype: float64
"""
if not isinstance(other, ps.Series):
raise ValueError("`combine_first` only allows `Series` for parameter `other`")
raise TypeError("`combine_first` only allows `Series` for parameter `other`")
if same_anchor(self, other):
this = self.spark.column
that = other.spark.column
@ -4977,7 +4977,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Series([], dtype: int64)
"""
if not isinstance(repeats, (int, Series)):
raise ValueError(
raise TypeError(
"`repeats` argument must be integer or Series, but got {}".format(type(repeats))
)

View file

@ -1489,7 +1489,7 @@ class StringMethods(object):
dtype: object
"""
if not isinstance(repeats, int):
raise ValueError("repeats expects an int parameter")
raise TypeError("repeats expects an int parameter")
return self._data.spark.transform(lambda c: SF.repeat(col=c, n=repeats))
def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True) -> "ps.Series":

View file

@ -1356,7 +1356,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
self.assert_eq((kidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values())
self.assertRaises(ValueError, lambda: kidx.repeat(-1))
self.assertRaises(ValueError, lambda: kidx.repeat("abc"))
self.assertRaises(TypeError, lambda: kidx.repeat("abc"))
pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
kmidx = ps.from_pandas(pmidx)
@ -1365,7 +1365,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
self.assert_eq(kmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True)
self.assertRaises(ValueError, lambda: kmidx.repeat(-1))
self.assertRaises(ValueError, lambda: kmidx.repeat("abc"))
self.assertRaises(TypeError, lambda: kmidx.repeat("abc"))
def test_unique(self):
pidx = pd.Index(["a", "b", "a"])
@ -1618,14 +1618,14 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
)
# Checking the type of indices.
self.assertRaises(ValueError, lambda: kidx.take(1))
self.assertRaises(ValueError, lambda: kidx.take("1"))
self.assertRaises(ValueError, lambda: kidx.take({1, 2}))
self.assertRaises(ValueError, lambda: kidx.take({1: None, 2: None}))
self.assertRaises(ValueError, lambda: kmidx.take(1))
self.assertRaises(ValueError, lambda: kmidx.take("1"))
self.assertRaises(ValueError, lambda: kmidx.take({1, 2}))
self.assertRaises(ValueError, lambda: kmidx.take({1: None, 2: None}))
self.assertRaises(TypeError, lambda: kidx.take(1))
self.assertRaises(TypeError, lambda: kidx.take("1"))
self.assertRaises(TypeError, lambda: kidx.take({1, 2}))
self.assertRaises(TypeError, lambda: kidx.take({1: None, 2: None}))
self.assertRaises(TypeError, lambda: kmidx.take(1))
self.assertRaises(TypeError, lambda: kmidx.take("1"))
self.assertRaises(TypeError, lambda: kmidx.take({1, 2}))
self.assertRaises(TypeError, lambda: kmidx.take({1: None, 2: None}))
def test_index_get_level_values(self):
pidx = pd.Index([1, 2, 3], name="ks")
@ -1998,7 +1998,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
with self.assertRaisesRegex(TypeError, err_msg):
ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]})
self.assertRaises(ValueError, lambda: ps.MultiIndex.from_frame(kdf, names="ab"))
self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(kdf, names="ab"))
# non-string names
self.assert_eq(

View file

@ -77,16 +77,16 @@ class ConfigTest(PandasOnSparkTestCase):
self.assertEqual(ps.get_option("test.config.int.none"), None)
def test_different_types(self):
with self.assertRaisesRegex(ValueError, "was <class 'int'>"):
with self.assertRaisesRegex(TypeError, "was <class 'int'>"):
ps.set_option("test.config.list", 1)
with self.assertRaisesRegex(ValueError, "however, expected types are"):
with self.assertRaisesRegex(TypeError, "however, expected types are"):
ps.set_option("test.config.float", "abc")
with self.assertRaisesRegex(ValueError, "[<class 'int'>]"):
with self.assertRaisesRegex(TypeError, "[<class 'int'>]"):
ps.set_option("test.config.int", "abc")
with self.assertRaisesRegex(ValueError, "(<class 'int'>, <class 'NoneType'>)"):
with self.assertRaisesRegex(TypeError, "(<class 'int'>, <class 'NoneType'>)"):
ps.set_option("test.config.int.none", "abc")
def test_check_func(self):

View file

@ -214,7 +214,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
ValueError, "cannot insert b, already exists", lambda: kdf.insert(1, "b", 10)
)
self.assertRaisesRegex(
ValueError,
TypeError,
'"column" should be a scalar value or tuple that contains scalar values',
lambda: kdf.insert(0, list("abc"), kser),
)
@ -2206,7 +2206,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# Assert appending a Series fails
msg = "DataFrames.append() does not support appending Series to DataFrames"
with self.assertRaises(ValueError, msg=msg):
with self.assertRaises(TypeError, msg=msg):
kdf.append(kdf["A"])
# Assert using the sort parameter raises an exception
@ -2286,9 +2286,9 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# Assert list-like values are not accepted for 'lower' and 'upper'
msg = "List-like value are not supported for 'lower' and 'upper' at the moment"
with self.assertRaises(ValueError, msg=msg):
with self.assertRaises(TypeError, msg=msg):
kdf.clip(lower=[1])
with self.assertRaises(ValueError, msg=msg):
with self.assertRaises(TypeError, msg=msg):
kdf.clip(upper=[1])
# Assert no lower or upper
@ -2323,7 +2323,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
)
self.assertRaisesRegex(
ValueError,
TypeError,
"add with a sequence is currently not supported",
lambda: ps.range(10).add(ps.range(10).id),
)
@ -3060,7 +3060,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
self.assertRaises(KeyError, lambda: kdf.pivot_table(index=["c"], columns="a", values=5))
msg = "index should be a None or a list of columns."
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kdf.pivot_table(index="c", columns="a", values="b")
msg = "pivot_table doesn't support aggfunc as dict and without index."
@ -3068,7 +3068,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
kdf.pivot_table(columns="a", values=["b", "e"], aggfunc={"b": "mean", "e": "sum"})
msg = "columns should be one column name."
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kdf.pivot_table(columns=["a"], values=["b"], aggfunc={"b": "mean", "e": "sum"})
msg = "Columns in aggfunc must be the same as values."
@ -3843,7 +3843,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(pdf.round({"A": 1, "D": 2}), kdf.round({"A": 1, "D": 2}))
self.assert_eq(pdf.round(pser), kdf.round(kser))
msg = "decimals must be an integer, a dict-like or a Series"
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kdf.round(1.5)
# multi-index columns
@ -3894,7 +3894,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
)
self.assert_eq(pdf1, kdf.shift(periods=3, fill_value=0))
msg = "should be an int"
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kdf.shift(1.5)
# multi-index columns
@ -3916,7 +3916,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(pdf.diff().sum().astype(int), kdf.diff().sum())
msg = "should be an int"
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kdf.diff(1.5)
msg = 'axis should be either 0 or "index" currently.'
with self.assertRaisesRegex(NotImplementedError, msg):
@ -4491,11 +4491,11 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
NotImplementedError, 'axis should be either 0 or "index" currently.'
):
kdf.quantile(0.5, axis=1)
with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"):
with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
kdf.quantile(accuracy="a")
with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
kdf.quantile(q="a")
with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
kdf.quantile(q=["a"])
self.assert_eq(kdf.quantile(0.5, numeric_only=False), pdf.quantile(0.5, numeric_only=False))
@ -4541,13 +4541,13 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
def test_where(self):
kdf = ps.from_pandas(self.pdf)
with self.assertRaisesRegex(ValueError, "type of cond must be a DataFrame or Series"):
with self.assertRaisesRegex(TypeError, "type of cond must be a DataFrame or Series"):
kdf.where(1)
def test_mask(self):
kdf = ps.from_pandas(self.pdf)
with self.assertRaisesRegex(ValueError, "type of cond must be a DataFrame or Series"):
with self.assertRaisesRegex(TypeError, "type of cond must be a DataFrame or Series"):
kdf.mask(1)
def test_query(self):
@ -4575,7 +4575,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
invalid_exprs = (1, 1.0, (exprs[0],), [exprs[0]])
for expr in invalid_exprs:
with self.assertRaisesRegex(
ValueError,
TypeError,
"expr must be a string to be evaluated, {} given".format(type(expr).__name__),
):
kdf.query(expr)
@ -4584,7 +4584,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
invalid_inplaces = (1, 0, "True", "False")
for inplace in invalid_inplaces:
with self.assertRaisesRegex(
ValueError,
TypeError,
'For argument "inplace" expected type bool, received type {}.'.format(
type(inplace).__name__
),
@ -4594,7 +4594,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# doesn't support for MultiIndex columns
columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "C")])
kdf.columns = columns
with self.assertRaisesRegex(ValueError, "Doesn't support for MultiIndex columns"):
with self.assertRaisesRegex(TypeError, "Doesn't support for MultiIndex columns"):
kdf.query("('A', 'Z') > ('B', 'X')")
def test_take(self):
@ -4683,10 +4683,10 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
)
# Checking the type of indices.
self.assertRaises(ValueError, lambda: kdf.take(1))
self.assertRaises(ValueError, lambda: kdf.take("1"))
self.assertRaises(ValueError, lambda: kdf.take({1, 2}))
self.assertRaises(ValueError, lambda: kdf.take({1: None, 2: None}))
self.assertRaises(TypeError, lambda: kdf.take(1))
self.assertRaises(TypeError, lambda: kdf.take("1"))
self.assertRaises(TypeError, lambda: kdf.take({1, 2}))
self.assertRaises(TypeError, lambda: kdf.take({1: None, 2: None}))
def test_axes(self):
pdf = self.pdf
@ -4739,7 +4739,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
# doesn't support for multi-index columns
columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b"), ("z", "c")])
kdf.columns = columns
self.assertRaises(ValueError, lambda: kdf.eval("x.a + y.b"))
self.assertRaises(TypeError, lambda: kdf.eval("x.a + y.b"))
@unittest.skipIf(not have_tabulate, tabulate_requirement_message)
def test_to_markdown(self):
@ -4972,7 +4972,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(kdf.explode("A").index.name, expected_result1.index.name)
self.assert_eq(kdf.explode("A").columns.name, expected_result1.columns.name)
self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"]))
self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"]))
# MultiIndex
midx = pd.MultiIndex.from_tuples(
@ -4997,7 +4997,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(kdf.explode("A").index.names, expected_result1.index.names)
self.assert_eq(kdf.explode("A").columns.name, expected_result1.columns.name)
self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"]))
self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"]))
# MultiIndex columns
columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X")], names=["column1", "column2"])
@ -5022,7 +5022,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(kdf.A.explode("Z"), expected_result3, almost=True)
self.assertRaises(ValueError, lambda: kdf.explode(["A", "B"]))
self.assertRaises(TypeError, lambda: kdf.explode(["A", "B"]))
self.assertRaises(ValueError, lambda: kdf.explode("A"))
def test_spark_schema(self):

View file

@ -2643,7 +2643,7 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
)
self.assert_eq(expected_result, kdf.groupby("a")["b"].median().sort_index())
with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"):
with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
kdf.groupby("a").median(accuracy="a")
def test_tail(self):

View file

@ -48,7 +48,7 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(kmidx, pmidx)
expected_error_message = "Unknown data type: {}".format(type(kidx).__name__)
with self.assertRaisesRegex(ValueError, expected_error_message):
with self.assertRaisesRegex(TypeError, expected_error_message):
ps.from_pandas(kidx)
def test_to_datetime(self):
@ -303,7 +303,7 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils):
expected_error_message = "Invalid type : expected DataFrame got {}".format(
type(kser).__name__
)
with self.assertRaisesRegex(ValueError, expected_error_message):
with self.assertRaisesRegex(TypeError, expected_error_message):
ps.broadcast(kser)
def test_get_index_map(self):

View file

@ -622,7 +622,7 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
kser1.combine_first(kser2).sort_index(), pser1.combine_first(pser2).sort_index()
)
with self.assertRaisesRegex(
ValueError, "`combine_first` only allows `Series` for parameter `other`"
TypeError, "`combine_first` only allows `Series` for parameter `other`"
):
kser1.combine_first(50)

View file

@ -1109,9 +1109,9 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
# Assert list-like values are not accepted for 'lower' and 'upper'
msg = "List-like value are not supported for 'lower' and 'upper' at the moment"
with self.assertRaises(ValueError, msg=msg):
with self.assertRaises(TypeError, msg=msg):
kser.clip(lower=[1])
with self.assertRaises(ValueError, msg=msg):
with self.assertRaises(TypeError, msg=msg):
kser.clip(upper=[1])
# Assert no lower or upper
@ -1324,7 +1324,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(pser.cumprod(skipna=False).astype(int), kser.cumprod(skipna=False))
def test_median(self):
with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"):
with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a")
def test_rank(self):
@ -1347,7 +1347,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
kser = ps.from_pandas(pser)
self.assert_eq(pser.round(2), kser.round(2))
msg = "decimals must be an integer"
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kser.round(1.5)
def test_quantile(self):
@ -1357,11 +1357,11 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(kser.quantile(0.5), pser.quantile(0.5))
self.assert_eq(kser.quantile([0.25, 0.5, 0.75]), pser.quantile([0.25, 0.5, 0.75]))
with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"):
with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(accuracy="a")
with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q="a")
with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
@ -1433,7 +1433,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(kser.shift(periods=2), pser.shift(periods=2))
else:
self.assert_eq(kser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0))
with self.assertRaisesRegex(ValueError, "periods should be an int; however"):
with self.assertRaisesRegex(TypeError, "periods should be an int; however"):
kser.shift(periods=1.5)
def test_diff(self):
@ -1602,7 +1602,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
pser = pd.Series([10, 20, 15, 30, 45], name="x")
kser = ps.Series(pser)
msg = "func must be a string or list of strings"
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kser.aggregate({"x": ["min", "max"]})
msg = (
"If the given function is a list, it " "should only contains function names as strings."
@ -1692,7 +1692,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(kser.replace((10, 15), (45, 50)), pser.replace((10, 15), (45, 50)))
msg = "'to_replace' should be one of str, list, tuple, dict, int, float"
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kser.replace(ps.range(5))
msg = "Replacement lists must match in length. Expecting 3 got 2"
with self.assertRaisesRegex(ValueError, msg):
@ -1734,7 +1734,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
kser = ps.Series(pser)
msg = "'other' must be a Series"
with self.assertRaisesRegex(ValueError, msg):
with self.assertRaisesRegex(TypeError, msg):
kser.update(10)
def test_where(self):
@ -1883,7 +1883,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(kser.repeat(0).sort_index(), pser.repeat(0).sort_index())
self.assertRaises(ValueError, lambda: kser.repeat(-1))
self.assertRaises(ValueError, lambda: kser.repeat("abc"))
self.assertRaises(TypeError, lambda: kser.repeat("abc"))
pdf = pd.DataFrame({"a": ["a", "b", "c"], "rep": [10, 20, 30]}, index=np.random.rand(3))
kdf = ps.from_pandas(pdf)
@ -1904,10 +1904,10 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
)
# Checking the type of indices.
self.assertRaises(ValueError, lambda: kser.take(1))
self.assertRaises(ValueError, lambda: kser.take("1"))
self.assertRaises(ValueError, lambda: kser.take({1, 2}))
self.assertRaises(ValueError, lambda: kser.take({1: None, 2: None}))
self.assertRaises(TypeError, lambda: kser.take(1))
self.assertRaises(TypeError, lambda: kser.take("1"))
self.assertRaises(TypeError, lambda: kser.take({1, 2}))
self.assertRaises(TypeError, lambda: kser.take({1: None, 2: None}))
def test_divmod(self):
pser = pd.Series([100, None, 300, None, 500], name="Koalas")

View file

@ -241,7 +241,7 @@ class SeriesStringTest(PandasOnSparkTestCase, SQLTestUtils):
def test_string_repeat(self):
self.check_func(lambda x: x.str.repeat(repeats=3))
with self.assertRaises(ValueError):
with self.assertRaises(TypeError):
self.check_func(lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
def test_string_replace(self):

View file

@ -78,7 +78,7 @@ class UtilsTest(PandasOnSparkTestCase, SQLTestUtils):
# This should fail because we are explicitly setting a non-boolean value
koalas = "true"
with self.assertRaisesRegex(
ValueError, 'For argument "koalas" expected type bool, received type str.'
TypeError, 'For argument "koalas" expected type bool, received type str.'
):
validate_bool_kwarg(koalas, "koalas")

View file

@ -689,7 +689,7 @@ def validate_axis(axis=0, none_axis=0):
def validate_bool_kwarg(value, arg_name):
""" Ensures that argument passed in arg_name is of type bool. """
if not (isinstance(value, bool) or value is None):
raise ValueError(
raise TypeError(
'For argument "{}" expected type bool, received '
"type {}.".format(arg_name, type(value).__name__)
)

View file

@ -1134,12 +1134,12 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
if isinstance(col, str):
col = Column(col)
elif not isinstance(col, Column):
raise ValueError("col must be a string or a column, but got %r" % type(col))
raise TypeError("col must be a string or a column, but got %r" % type(col))
if not isinstance(fractions, dict):
raise ValueError("fractions must be a dict but got %r" % type(fractions))
raise TypeError("fractions must be a dict but got %r" % type(fractions))
for k, v in fractions.items():
if not isinstance(k, (float, int, str)):
raise ValueError("key must be float, int, or string, but got %r" % type(k))
raise TypeError("key must be float, int, or string, but got %r" % type(k))
fractions[k] = float(v)
col = col._jc
seed = seed if seed is not None else random.randint(0, sys.maxsize)
@ -1225,7 +1225,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
+----+
"""
if not isinstance(colName, str):
raise ValueError("colName should be provided as string")
raise TypeError("colName should be provided as string")
jc = self._jdf.colRegex(colName)
return Column(jc)
@ -2009,7 +2009,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
elif isinstance(subset, str):
subset = [subset]
elif not isinstance(subset, (list, tuple)):
raise ValueError("subset should be a list or tuple of column names")
raise TypeError("subset should be a list or tuple of column names")
if thresh is None:
thresh = len(subset) if how == 'any' else 1
@ -2067,7 +2067,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
+---+------+-------+
"""
if not isinstance(value, (float, int, str, bool, dict)):
raise ValueError("value should be a float, int, string, bool or dict")
raise TypeError("value should be a float, int, string, bool or dict")
# Note that bool validates isinstance(int), but we don't want to
# convert bools to floats
@ -2083,7 +2083,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
if isinstance(subset, str):
subset = [subset]
elif not isinstance(subset, (list, tuple)):
raise ValueError("subset should be a list or tuple of column names")
raise TypeError("subset should be a list or tuple of column names")
return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx)
@ -2186,15 +2186,15 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
# Validate input types
valid_types = (bool, float, int, str, list, tuple)
if not isinstance(to_replace, valid_types + (dict, )):
raise ValueError(
raise TypeError(
"to_replace should be a bool, float, int, string, list, tuple, or dict. "
"Got {0}".format(type(to_replace)))
if not isinstance(value, valid_types) and value is not None \
and not isinstance(to_replace, dict):
raise ValueError("If to_replace is not a dict, value should be "
"a bool, float, int, string, list, tuple or None. "
"Got {0}".format(type(value)))
raise TypeError("If to_replace is not a dict, value should be "
"a bool, float, int, string, list, tuple or None. "
"Got {0}".format(type(value)))
if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)):
if len(to_replace) != len(value):
@ -2202,8 +2202,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
"Got {0} and {1}".format(len(to_replace), len(value)))
if not (subset is None or isinstance(subset, (list, tuple, str))):
raise ValueError("subset should be a list or tuple of column names, "
"column name or None. Got {0}".format(type(subset)))
raise TypeError("subset should be a list or tuple of column names, "
"column name or None. Got {0}".format(type(subset)))
# Reshape input arguments if necessary
if isinstance(to_replace, (float, int, str)):
@ -2285,7 +2285,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
"""
if not isinstance(col, (str, list, tuple)):
raise ValueError("col should be a string, list or tuple, but got %r" % type(col))
raise TypeError("col should be a string, list or tuple, but got %r" % type(col))
isStr = isinstance(col, str)
@ -2296,11 +2296,11 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
for c in col:
if not isinstance(c, str):
raise ValueError("columns should be strings, but got %r" % type(c))
raise TypeError("columns should be strings, but got %r" % type(c))
col = _to_list(self._sc, col)
if not isinstance(probabilities, (list, tuple)):
raise ValueError("probabilities should be a list or tuple")
raise TypeError("probabilities should be a list or tuple")
if isinstance(probabilities, tuple):
probabilities = list(probabilities)
for p in probabilities:
@ -2308,8 +2308,10 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
raise ValueError("probabilities should be numerical (float, int) in [0,1].")
probabilities = _to_list(self._sc, probabilities)
if not isinstance(relativeError, (float, int)) or relativeError < 0:
raise ValueError("relativeError should be numerical (float, int) >= 0.")
if not isinstance(relativeError, (float, int)):
raise TypeError("relativeError should be numerical (float, int)")
if relativeError < 0:
raise ValueError("relativeError should be >= 0.")
relativeError = float(relativeError)
jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError)
@ -2334,9 +2336,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
The correlation method. Currently only supports "pearson"
"""
if not isinstance(col1, str):
raise ValueError("col1 should be a string.")
raise TypeError("col1 should be a string.")
if not isinstance(col2, str):
raise ValueError("col2 should be a string.")
raise TypeError("col2 should be a string.")
if not method:
method = "pearson"
if not method == "pearson":
@ -2359,9 +2361,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
The name of the second column
"""
if not isinstance(col1, str):
raise ValueError("col1 should be a string.")
raise TypeError("col1 should be a string.")
if not isinstance(col2, str):
raise ValueError("col2 should be a string.")
raise TypeError("col2 should be a string.")
return self._jdf.stat().cov(col1, col2)
def crosstab(self, col1, col2):
@ -2386,9 +2388,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
of the :class:`DataFrame`.
"""
if not isinstance(col1, str):
raise ValueError("col1 should be a string.")
raise TypeError("col1 should be a string.")
if not isinstance(col2, str):
raise ValueError("col2 should be a string.")
raise TypeError("col2 should be a string.")
return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx)
def freqItems(self, cols, support=None):
@ -2418,7 +2420,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
if isinstance(cols, tuple):
cols = list(cols)
if not isinstance(cols, list):
raise ValueError("cols must be a list or tuple of column names as strings.")
raise TypeError("cols must be a list or tuple of column names as strings.")
if not support:
support = 0.01
return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
@ -2453,7 +2455,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
[Row(age=2, name='Alice', age2=4), Row(age=5, name='Bob', age2=7)]
"""
assert isinstance(col, Column), "col should be Column"
if not isinstance(col, Column):
raise TypeError("col should be Column")
return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)
def withColumnRenamed(self, existing, new):
@ -2597,8 +2600,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
True
"""
if not isinstance(other, DataFrame):
raise ValueError("other parameter should be of DataFrame; however, got %s"
% type(other))
raise TypeError("other parameter should be of DataFrame; however, got %s"
% type(other))
return self._jdf.sameSemantics(other._jdf)
def semanticHash(self):

View file

@ -319,7 +319,7 @@ class DataFrameTests(ReusedSQLTestCase):
self.assertTupleEqual(row, (u'Alice', 20, None))
# should fail if subset is not list, tuple or None
with self.assertRaises(ValueError):
with self.assertRaises(TypeError):
self.spark.createDataFrame(
[(u'Alice', 10, 80.1)], schema).replace({10: 11}, subset=1).first()
@ -329,7 +329,7 @@ class DataFrameTests(ReusedSQLTestCase):
[(u'Alice', 10, 80.1)], schema).replace(["Alice", "Bob"], ["Eve"]).first()
# should fail if when received unexpected type
with self.assertRaises(ValueError):
with self.assertRaises(TypeError):
from datetime import datetime
self.spark.createDataFrame(
[(u'Alice', 10, 80.1)], schema).replace(datetime.now(), datetime.now()).first()
@ -818,7 +818,7 @@ class DataFrameTests(ReusedSQLTestCase):
def test_same_semantics_error(self):
with QuietTest(self.sc):
with self.assertRaisesRegex(ValueError, "should be of DataFrame.*int"):
with self.assertRaisesRegex(TypeError, "should be of DataFrame.*int"):
self.spark.range(10).sameSemantics(1)
def test_input_files(self):

View file

@ -279,9 +279,9 @@ class FunctionsTests(ReusedSQLTestCase):
self.assertTrue(isinstance(aqt[1], list))
self.assertEqual(len(aqt[1]), 3)
self.assertTrue(all(isinstance(q, float) for q in aqt[1]))
self.assertRaises(ValueError, lambda: df.stat.approxQuantile(123, [0.1, 0.9], 0.1))
self.assertRaises(ValueError, lambda: df.stat.approxQuantile(("a", 123), [0.1, 0.9], 0.1))
self.assertRaises(ValueError, lambda: df.stat.approxQuantile(["a", 123], [0.1, 0.9], 0.1))
self.assertRaises(TypeError, lambda: df.stat.approxQuantile(123, [0.1, 0.9], 0.1))
self.assertRaises(TypeError, lambda: df.stat.approxQuantile(("a", 123), [0.1, 0.9], 0.1))
self.assertRaises(TypeError, lambda: df.stat.approxQuantile(["a", 123], [0.1, 0.9], 0.1))
def test_sorting_functions_with_column(self):
from pyspark.sql import functions

View file

@ -236,7 +236,7 @@ class BarrierTaskContext(TaskContext):
This API is experimental
"""
if not isinstance(message, str):
raise ValueError("Argument `message` must be of type `str`")
raise TypeError("Argument `message` must be of type `str`")
elif self._port is None or self._secret is None:
raise Exception("Not supported to call barrier() before initialize " +
"BarrierTaskContext.")