spark-instrumented-optimizer/python/pyspark/ml/evaluation.py
Yikun Jiang 44b7931936 [SPARK-35176][PYTHON] Standardize input validation error type
### What changes were proposed in this pull request?
This PR corrects some exception type when the function input params are failed to validate due to TypeError.
In order to convenient to review, there are 3 commits in this PR:
- Standardize input validation error type on sql
- Standardize input validation error type on ml
- Standardize input validation error type on pandas

### Why are the changes needed?
As suggestion from Python exception doc [1]: "Raised when an operation or function is applied to an object of inappropriate type.", but there are many Value error are raised in some pyspark code, this patch fix them.

[1] https://docs.python.org/3/library/exceptions.html#TypeError

Note that: this patch only addresses the exsiting some wrong raise type for input validation, the input validation decorator/framework which mentioned in [SPARK-35176](https://issues.apache.org/jira/browse/SPARK-35176), would be submited in a speparated patch.

### Does this PR introduce _any_ user-facing change?
Yes, code can raise the right TypeError instead of ValueError.

### How was this patch tested?
Existing test case and UT

Closes #32368 from Yikun/SPARK-35176.

Authored-by: Yikun Jiang <yikunkero@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2021-05-03 15:34:24 +09:00

950 lines
33 KiB
Python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys
from abc import abstractmethod, ABCMeta
from pyspark import since, keyword_only
from pyspark.ml.wrapper import JavaParams
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasProbabilityCol, \
HasRawPredictionCol, HasFeaturesCol, HasWeightCol
from pyspark.ml.common import inherit_doc
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator',
'MulticlassClassificationEvaluator', 'MultilabelClassificationEvaluator',
'ClusteringEvaluator', 'RankingEvaluator']
@inherit_doc
class Evaluator(Params, metaclass=ABCMeta):
"""
Base class for evaluators that compute metrics from predictions.
.. versionadded:: 1.4.0
"""
pass
@abstractmethod
def _evaluate(self, dataset):
"""
Evaluates the output.
Parameters
----------
dataset : :py:class:`pyspark.sql.DataFrame`
a dataset that contains labels/observations and predictions
Returns
-------
float
metric
"""
raise NotImplementedError()
def evaluate(self, dataset, params=None):
"""
Evaluates the output with optional parameters.
.. versionadded:: 1.4.0
Parameters
----------
dataset : :py:class:`pyspark.sql.DataFrame`
a dataset that contains labels/observations and predictions
params : dict, optional
an optional param map that overrides embedded params
Returns
-------
float
metric
"""
if params is None:
params = dict()
if isinstance(params, dict):
if params:
return self.copy(params)._evaluate(dataset)
else:
return self._evaluate(dataset)
else:
raise TypeError("Params must be a param map but got %s." % type(params))
@since("1.5.0")
def isLargerBetter(self):
"""
Indicates whether the metric returned by :py:meth:`evaluate` should be maximized
(True, default) or minimized (False).
A given evaluator may support multiple metrics which may be maximized or minimized.
"""
return True
@inherit_doc
class JavaEvaluator(JavaParams, Evaluator, metaclass=ABCMeta):
"""
Base class for :py:class:`Evaluator`s that wrap Java/Scala
implementations.
"""
def _evaluate(self, dataset):
"""
Evaluates the output.
Parameters
----------
dataset : :py:class:`pyspark.sql.DataFrame`
a dataset that contains labels/observations and predictions
Returns
-------
float
evaluation metric
"""
self._transfer_params_to_java()
return self._java_obj.evaluate(dataset._jdf)
def isLargerBetter(self):
self._transfer_params_to_java()
return self._java_obj.isLargerBetter()
@inherit_doc
class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, HasWeightCol,
JavaMLReadable, JavaMLWritable):
"""
Evaluator for binary classification, which expects input columns rawPrediction, label
and an optional weight column.
The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
.. versionadded:: 1.4.0
Examples
--------
>>> from pyspark.ml.linalg import Vectors
>>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]),
... [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)])
>>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
...
>>> evaluator = BinaryClassificationEvaluator()
>>> evaluator.setRawPredictionCol("raw")
BinaryClassificationEvaluator...
>>> evaluator.evaluate(dataset)
0.70...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
0.83...
>>> bce_path = temp_path + "/bce"
>>> evaluator.save(bce_path)
>>> evaluator2 = BinaryClassificationEvaluator.load(bce_path)
>>> str(evaluator2.getRawPredictionCol())
'raw'
>>> scoreAndLabelsAndWeight = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1], x[2]),
... [(0.1, 0.0, 1.0), (0.1, 1.0, 0.9), (0.4, 0.0, 0.7), (0.6, 0.0, 0.9),
... (0.6, 1.0, 1.0), (0.6, 1.0, 0.3), (0.8, 1.0, 1.0)])
>>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["raw", "label", "weight"])
...
>>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw", weightCol="weight")
>>> evaluator.evaluate(dataset)
0.70...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
0.82...
>>> evaluator.getNumBins()
1000
"""
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation (areaUnderROC|areaUnderPR)",
typeConverter=TypeConverters.toString)
numBins = Param(Params._dummy(), "numBins", "Number of bins to down-sample the curves "
"(ROC curve, PR curve) in area computation. If 0, no down-sampling will "
"occur. Must be >= 0.",
typeConverter=TypeConverters.toInt)
@keyword_only
def __init__(self, *, rawPredictionCol="rawPrediction", labelCol="label",
metricName="areaUnderROC", weightCol=None, numBins=1000):
"""
__init__(self, \\*, rawPredictionCol="rawPrediction", labelCol="label", \
metricName="areaUnderROC", weightCol=None, numBins=1000)
"""
super(BinaryClassificationEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
self._setDefault(metricName="areaUnderROC", numBins=1000)
kwargs = self._input_kwargs
self._set(**kwargs)
@since("1.4.0")
def setMetricName(self, value):
"""
Sets the value of :py:attr:`metricName`.
"""
return self._set(metricName=value)
@since("1.4.0")
def getMetricName(self):
"""
Gets the value of metricName or its default value.
"""
return self.getOrDefault(self.metricName)
@since("3.0.0")
def setNumBins(self, value):
"""
Sets the value of :py:attr:`numBins`.
"""
return self._set(numBins=value)
@since("3.0.0")
def getNumBins(self):
"""
Gets the value of numBins or its default value.
"""
return self.getOrDefault(self.numBins)
def setLabelCol(self, value):
"""
Sets the value of :py:attr:`labelCol`.
"""
return self._set(labelCol=value)
def setRawPredictionCol(self, value):
"""
Sets the value of :py:attr:`rawPredictionCol`.
"""
return self._set(rawPredictionCol=value)
@since("3.0.0")
def setWeightCol(self, value):
"""
Sets the value of :py:attr:`weightCol`.
"""
return self._set(weightCol=value)
@keyword_only
@since("1.4.0")
def setParams(self, *, rawPredictionCol="rawPrediction", labelCol="label",
metricName="areaUnderROC", weightCol=None, numBins=1000):
"""
setParams(self, \\*, rawPredictionCol="rawPrediction", labelCol="label", \
metricName="areaUnderROC", weightCol=None, numBins=1000)
Sets params for binary classification evaluator.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
@inherit_doc
class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeightCol,
JavaMLReadable, JavaMLWritable):
"""
Evaluator for Regression, which expects input columns prediction, label
and an optional weight column.
.. versionadded:: 1.4.0
Examples
--------
>>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5),
... (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)]
>>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
...
>>> evaluator = RegressionEvaluator()
>>> evaluator.setPredictionCol("raw")
RegressionEvaluator...
>>> evaluator.evaluate(dataset)
2.842...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
0.993...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
2.649...
>>> re_path = temp_path + "/re"
>>> evaluator.save(re_path)
>>> evaluator2 = RegressionEvaluator.load(re_path)
>>> str(evaluator2.getPredictionCol())
'raw'
>>> scoreAndLabelsAndWeight = [(-28.98343821, -27.0, 1.0), (20.21491975, 21.5, 0.8),
... (-25.98418959, -22.0, 1.0), (30.69731842, 33.0, 0.6), (74.69283752, 71.0, 0.2)]
>>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["raw", "label", "weight"])
...
>>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight")
>>> evaluator.evaluate(dataset)
2.740...
>>> evaluator.getThroughOrigin()
False
"""
metricName = Param(Params._dummy(), "metricName",
"""metric name in evaluation - one of:
rmse - root mean squared error (default)
mse - mean squared error
r2 - r^2 metric
mae - mean absolute error
var - explained variance.""",
typeConverter=TypeConverters.toString)
throughOrigin = Param(Params._dummy(), "throughOrigin",
"whether the regression is through the origin.",
typeConverter=TypeConverters.toBoolean)
@keyword_only
def __init__(self, *, predictionCol="prediction", labelCol="label",
metricName="rmse", weightCol=None, throughOrigin=False):
"""
__init__(self, \\*, predictionCol="prediction", labelCol="label", \
metricName="rmse", weightCol=None, throughOrigin=False)
"""
super(RegressionEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
self._setDefault(metricName="rmse", throughOrigin=False)
kwargs = self._input_kwargs
self._set(**kwargs)
@since("1.4.0")
def setMetricName(self, value):
"""
Sets the value of :py:attr:`metricName`.
"""
return self._set(metricName=value)
@since("1.4.0")
def getMetricName(self):
"""
Gets the value of metricName or its default value.
"""
return self.getOrDefault(self.metricName)
@since("3.0.0")
def setThroughOrigin(self, value):
"""
Sets the value of :py:attr:`throughOrigin`.
"""
return self._set(throughOrigin=value)
@since("3.0.0")
def getThroughOrigin(self):
"""
Gets the value of throughOrigin or its default value.
"""
return self.getOrDefault(self.throughOrigin)
def setLabelCol(self, value):
"""
Sets the value of :py:attr:`labelCol`.
"""
return self._set(labelCol=value)
def setPredictionCol(self, value):
"""
Sets the value of :py:attr:`predictionCol`.
"""
return self._set(predictionCol=value)
@since("3.0.0")
def setWeightCol(self, value):
"""
Sets the value of :py:attr:`weightCol`.
"""
return self._set(weightCol=value)
@keyword_only
@since("1.4.0")
def setParams(self, *, predictionCol="prediction", labelCol="label",
metricName="rmse", weightCol=None, throughOrigin=False):
"""
setParams(self, \\*, predictionCol="prediction", labelCol="label", \
metricName="rmse", weightCol=None, throughOrigin=False)
Sets params for regression evaluator.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
@inherit_doc
class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeightCol,
HasProbabilityCol, JavaMLReadable, JavaMLWritable):
"""
Evaluator for Multiclass Classification, which expects input
columns: prediction, label, weight (optional) and probabilityCol (only for logLoss).
.. versionadded:: 1.5.0
Examples
--------
>>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]
>>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
>>> evaluator = MulticlassClassificationEvaluator()
>>> evaluator.setPredictionCol("prediction")
MulticlassClassificationEvaluator...
>>> evaluator.evaluate(dataset)
0.66...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0.66...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "truePositiveRateByLabel",
... evaluator.metricLabel: 1.0})
0.75...
>>> evaluator.setMetricName("hammingLoss")
MulticlassClassificationEvaluator...
>>> evaluator.evaluate(dataset)
0.33...
>>> mce_path = temp_path + "/mce"
>>> evaluator.save(mce_path)
>>> evaluator2 = MulticlassClassificationEvaluator.load(mce_path)
>>> str(evaluator2.getPredictionCol())
'prediction'
>>> scoreAndLabelsAndWeight = [(0.0, 0.0, 1.0), (0.0, 1.0, 1.0), (0.0, 0.0, 1.0),
... (1.0, 0.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0), (1.0, 1.0, 1.0),
... (2.0, 2.0, 1.0), (2.0, 0.0, 1.0)]
>>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["prediction", "label", "weight"])
>>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
... weightCol="weight")
>>> evaluator.evaluate(dataset)
0.66...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0.66...
>>> predictionAndLabelsWithProbabilities = [
... (1.0, 1.0, 1.0, [0.1, 0.8, 0.1]), (0.0, 2.0, 1.0, [0.9, 0.05, 0.05]),
... (0.0, 0.0, 1.0, [0.8, 0.2, 0.0]), (1.0, 1.0, 1.0, [0.3, 0.65, 0.05])]
>>> dataset = spark.createDataFrame(predictionAndLabelsWithProbabilities, ["prediction",
... "label", "weight", "probability"])
>>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
... probabilityCol="probability")
>>> evaluator.setMetricName("logLoss")
MulticlassClassificationEvaluator...
>>> evaluator.evaluate(dataset)
0.9682...
"""
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation "
"(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| "
"weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| "
"falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| "
"logLoss|hammingLoss)",
typeConverter=TypeConverters.toString)
metricLabel = Param(Params._dummy(), "metricLabel",
"The class whose metric will be computed in truePositiveRateByLabel|"
"falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel."
" Must be >= 0. The default value is 0.",
typeConverter=TypeConverters.toFloat)
beta = Param(Params._dummy(), "beta",
"The beta value used in weightedFMeasure|fMeasureByLabel."
" Must be > 0. The default value is 1.",
typeConverter=TypeConverters.toFloat)
eps = Param(Params._dummy(), "eps",
"log-loss is undefined for p=0 or p=1, so probabilities are clipped to "
"max(eps, min(1 - eps, p)). "
"Must be in range (0, 0.5). The default value is 1e-15.",
typeConverter=TypeConverters.toFloat)
@keyword_only
def __init__(self, *, predictionCol="prediction", labelCol="label",
metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0,
probabilityCol="probability", eps=1e-15):
"""
__init__(self, \\*, predictionCol="prediction", labelCol="label", \
metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0, \
probabilityCol="probability", eps=1e-15)
"""
super(MulticlassClassificationEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
self._setDefault(metricName="f1", metricLabel=0.0, beta=1.0, eps=1e-15)
kwargs = self._input_kwargs
self._set(**kwargs)
@since("1.5.0")
def setMetricName(self, value):
"""
Sets the value of :py:attr:`metricName`.
"""
return self._set(metricName=value)
@since("1.5.0")
def getMetricName(self):
"""
Gets the value of metricName or its default value.
"""
return self.getOrDefault(self.metricName)
@since("3.0.0")
def setMetricLabel(self, value):
"""
Sets the value of :py:attr:`metricLabel`.
"""
return self._set(metricLabel=value)
@since("3.0.0")
def getMetricLabel(self):
"""
Gets the value of metricLabel or its default value.
"""
return self.getOrDefault(self.metricLabel)
@since("3.0.0")
def setBeta(self, value):
"""
Sets the value of :py:attr:`beta`.
"""
return self._set(beta=value)
@since("3.0.0")
def getBeta(self):
"""
Gets the value of beta or its default value.
"""
return self.getOrDefault(self.beta)
@since("3.0.0")
def setEps(self, value):
"""
Sets the value of :py:attr:`eps`.
"""
return self._set(eps=value)
@since("3.0.0")
def getEps(self):
"""
Gets the value of eps or its default value.
"""
return self.getOrDefault(self.eps)
def setLabelCol(self, value):
"""
Sets the value of :py:attr:`labelCol`.
"""
return self._set(labelCol=value)
def setPredictionCol(self, value):
"""
Sets the value of :py:attr:`predictionCol`.
"""
return self._set(predictionCol=value)
@since("3.0.0")
def setProbabilityCol(self, value):
"""
Sets the value of :py:attr:`probabilityCol`.
"""
return self._set(probabilityCol=value)
@since("3.0.0")
def setWeightCol(self, value):
"""
Sets the value of :py:attr:`weightCol`.
"""
return self._set(weightCol=value)
@keyword_only
@since("1.5.0")
def setParams(self, *, predictionCol="prediction", labelCol="label",
metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0,
probabilityCol="probability", eps=1e-15):
"""
setParams(self, \\*, predictionCol="prediction", labelCol="label", \
metricName="f1", weightCol=None, metricLabel=0.0, beta=1.0, \
probabilityCol="probability", eps=1e-15)
Sets params for multiclass classification evaluator.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
@inherit_doc
class MultilabelClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
JavaMLReadable, JavaMLWritable):
"""
Evaluator for Multilabel Classification, which expects two input
columns: prediction and label.
.. versionadded:: 3.0.0
Notes
-----
Experimental
Examples
--------
>>> scoreAndLabels = [([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]),
... ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]),
... ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])]
>>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
...
>>> evaluator = MultilabelClassificationEvaluator()
>>> evaluator.setPredictionCol("prediction")
MultilabelClassificationEvaluator...
>>> evaluator.evaluate(dataset)
0.63...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"})
0.54...
>>> mlce_path = temp_path + "/mlce"
>>> evaluator.save(mlce_path)
>>> evaluator2 = MultilabelClassificationEvaluator.load(mlce_path)
>>> str(evaluator2.getPredictionCol())
'prediction'
"""
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation "
"(subsetAccuracy|accuracy|hammingLoss|precision|recall|f1Measure|"
"precisionByLabel|recallByLabel|f1MeasureByLabel|microPrecision|"
"microRecall|microF1Measure)",
typeConverter=TypeConverters.toString)
metricLabel = Param(Params._dummy(), "metricLabel",
"The class whose metric will be computed in precisionByLabel|"
"recallByLabel|f1MeasureByLabel. "
"Must be >= 0. The default value is 0.",
typeConverter=TypeConverters.toFloat)
@keyword_only
def __init__(self, *, predictionCol="prediction", labelCol="label",
metricName="f1Measure", metricLabel=0.0):
"""
__init__(self, \\*, predictionCol="prediction", labelCol="label", \
metricName="f1Measure", metricLabel=0.0)
"""
super(MultilabelClassificationEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.MultilabelClassificationEvaluator", self.uid)
self._setDefault(metricName="f1Measure", metricLabel=0.0)
kwargs = self._input_kwargs
self._set(**kwargs)
@since("3.0.0")
def setMetricName(self, value):
"""
Sets the value of :py:attr:`metricName`.
"""
return self._set(metricName=value)
@since("3.0.0")
def getMetricName(self):
"""
Gets the value of metricName or its default value.
"""
return self.getOrDefault(self.metricName)
@since("3.0.0")
def setMetricLabel(self, value):
"""
Sets the value of :py:attr:`metricLabel`.
"""
return self._set(metricLabel=value)
@since("3.0.0")
def getMetricLabel(self):
"""
Gets the value of metricLabel or its default value.
"""
return self.getOrDefault(self.metricLabel)
@since("3.0.0")
def setLabelCol(self, value):
"""
Sets the value of :py:attr:`labelCol`.
"""
return self._set(labelCol=value)
@since("3.0.0")
def setPredictionCol(self, value):
"""
Sets the value of :py:attr:`predictionCol`.
"""
return self._set(predictionCol=value)
@keyword_only
@since("3.0.0")
def setParams(self, *, predictionCol="prediction", labelCol="label",
metricName="f1Measure", metricLabel=0.0):
"""
setParams(self, \\*, predictionCol="prediction", labelCol="label", \
metricName="f1Measure", metricLabel=0.0)
Sets params for multilabel classification evaluator.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
@inherit_doc
class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol, HasWeightCol,
JavaMLReadable, JavaMLWritable):
"""
Evaluator for Clustering results, which expects two input
columns: prediction and features. The metric computes the Silhouette
measure using the squared Euclidean distance.
The Silhouette is a measure for the validation of the consistency
within clusters. It ranges between 1 and -1, where a value close to
1 means that the points in a cluster are close to the other points
in the same cluster and far from the points of the other clusters.
.. versionadded:: 2.3.0
Examples
--------
>>> from pyspark.ml.linalg import Vectors
>>> featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
... [([0.0, 0.5], 0.0), ([0.5, 0.0], 0.0), ([10.0, 11.0], 1.0),
... ([10.5, 11.5], 1.0), ([1.0, 1.0], 0.0), ([8.0, 6.0], 1.0)])
>>> dataset = spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
...
>>> evaluator = ClusteringEvaluator()
>>> evaluator.setPredictionCol("prediction")
ClusteringEvaluator...
>>> evaluator.evaluate(dataset)
0.9079...
>>> featureAndPredictionsWithWeight = map(lambda x: (Vectors.dense(x[0]), x[1], x[2]),
... [([0.0, 0.5], 0.0, 2.5), ([0.5, 0.0], 0.0, 2.5), ([10.0, 11.0], 1.0, 2.5),
... ([10.5, 11.5], 1.0, 2.5), ([1.0, 1.0], 0.0, 2.5), ([8.0, 6.0], 1.0, 2.5)])
>>> dataset = spark.createDataFrame(
... featureAndPredictionsWithWeight, ["features", "prediction", "weight"])
>>> evaluator = ClusteringEvaluator()
>>> evaluator.setPredictionCol("prediction")
ClusteringEvaluator...
>>> evaluator.setWeightCol("weight")
ClusteringEvaluator...
>>> evaluator.evaluate(dataset)
0.9079...
>>> ce_path = temp_path + "/ce"
>>> evaluator.save(ce_path)
>>> evaluator2 = ClusteringEvaluator.load(ce_path)
>>> str(evaluator2.getPredictionCol())
'prediction'
"""
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation (silhouette)",
typeConverter=TypeConverters.toString)
distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " +
"Supported options: 'squaredEuclidean' and 'cosine'.",
typeConverter=TypeConverters.toString)
@keyword_only
def __init__(self, *, predictionCol="prediction", featuresCol="features",
metricName="silhouette", distanceMeasure="squaredEuclidean", weightCol=None):
"""
__init__(self, \\*, predictionCol="prediction", featuresCol="features", \
metricName="silhouette", distanceMeasure="squaredEuclidean", weightCol=None)
"""
super(ClusteringEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.ClusteringEvaluator", self.uid)
self._setDefault(metricName="silhouette", distanceMeasure="squaredEuclidean")
kwargs = self._input_kwargs
self._set(**kwargs)
@keyword_only
@since("2.3.0")
def setParams(self, *, predictionCol="prediction", featuresCol="features",
metricName="silhouette", distanceMeasure="squaredEuclidean", weightCol=None):
"""
setParams(self, \\*, predictionCol="prediction", featuresCol="features", \
metricName="silhouette", distanceMeasure="squaredEuclidean", weightCol=None)
Sets params for clustering evaluator.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
@since("2.3.0")
def setMetricName(self, value):
"""
Sets the value of :py:attr:`metricName`.
"""
return self._set(metricName=value)
@since("2.3.0")
def getMetricName(self):
"""
Gets the value of metricName or its default value.
"""
return self.getOrDefault(self.metricName)
@since("2.4.0")
def setDistanceMeasure(self, value):
"""
Sets the value of :py:attr:`distanceMeasure`.
"""
return self._set(distanceMeasure=value)
@since("2.4.0")
def getDistanceMeasure(self):
"""
Gets the value of `distanceMeasure`
"""
return self.getOrDefault(self.distanceMeasure)
def setFeaturesCol(self, value):
"""
Sets the value of :py:attr:`featuresCol`.
"""
return self._set(featuresCol=value)
def setPredictionCol(self, value):
"""
Sets the value of :py:attr:`predictionCol`.
"""
return self._set(predictionCol=value)
@since("3.1.0")
def setWeightCol(self, value):
"""
Sets the value of :py:attr:`weightCol`.
"""
return self._set(weightCol=value)
@inherit_doc
class RankingEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol,
JavaMLReadable, JavaMLWritable):
"""
Evaluator for Ranking, which expects two input
columns: prediction and label.
.. versionadded:: 3.0.0
Notes
-----
Experimental
Examples
--------
>>> scoreAndLabels = [([1.0, 6.0, 2.0, 7.0, 8.0, 3.0, 9.0, 10.0, 4.0, 5.0],
... [1.0, 2.0, 3.0, 4.0, 5.0]),
... ([4.0, 1.0, 5.0, 6.0, 2.0, 7.0, 3.0, 8.0, 9.0, 10.0], [1.0, 2.0, 3.0]),
... ([1.0, 2.0, 3.0, 4.0, 5.0], [])]
>>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
...
>>> evaluator = RankingEvaluator()
>>> evaluator.setPredictionCol("prediction")
RankingEvaluator...
>>> evaluator.evaluate(dataset)
0.35...
>>> evaluator.evaluate(dataset, {evaluator.metricName: "precisionAtK", evaluator.k: 2})
0.33...
>>> ranke_path = temp_path + "/ranke"
>>> evaluator.save(ranke_path)
>>> evaluator2 = RankingEvaluator.load(ranke_path)
>>> str(evaluator2.getPredictionCol())
'prediction'
"""
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation "
"(meanAveragePrecision|meanAveragePrecisionAtK|"
"precisionAtK|ndcgAtK|recallAtK)",
typeConverter=TypeConverters.toString)
k = Param(Params._dummy(), "k",
"The ranking position value used in meanAveragePrecisionAtK|precisionAtK|"
"ndcgAtK|recallAtK. Must be > 0. The default value is 10.",
typeConverter=TypeConverters.toInt)
@keyword_only
def __init__(self, *, predictionCol="prediction", labelCol="label",
metricName="meanAveragePrecision", k=10):
"""
__init__(self, \\*, predictionCol="prediction", labelCol="label", \
metricName="meanAveragePrecision", k=10)
"""
super(RankingEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.RankingEvaluator", self.uid)
self._setDefault(metricName="meanAveragePrecision", k=10)
kwargs = self._input_kwargs
self._set(**kwargs)
@since("3.0.0")
def setMetricName(self, value):
"""
Sets the value of :py:attr:`metricName`.
"""
return self._set(metricName=value)
@since("3.0.0")
def getMetricName(self):
"""
Gets the value of metricName or its default value.
"""
return self.getOrDefault(self.metricName)
@since("3.0.0")
def setK(self, value):
"""
Sets the value of :py:attr:`k`.
"""
return self._set(k=value)
@since("3.0.0")
def getK(self):
"""
Gets the value of k or its default value.
"""
return self.getOrDefault(self.k)
@since("3.0.0")
def setLabelCol(self, value):
"""
Sets the value of :py:attr:`labelCol`.
"""
return self._set(labelCol=value)
@since("3.0.0")
def setPredictionCol(self, value):
"""
Sets the value of :py:attr:`predictionCol`.
"""
return self._set(predictionCol=value)
@keyword_only
@since("3.0.0")
def setParams(self, *, predictionCol="prediction", labelCol="label",
metricName="meanAveragePrecision", k=10):
"""
setParams(self, \\*, predictionCol="prediction", labelCol="label", \
metricName="meanAveragePrecision", k=10)
Sets params for ranking evaluator.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
if __name__ == "__main__":
import doctest
import tempfile
import pyspark.ml.evaluation
from pyspark.sql import SparkSession
globs = pyspark.ml.evaluation.__dict__.copy()
# The small batch size here ensures that we see multiple batches,
# even in these small test examples:
spark = SparkSession.builder\
.master("local[2]")\
.appName("ml.evaluation tests")\
.getOrCreate()
globs['spark'] = spark
temp_path = tempfile.mkdtemp()
globs['temp_path'] = temp_path
try:
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
spark.stop()
finally:
from shutil import rmtree
try:
rmtree(temp_path)
except OSError:
pass
if failure_count:
sys.exit(-1)