[SPARK-10269][PYSPARK][MLLIB] Add @since annotation to pyspark.mllib.classification
Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings). Added since to methods + "versionadded::" to classes derived from the file history. Note - some methods are inherited from the regression module (i.e. LinearModel.intercept) so these won't have version numbers in the API docs until that model is updated. Author: noelsmith <mail@noelsmith.com> Closes #8626 from noel-smith/SPARK-10269-since-mlib-classification.
This commit is contained in:
parent
9f49895fef
commit
04521ea067
|
@ -20,7 +20,7 @@ from math import exp
|
|||
import numpy
|
||||
from numpy import array
|
||||
|
||||
from pyspark import RDD
|
||||
from pyspark import RDD, since
|
||||
from pyspark.streaming import DStream
|
||||
from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py
|
||||
from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector
|
||||
|
@ -44,6 +44,7 @@ class LinearClassificationModel(LinearModel):
|
|||
super(LinearClassificationModel, self).__init__(weights, intercept)
|
||||
self._threshold = None
|
||||
|
||||
@since('1.4.0')
|
||||
def setThreshold(self, value):
|
||||
"""
|
||||
.. note:: Experimental
|
||||
|
@ -57,6 +58,7 @@ class LinearClassificationModel(LinearModel):
|
|||
self._threshold = value
|
||||
|
||||
@property
|
||||
@since('1.4.0')
|
||||
def threshold(self):
|
||||
"""
|
||||
.. note:: Experimental
|
||||
|
@ -67,6 +69,7 @@ class LinearClassificationModel(LinearModel):
|
|||
"""
|
||||
return self._threshold
|
||||
|
||||
@since('1.4.0')
|
||||
def clearThreshold(self):
|
||||
"""
|
||||
.. note:: Experimental
|
||||
|
@ -76,6 +79,7 @@ class LinearClassificationModel(LinearModel):
|
|||
"""
|
||||
self._threshold = None
|
||||
|
||||
@since('1.4.0')
|
||||
def predict(self, test):
|
||||
"""
|
||||
Predict values for a single data point or an RDD of points
|
||||
|
@ -157,6 +161,8 @@ class LogisticRegressionModel(LinearClassificationModel):
|
|||
1
|
||||
>>> mcm.predict([0.0, 0.0, 0.3])
|
||||
2
|
||||
|
||||
.. versionadded:: 0.9.0
|
||||
"""
|
||||
def __init__(self, weights, intercept, numFeatures, numClasses):
|
||||
super(LogisticRegressionModel, self).__init__(weights, intercept)
|
||||
|
@ -172,13 +178,23 @@ class LogisticRegressionModel(LinearClassificationModel):
|
|||
self._dataWithBiasSize)
|
||||
|
||||
@property
|
||||
@since('1.4.0')
|
||||
def numFeatures(self):
|
||||
"""
|
||||
Dimension of the features.
|
||||
"""
|
||||
return self._numFeatures
|
||||
|
||||
@property
|
||||
@since('1.4.0')
|
||||
def numClasses(self):
|
||||
"""
|
||||
Number of possible outcomes for k classes classification problem in Multinomial
|
||||
Logistic Regression.
|
||||
"""
|
||||
return self._numClasses
|
||||
|
||||
@since('0.9.0')
|
||||
def predict(self, x):
|
||||
"""
|
||||
Predict values for a single data point or an RDD of points
|
||||
|
@ -217,13 +233,21 @@ class LogisticRegressionModel(LinearClassificationModel):
|
|||
best_class = i + 1
|
||||
return best_class
|
||||
|
||||
@since('1.4.0')
|
||||
def save(self, sc, path):
|
||||
"""
|
||||
Save this model to the given path.
|
||||
"""
|
||||
java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel(
|
||||
_py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses)
|
||||
java_model.save(sc._jsc.sc(), path)
|
||||
|
||||
@classmethod
|
||||
@since('1.4.0')
|
||||
def load(cls, sc, path):
|
||||
"""
|
||||
Load a model from the given path.
|
||||
"""
|
||||
java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load(
|
||||
sc._jsc.sc(), path)
|
||||
weights = _java2py(sc, java_model.weights())
|
||||
|
@ -237,8 +261,11 @@ class LogisticRegressionModel(LinearClassificationModel):
|
|||
|
||||
|
||||
class LogisticRegressionWithSGD(object):
|
||||
|
||||
"""
|
||||
.. versionadded:: 0.9.0
|
||||
"""
|
||||
@classmethod
|
||||
@since('0.9.0')
|
||||
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
|
||||
initialWeights=None, regParam=0.01, regType="l2", intercept=False,
|
||||
validateData=True, convergenceTol=0.001):
|
||||
|
@ -286,8 +313,11 @@ class LogisticRegressionWithSGD(object):
|
|||
|
||||
|
||||
class LogisticRegressionWithLBFGS(object):
|
||||
|
||||
"""
|
||||
.. versionadded:: 1.2.0
|
||||
"""
|
||||
@classmethod
|
||||
@since('1.2.0')
|
||||
def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType="l2",
|
||||
intercept=False, corrections=10, tolerance=1e-4, validateData=True, numClasses=2):
|
||||
"""
|
||||
|
@ -399,11 +429,14 @@ class SVMModel(LinearClassificationModel):
|
|||
... rmtree(path)
|
||||
... except:
|
||||
... pass
|
||||
|
||||
.. versionadded:: 0.9.0
|
||||
"""
|
||||
def __init__(self, weights, intercept):
|
||||
super(SVMModel, self).__init__(weights, intercept)
|
||||
self._threshold = 0.0
|
||||
|
||||
@since('0.9.0')
|
||||
def predict(self, x):
|
||||
"""
|
||||
Predict values for a single data point or an RDD of points
|
||||
|
@ -419,13 +452,21 @@ class SVMModel(LinearClassificationModel):
|
|||
else:
|
||||
return 1 if margin > self._threshold else 0
|
||||
|
||||
@since('1.4.0')
|
||||
def save(self, sc, path):
|
||||
"""
|
||||
Save this model to the given path.
|
||||
"""
|
||||
java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel(
|
||||
_py2java(sc, self._coeff), self.intercept)
|
||||
java_model.save(sc._jsc.sc(), path)
|
||||
|
||||
@classmethod
|
||||
@since('1.4.0')
|
||||
def load(cls, sc, path):
|
||||
"""
|
||||
Load a model from the given path.
|
||||
"""
|
||||
java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load(
|
||||
sc._jsc.sc(), path)
|
||||
weights = _java2py(sc, java_model.weights())
|
||||
|
@ -437,8 +478,12 @@ class SVMModel(LinearClassificationModel):
|
|||
|
||||
|
||||
class SVMWithSGD(object):
|
||||
"""
|
||||
.. versionadded:: 0.9.0
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@since('0.9.0')
|
||||
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
|
||||
miniBatchFraction=1.0, initialWeights=None, regType="l2",
|
||||
intercept=False, validateData=True, convergenceTol=0.001):
|
||||
|
@ -530,13 +575,15 @@ class NaiveBayesModel(Saveable, Loader):
|
|||
... rmtree(path)
|
||||
... except OSError:
|
||||
... pass
|
||||
"""
|
||||
|
||||
.. versionadded:: 0.9.0
|
||||
"""
|
||||
def __init__(self, labels, pi, theta):
|
||||
self.labels = labels
|
||||
self.pi = pi
|
||||
self.theta = theta
|
||||
|
||||
@since('0.9.0')
|
||||
def predict(self, x):
|
||||
"""
|
||||
Return the most likely class for a data vector
|
||||
|
@ -548,6 +595,9 @@ class NaiveBayesModel(Saveable, Loader):
|
|||
return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]
|
||||
|
||||
def save(self, sc, path):
|
||||
"""
|
||||
Save this model to the given path.
|
||||
"""
|
||||
java_labels = _py2java(sc, self.labels.tolist())
|
||||
java_pi = _py2java(sc, self.pi.tolist())
|
||||
java_theta = _py2java(sc, self.theta.tolist())
|
||||
|
@ -556,7 +606,11 @@ class NaiveBayesModel(Saveable, Loader):
|
|||
java_model.save(sc._jsc.sc(), path)
|
||||
|
||||
@classmethod
|
||||
@since('1.4.0')
|
||||
def load(cls, sc, path):
|
||||
"""
|
||||
Load a model from the given path.
|
||||
"""
|
||||
java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
|
||||
sc._jsc.sc(), path)
|
||||
# Can not unpickle array.array from Pyrolite in Python3 with "bytes"
|
||||
|
@ -567,8 +621,12 @@ class NaiveBayesModel(Saveable, Loader):
|
|||
|
||||
|
||||
class NaiveBayes(object):
|
||||
"""
|
||||
.. versionadded:: 0.9.0
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@since('0.9.0')
|
||||
def train(cls, data, lambda_=1.0):
|
||||
"""
|
||||
Train a Naive Bayes model given an RDD of (label, features)
|
||||
|
@ -605,6 +663,8 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
|
|||
iteration.
|
||||
:param regParam: L2 Regularization parameter.
|
||||
:param convergenceTol: A condition which decides iteration termination.
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
"""
|
||||
def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
|
||||
convergenceTol=0.001):
|
||||
|
@ -617,6 +677,7 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
|
|||
super(StreamingLogisticRegressionWithSGD, self).__init__(
|
||||
model=self._model)
|
||||
|
||||
@since('1.5.0')
|
||||
def setInitialWeights(self, initialWeights):
|
||||
"""
|
||||
Set the initial value of weights.
|
||||
|
@ -630,6 +691,7 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
|
|||
initialWeights, 0, initialWeights.size, 2)
|
||||
return self
|
||||
|
||||
@since('1.5.0')
|
||||
def trainOn(self, dstream):
|
||||
"""Train the model on the incoming dstream."""
|
||||
self._validate(dstream)
|
||||
|
|
Loading…
Reference in a new issue