[SPARK-10269][PYSPARK][MLLIB] Add @since annotation to pyspark.mllib.classification

Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings). Added since to methods + "versionadded::" to classes derived from the file history. Note - some methods are inherited from the regression module (i.e. LinearModel.intercept) so these won't have version numbers in the API docs until that model is updated. Author: noelsmith <mail@noelsmith.com> Closes #8626 from noel-smith/SPARK-10269-since-mlib-classification.
2015-10-20 16:14:20 -07:00 · 2015-10-20 16:14:20 -07:00 · 04521ea067
parent 9f49895fef
commit 04521ea067
1 changed files with 66 additions and 4 deletions
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@ -20,7 +20,7 @@ from math import exp
 import numpy
 from numpy import array

-from pyspark import RDD
+from pyspark import RDD, since
 from pyspark.streaming import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py
 from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector
@ -44,6 +44,7 @@ class LinearClassificationModel(LinearModel):
        super(LinearClassificationModel, self).__init__(weights, intercept)
        self._threshold = None

+    @since('1.4.0')
    def setThreshold(self, value):
        """
        .. note:: Experimental
@ -57,6 +58,7 @@ class LinearClassificationModel(LinearModel):
        self._threshold = value

    @property
+    @since('1.4.0')
    def threshold(self):
        """
        .. note:: Experimental
@ -67,6 +69,7 @@ class LinearClassificationModel(LinearModel):
        """
        return self._threshold

+    @since('1.4.0')
    def clearThreshold(self):
        """
        .. note:: Experimental
@ -76,6 +79,7 @@ class LinearClassificationModel(LinearModel):
        """
        self._threshold = None

+    @since('1.4.0')
    def predict(self, test):
        """
        Predict values for a single data point or an RDD of points
@ -157,6 +161,8 @@ class LogisticRegressionModel(LinearClassificationModel):
    1
    >>> mcm.predict([0.0, 0.0, 0.3])
    2
+
+    .. versionadded:: 0.9.0
    """
    def __init__(self, weights, intercept, numFeatures, numClasses):
        super(LogisticRegressionModel, self).__init__(weights, intercept)
@ -172,13 +178,23 @@ class LogisticRegressionModel(LinearClassificationModel):
                                                                self._dataWithBiasSize)

    @property
+    @since('1.4.0')
    def numFeatures(self):
+        """
+        Dimension of the features.
+        """
        return self._numFeatures

    @property
+    @since('1.4.0')
    def numClasses(self):
+        """
+        Number of possible outcomes for k classes classification problem in Multinomial
+        Logistic Regression.
+        """
        return self._numClasses

+    @since('0.9.0')
    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points
@ -217,13 +233,21 @@ class LogisticRegressionModel(LinearClassificationModel):
                        best_class = i + 1
            return best_class

+    @since('1.4.0')
    def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
        java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel(
            _py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses)
        java_model.save(sc._jsc.sc(), path)

    @classmethod
+    @since('1.4.0')
    def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
        java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load(
            sc._jsc.sc(), path)
        weights = _java2py(sc, java_model.weights())
@ -237,8 +261,11 @@ class LogisticRegressionModel(LinearClassificationModel):


 class LogisticRegressionWithSGD(object):
-
+    """
+    .. versionadded:: 0.9.0
+    """
    @classmethod
+    @since('0.9.0')
    def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
              initialWeights=None, regParam=0.01, regType="l2", intercept=False,
              validateData=True, convergenceTol=0.001):
@ -286,8 +313,11 @@ class LogisticRegressionWithSGD(object):


 class LogisticRegressionWithLBFGS(object):
-
+    """
+    .. versionadded:: 1.2.0
+    """
    @classmethod
+    @since('1.2.0')
    def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType="l2",
              intercept=False, corrections=10, tolerance=1e-4, validateData=True, numClasses=2):
        """
@ -399,11 +429,14 @@ class SVMModel(LinearClassificationModel):
    ...    rmtree(path)
    ... except:
    ...    pass
+
+    .. versionadded:: 0.9.0
    """
    def __init__(self, weights, intercept):
        super(SVMModel, self).__init__(weights, intercept)
        self._threshold = 0.0

+    @since('0.9.0')
    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points
@ -419,13 +452,21 @@ class SVMModel(LinearClassificationModel):
        else:
            return 1 if margin > self._threshold else 0

+    @since('1.4.0')
    def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
        java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel(
            _py2java(sc, self._coeff), self.intercept)
        java_model.save(sc._jsc.sc(), path)

    @classmethod
+    @since('1.4.0')
    def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
        java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load(
            sc._jsc.sc(), path)
        weights = _java2py(sc, java_model.weights())
@ -437,8 +478,12 @@ class SVMModel(LinearClassificationModel):


 class SVMWithSGD(object):
+    """
+    .. versionadded:: 0.9.0
+    """

    @classmethod
+    @since('0.9.0')
    def train(cls, data, iterations=100, step=1.0, regParam=0.01,
              miniBatchFraction=1.0, initialWeights=None, regType="l2",
              intercept=False, validateData=True, convergenceTol=0.001):
@ -530,13 +575,15 @@ class NaiveBayesModel(Saveable, Loader):
    ...     rmtree(path)
    ... except OSError:
    ...     pass
-    """

+    .. versionadded:: 0.9.0
+    """
    def __init__(self, labels, pi, theta):
        self.labels = labels
        self.pi = pi
        self.theta = theta

+    @since('0.9.0')
    def predict(self, x):
        """
        Return the most likely class for a data vector
@ -548,6 +595,9 @@ class NaiveBayesModel(Saveable, Loader):
        return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]

    def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
        java_labels = _py2java(sc, self.labels.tolist())
        java_pi = _py2java(sc, self.pi.tolist())
        java_theta = _py2java(sc, self.theta.tolist())
@ -556,7 +606,11 @@ class NaiveBayesModel(Saveable, Loader):
        java_model.save(sc._jsc.sc(), path)

    @classmethod
+    @since('1.4.0')
    def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
        java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
            sc._jsc.sc(), path)
        # Can not unpickle array.array from Pyrolite in Python3 with "bytes"
@ -567,8 +621,12 @@ class NaiveBayesModel(Saveable, Loader):


 class NaiveBayes(object):
+    """
+    .. versionadded:: 0.9.0
+    """

    @classmethod
+    @since('0.9.0')
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features)
@ -605,6 +663,8 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
                              iteration.
    :param regParam: L2 Regularization parameter.
    :param convergenceTol: A condition which decides iteration termination.
+
+    .. versionadded:: 1.5.0
    """
    def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
                 convergenceTol=0.001):
@ -617,6 +677,7 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
        super(StreamingLogisticRegressionWithSGD, self).__init__(
            model=self._model)

+    @since('1.5.0')
    def setInitialWeights(self, initialWeights):
        """
        Set the initial value of weights.
@ -630,6 +691,7 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
            initialWeights, 0, initialWeights.size, 2)
        return self

+    @since('1.5.0')
    def trainOn(self, dstream):
        """Train the model on the incoming dstream."""
        self._validate(dstream)