From 04521ea067d6ed3c5398067f07904d27c77017ff Mon Sep 17 00:00:00 2001 From: noelsmith Date: Tue, 20 Oct 2015 16:14:20 -0700 Subject: [PATCH] [SPARK-10269][PYSPARK][MLLIB] Add @since annotation to pyspark.mllib.classification Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings). Added since to methods + "versionadded::" to classes derived from the file history. Note - some methods are inherited from the regression module (i.e. LinearModel.intercept) so these won't have version numbers in the API docs until that model is updated. Author: noelsmith Closes #8626 from noel-smith/SPARK-10269-since-mlib-classification. --- python/pyspark/mllib/classification.py | 70 ++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 4 deletions(-) diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index b77754500b..aab4015ba8 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -20,7 +20,7 @@ from math import exp import numpy from numpy import array -from pyspark import RDD +from pyspark import RDD, since from pyspark.streaming import DStream from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector @@ -44,6 +44,7 @@ class LinearClassificationModel(LinearModel): super(LinearClassificationModel, self).__init__(weights, intercept) self._threshold = None + @since('1.4.0') def setThreshold(self, value): """ .. note:: Experimental @@ -57,6 +58,7 @@ class LinearClassificationModel(LinearModel): self._threshold = value @property + @since('1.4.0') def threshold(self): """ .. note:: Experimental @@ -67,6 +69,7 @@ class LinearClassificationModel(LinearModel): """ return self._threshold + @since('1.4.0') def clearThreshold(self): """ .. note:: Experimental @@ -76,6 +79,7 @@ class LinearClassificationModel(LinearModel): """ self._threshold = None + @since('1.4.0') def predict(self, test): """ Predict values for a single data point or an RDD of points @@ -157,6 +161,8 @@ class LogisticRegressionModel(LinearClassificationModel): 1 >>> mcm.predict([0.0, 0.0, 0.3]) 2 + + .. versionadded:: 0.9.0 """ def __init__(self, weights, intercept, numFeatures, numClasses): super(LogisticRegressionModel, self).__init__(weights, intercept) @@ -172,13 +178,23 @@ class LogisticRegressionModel(LinearClassificationModel): self._dataWithBiasSize) @property + @since('1.4.0') def numFeatures(self): + """ + Dimension of the features. + """ return self._numFeatures @property + @since('1.4.0') def numClasses(self): + """ + Number of possible outcomes for k classes classification problem in Multinomial + Logistic Regression. + """ return self._numClasses + @since('0.9.0') def predict(self, x): """ Predict values for a single data point or an RDD of points @@ -217,13 +233,21 @@ class LogisticRegressionModel(LinearClassificationModel): best_class = i + 1 return best_class + @since('1.4.0') def save(self, sc, path): + """ + Save this model to the given path. + """ java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel( _py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses) java_model.save(sc._jsc.sc(), path) @classmethod + @since('1.4.0') def load(cls, sc, path): + """ + Load a model from the given path. + """ java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load( sc._jsc.sc(), path) weights = _java2py(sc, java_model.weights()) @@ -237,8 +261,11 @@ class LogisticRegressionModel(LinearClassificationModel): class LogisticRegressionWithSGD(object): - + """ + .. versionadded:: 0.9.0 + """ @classmethod + @since('0.9.0') def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=0.01, regType="l2", intercept=False, validateData=True, convergenceTol=0.001): @@ -286,8 +313,11 @@ class LogisticRegressionWithSGD(object): class LogisticRegressionWithLBFGS(object): - + """ + .. versionadded:: 1.2.0 + """ @classmethod + @since('1.2.0') def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType="l2", intercept=False, corrections=10, tolerance=1e-4, validateData=True, numClasses=2): """ @@ -399,11 +429,14 @@ class SVMModel(LinearClassificationModel): ... rmtree(path) ... except: ... pass + + .. versionadded:: 0.9.0 """ def __init__(self, weights, intercept): super(SVMModel, self).__init__(weights, intercept) self._threshold = 0.0 + @since('0.9.0') def predict(self, x): """ Predict values for a single data point or an RDD of points @@ -419,13 +452,21 @@ class SVMModel(LinearClassificationModel): else: return 1 if margin > self._threshold else 0 + @since('1.4.0') def save(self, sc, path): + """ + Save this model to the given path. + """ java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel( _py2java(sc, self._coeff), self.intercept) java_model.save(sc._jsc.sc(), path) @classmethod + @since('1.4.0') def load(cls, sc, path): + """ + Load a model from the given path. + """ java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load( sc._jsc.sc(), path) weights = _java2py(sc, java_model.weights()) @@ -437,8 +478,12 @@ class SVMModel(LinearClassificationModel): class SVMWithSGD(object): + """ + .. versionadded:: 0.9.0 + """ @classmethod + @since('0.9.0') def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType="l2", intercept=False, validateData=True, convergenceTol=0.001): @@ -530,13 +575,15 @@ class NaiveBayesModel(Saveable, Loader): ... rmtree(path) ... except OSError: ... pass - """ + .. versionadded:: 0.9.0 + """ def __init__(self, labels, pi, theta): self.labels = labels self.pi = pi self.theta = theta + @since('0.9.0') def predict(self, x): """ Return the most likely class for a data vector @@ -548,6 +595,9 @@ class NaiveBayesModel(Saveable, Loader): return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))] def save(self, sc, path): + """ + Save this model to the given path. + """ java_labels = _py2java(sc, self.labels.tolist()) java_pi = _py2java(sc, self.pi.tolist()) java_theta = _py2java(sc, self.theta.tolist()) @@ -556,7 +606,11 @@ class NaiveBayesModel(Saveable, Loader): java_model.save(sc._jsc.sc(), path) @classmethod + @since('1.4.0') def load(cls, sc, path): + """ + Load a model from the given path. + """ java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load( sc._jsc.sc(), path) # Can not unpickle array.array from Pyrolite in Python3 with "bytes" @@ -567,8 +621,12 @@ class NaiveBayesModel(Saveable, Loader): class NaiveBayes(object): + """ + .. versionadded:: 0.9.0 + """ @classmethod + @since('0.9.0') def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) @@ -605,6 +663,8 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm): iteration. :param regParam: L2 Regularization parameter. :param convergenceTol: A condition which decides iteration termination. + + .. versionadded:: 1.5.0 """ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01, convergenceTol=0.001): @@ -617,6 +677,7 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm): super(StreamingLogisticRegressionWithSGD, self).__init__( model=self._model) + @since('1.5.0') def setInitialWeights(self, initialWeights): """ Set the initial value of weights. @@ -630,6 +691,7 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm): initialWeights, 0, initialWeights.size, 2) return self + @since('1.5.0') def trainOn(self, dstream): """Train the model on the incoming dstream.""" self._validate(dstream)