[SPARK-7916] [MLLIB] MLlib Python doc parity check for classification and regression
Check then make the MLlib Python classification and regression doc to be as complete as the Scala doc.
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #6460 from yanboliang/spark-7916 and squashes the following commits:
f8deda4 [Yanbo Liang] trigger jenkins
6dc4d99 [Yanbo Liang] address comments
ce2a43e [Yanbo Liang] truncate too long line and remove extra sparse
3eaf6ad [Yanbo Liang] MLlib Python doc parity check for classification and regression
(cherry picked from commit ca998757e8
)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
This commit is contained in:
parent
b9e5d3cadd
commit
15d973f2d9
|
@ -73,7 +73,7 @@ object RidgeRegressionModel extends Loader[RidgeRegressionModel] {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Train a regression model with L2-regularization using Stochastic Gradient Descent.
|
* Train a regression model with L2-regularization using Stochastic Gradient Descent.
|
||||||
* This solves the l1-regularized least squares regression formulation
|
* This solves the l2-regularized least squares regression formulation
|
||||||
* f(weights) = 1/2n ||A weights-y||^2^ + regParam/2 ||weights||^2^
|
* f(weights) = 1/2n ||A weights-y||^2^ + regParam/2 ||weights||^2^
|
||||||
* Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
|
* Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
|
||||||
* its corresponding right hand side label y.
|
* its corresponding right hand side label y.
|
||||||
|
|
|
@ -33,8 +33,8 @@ __all__ = ['LogisticRegressionModel', 'LogisticRegressionWithSGD', 'LogisticRegr
|
||||||
|
|
||||||
class LinearClassificationModel(LinearModel):
|
class LinearClassificationModel(LinearModel):
|
||||||
"""
|
"""
|
||||||
A private abstract class representing a multiclass classification model.
|
A private abstract class representing a multiclass classification
|
||||||
The categories are represented by int values: 0, 1, 2, etc.
|
model. The categories are represented by int values: 0, 1, 2, etc.
|
||||||
"""
|
"""
|
||||||
def __init__(self, weights, intercept):
|
def __init__(self, weights, intercept):
|
||||||
super(LinearClassificationModel, self).__init__(weights, intercept)
|
super(LinearClassificationModel, self).__init__(weights, intercept)
|
||||||
|
@ -44,10 +44,11 @@ class LinearClassificationModel(LinearModel):
|
||||||
"""
|
"""
|
||||||
.. note:: Experimental
|
.. note:: Experimental
|
||||||
|
|
||||||
Sets the threshold that separates positive predictions from negative
|
Sets the threshold that separates positive predictions from
|
||||||
predictions. An example with prediction score greater than or equal
|
negative predictions. An example with prediction score greater
|
||||||
to this threshold is identified as an positive, and negative otherwise.
|
than or equal to this threshold is identified as an positive,
|
||||||
It is used for binary classification only.
|
and negative otherwise. It is used for binary classification
|
||||||
|
only.
|
||||||
"""
|
"""
|
||||||
self._threshold = value
|
self._threshold = value
|
||||||
|
|
||||||
|
@ -56,8 +57,9 @@ class LinearClassificationModel(LinearModel):
|
||||||
"""
|
"""
|
||||||
.. note:: Experimental
|
.. note:: Experimental
|
||||||
|
|
||||||
Returns the threshold (if any) used for converting raw prediction scores
|
Returns the threshold (if any) used for converting raw
|
||||||
into 0/1 predictions. It is used for binary classification only.
|
prediction scores into 0/1 predictions. It is used for
|
||||||
|
binary classification only.
|
||||||
"""
|
"""
|
||||||
return self._threshold
|
return self._threshold
|
||||||
|
|
||||||
|
@ -65,22 +67,35 @@ class LinearClassificationModel(LinearModel):
|
||||||
"""
|
"""
|
||||||
.. note:: Experimental
|
.. note:: Experimental
|
||||||
|
|
||||||
Clears the threshold so that `predict` will output raw prediction scores.
|
Clears the threshold so that `predict` will output raw
|
||||||
It is used for binary classification only.
|
prediction scores. It is used for binary classification only.
|
||||||
"""
|
"""
|
||||||
self._threshold = None
|
self._threshold = None
|
||||||
|
|
||||||
def predict(self, test):
|
def predict(self, test):
|
||||||
"""
|
"""
|
||||||
Predict values for a single data point or an RDD of points using
|
Predict values for a single data point or an RDD of points
|
||||||
the model trained.
|
using the model trained.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class LogisticRegressionModel(LinearClassificationModel):
|
class LogisticRegressionModel(LinearClassificationModel):
|
||||||
|
|
||||||
"""A linear binary classification model derived from logistic regression.
|
"""
|
||||||
|
Classification model trained using Multinomial/Binary Logistic
|
||||||
|
Regression.
|
||||||
|
|
||||||
|
:param weights: Weights computed for every feature.
|
||||||
|
:param intercept: Intercept computed for this model. (Only used
|
||||||
|
in Binary Logistic Regression. In Multinomial Logistic
|
||||||
|
Regression, the intercepts will not be a single value,
|
||||||
|
so the intercepts will be part of the weights.)
|
||||||
|
:param numFeatures: the dimension of the features.
|
||||||
|
:param numClasses: the number of possible outcomes for k classes
|
||||||
|
classification problem in Multinomial Logistic Regression.
|
||||||
|
By default, it is binary logistic regression so numClasses
|
||||||
|
will be set to 2.
|
||||||
|
|
||||||
>>> data = [
|
>>> data = [
|
||||||
... LabeledPoint(0.0, [0.0, 1.0]),
|
... LabeledPoint(0.0, [0.0, 1.0]),
|
||||||
|
@ -161,8 +176,8 @@ class LogisticRegressionModel(LinearClassificationModel):
|
||||||
|
|
||||||
def predict(self, x):
|
def predict(self, x):
|
||||||
"""
|
"""
|
||||||
Predict values for a single data point or an RDD of points using
|
Predict values for a single data point or an RDD of points
|
||||||
the model trained.
|
using the model trained.
|
||||||
"""
|
"""
|
||||||
if isinstance(x, RDD):
|
if isinstance(x, RDD):
|
||||||
return x.map(lambda v: self.predict(v))
|
return x.map(lambda v: self.predict(v))
|
||||||
|
@ -225,16 +240,19 @@ class LogisticRegressionWithSGD(object):
|
||||||
"""
|
"""
|
||||||
Train a logistic regression model on the given data.
|
Train a logistic regression model on the given data.
|
||||||
|
|
||||||
:param data: The training data, an RDD of LabeledPoint.
|
:param data: The training data, an RDD of
|
||||||
:param iterations: The number of iterations (default: 100).
|
LabeledPoint.
|
||||||
|
:param iterations: The number of iterations
|
||||||
|
(default: 100).
|
||||||
:param step: The step parameter used in SGD
|
:param step: The step parameter used in SGD
|
||||||
(default: 1.0).
|
(default: 1.0).
|
||||||
:param miniBatchFraction: Fraction of data to be used for each SGD
|
:param miniBatchFraction: Fraction of data to be used for each
|
||||||
iteration.
|
SGD iteration (default: 1.0).
|
||||||
:param initialWeights: The initial weights (default: None).
|
:param initialWeights: The initial weights (default: None).
|
||||||
:param regParam: The regularizer parameter (default: 0.01).
|
:param regParam: The regularizer parameter
|
||||||
:param regType: The type of regularizer used for training
|
(default: 0.01).
|
||||||
our model.
|
:param regType: The type of regularizer used for
|
||||||
|
training our model.
|
||||||
|
|
||||||
:Allowed values:
|
:Allowed values:
|
||||||
- "l1" for using L1 regularization
|
- "l1" for using L1 regularization
|
||||||
|
@ -243,13 +261,14 @@ class LogisticRegressionWithSGD(object):
|
||||||
|
|
||||||
(default: "l2")
|
(default: "l2")
|
||||||
|
|
||||||
:param intercept: Boolean parameter which indicates the use
|
:param intercept: Boolean parameter which indicates the
|
||||||
or not of the augmented representation for
|
use or not of the augmented representation
|
||||||
training data (i.e. whether bias features
|
for training data (i.e. whether bias
|
||||||
are activated or not).
|
features are activated or not,
|
||||||
:param validateData: Boolean parameter which indicates if the
|
default: False).
|
||||||
algorithm should validate data before training.
|
:param validateData: Boolean parameter which indicates if
|
||||||
(default: True)
|
the algorithm should validate data
|
||||||
|
before training. (default: True)
|
||||||
"""
|
"""
|
||||||
def train(rdd, i):
|
def train(rdd, i):
|
||||||
return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
|
return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
|
||||||
|
@ -267,12 +286,15 @@ class LogisticRegressionWithLBFGS(object):
|
||||||
"""
|
"""
|
||||||
Train a logistic regression model on the given data.
|
Train a logistic regression model on the given data.
|
||||||
|
|
||||||
:param data: The training data, an RDD of LabeledPoint.
|
:param data: The training data, an RDD of
|
||||||
:param iterations: The number of iterations (default: 100).
|
LabeledPoint.
|
||||||
|
:param iterations: The number of iterations
|
||||||
|
(default: 100).
|
||||||
:param initialWeights: The initial weights (default: None).
|
:param initialWeights: The initial weights (default: None).
|
||||||
:param regParam: The regularizer parameter (default: 0.01).
|
:param regParam: The regularizer parameter
|
||||||
:param regType: The type of regularizer used for training
|
(default: 0.01).
|
||||||
our model.
|
:param regType: The type of regularizer used for
|
||||||
|
training our model.
|
||||||
|
|
||||||
:Allowed values:
|
:Allowed values:
|
||||||
- "l1" for using L1 regularization
|
- "l1" for using L1 regularization
|
||||||
|
@ -281,19 +303,21 @@ class LogisticRegressionWithLBFGS(object):
|
||||||
|
|
||||||
(default: "l2")
|
(default: "l2")
|
||||||
|
|
||||||
:param intercept: Boolean parameter which indicates the use
|
:param intercept: Boolean parameter which indicates the
|
||||||
or not of the augmented representation for
|
use or not of the augmented representation
|
||||||
training data (i.e. whether bias features
|
for training data (i.e. whether bias
|
||||||
are activated or not).
|
features are activated or not,
|
||||||
:param corrections: The number of corrections used in the LBFGS
|
default: False).
|
||||||
update (default: 10).
|
:param corrections: The number of corrections used in the
|
||||||
:param tolerance: The convergence tolerance of iterations for
|
LBFGS update (default: 10).
|
||||||
L-BFGS (default: 1e-4).
|
:param tolerance: The convergence tolerance of iterations
|
||||||
|
for L-BFGS (default: 1e-4).
|
||||||
:param validateData: Boolean parameter which indicates if the
|
:param validateData: Boolean parameter which indicates if the
|
||||||
algorithm should validate data before training.
|
algorithm should validate data before
|
||||||
(default: True)
|
training. (default: True)
|
||||||
:param numClasses: The number of classes (i.e., outcomes) a label can take
|
:param numClasses: The number of classes (i.e., outcomes) a
|
||||||
in Multinomial Logistic Regression (default: 2).
|
label can take in Multinomial Logistic
|
||||||
|
Regression (default: 2).
|
||||||
|
|
||||||
>>> data = [
|
>>> data = [
|
||||||
... LabeledPoint(0.0, [0.0, 1.0]),
|
... LabeledPoint(0.0, [0.0, 1.0]),
|
||||||
|
@ -323,7 +347,11 @@ class LogisticRegressionWithLBFGS(object):
|
||||||
|
|
||||||
class SVMModel(LinearClassificationModel):
|
class SVMModel(LinearClassificationModel):
|
||||||
|
|
||||||
"""A support vector machine.
|
"""
|
||||||
|
Model for Support Vector Machines (SVMs).
|
||||||
|
|
||||||
|
:param weights: Weights computed for every feature.
|
||||||
|
:param intercept: Intercept computed for this model.
|
||||||
|
|
||||||
>>> data = [
|
>>> data = [
|
||||||
... LabeledPoint(0.0, [0.0]),
|
... LabeledPoint(0.0, [0.0]),
|
||||||
|
@ -370,8 +398,8 @@ class SVMModel(LinearClassificationModel):
|
||||||
|
|
||||||
def predict(self, x):
|
def predict(self, x):
|
||||||
"""
|
"""
|
||||||
Predict values for a single data point or an RDD of points using
|
Predict values for a single data point or an RDD of points
|
||||||
the model trained.
|
using the model trained.
|
||||||
"""
|
"""
|
||||||
if isinstance(x, RDD):
|
if isinstance(x, RDD):
|
||||||
return x.map(lambda v: self.predict(v))
|
return x.map(lambda v: self.predict(v))
|
||||||
|
@ -409,16 +437,19 @@ class SVMWithSGD(object):
|
||||||
"""
|
"""
|
||||||
Train a support vector machine on the given data.
|
Train a support vector machine on the given data.
|
||||||
|
|
||||||
:param data: The training data, an RDD of LabeledPoint.
|
:param data: The training data, an RDD of
|
||||||
:param iterations: The number of iterations (default: 100).
|
LabeledPoint.
|
||||||
|
:param iterations: The number of iterations
|
||||||
|
(default: 100).
|
||||||
:param step: The step parameter used in SGD
|
:param step: The step parameter used in SGD
|
||||||
(default: 1.0).
|
(default: 1.0).
|
||||||
:param regParam: The regularizer parameter (default: 0.01).
|
:param regParam: The regularizer parameter
|
||||||
:param miniBatchFraction: Fraction of data to be used for each SGD
|
(default: 0.01).
|
||||||
iteration.
|
:param miniBatchFraction: Fraction of data to be used for each
|
||||||
|
SGD iteration (default: 1.0).
|
||||||
:param initialWeights: The initial weights (default: None).
|
:param initialWeights: The initial weights (default: None).
|
||||||
:param regType: The type of regularizer used for training
|
:param regType: The type of regularizer used for
|
||||||
our model.
|
training our model.
|
||||||
|
|
||||||
:Allowed values:
|
:Allowed values:
|
||||||
- "l1" for using L1 regularization
|
- "l1" for using L1 regularization
|
||||||
|
@ -427,13 +458,14 @@ class SVMWithSGD(object):
|
||||||
|
|
||||||
(default: "l2")
|
(default: "l2")
|
||||||
|
|
||||||
:param intercept: Boolean parameter which indicates the use
|
:param intercept: Boolean parameter which indicates the
|
||||||
or not of the augmented representation for
|
use or not of the augmented representation
|
||||||
training data (i.e. whether bias features
|
for training data (i.e. whether bias
|
||||||
are activated or not).
|
features are activated or not,
|
||||||
:param validateData: Boolean parameter which indicates if the
|
default: False).
|
||||||
algorithm should validate data before training.
|
:param validateData: Boolean parameter which indicates if
|
||||||
(default: True)
|
the algorithm should validate data
|
||||||
|
before training. (default: True)
|
||||||
"""
|
"""
|
||||||
def train(rdd, i):
|
def train(rdd, i):
|
||||||
return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step),
|
return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step),
|
||||||
|
@ -449,9 +481,11 @@ class NaiveBayesModel(Saveable, Loader):
|
||||||
"""
|
"""
|
||||||
Model for Naive Bayes classifiers.
|
Model for Naive Bayes classifiers.
|
||||||
|
|
||||||
Contains two parameters:
|
:param labels: list of labels.
|
||||||
- pi: vector of logs of class priors (dimension C)
|
:param pi: log of class priors, whose dimension is C,
|
||||||
- theta: matrix of logs of class conditional probabilities (CxD)
|
number of labels.
|
||||||
|
:param theta: log of class conditional probabilities, whose
|
||||||
|
dimension is C-by-D, where D is number of features.
|
||||||
|
|
||||||
>>> data = [
|
>>> data = [
|
||||||
... LabeledPoint(0.0, [0.0, 0.0]),
|
... LabeledPoint(0.0, [0.0, 0.0]),
|
||||||
|
@ -493,7 +527,10 @@ class NaiveBayesModel(Saveable, Loader):
|
||||||
self.theta = theta
|
self.theta = theta
|
||||||
|
|
||||||
def predict(self, x):
|
def predict(self, x):
|
||||||
"""Return the most likely class for a data vector or an RDD of vectors"""
|
"""
|
||||||
|
Return the most likely class for a data vector
|
||||||
|
or an RDD of vectors
|
||||||
|
"""
|
||||||
if isinstance(x, RDD):
|
if isinstance(x, RDD):
|
||||||
return x.map(lambda v: self.predict(v))
|
return x.map(lambda v: self.predict(v))
|
||||||
x = _convert_to_vector(x)
|
x = _convert_to_vector(x)
|
||||||
|
@ -523,16 +560,18 @@ class NaiveBayes(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def train(cls, data, lambda_=1.0):
|
def train(cls, data, lambda_=1.0):
|
||||||
"""
|
"""
|
||||||
Train a Naive Bayes model given an RDD of (label, features) vectors.
|
Train a Naive Bayes model given an RDD of (label, features)
|
||||||
|
vectors.
|
||||||
|
|
||||||
This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
|
This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which
|
||||||
handle all kinds of discrete data. For example, by converting
|
can handle all kinds of discrete data. For example, by
|
||||||
documents into TF-IDF vectors, it can be used for document
|
converting documents into TF-IDF vectors, it can be used for
|
||||||
classification. By making every vector a 0-1 vector, it can also be
|
document classification. By making every vector a 0-1 vector,
|
||||||
used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
|
it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
|
||||||
|
The input feature values must be nonnegative.
|
||||||
|
|
||||||
:param data: RDD of LabeledPoint.
|
:param data: RDD of LabeledPoint.
|
||||||
:param lambda_: The smoothing parameter
|
:param lambda_: The smoothing parameter (default: 1.0).
|
||||||
"""
|
"""
|
||||||
first = data.first()
|
first = data.first()
|
||||||
if not isinstance(first, LabeledPoint):
|
if not isinstance(first, LabeledPoint):
|
||||||
|
|
|
@ -33,7 +33,7 @@ __all__ = ['LabeledPoint', 'LinearModel',
|
||||||
class LabeledPoint(object):
|
class LabeledPoint(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
The features and labels of a data point.
|
Class that represents the features and labels of a data point.
|
||||||
|
|
||||||
:param label: Label for this data point.
|
:param label: Label for this data point.
|
||||||
:param features: Vector of features for this point (NumPy array,
|
:param features: Vector of features for this point (NumPy array,
|
||||||
|
@ -59,7 +59,12 @@ class LabeledPoint(object):
|
||||||
|
|
||||||
class LinearModel(object):
|
class LinearModel(object):
|
||||||
|
|
||||||
"""A linear model that has a vector of coefficients and an intercept."""
|
"""
|
||||||
|
A linear model that has a vector of coefficients and an intercept.
|
||||||
|
|
||||||
|
:param weights: Weights computed for every feature.
|
||||||
|
:param intercept: Intercept computed for this model.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, weights, intercept):
|
def __init__(self, weights, intercept):
|
||||||
self._coeff = _convert_to_vector(weights)
|
self._coeff = _convert_to_vector(weights)
|
||||||
|
@ -193,18 +198,28 @@ class LinearRegressionWithSGD(object):
|
||||||
initialWeights=None, regParam=0.0, regType=None, intercept=False,
|
initialWeights=None, regParam=0.0, regType=None, intercept=False,
|
||||||
validateData=True):
|
validateData=True):
|
||||||
"""
|
"""
|
||||||
Train a linear regression model on the given data.
|
Train a linear regression model using Stochastic Gradient
|
||||||
|
Descent (SGD).
|
||||||
|
This solves the least squares regression formulation
|
||||||
|
f(weights) = 1/n ||A weights-y||^2^
|
||||||
|
(which is the mean squared error).
|
||||||
|
Here the data matrix has n rows, and the input RDD holds the
|
||||||
|
set of rows of A, each with its corresponding right hand side
|
||||||
|
label y. See also the documentation for the precise formulation.
|
||||||
|
|
||||||
:param data: The training data.
|
:param data: The training data, an RDD of
|
||||||
:param iterations: The number of iterations (default: 100).
|
LabeledPoint.
|
||||||
|
:param iterations: The number of iterations
|
||||||
|
(default: 100).
|
||||||
:param step: The step parameter used in SGD
|
:param step: The step parameter used in SGD
|
||||||
(default: 1.0).
|
(default: 1.0).
|
||||||
:param miniBatchFraction: Fraction of data to be used for each SGD
|
:param miniBatchFraction: Fraction of data to be used for each
|
||||||
iteration.
|
SGD iteration (default: 1.0).
|
||||||
:param initialWeights: The initial weights (default: None).
|
:param initialWeights: The initial weights (default: None).
|
||||||
:param regParam: The regularizer parameter (default: 0.0).
|
:param regParam: The regularizer parameter
|
||||||
:param regType: The type of regularizer used for training
|
(default: 0.0).
|
||||||
our model.
|
:param regType: The type of regularizer used for
|
||||||
|
training our model.
|
||||||
|
|
||||||
:Allowed values:
|
:Allowed values:
|
||||||
- "l1" for using L1 regularization (lasso),
|
- "l1" for using L1 regularization (lasso),
|
||||||
|
@ -213,13 +228,14 @@ class LinearRegressionWithSGD(object):
|
||||||
|
|
||||||
(default: None)
|
(default: None)
|
||||||
|
|
||||||
:param intercept: Boolean parameter which indicates the use
|
:param intercept: Boolean parameter which indicates the
|
||||||
or not of the augmented representation for
|
use or not of the augmented representation
|
||||||
training data (i.e. whether bias features
|
for training data (i.e. whether bias
|
||||||
are activated or not). (default: False)
|
features are activated or not,
|
||||||
:param validateData: Boolean parameter which indicates if the
|
default: False).
|
||||||
algorithm should validate data before training.
|
:param validateData: Boolean parameter which indicates if
|
||||||
(default: True)
|
the algorithm should validate data
|
||||||
|
before training. (default: True)
|
||||||
"""
|
"""
|
||||||
def train(rdd, i):
|
def train(rdd, i):
|
||||||
return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
|
return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
|
||||||
|
@ -232,8 +248,8 @@ class LinearRegressionWithSGD(object):
|
||||||
@inherit_doc
|
@inherit_doc
|
||||||
class LassoModel(LinearRegressionModelBase):
|
class LassoModel(LinearRegressionModelBase):
|
||||||
|
|
||||||
"""A linear regression model derived from a least-squares fit with an
|
"""A linear regression model derived from a least-squares fit with
|
||||||
l_1 penalty term.
|
an l_1 penalty term.
|
||||||
|
|
||||||
>>> from pyspark.mllib.regression import LabeledPoint
|
>>> from pyspark.mllib.regression import LabeledPoint
|
||||||
>>> data = [
|
>>> data = [
|
||||||
|
@ -304,7 +320,36 @@ class LassoWithSGD(object):
|
||||||
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
|
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
|
||||||
miniBatchFraction=1.0, initialWeights=None, intercept=False,
|
miniBatchFraction=1.0, initialWeights=None, intercept=False,
|
||||||
validateData=True):
|
validateData=True):
|
||||||
"""Train a Lasso regression model on the given data."""
|
"""
|
||||||
|
Train a regression model with L1-regularization using
|
||||||
|
Stochastic Gradient Descent.
|
||||||
|
This solves the l1-regularized least squares regression
|
||||||
|
formulation
|
||||||
|
f(weights) = 1/2n ||A weights-y||^2^ + regParam ||weights||_1
|
||||||
|
Here the data matrix has n rows, and the input RDD holds the
|
||||||
|
set of rows of A, each with its corresponding right hand side
|
||||||
|
label y. See also the documentation for the precise formulation.
|
||||||
|
|
||||||
|
:param data: The training data, an RDD of
|
||||||
|
LabeledPoint.
|
||||||
|
:param iterations: The number of iterations
|
||||||
|
(default: 100).
|
||||||
|
:param step: The step parameter used in SGD
|
||||||
|
(default: 1.0).
|
||||||
|
:param regParam: The regularizer parameter
|
||||||
|
(default: 0.01).
|
||||||
|
:param miniBatchFraction: Fraction of data to be used for each
|
||||||
|
SGD iteration (default: 1.0).
|
||||||
|
:param initialWeights: The initial weights (default: None).
|
||||||
|
:param intercept: Boolean parameter which indicates the
|
||||||
|
use or not of the augmented representation
|
||||||
|
for training data (i.e. whether bias
|
||||||
|
features are activated or not,
|
||||||
|
default: False).
|
||||||
|
:param validateData: Boolean parameter which indicates if
|
||||||
|
the algorithm should validate data
|
||||||
|
before training. (default: True)
|
||||||
|
"""
|
||||||
def train(rdd, i):
|
def train(rdd, i):
|
||||||
return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
|
return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
|
||||||
float(regParam), float(miniBatchFraction), i, bool(intercept),
|
float(regParam), float(miniBatchFraction), i, bool(intercept),
|
||||||
|
@ -316,8 +361,8 @@ class LassoWithSGD(object):
|
||||||
@inherit_doc
|
@inherit_doc
|
||||||
class RidgeRegressionModel(LinearRegressionModelBase):
|
class RidgeRegressionModel(LinearRegressionModelBase):
|
||||||
|
|
||||||
"""A linear regression model derived from a least-squares fit with an
|
"""A linear regression model derived from a least-squares fit with
|
||||||
l_2 penalty term.
|
an l_2 penalty term.
|
||||||
|
|
||||||
>>> from pyspark.mllib.regression import LabeledPoint
|
>>> from pyspark.mllib.regression import LabeledPoint
|
||||||
>>> data = [
|
>>> data = [
|
||||||
|
@ -389,7 +434,36 @@ class RidgeRegressionWithSGD(object):
|
||||||
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
|
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
|
||||||
miniBatchFraction=1.0, initialWeights=None, intercept=False,
|
miniBatchFraction=1.0, initialWeights=None, intercept=False,
|
||||||
validateData=True):
|
validateData=True):
|
||||||
"""Train a ridge regression model on the given data."""
|
"""
|
||||||
|
Train a regression model with L2-regularization using
|
||||||
|
Stochastic Gradient Descent.
|
||||||
|
This solves the l2-regularized least squares regression
|
||||||
|
formulation
|
||||||
|
f(weights) = 1/2n ||A weights-y||^2^ + regParam/2 ||weights||^2^
|
||||||
|
Here the data matrix has n rows, and the input RDD holds the
|
||||||
|
set of rows of A, each with its corresponding right hand side
|
||||||
|
label y. See also the documentation for the precise formulation.
|
||||||
|
|
||||||
|
:param data: The training data, an RDD of
|
||||||
|
LabeledPoint.
|
||||||
|
:param iterations: The number of iterations
|
||||||
|
(default: 100).
|
||||||
|
:param step: The step parameter used in SGD
|
||||||
|
(default: 1.0).
|
||||||
|
:param regParam: The regularizer parameter
|
||||||
|
(default: 0.01).
|
||||||
|
:param miniBatchFraction: Fraction of data to be used for each
|
||||||
|
SGD iteration (default: 1.0).
|
||||||
|
:param initialWeights: The initial weights (default: None).
|
||||||
|
:param intercept: Boolean parameter which indicates the
|
||||||
|
use or not of the augmented representation
|
||||||
|
for training data (i.e. whether bias
|
||||||
|
features are activated or not,
|
||||||
|
default: False).
|
||||||
|
:param validateData: Boolean parameter which indicates if
|
||||||
|
the algorithm should validate data
|
||||||
|
before training. (default: True)
|
||||||
|
"""
|
||||||
def train(rdd, i):
|
def train(rdd, i):
|
||||||
return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
|
return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
|
||||||
float(regParam), float(miniBatchFraction), i, bool(intercept),
|
float(regParam), float(miniBatchFraction), i, bool(intercept),
|
||||||
|
@ -400,7 +474,15 @@ class RidgeRegressionWithSGD(object):
|
||||||
|
|
||||||
class IsotonicRegressionModel(Saveable, Loader):
|
class IsotonicRegressionModel(Saveable, Loader):
|
||||||
|
|
||||||
"""Regression model for isotonic regression.
|
"""
|
||||||
|
Regression model for isotonic regression.
|
||||||
|
|
||||||
|
:param boundaries: Array of boundaries for which predictions are
|
||||||
|
known. Boundaries must be sorted in increasing order.
|
||||||
|
:param predictions: Array of predictions associated to the
|
||||||
|
boundaries at the same index. Results of isotonic
|
||||||
|
regression and therefore monotone.
|
||||||
|
:param isotonic: indicates whether this is isotonic or antitonic.
|
||||||
|
|
||||||
>>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)]
|
>>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)]
|
||||||
>>> irm = IsotonicRegression.train(sc.parallelize(data))
|
>>> irm = IsotonicRegression.train(sc.parallelize(data))
|
||||||
|
@ -430,6 +512,25 @@ class IsotonicRegressionModel(Saveable, Loader):
|
||||||
self.isotonic = isotonic
|
self.isotonic = isotonic
|
||||||
|
|
||||||
def predict(self, x):
|
def predict(self, x):
|
||||||
|
"""
|
||||||
|
Predict labels for provided features.
|
||||||
|
Using a piecewise linear function.
|
||||||
|
1) If x exactly matches a boundary then associated prediction
|
||||||
|
is returned. In case there are multiple predictions with the
|
||||||
|
same boundary then one of them is returned. Which one is
|
||||||
|
undefined (same as java.util.Arrays.binarySearch).
|
||||||
|
2) If x is lower or higher than all boundaries then first or
|
||||||
|
last prediction is returned respectively. In case there are
|
||||||
|
multiple predictions with the same boundary then the lowest
|
||||||
|
or highest is returned respectively.
|
||||||
|
3) If x falls between two values in boundary array then
|
||||||
|
prediction is treated as piecewise linear function and
|
||||||
|
interpolated value is returned. In case there are multiple
|
||||||
|
values with the same boundary then the same rules as in 2)
|
||||||
|
are used.
|
||||||
|
|
||||||
|
:param x: Feature or RDD of Features to be labeled.
|
||||||
|
"""
|
||||||
if isinstance(x, RDD):
|
if isinstance(x, RDD):
|
||||||
return x.map(lambda v: self.predict(v))
|
return x.map(lambda v: self.predict(v))
|
||||||
return np.interp(x, self.boundaries, self.predictions)
|
return np.interp(x, self.boundaries, self.predictions)
|
||||||
|
@ -451,15 +552,15 @@ class IsotonicRegressionModel(Saveable, Loader):
|
||||||
|
|
||||||
|
|
||||||
class IsotonicRegression(object):
|
class IsotonicRegression(object):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def train(cls, data, isotonic=True):
|
||||||
"""
|
"""
|
||||||
Run IsotonicRegression algorithm to obtain isotonic regression model.
|
Train a isotonic regression model on the given data.
|
||||||
|
|
||||||
:param data: RDD of (label, feature, weight) tuples.
|
:param data: RDD of (label, feature, weight) tuples.
|
||||||
:param isotonic: Whether this is isotonic or antitonic.
|
:param isotonic: Whether this is isotonic or antitonic.
|
||||||
"""
|
"""
|
||||||
@classmethod
|
|
||||||
def train(cls, data, isotonic=True):
|
|
||||||
"""Train a isotonic regression model on the given data."""
|
|
||||||
boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel",
|
boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel",
|
||||||
data.map(_convert_to_vector), bool(isotonic))
|
data.map(_convert_to_vector), bool(isotonic))
|
||||||
return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic)
|
return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic)
|
||||||
|
|
Loading…
Reference in a new issue