2015-01-28 20:14:23 -05:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
2020-12-03 19:35:50 -05:00
|
|
|
import os
|
2016-04-15 15:58:38 -04:00
|
|
|
import operator
|
2020-08-30 22:23:31 -04:00
|
|
|
import sys
|
|
|
|
import uuid
|
2020-07-13 22:22:44 -04:00
|
|
|
import warnings
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
from abc import ABCMeta, abstractmethod, abstractproperty
|
2017-09-12 13:02:27 -04:00
|
|
|
from multiprocessing.pool import ThreadPool
|
2015-11-02 19:12:04 -05:00
|
|
|
|
2020-08-30 22:23:31 -04:00
|
|
|
from pyspark import keyword_only, since, SparkContext
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
from pyspark.ml import Estimator, Predictor, PredictionModel, Model
|
2020-08-30 22:23:31 -04:00
|
|
|
from pyspark.ml.param.shared import HasRawPredictionCol, HasProbabilityCol, HasThresholds, \
|
|
|
|
HasRegParam, HasMaxIter, HasFitIntercept, HasTol, HasStandardization, HasWeightCol, \
|
2020-11-12 06:14:07 -05:00
|
|
|
HasAggregationDepth, HasThreshold, HasBlockSize, HasMaxBlockSizeInMB, Param, Params, \
|
|
|
|
TypeConverters, HasElasticNetParam, HasSeed, HasStepSize, HasSolver, HasParallelism
|
2019-10-12 10:13:50 -04:00
|
|
|
from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
|
|
|
|
_TreeEnsembleModel, _RandomForestParams, _GBTParams, \
|
2020-07-13 22:22:44 -04:00
|
|
|
_HasVarianceImpurity, _TreeClassifierParams
|
2019-12-30 23:56:19 -05:00
|
|
|
from pyspark.ml.regression import _FactorizationMachinesParams, DecisionTreeRegressionModel
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
from pyspark.ml.base import _PredictorParams
|
2020-12-03 19:35:50 -05:00
|
|
|
from pyspark.ml.util import DefaultParamsReader, DefaultParamsWriter, \
|
|
|
|
JavaMLReadable, JavaMLReader, JavaMLWritable, JavaMLWriter, \
|
|
|
|
MLReader, MLReadable, MLWriter, MLWritable, HasTrainingSummary
|
2020-07-13 22:22:44 -04:00
|
|
|
from pyspark.ml.wrapper import JavaParams, \
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
JavaPredictor, JavaPredictionModel, JavaWrapper
|
[SPARK-33592] Fix: Pyspark ML Validator params in estimatorParamMaps may be lost after saving and reloading
### What changes were proposed in this pull request?
Fix: Pyspark ML Validator params in estimatorParamMaps may be lost after saving and reloading
When saving validator estimatorParamMaps, will check all nested stages in tuned estimator to get correct param parent.
Two typical cases to manually test:
~~~python
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100]) \
.addGrid(lr.maxIter, [100, 200]) \
.build()
tvs = TrainValidationSplit(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=MulticlassClassificationEvaluator())
tvs.save(tvsPath)
loadedTvs = TrainValidationSplit.load(tvsPath)
# check `loadedTvs.getEstimatorParamMaps()` restored correctly.
~~~
~~~python
lr = LogisticRegression()
ova = OneVsRest(classifier=lr)
grid = ParamGridBuilder().addGrid(lr.maxIter, [100, 200]).build()
evaluator = MulticlassClassificationEvaluator()
tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
tvs.save(tvsPath)
loadedTvs = TrainValidationSplit.load(tvsPath)
# check `loadedTvs.getEstimatorParamMaps()` restored correctly.
~~~
### Why are the changes needed?
Bug fix.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Unit test.
Closes #30539 from WeichenXu123/fix_tuning_param_maps_io.
Authored-by: Weichen Xu <weichen.xu@databricks.com>
Signed-off-by: Ruifeng Zheng <ruifengz@foxmail.com>
2020-11-30 20:36:42 -05:00
|
|
|
from pyspark.ml.common import inherit_doc
|
2021-04-21 04:29:10 -04:00
|
|
|
from pyspark.ml.linalg import Vectors, VectorUDT
|
2016-04-06 15:07:47 -04:00
|
|
|
from pyspark.sql import DataFrame
|
2016-04-15 15:58:38 -04:00
|
|
|
from pyspark.sql.functions import udf, when
|
|
|
|
from pyspark.sql.types import ArrayType, DoubleType
|
|
|
|
from pyspark.storagelevel import StorageLevel
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2017-01-27 19:03:53 -05:00
|
|
|
__all__ = ['LinearSVC', 'LinearSVCModel',
|
2020-06-26 13:57:30 -04:00
|
|
|
'LinearSVCSummary', 'LinearSVCTrainingSummary',
|
2017-01-27 19:03:53 -05:00
|
|
|
'LogisticRegression', 'LogisticRegressionModel',
|
2016-04-06 15:07:47 -04:00
|
|
|
'LogisticRegressionSummary', 'LogisticRegressionTrainingSummary',
|
|
|
|
'BinaryLogisticRegressionSummary', 'BinaryLogisticRegressionTrainingSummary',
|
2016-03-02 00:26:47 -05:00
|
|
|
'DecisionTreeClassifier', 'DecisionTreeClassificationModel',
|
|
|
|
'GBTClassifier', 'GBTClassificationModel',
|
|
|
|
'RandomForestClassifier', 'RandomForestClassificationModel',
|
2020-07-01 09:09:07 -04:00
|
|
|
'RandomForestClassificationSummary', 'RandomForestClassificationTrainingSummary',
|
|
|
|
'BinaryRandomForestClassificationSummary',
|
|
|
|
'BinaryRandomForestClassificationTrainingSummary',
|
2016-03-02 00:26:47 -05:00
|
|
|
'NaiveBayes', 'NaiveBayesModel',
|
2016-04-15 15:58:38 -04:00
|
|
|
'MultilayerPerceptronClassifier', 'MultilayerPerceptronClassificationModel',
|
2020-07-29 10:58:25 -04:00
|
|
|
'MultilayerPerceptronClassificationSummary',
|
|
|
|
'MultilayerPerceptronClassificationTrainingSummary',
|
2019-12-26 12:39:53 -05:00
|
|
|
'OneVsRest', 'OneVsRestModel',
|
2020-07-15 13:13:03 -04:00
|
|
|
'FMClassifier', 'FMClassificationModel', 'FMClassificationSummary',
|
|
|
|
'FMClassificationTrainingSummary']
|
2015-01-28 20:14:23 -05:00
|
|
|
|
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class _ClassifierParams(HasRawPredictionCol, _PredictorParams):
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
Classifier Params for classification tasks.
|
2019-10-14 11:52:23 -04:00
|
|
|
|
|
|
|
.. versionadded:: 3.0.0
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2016-08-22 06:21:22 -04:00
|
|
|
@inherit_doc
|
2020-09-16 07:22:11 -04:00
|
|
|
class Classifier(Predictor, _ClassifierParams, metaclass=ABCMeta):
|
2016-08-22 06:21:22 -04:00
|
|
|
"""
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
Classifier for classification tasks.
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
Classes are indexed {0, 1, ..., numClasses - 1}.
|
|
|
|
"""
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setRawPredictionCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`rawPredictionCol`.
|
|
|
|
"""
|
|
|
|
return self._set(rawPredictionCol=value)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
2020-09-16 07:22:11 -04:00
|
|
|
class ClassificationModel(PredictionModel, _ClassifierParams, metaclass=ABCMeta):
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
Model produced by a ``Classifier``.
|
2016-08-22 06:21:22 -04:00
|
|
|
Classes are indexed {0, 1, ..., numClasses - 1}.
|
|
|
|
"""
|
|
|
|
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
@since("3.0.0")
|
|
|
|
def setRawPredictionCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`rawPredictionCol`.
|
|
|
|
"""
|
|
|
|
return self._set(rawPredictionCol=value)
|
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
@abstractproperty
|
2016-08-22 06:21:22 -04:00
|
|
|
@since("2.1.0")
|
|
|
|
def numClasses(self):
|
|
|
|
"""
|
|
|
|
Number of classes (values which the label can take).
|
|
|
|
"""
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
raise NotImplementedError()
|
2016-08-22 06:21:22 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
@abstractmethod
|
2020-01-03 12:42:56 -05:00
|
|
|
@since("3.0.0")
|
|
|
|
def predictRaw(self, value):
|
|
|
|
"""
|
|
|
|
Raw prediction for each possible label.
|
|
|
|
"""
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
raise NotImplementedError()
|
2020-01-03 12:42:56 -05:00
|
|
|
|
2016-08-22 06:21:22 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class _ProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, _ClassifierParams):
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
Params for :py:class:`ProbabilisticClassifier` and
|
|
|
|
:py:class:`ProbabilisticClassificationModel`.
|
2019-10-14 11:52:23 -04:00
|
|
|
|
|
|
|
.. versionadded:: 3.0.0
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2017-01-27 19:03:53 -05:00
|
|
|
@inherit_doc
|
2020-09-16 07:22:11 -04:00
|
|
|
class ProbabilisticClassifier(Classifier, _ProbabilisticClassifierParams,
|
|
|
|
metaclass=ABCMeta):
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
Probabilistic Classifier for classification tasks.
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setProbabilityCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`probabilityCol`.
|
|
|
|
"""
|
|
|
|
return self._set(probabilityCol=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setThresholds(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`thresholds`.
|
|
|
|
"""
|
|
|
|
return self._set(thresholds=value)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class ProbabilisticClassificationModel(ClassificationModel,
|
2020-09-16 07:22:11 -04:00
|
|
|
_ProbabilisticClassifierParams,
|
|
|
|
metaclass=ABCMeta):
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
Model produced by a ``ProbabilisticClassifier``.
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setProbabilityCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`probabilityCol`.
|
|
|
|
"""
|
|
|
|
return self._set(probabilityCol=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setThresholds(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`thresholds`.
|
|
|
|
"""
|
|
|
|
return self._set(thresholds=value)
|
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
@abstractmethod
|
|
|
|
@since("3.0.0")
|
|
|
|
def predictProbability(self, value):
|
|
|
|
"""
|
|
|
|
Predict the probability of each class given the features.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
2020-09-16 07:22:11 -04:00
|
|
|
class _JavaClassifier(Classifier, JavaPredictor, metaclass=ABCMeta):
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
"""
|
|
|
|
Java Classifier for classification tasks.
|
|
|
|
Classes are indexed {0, 1, ..., numClasses - 1}.
|
|
|
|
"""
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setRawPredictionCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`rawPredictionCol`.
|
|
|
|
"""
|
|
|
|
return self._set(rawPredictionCol=value)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class _JavaClassificationModel(ClassificationModel, JavaPredictionModel):
|
|
|
|
"""
|
|
|
|
Java Model produced by a ``Classifier``.
|
|
|
|
Classes are indexed {0, 1, ..., numClasses - 1}.
|
2020-05-18 07:25:02 -04:00
|
|
|
To be mixed in with :class:`pyspark.ml.JavaModel`
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("2.1.0")
|
|
|
|
def numClasses(self):
|
|
|
|
"""
|
|
|
|
Number of classes (values which the label can take).
|
|
|
|
"""
|
|
|
|
return self._call_java("numClasses")
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def predictRaw(self, value):
|
|
|
|
"""
|
|
|
|
Raw prediction for each possible label.
|
|
|
|
"""
|
|
|
|
return self._call_java("predictRaw", value)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
2020-09-16 07:22:11 -04:00
|
|
|
class _JavaProbabilisticClassifier(ProbabilisticClassifier, _JavaClassifier,
|
|
|
|
metaclass=ABCMeta):
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
"""
|
|
|
|
Java Probabilistic Classifier for classification tasks.
|
|
|
|
"""
|
2020-09-16 07:22:11 -04:00
|
|
|
pass
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class _JavaProbabilisticClassificationModel(ProbabilisticClassificationModel,
|
|
|
|
_JavaClassificationModel):
|
|
|
|
"""
|
|
|
|
Java Model produced by a ``ProbabilisticClassifier``.
|
|
|
|
"""
|
|
|
|
|
2020-01-03 12:42:56 -05:00
|
|
|
@since("3.0.0")
|
|
|
|
def predictProbability(self, value):
|
|
|
|
"""
|
|
|
|
Predict the probability of each class given the features.
|
|
|
|
"""
|
|
|
|
return self._call_java("predictProbability", value)
|
|
|
|
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
|
2020-06-20 09:43:28 -04:00
|
|
|
@inherit_doc
|
|
|
|
class _ClassificationSummary(JavaWrapper):
|
|
|
|
"""
|
|
|
|
Abstraction for multiclass classification results for a given model.
|
|
|
|
|
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def predictions(self):
|
|
|
|
"""
|
|
|
|
Dataframe outputted by the model's `transform` method.
|
|
|
|
"""
|
|
|
|
return self._call_java("predictions")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def predictionCol(self):
|
|
|
|
"""
|
|
|
|
Field in "predictions" which gives the prediction of each class.
|
|
|
|
"""
|
|
|
|
return self._call_java("predictionCol")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def labelCol(self):
|
|
|
|
"""
|
|
|
|
Field in "predictions" which gives the true label of each
|
|
|
|
instance.
|
|
|
|
"""
|
|
|
|
return self._call_java("labelCol")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def weightCol(self):
|
|
|
|
"""
|
|
|
|
Field in "predictions" which gives the weight of each instance
|
|
|
|
as a vector.
|
|
|
|
"""
|
|
|
|
return self._call_java("weightCol")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def labels(self):
|
|
|
|
"""
|
|
|
|
Returns the sequence of labels in ascending order. This order matches the order used
|
|
|
|
in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
|
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the
|
2020-06-20 09:43:28 -04:00
|
|
|
training set is missing a label, then all of the arrays over labels
|
|
|
|
(e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the
|
|
|
|
expected numClasses.
|
|
|
|
"""
|
|
|
|
return self._call_java("labels")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def truePositiveRateByLabel(self):
|
|
|
|
"""
|
|
|
|
Returns true positive rate for each label (category).
|
|
|
|
"""
|
|
|
|
return self._call_java("truePositiveRateByLabel")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def falsePositiveRateByLabel(self):
|
|
|
|
"""
|
|
|
|
Returns false positive rate for each label (category).
|
|
|
|
"""
|
|
|
|
return self._call_java("falsePositiveRateByLabel")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def precisionByLabel(self):
|
|
|
|
"""
|
|
|
|
Returns precision for each label (category).
|
|
|
|
"""
|
|
|
|
return self._call_java("precisionByLabel")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def recallByLabel(self):
|
|
|
|
"""
|
|
|
|
Returns recall for each label (category).
|
|
|
|
"""
|
|
|
|
return self._call_java("recallByLabel")
|
|
|
|
|
|
|
|
@since("3.1.0")
|
|
|
|
def fMeasureByLabel(self, beta=1.0):
|
|
|
|
"""
|
|
|
|
Returns f-measure for each label (category).
|
|
|
|
"""
|
|
|
|
return self._call_java("fMeasureByLabel", beta)
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def accuracy(self):
|
|
|
|
"""
|
|
|
|
Returns accuracy.
|
|
|
|
(equals to the total number of correctly classified instances
|
|
|
|
out of the total number of instances.)
|
|
|
|
"""
|
|
|
|
return self._call_java("accuracy")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def weightedTruePositiveRate(self):
|
|
|
|
"""
|
|
|
|
Returns weighted true positive rate.
|
|
|
|
(equals to precision, recall and f-measure)
|
|
|
|
"""
|
|
|
|
return self._call_java("weightedTruePositiveRate")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def weightedFalsePositiveRate(self):
|
|
|
|
"""
|
|
|
|
Returns weighted false positive rate.
|
|
|
|
"""
|
|
|
|
return self._call_java("weightedFalsePositiveRate")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def weightedRecall(self):
|
|
|
|
"""
|
|
|
|
Returns weighted averaged recall.
|
|
|
|
(equals to precision, recall and f-measure)
|
|
|
|
"""
|
|
|
|
return self._call_java("weightedRecall")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def weightedPrecision(self):
|
|
|
|
"""
|
|
|
|
Returns weighted averaged precision.
|
|
|
|
"""
|
|
|
|
return self._call_java("weightedPrecision")
|
|
|
|
|
|
|
|
@since("3.1.0")
|
|
|
|
def weightedFMeasure(self, beta=1.0):
|
|
|
|
"""
|
|
|
|
Returns weighted averaged f-measure.
|
|
|
|
"""
|
|
|
|
return self._call_java("weightedFMeasure", beta)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class _TrainingSummary(JavaWrapper):
|
|
|
|
"""
|
|
|
|
Abstraction for Training results.
|
|
|
|
|
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def objectiveHistory(self):
|
|
|
|
"""
|
|
|
|
Objective function (scaled loss + regularization) at each
|
|
|
|
iteration. It contains one more element, the initial state,
|
|
|
|
than number of iterations.
|
|
|
|
"""
|
|
|
|
return self._call_java("objectiveHistory")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def totalIterations(self):
|
|
|
|
"""
|
|
|
|
Number of training iterations until termination.
|
|
|
|
"""
|
|
|
|
return self._call_java("totalIterations")
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class _BinaryClassificationSummary(_ClassificationSummary):
|
|
|
|
"""
|
|
|
|
Binary classification results for a given model.
|
|
|
|
|
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def scoreCol(self):
|
|
|
|
"""
|
|
|
|
Field in "predictions" which gives the probability or raw prediction
|
|
|
|
of each class as a vector.
|
|
|
|
"""
|
|
|
|
return self._call_java("scoreCol")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def roc(self):
|
|
|
|
"""
|
|
|
|
Returns the receiver operating characteristic (ROC) curve,
|
|
|
|
which is a Dataframe having two fields (FPR, TPR) with
|
|
|
|
(0.0, 0.0) prepended and (1.0, 1.0) appended to it.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
|
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
`Wikipedia reference <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
|
2020-06-20 09:43:28 -04:00
|
|
|
"""
|
|
|
|
return self._call_java("roc")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def areaUnderROC(self):
|
|
|
|
"""
|
|
|
|
Computes the area under the receiver operating characteristic
|
|
|
|
(ROC) curve.
|
|
|
|
"""
|
|
|
|
return self._call_java("areaUnderROC")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def pr(self):
|
|
|
|
"""
|
|
|
|
Returns the precision-recall curve, which is a Dataframe
|
|
|
|
containing two fields recall, precision with (0.0, 1.0) prepended
|
|
|
|
to it.
|
|
|
|
"""
|
|
|
|
return self._call_java("pr")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def fMeasureByThreshold(self):
|
|
|
|
"""
|
|
|
|
Returns a dataframe with two fields (threshold, F-Measure) curve
|
|
|
|
with beta = 1.0.
|
|
|
|
"""
|
|
|
|
return self._call_java("fMeasureByThreshold")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def precisionByThreshold(self):
|
|
|
|
"""
|
|
|
|
Returns a dataframe with two fields (threshold, precision) curve.
|
|
|
|
Every possible probability obtained in transforming the dataset
|
|
|
|
are used as thresholds used in calculating the precision.
|
|
|
|
"""
|
|
|
|
return self._call_java("precisionByThreshold")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def recallByThreshold(self):
|
|
|
|
"""
|
|
|
|
Returns a dataframe with two fields (threshold, recall) curve.
|
|
|
|
Every possible probability obtained in transforming the dataset
|
|
|
|
are used as thresholds used in calculating the recall.
|
|
|
|
"""
|
|
|
|
return self._call_java("recallByThreshold")
|
|
|
|
|
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class _LinearSVCParams(_ClassifierParams, HasRegParam, HasMaxIter, HasFitIntercept, HasTol,
|
[SPARK-30642][ML][PYSPARK] LinearSVC blockify input vectors
### What changes were proposed in this pull request?
1, add new param `blockSize`;
2, add a new class InstanceBlock;
3, **if `blockSize==1`, keep original behavior; if `blockSize>1`, stack input vectors to blocks (like ALS/MLP);**
4, if `blockSize>1`, standardize the input outside of optimization procedure;
### Why are the changes needed?
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster on dataset `epsilon`)
### Does this PR introduce any user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28349 from zhengruifeng/blockify_svc_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-05 22:06:23 -04:00
|
|
|
HasStandardization, HasWeightCol, HasAggregationDepth, HasThreshold,
|
2020-11-12 06:14:07 -05:00
|
|
|
HasMaxBlockSizeInMB):
|
2019-10-18 05:26:54 -04:00
|
|
|
"""
|
|
|
|
Params for :py:class:`LinearSVC` and :py:class:`LinearSVCModel`.
|
|
|
|
|
|
|
|
.. versionadded:: 3.0.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
threshold = Param(Params._dummy(), "threshold",
|
|
|
|
"The threshold in binary classification applied to the linear model"
|
|
|
|
" prediction. This threshold can be any real number, where Inf will make"
|
|
|
|
" all predictions 0.0 and -Inf will make all predictions 1.0.",
|
|
|
|
typeConverter=TypeConverters.toFloat)
|
|
|
|
|
2020-08-03 11:50:34 -04:00
|
|
|
def __init__(self, *args):
|
|
|
|
super(_LinearSVCParams, self).__init__(*args)
|
2020-07-16 14:12:29 -04:00
|
|
|
self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True,
|
|
|
|
standardization=True, threshold=0.0, aggregationDepth=2,
|
2020-11-12 06:14:07 -05:00
|
|
|
maxBlockSizeInMB=0.0)
|
2020-07-16 14:12:29 -04:00
|
|
|
|
2019-10-18 05:26:54 -04:00
|
|
|
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class LinearSVC(_JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadable):
|
2017-01-27 19:03:53 -05:00
|
|
|
"""
|
|
|
|
This binary classifier optimizes the Hinge Loss using the OWLQN optimizer.
|
2017-05-16 00:21:54 -04:00
|
|
|
Only supports L2 regularization currently.
|
2017-01-27 19:03:53 -05:00
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.2.0
|
|
|
|
|
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
`Linear SVM Classifier <https://en.wikipedia.org/wiki/Support_vector_machine#Linear_SVM>`_
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2017-01-27 19:03:53 -05:00
|
|
|
>>> from pyspark.sql import Row
|
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
|
|
|
>>> df = sc.parallelize([
|
|
|
|
... Row(label=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
|
|
|
|
... Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()
|
2019-10-27 23:36:10 -04:00
|
|
|
>>> svm = LinearSVC()
|
|
|
|
>>> svm.getMaxIter()
|
|
|
|
100
|
|
|
|
>>> svm.setMaxIter(5)
|
|
|
|
LinearSVC...
|
|
|
|
>>> svm.getMaxIter()
|
|
|
|
5
|
|
|
|
>>> svm.getRegParam()
|
|
|
|
0.0
|
|
|
|
>>> svm.setRegParam(0.01)
|
|
|
|
LinearSVC...
|
|
|
|
>>> svm.getRegParam()
|
|
|
|
0.01
|
2017-01-27 19:03:53 -05:00
|
|
|
>>> model = svm.fit(df)
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.setPredictionCol("newPrediction")
|
[SPARK-29867][ML][PYTHON] Add __repr__ in Python ML Models
### What changes were proposed in this pull request?
Add ```__repr__``` in Python ML Models
### Why are the changes needed?
In Python ML Models, some of them have ```__repr__```, others don't. In the doctest, when calling Model.setXXX, some of the Models print out the xxxModel... correctly, some of them can't because of lacking the ```__repr__``` method. For example:
```
>>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)
>>> model = gm.fit(df)
>>> model.setPredictionCol("newPrediction")
GaussianMixture...
```
After the change, the above code will become the following:
```
>>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)
>>> model = gm.fit(df)
>>> model.setPredictionCol("newPrediction")
GaussianMixtureModel...
```
### Does this PR introduce any user-facing change?
Yes.
### How was this patch tested?
doctest
Closes #26489 from huaxingao/spark-29876.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-16 00:44:39 -05:00
|
|
|
LinearSVCModel...
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.getPredictionCol()
|
|
|
|
'newPrediction'
|
2019-10-18 05:26:54 -04:00
|
|
|
>>> model.setThreshold(0.5)
|
[SPARK-29867][ML][PYTHON] Add __repr__ in Python ML Models
### What changes were proposed in this pull request?
Add ```__repr__``` in Python ML Models
### Why are the changes needed?
In Python ML Models, some of them have ```__repr__```, others don't. In the doctest, when calling Model.setXXX, some of the Models print out the xxxModel... correctly, some of them can't because of lacking the ```__repr__``` method. For example:
```
>>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)
>>> model = gm.fit(df)
>>> model.setPredictionCol("newPrediction")
GaussianMixture...
```
After the change, the above code will become the following:
```
>>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)
>>> model = gm.fit(df)
>>> model.setPredictionCol("newPrediction")
GaussianMixtureModel...
```
### Does this PR introduce any user-facing change?
Yes.
### How was this patch tested?
doctest
Closes #26489 from huaxingao/spark-29876.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-16 00:44:39 -05:00
|
|
|
LinearSVCModel...
|
2019-10-18 05:26:54 -04:00
|
|
|
>>> model.getThreshold()
|
|
|
|
0.5
|
2020-11-12 06:14:07 -05:00
|
|
|
>>> model.getMaxBlockSizeInMB()
|
|
|
|
0.0
|
2017-01-27 19:03:53 -05:00
|
|
|
>>> model.coefficients
|
2021-04-25 01:16:46 -04:00
|
|
|
DenseVector([0.0, -1.0319, -0.5159])
|
2017-01-27 19:03:53 -05:00
|
|
|
>>> model.intercept
|
2021-04-25 01:16:46 -04:00
|
|
|
2.579645978780695
|
2017-01-27 19:03:53 -05:00
|
|
|
>>> model.numClasses
|
|
|
|
2
|
|
|
|
>>> model.numFeatures
|
|
|
|
3
|
|
|
|
>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, -1.0, -1.0))]).toDF()
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.predict(test0.head().features)
|
|
|
|
1.0
|
2020-01-03 12:42:56 -05:00
|
|
|
>>> model.predictRaw(test0.head().features)
|
2021-04-25 01:16:46 -04:00
|
|
|
DenseVector([-4.1274, 4.1274])
|
2017-01-27 19:03:53 -05:00
|
|
|
>>> result = model.transform(test0).head()
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> result.newPrediction
|
2017-01-27 19:03:53 -05:00
|
|
|
1.0
|
|
|
|
>>> result.rawPrediction
|
2021-04-25 01:16:46 -04:00
|
|
|
DenseVector([-4.1274, 4.1274])
|
2017-01-27 19:03:53 -05:00
|
|
|
>>> svm_path = temp_path + "/svm"
|
|
|
|
>>> svm.save(svm_path)
|
|
|
|
>>> svm2 = LinearSVC.load(svm_path)
|
|
|
|
>>> svm2.getMaxIter()
|
|
|
|
5
|
|
|
|
>>> model_path = temp_path + "/svm_model"
|
|
|
|
>>> model.save(model_path)
|
|
|
|
>>> model2 = LinearSVCModel.load(model_path)
|
|
|
|
>>> model.coefficients[0] == model2.coefficients[0]
|
|
|
|
True
|
|
|
|
>>> model.intercept == model2.intercept
|
|
|
|
True
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
|
|
|
|
True
|
2017-01-27 19:03:53 -05:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2017-01-27 19:03:53 -05:00
|
|
|
maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction",
|
|
|
|
fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,
|
2020-11-12 06:14:07 -05:00
|
|
|
aggregationDepth=2, maxBlockSizeInMB=0.0):
|
2017-01-27 19:03:53 -05:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2017-01-27 19:03:53 -05:00
|
|
|
maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \
|
|
|
|
fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \
|
2020-11-12 06:14:07 -05:00
|
|
|
aggregationDepth=2, maxBlockSizeInMB=0.0):
|
2017-01-27 19:03:53 -05:00
|
|
|
"""
|
|
|
|
super(LinearSVC, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.classification.LinearSVC", self.uid)
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2017-01-27 19:03:53 -05:00
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
@since("2.2.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2017-01-27 19:03:53 -05:00
|
|
|
maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction",
|
|
|
|
fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,
|
2020-11-12 06:14:07 -05:00
|
|
|
aggregationDepth=2, maxBlockSizeInMB=0.0):
|
2017-01-27 19:03:53 -05:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2017-01-27 19:03:53 -05:00
|
|
|
maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \
|
|
|
|
fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \
|
2020-11-12 06:14:07 -05:00
|
|
|
aggregationDepth=2, maxBlockSizeInMB=0.0):
|
2017-01-27 19:03:53 -05:00
|
|
|
Sets params for Linear SVM Classifier.
|
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2017-01-27 19:03:53 -05:00
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return LinearSVCModel(java_model)
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
@since("2.2.0")
|
|
|
|
def setMaxIter(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxIter`.
|
|
|
|
"""
|
|
|
|
return self._set(maxIter=value)
|
|
|
|
|
|
|
|
@since("2.2.0")
|
|
|
|
def setRegParam(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`regParam`.
|
|
|
|
"""
|
|
|
|
return self._set(regParam=value)
|
|
|
|
|
|
|
|
@since("2.2.0")
|
|
|
|
def setTol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`tol`.
|
|
|
|
"""
|
|
|
|
return self._set(tol=value)
|
|
|
|
|
|
|
|
@since("2.2.0")
|
|
|
|
def setFitIntercept(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`fitIntercept`.
|
|
|
|
"""
|
|
|
|
return self._set(fitIntercept=value)
|
|
|
|
|
|
|
|
@since("2.2.0")
|
|
|
|
def setStandardization(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`standardization`.
|
|
|
|
"""
|
|
|
|
return self._set(standardization=value)
|
|
|
|
|
|
|
|
@since("2.2.0")
|
|
|
|
def setThreshold(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`threshold`.
|
|
|
|
"""
|
|
|
|
return self._set(threshold=value)
|
|
|
|
|
|
|
|
@since("2.2.0")
|
|
|
|
def setWeightCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`weightCol`.
|
|
|
|
"""
|
|
|
|
return self._set(weightCol=value)
|
|
|
|
|
|
|
|
@since("2.2.0")
|
|
|
|
def setAggregationDepth(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`aggregationDepth`.
|
|
|
|
"""
|
|
|
|
return self._set(aggregationDepth=value)
|
|
|
|
|
[SPARK-30642][ML][PYSPARK] LinearSVC blockify input vectors
### What changes were proposed in this pull request?
1, add new param `blockSize`;
2, add a new class InstanceBlock;
3, **if `blockSize==1`, keep original behavior; if `blockSize>1`, stack input vectors to blocks (like ALS/MLP);**
4, if `blockSize>1`, standardize the input outside of optimization procedure;
### Why are the changes needed?
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster on dataset `epsilon`)
### Does this PR introduce any user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28349 from zhengruifeng/blockify_svc_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-05 22:06:23 -04:00
|
|
|
@since("3.1.0")
|
2020-11-12 06:14:07 -05:00
|
|
|
def setMaxBlockSizeInMB(self, value):
|
[SPARK-30642][ML][PYSPARK] LinearSVC blockify input vectors
### What changes were proposed in this pull request?
1, add new param `blockSize`;
2, add a new class InstanceBlock;
3, **if `blockSize==1`, keep original behavior; if `blockSize>1`, stack input vectors to blocks (like ALS/MLP);**
4, if `blockSize>1`, standardize the input outside of optimization procedure;
### Why are the changes needed?
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster on dataset `epsilon`)
### Does this PR introduce any user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28349 from zhengruifeng/blockify_svc_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-05 22:06:23 -04:00
|
|
|
"""
|
2020-11-12 06:14:07 -05:00
|
|
|
Sets the value of :py:attr:`maxBlockSizeInMB`.
|
[SPARK-30642][ML][PYSPARK] LinearSVC blockify input vectors
### What changes were proposed in this pull request?
1, add new param `blockSize`;
2, add a new class InstanceBlock;
3, **if `blockSize==1`, keep original behavior; if `blockSize>1`, stack input vectors to blocks (like ALS/MLP);**
4, if `blockSize>1`, standardize the input outside of optimization procedure;
### Why are the changes needed?
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster on dataset `epsilon`)
### Does this PR introduce any user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28349 from zhengruifeng/blockify_svc_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-05 22:06:23 -04:00
|
|
|
"""
|
2020-11-12 06:14:07 -05:00
|
|
|
return self._set(maxBlockSizeInMB=value)
|
[SPARK-30642][ML][PYSPARK] LinearSVC blockify input vectors
### What changes were proposed in this pull request?
1, add new param `blockSize`;
2, add a new class InstanceBlock;
3, **if `blockSize==1`, keep original behavior; if `blockSize>1`, stack input vectors to blocks (like ALS/MLP);**
4, if `blockSize>1`, standardize the input outside of optimization procedure;
### Why are the changes needed?
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster on dataset `epsilon`)
### Does this PR introduce any user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28349 from zhengruifeng/blockify_svc_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-05 22:06:23 -04:00
|
|
|
|
2017-01-27 19:03:53 -05:00
|
|
|
|
2020-06-26 13:57:30 -04:00
|
|
|
class LinearSVCModel(_JavaClassificationModel, _LinearSVCParams, JavaMLWritable, JavaMLReadable,
|
|
|
|
HasTrainingSummary):
|
2017-01-27 19:03:53 -05:00
|
|
|
"""
|
|
|
|
Model fitted by LinearSVC.
|
|
|
|
|
|
|
|
.. versionadded:: 2.2.0
|
|
|
|
"""
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
@since("3.0.0")
|
|
|
|
def setThreshold(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`threshold`.
|
|
|
|
"""
|
|
|
|
return self._set(threshold=value)
|
|
|
|
|
2017-01-27 19:03:53 -05:00
|
|
|
@property
|
|
|
|
@since("2.2.0")
|
|
|
|
def coefficients(self):
|
|
|
|
"""
|
|
|
|
Model coefficients of Linear SVM Classifier.
|
|
|
|
"""
|
|
|
|
return self._call_java("coefficients")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("2.2.0")
|
|
|
|
def intercept(self):
|
|
|
|
"""
|
|
|
|
Model intercept of Linear SVM Classifier.
|
|
|
|
"""
|
|
|
|
return self._call_java("intercept")
|
|
|
|
|
2020-06-26 13:57:30 -04:00
|
|
|
@since("3.1.0")
|
|
|
|
def summary(self):
|
|
|
|
"""
|
2021-05-11 21:38:59 -04:00
|
|
|
Gets summary (accuracy/precision/recall, objective history, total iterations) of model
|
2020-06-26 13:57:30 -04:00
|
|
|
trained on the training set. An exception is thrown if `trainingSummary is None`.
|
|
|
|
"""
|
|
|
|
if self.hasSummary:
|
|
|
|
return LinearSVCTrainingSummary(super(LinearSVCModel, self).summary)
|
|
|
|
else:
|
|
|
|
raise RuntimeError("No training summary available for this %s" %
|
|
|
|
self.__class__.__name__)
|
|
|
|
|
|
|
|
def evaluate(self, dataset):
|
|
|
|
"""
|
|
|
|
Evaluates the model on a test dataset.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
Test dataset to evaluate model on.
|
2020-06-26 13:57:30 -04:00
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
2021-05-03 02:34:24 -04:00
|
|
|
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
|
2020-06-26 13:57:30 -04:00
|
|
|
java_lsvc_summary = self._call_java("evaluate", dataset)
|
|
|
|
return LinearSVCSummary(java_lsvc_summary)
|
|
|
|
|
|
|
|
|
|
|
|
class LinearSVCSummary(_BinaryClassificationSummary):
|
|
|
|
"""
|
|
|
|
Abstraction for LinearSVC Results for a given model.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
2020-06-26 13:57:30 -04:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class LinearSVCTrainingSummary(LinearSVCSummary, _TrainingSummary):
|
|
|
|
"""
|
|
|
|
Abstraction for LinearSVC Training results.
|
|
|
|
|
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2017-01-27 19:03:53 -05:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class _LogisticRegressionParams(_ProbabilisticClassifierParams, HasRegParam,
|
2019-10-18 05:26:54 -04:00
|
|
|
HasElasticNetParam, HasMaxIter, HasFitIntercept, HasTol,
|
|
|
|
HasStandardization, HasWeightCol, HasAggregationDepth,
|
2020-11-18 10:02:31 -05:00
|
|
|
HasThreshold, HasMaxBlockSizeInMB):
|
2019-10-18 05:26:54 -04:00
|
|
|
"""
|
|
|
|
Params for :py:class:`LogisticRegression` and :py:class:`LogisticRegressionModel`.
|
|
|
|
|
|
|
|
.. versionadded:: 3.0.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
threshold = Param(Params._dummy(), "threshold",
|
|
|
|
"Threshold in binary classification prediction, in range [0, 1]." +
|
|
|
|
" If threshold and thresholds are both set, they must match." +
|
|
|
|
"e.g. if threshold is p, then thresholds must be equal to [1-p, p].",
|
|
|
|
typeConverter=TypeConverters.toFloat)
|
|
|
|
|
|
|
|
family = Param(Params._dummy(), "family",
|
|
|
|
"The name of family which is a description of the label distribution to " +
|
|
|
|
"be used in the model. Supported options: auto, binomial, multinomial",
|
|
|
|
typeConverter=TypeConverters.toString)
|
|
|
|
|
|
|
|
lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients",
|
|
|
|
"The lower bounds on coefficients if fitting under bound "
|
|
|
|
"constrained optimization. The bound matrix must be "
|
|
|
|
"compatible with the shape "
|
|
|
|
"(1, number of features) for binomial regression, or "
|
|
|
|
"(number of classes, number of features) "
|
|
|
|
"for multinomial regression.",
|
|
|
|
typeConverter=TypeConverters.toMatrix)
|
|
|
|
|
|
|
|
upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients",
|
|
|
|
"The upper bounds on coefficients if fitting under bound "
|
|
|
|
"constrained optimization. The bound matrix must be "
|
|
|
|
"compatible with the shape "
|
|
|
|
"(1, number of features) for binomial regression, or "
|
|
|
|
"(number of classes, number of features) "
|
|
|
|
"for multinomial regression.",
|
|
|
|
typeConverter=TypeConverters.toMatrix)
|
|
|
|
|
|
|
|
lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts",
|
|
|
|
"The lower bounds on intercepts if fitting under bound "
|
|
|
|
"constrained optimization. The bounds vector size must be"
|
|
|
|
"equal with 1 for binomial regression, or the number of"
|
|
|
|
"lasses for multinomial regression.",
|
|
|
|
typeConverter=TypeConverters.toVector)
|
|
|
|
|
|
|
|
upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts",
|
|
|
|
"The upper bounds on intercepts if fitting under bound "
|
|
|
|
"constrained optimization. The bound vector size must be "
|
|
|
|
"equal with 1 for binomial regression, or the number of "
|
|
|
|
"classes for multinomial regression.",
|
|
|
|
typeConverter=TypeConverters.toVector)
|
|
|
|
|
2020-08-03 11:50:34 -04:00
|
|
|
def __init__(self, *args):
|
|
|
|
super(_LogisticRegressionParams, self).__init__(*args)
|
2020-07-16 14:12:29 -04:00
|
|
|
self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto",
|
2020-11-18 10:02:31 -05:00
|
|
|
maxBlockSizeInMB=0.0)
|
2020-07-16 14:12:29 -04:00
|
|
|
|
2019-10-18 05:26:54 -04:00
|
|
|
@since("1.4.0")
|
|
|
|
def setThreshold(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`threshold`.
|
|
|
|
Clears value of :py:attr:`thresholds` if it has been set.
|
|
|
|
"""
|
|
|
|
self._set(threshold=value)
|
|
|
|
self.clear(self.thresholds)
|
|
|
|
return self
|
|
|
|
|
|
|
|
@since("1.4.0")
|
|
|
|
def getThreshold(self):
|
|
|
|
"""
|
|
|
|
Get threshold for binary classification.
|
|
|
|
|
|
|
|
If :py:attr:`thresholds` is set with length 2 (i.e., binary classification),
|
|
|
|
this returns the equivalent threshold:
|
|
|
|
:math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`.
|
|
|
|
Otherwise, returns :py:attr:`threshold` if set or its default value if unset.
|
|
|
|
"""
|
|
|
|
self._checkThresholdConsistency()
|
|
|
|
if self.isSet(self.thresholds):
|
|
|
|
ts = self.getOrDefault(self.thresholds)
|
|
|
|
if len(ts) != 2:
|
|
|
|
raise ValueError("Logistic Regression getThreshold only applies to" +
|
|
|
|
" binary classification, but thresholds has length != 2." +
|
|
|
|
" thresholds: " + ",".join(ts))
|
|
|
|
return 1.0/(1.0 + ts[0]/ts[1])
|
|
|
|
else:
|
|
|
|
return self.getOrDefault(self.threshold)
|
|
|
|
|
|
|
|
@since("1.5.0")
|
|
|
|
def setThresholds(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`thresholds`.
|
|
|
|
Clears value of :py:attr:`threshold` if it has been set.
|
|
|
|
"""
|
|
|
|
self._set(thresholds=value)
|
|
|
|
self.clear(self.threshold)
|
|
|
|
return self
|
|
|
|
|
|
|
|
@since("1.5.0")
|
|
|
|
def getThresholds(self):
|
|
|
|
"""
|
|
|
|
If :py:attr:`thresholds` is set, return its value.
|
|
|
|
Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary
|
|
|
|
classification: (1-threshold, threshold).
|
|
|
|
If neither are set, throw an error.
|
|
|
|
"""
|
|
|
|
self._checkThresholdConsistency()
|
|
|
|
if not self.isSet(self.thresholds) and self.isSet(self.threshold):
|
|
|
|
t = self.getOrDefault(self.threshold)
|
|
|
|
return [1.0-t, t]
|
|
|
|
else:
|
|
|
|
return self.getOrDefault(self.thresholds)
|
|
|
|
|
|
|
|
def _checkThresholdConsistency(self):
|
|
|
|
if self.isSet(self.threshold) and self.isSet(self.thresholds):
|
|
|
|
ts = self.getOrDefault(self.thresholds)
|
|
|
|
if len(ts) != 2:
|
|
|
|
raise ValueError("Logistic Regression getThreshold only applies to" +
|
|
|
|
" binary classification, but thresholds has length != 2." +
|
|
|
|
" thresholds: {0}".format(str(ts)))
|
|
|
|
t = 1.0/(1.0 + ts[0]/ts[1])
|
|
|
|
t2 = self.getOrDefault(self.threshold)
|
|
|
|
if abs(t2 - t) >= 1E-5:
|
|
|
|
raise ValueError("Logistic Regression getThreshold found inconsistent values for" +
|
|
|
|
" threshold (%g) and thresholds (equivalent to %g)" % (t2, t))
|
|
|
|
|
|
|
|
@since("2.1.0")
|
|
|
|
def getFamily(self):
|
|
|
|
"""
|
|
|
|
Gets the value of :py:attr:`family` or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.family)
|
|
|
|
|
|
|
|
@since("2.3.0")
|
|
|
|
def getLowerBoundsOnCoefficients(self):
|
|
|
|
"""
|
|
|
|
Gets the value of :py:attr:`lowerBoundsOnCoefficients`
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.lowerBoundsOnCoefficients)
|
|
|
|
|
|
|
|
@since("2.3.0")
|
|
|
|
def getUpperBoundsOnCoefficients(self):
|
|
|
|
"""
|
|
|
|
Gets the value of :py:attr:`upperBoundsOnCoefficients`
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.upperBoundsOnCoefficients)
|
|
|
|
|
|
|
|
@since("2.3.0")
|
|
|
|
def getLowerBoundsOnIntercepts(self):
|
|
|
|
"""
|
|
|
|
Gets the value of :py:attr:`lowerBoundsOnIntercepts`
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.lowerBoundsOnIntercepts)
|
|
|
|
|
|
|
|
@since("2.3.0")
|
|
|
|
def getUpperBoundsOnIntercepts(self):
|
|
|
|
"""
|
|
|
|
Gets the value of :py:attr:`upperBoundsOnIntercepts`
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.upperBoundsOnIntercepts)
|
|
|
|
|
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class LogisticRegression(_JavaProbabilisticClassifier, _LogisticRegressionParams, JavaMLWritable,
|
2019-10-18 05:26:54 -04:00
|
|
|
JavaMLReadable):
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
Logistic regression.
|
2016-09-27 03:00:21 -04:00
|
|
|
This class supports multinomial logistic (softmax) and binomial logistic regression.
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 1.3.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2015-01-28 20:14:23 -05:00
|
|
|
>>> from pyspark.sql import Row
|
2016-05-17 15:51:07 -04:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> bdf = sc.parallelize([
|
2017-04-26 09:34:18 -04:00
|
|
|
... Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)),
|
|
|
|
... Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),
|
|
|
|
... Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),
|
|
|
|
... Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))]).toDF()
|
2019-10-27 23:36:10 -04:00
|
|
|
>>> blor = LogisticRegression(weightCol="weight")
|
|
|
|
>>> blor.getRegParam()
|
|
|
|
0.0
|
|
|
|
>>> blor.setRegParam(0.01)
|
|
|
|
LogisticRegression...
|
|
|
|
>>> blor.getRegParam()
|
|
|
|
0.01
|
|
|
|
>>> blor.setMaxIter(10)
|
|
|
|
LogisticRegression...
|
|
|
|
>>> blor.getMaxIter()
|
|
|
|
10
|
|
|
|
>>> blor.clear(blor.maxIter)
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blorModel = blor.fit(bdf)
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> blorModel.setFeaturesCol("features")
|
|
|
|
LogisticRegressionModel...
|
|
|
|
>>> blorModel.setProbabilityCol("newProbability")
|
|
|
|
LogisticRegressionModel...
|
|
|
|
>>> blorModel.getProbabilityCol()
|
|
|
|
'newProbability'
|
2020-11-18 10:02:31 -05:00
|
|
|
>>> blorModel.getMaxBlockSizeInMB()
|
|
|
|
0.0
|
2019-10-18 05:26:54 -04:00
|
|
|
>>> blorModel.setThreshold(0.1)
|
|
|
|
LogisticRegressionModel...
|
|
|
|
>>> blorModel.getThreshold()
|
|
|
|
0.1
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blorModel.coefficients
|
2017-04-26 09:34:18 -04:00
|
|
|
DenseVector([-1.080..., -0.646...])
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blorModel.intercept
|
2017-04-26 09:34:18 -04:00
|
|
|
3.112...
|
2020-05-31 11:24:20 -04:00
|
|
|
>>> blorModel.evaluate(bdf).accuracy == blorModel.summary.accuracy
|
|
|
|
True
|
2017-04-26 09:34:18 -04:00
|
|
|
>>> data_path = "data/mllib/sample_multiclass_classification_data.txt"
|
|
|
|
>>> mdf = spark.read.format("libsvm").load(data_path)
|
|
|
|
>>> mlor = LogisticRegression(regParam=0.1, elasticNetParam=1.0, family="multinomial")
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> mlorModel = mlor.fit(mdf)
|
2017-04-25 13:10:41 -04:00
|
|
|
>>> mlorModel.coefficientMatrix
|
2017-04-26 09:34:18 -04:00
|
|
|
SparseMatrix(3, 4, [0, 1, 2, 3], [3, 2, 1], [1.87..., -2.75..., -0.50...], 1)
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> mlorModel.interceptVector
|
2017-04-26 09:34:18 -04:00
|
|
|
DenseVector([0.04..., -0.42..., 0.37...])
|
|
|
|
>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 1.0))]).toDF()
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> blorModel.predict(test0.head().features)
|
|
|
|
1.0
|
2020-01-03 12:42:56 -05:00
|
|
|
>>> blorModel.predictRaw(test0.head().features)
|
|
|
|
DenseVector([-3.54..., 3.54...])
|
|
|
|
>>> blorModel.predictProbability(test0.head().features)
|
|
|
|
DenseVector([0.028, 0.972])
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> result = blorModel.transform(test0).head()
|
2015-08-03 01:19:27 -04:00
|
|
|
>>> result.prediction
|
2017-04-26 09:34:18 -04:00
|
|
|
1.0
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> result.newProbability
|
2017-04-26 09:34:18 -04:00
|
|
|
DenseVector([0.02..., 0.97...])
|
2015-08-03 01:19:27 -04:00
|
|
|
>>> result.rawPrediction
|
2017-04-26 09:34:18 -04:00
|
|
|
DenseVector([-3.54..., 3.54...])
|
|
|
|
>>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blorModel.transform(test1).head().prediction
|
2015-01-28 20:14:23 -05:00
|
|
|
1.0
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blor.setParams("vector")
|
2015-02-15 23:29:26 -05:00
|
|
|
Traceback (most recent call last):
|
|
|
|
...
|
|
|
|
TypeError: Method setParams forces keyword arguments.
|
2016-03-16 17:21:42 -04:00
|
|
|
>>> lr_path = temp_path + "/lr"
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blor.save(lr_path)
|
2016-03-16 17:21:42 -04:00
|
|
|
>>> lr2 = LogisticRegression.load(lr_path)
|
2017-04-26 09:34:18 -04:00
|
|
|
>>> lr2.getRegParam()
|
|
|
|
0.01
|
2016-03-16 17:21:42 -04:00
|
|
|
>>> model_path = temp_path + "/lr_model"
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blorModel.save(model_path)
|
2016-03-16 17:21:42 -04:00
|
|
|
>>> model2 = LogisticRegressionModel.load(model_path)
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blorModel.coefficients[0] == model2.coefficients[0]
|
2016-03-16 17:21:42 -04:00
|
|
|
True
|
2016-09-27 03:00:21 -04:00
|
|
|
>>> blorModel.intercept == model2.intercept
|
2016-03-16 17:21:42 -04:00
|
|
|
True
|
2018-06-28 15:40:39 -04:00
|
|
|
>>> model2
|
2019-11-11 14:03:26 -05:00
|
|
|
LogisticRegressionModel: uid=..., numClasses=2, numFeatures=2
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> blorModel.transform(test0).take(1) == model2.transform(test0).take(1)
|
|
|
|
True
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
|
2015-02-15 23:29:26 -05:00
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2016-03-04 11:25:41 -05:00
|
|
|
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
|
2015-09-11 11:50:35 -04:00
|
|
|
threshold=0.5, thresholds=None, probabilityCol="probability",
|
2016-08-25 05:26:33 -04:00
|
|
|
rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
|
2017-08-02 06:10:26 -04:00
|
|
|
aggregationDepth=2, family="auto",
|
|
|
|
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
|
[SPARK-30659][ML][PYSPARK] LogisticRegression blockify input vectors
### What changes were proposed in this pull request?
1, reorg the `fit` method in LR to several blocks (`createModel`, `createBounds`, `createOptimizer`, `createInitCoefWithInterceptMatrix`);
2, add new param blockSize;
3, if blockSize==1, keep original behavior, code path `trainOnRows`;
4, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path `trainOnBlocks`
### Why are the changes needed?
On dense dataset `epsilon_normalized.t`:
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster)
### Does this PR introduce _any_ user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28458 from zhengruifeng/blockify_lor_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-06 22:07:24 -04:00
|
|
|
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None,
|
2020-11-18 10:02:31 -05:00
|
|
|
maxBlockSizeInMB=0.0):
|
2017-08-02 06:10:26 -04:00
|
|
|
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2016-03-04 11:25:41 -05:00
|
|
|
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
|
2015-09-11 11:50:35 -04:00
|
|
|
threshold=0.5, thresholds=None, probabilityCol="probability", \
|
2016-08-25 05:26:33 -04:00
|
|
|
rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
|
2017-08-02 06:10:26 -04:00
|
|
|
aggregationDepth=2, family="auto", \
|
|
|
|
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
|
[SPARK-30659][ML][PYSPARK] LogisticRegression blockify input vectors
### What changes were proposed in this pull request?
1, reorg the `fit` method in LR to several blocks (`createModel`, `createBounds`, `createOptimizer`, `createInitCoefWithInterceptMatrix`);
2, add new param blockSize;
3, if blockSize==1, keep original behavior, code path `trainOnRows`;
4, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path `trainOnBlocks`
### Why are the changes needed?
On dense dataset `epsilon_normalized.t`:
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster)
### Does this PR introduce _any_ user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28458 from zhengruifeng/blockify_lor_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-06 22:07:24 -04:00
|
|
|
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, \
|
2020-11-18 10:02:31 -05:00
|
|
|
maxBlockSizeInMB=0.0):
|
2015-08-12 17:27:13 -04:00
|
|
|
If the threshold and thresholds Params are both set, they must be equivalent.
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
|
|
|
super(LogisticRegression, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.classification.LogisticRegression", self.uid)
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-02-15 23:29:26 -05:00
|
|
|
self.setParams(**kwargs)
|
2015-08-12 17:27:13 -04:00
|
|
|
self._checkThresholdConsistency()
|
2015-02-15 23:29:26 -05:00
|
|
|
|
|
|
|
@keyword_only
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.3.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2016-03-04 11:25:41 -05:00
|
|
|
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
|
2015-09-11 11:50:35 -04:00
|
|
|
threshold=0.5, thresholds=None, probabilityCol="probability",
|
2016-08-25 05:26:33 -04:00
|
|
|
rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
|
2017-08-02 06:10:26 -04:00
|
|
|
aggregationDepth=2, family="auto",
|
|
|
|
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
|
[SPARK-30659][ML][PYSPARK] LogisticRegression blockify input vectors
### What changes were proposed in this pull request?
1, reorg the `fit` method in LR to several blocks (`createModel`, `createBounds`, `createOptimizer`, `createInitCoefWithInterceptMatrix`);
2, add new param blockSize;
3, if blockSize==1, keep original behavior, code path `trainOnRows`;
4, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path `trainOnBlocks`
### Why are the changes needed?
On dense dataset `epsilon_normalized.t`:
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster)
### Does this PR introduce _any_ user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28458 from zhengruifeng/blockify_lor_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-06 22:07:24 -04:00
|
|
|
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None,
|
2020-11-18 10:02:31 -05:00
|
|
|
maxBlockSizeInMB=0.0):
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2016-03-04 11:25:41 -05:00
|
|
|
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
|
2015-09-11 11:50:35 -04:00
|
|
|
threshold=0.5, thresholds=None, probabilityCol="probability", \
|
2016-08-25 05:26:33 -04:00
|
|
|
rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
|
2017-08-02 06:10:26 -04:00
|
|
|
aggregationDepth=2, family="auto", \
|
|
|
|
lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
|
[SPARK-30659][ML][PYSPARK] LogisticRegression blockify input vectors
### What changes were proposed in this pull request?
1, reorg the `fit` method in LR to several blocks (`createModel`, `createBounds`, `createOptimizer`, `createInitCoefWithInterceptMatrix`);
2, add new param blockSize;
3, if blockSize==1, keep original behavior, code path `trainOnRows`;
4, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path `trainOnBlocks`
### Why are the changes needed?
On dense dataset `epsilon_normalized.t`:
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster)
### Does this PR introduce _any_ user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28458 from zhengruifeng/blockify_lor_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-06 22:07:24 -04:00
|
|
|
lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, \
|
2020-11-18 10:02:31 -05:00
|
|
|
maxBlockSizeInMB=0.0):
|
2015-02-15 23:29:26 -05:00
|
|
|
Sets params for logistic regression.
|
2015-08-12 17:27:13 -04:00
|
|
|
If the threshold and thresholds Params are both set, they must be equivalent.
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-08-12 17:27:13 -04:00
|
|
|
self._set(**kwargs)
|
|
|
|
self._checkThresholdConsistency()
|
|
|
|
return self
|
2015-02-15 23:29:26 -05:00
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
def _create_model(self, java_model):
|
|
|
|
return LogisticRegressionModel(java_model)
|
|
|
|
|
2016-09-27 03:00:21 -04:00
|
|
|
@since("2.1.0")
|
|
|
|
def setFamily(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`family`.
|
|
|
|
"""
|
|
|
|
return self._set(family=value)
|
|
|
|
|
2017-08-02 06:10:26 -04:00
|
|
|
@since("2.3.0")
|
|
|
|
def setLowerBoundsOnCoefficients(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`lowerBoundsOnCoefficients`
|
|
|
|
"""
|
|
|
|
return self._set(lowerBoundsOnCoefficients=value)
|
|
|
|
|
|
|
|
@since("2.3.0")
|
|
|
|
def setUpperBoundsOnCoefficients(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`upperBoundsOnCoefficients`
|
|
|
|
"""
|
|
|
|
return self._set(upperBoundsOnCoefficients=value)
|
|
|
|
|
|
|
|
@since("2.3.0")
|
|
|
|
def setLowerBoundsOnIntercepts(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`lowerBoundsOnIntercepts`
|
|
|
|
"""
|
|
|
|
return self._set(lowerBoundsOnIntercepts=value)
|
|
|
|
|
|
|
|
@since("2.3.0")
|
|
|
|
def setUpperBoundsOnIntercepts(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`upperBoundsOnIntercepts`
|
|
|
|
"""
|
|
|
|
return self._set(upperBoundsOnIntercepts=value)
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
def setMaxIter(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxIter`.
|
|
|
|
"""
|
|
|
|
return self._set(maxIter=value)
|
|
|
|
|
|
|
|
def setRegParam(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`regParam`.
|
|
|
|
"""
|
|
|
|
return self._set(regParam=value)
|
|
|
|
|
|
|
|
def setTol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`tol`.
|
|
|
|
"""
|
|
|
|
return self._set(tol=value)
|
|
|
|
|
|
|
|
def setElasticNetParam(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`elasticNetParam`.
|
|
|
|
"""
|
|
|
|
return self._set(elasticNetParam=value)
|
|
|
|
|
|
|
|
def setFitIntercept(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`fitIntercept`.
|
|
|
|
"""
|
|
|
|
return self._set(fitIntercept=value)
|
|
|
|
|
|
|
|
def setStandardization(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`standardization`.
|
|
|
|
"""
|
|
|
|
return self._set(standardization=value)
|
|
|
|
|
|
|
|
def setWeightCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`weightCol`.
|
|
|
|
"""
|
|
|
|
return self._set(weightCol=value)
|
|
|
|
|
|
|
|
def setAggregationDepth(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`aggregationDepth`.
|
|
|
|
"""
|
|
|
|
return self._set(aggregationDepth=value)
|
|
|
|
|
[SPARK-30659][ML][PYSPARK] LogisticRegression blockify input vectors
### What changes were proposed in this pull request?
1, reorg the `fit` method in LR to several blocks (`createModel`, `createBounds`, `createOptimizer`, `createInitCoefWithInterceptMatrix`);
2, add new param blockSize;
3, if blockSize==1, keep original behavior, code path `trainOnRows`;
4, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path `trainOnBlocks`
### Why are the changes needed?
On dense dataset `epsilon_normalized.t`:
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster)
### Does this PR introduce _any_ user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28458 from zhengruifeng/blockify_lor_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-06 22:07:24 -04:00
|
|
|
@since("3.1.0")
|
2020-11-18 10:02:31 -05:00
|
|
|
def setMaxBlockSizeInMB(self, value):
|
[SPARK-30659][ML][PYSPARK] LogisticRegression blockify input vectors
### What changes were proposed in this pull request?
1, reorg the `fit` method in LR to several blocks (`createModel`, `createBounds`, `createOptimizer`, `createInitCoefWithInterceptMatrix`);
2, add new param blockSize;
3, if blockSize==1, keep original behavior, code path `trainOnRows`;
4, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path `trainOnBlocks`
### Why are the changes needed?
On dense dataset `epsilon_normalized.t`:
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster)
### Does this PR introduce _any_ user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28458 from zhengruifeng/blockify_lor_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-06 22:07:24 -04:00
|
|
|
"""
|
2020-11-18 10:02:31 -05:00
|
|
|
Sets the value of :py:attr:`maxBlockSizeInMB`.
|
[SPARK-30659][ML][PYSPARK] LogisticRegression blockify input vectors
### What changes were proposed in this pull request?
1, reorg the `fit` method in LR to several blocks (`createModel`, `createBounds`, `createOptimizer`, `createInitCoefWithInterceptMatrix`);
2, add new param blockSize;
3, if blockSize==1, keep original behavior, code path `trainOnRows`;
4, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path `trainOnBlocks`
### Why are the changes needed?
On dense dataset `epsilon_normalized.t`:
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster)
### Does this PR introduce _any_ user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28458 from zhengruifeng/blockify_lor_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-06 22:07:24 -04:00
|
|
|
"""
|
2020-11-18 10:02:31 -05:00
|
|
|
return self._set(maxBlockSizeInMB=value)
|
[SPARK-30659][ML][PYSPARK] LogisticRegression blockify input vectors
### What changes were proposed in this pull request?
1, reorg the `fit` method in LR to several blocks (`createModel`, `createBounds`, `createOptimizer`, `createInitCoefWithInterceptMatrix`);
2, add new param blockSize;
3, if blockSize==1, keep original behavior, code path `trainOnRows`;
4, if blockSize>1, standardize and stack input vectors to blocks (like ALS/MLP), code path `trainOnBlocks`
### Why are the changes needed?
On dense dataset `epsilon_normalized.t`:
1, reduce RAM to persist traing dataset; (save about 40% RAM)
2, use Level-2 BLAS routines; (4x ~ 5x faster)
### Does this PR introduce _any_ user-facing change?
Yes, a new param is added
### How was this patch tested?
existing and added testsuites
Closes #28458 from zhengruifeng/blockify_lor_II.
Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-05-06 22:07:24 -04:00
|
|
|
|
2017-08-02 06:10:26 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class LogisticRegressionModel(_JavaProbabilisticClassificationModel, _LogisticRegressionParams,
|
2019-10-18 05:26:54 -04:00
|
|
|
JavaMLWritable, JavaMLReadable, HasTrainingSummary):
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
Model fitted by LogisticRegression.
|
2015-11-09 16:16:04 -05:00
|
|
|
|
|
|
|
.. versionadded:: 1.3.0
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
|
2015-11-02 19:12:04 -05:00
|
|
|
@property
|
2016-06-22 13:05:25 -04:00
|
|
|
@since("2.0.0")
|
2015-11-02 19:12:04 -05:00
|
|
|
def coefficients(self):
|
|
|
|
"""
|
2016-09-27 03:00:21 -04:00
|
|
|
Model coefficients of binomial logistic regression.
|
|
|
|
An exception is thrown in the case of multinomial logistic regression.
|
2015-11-02 19:12:04 -05:00
|
|
|
"""
|
|
|
|
return self._call_java("coefficients")
|
|
|
|
|
2015-05-14 21:13:58 -04:00
|
|
|
@property
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.4.0")
|
2015-05-14 21:13:58 -04:00
|
|
|
def intercept(self):
|
|
|
|
"""
|
2016-09-27 03:00:21 -04:00
|
|
|
Model intercept of binomial logistic regression.
|
|
|
|
An exception is thrown in the case of multinomial logistic regression.
|
2015-05-14 21:13:58 -04:00
|
|
|
"""
|
|
|
|
return self._call_java("intercept")
|
|
|
|
|
2016-09-27 03:00:21 -04:00
|
|
|
@property
|
|
|
|
@since("2.1.0")
|
|
|
|
def coefficientMatrix(self):
|
|
|
|
"""
|
|
|
|
Model coefficients.
|
|
|
|
"""
|
|
|
|
return self._call_java("coefficientMatrix")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("2.1.0")
|
|
|
|
def interceptVector(self):
|
|
|
|
"""
|
|
|
|
Model intercept.
|
|
|
|
"""
|
|
|
|
return self._call_java("interceptVector")
|
|
|
|
|
2016-04-06 15:07:47 -04:00
|
|
|
@property
|
|
|
|
@since("2.0.0")
|
|
|
|
def summary(self):
|
|
|
|
"""
|
2021-05-11 21:38:59 -04:00
|
|
|
Gets summary (accuracy/precision/recall, objective history, total iterations) of model
|
2016-11-21 08:36:49 -05:00
|
|
|
trained on the training set. An exception is thrown if `trainingSummary is None`.
|
2016-04-06 15:07:47 -04:00
|
|
|
"""
|
2016-11-21 08:36:49 -05:00
|
|
|
if self.hasSummary:
|
2017-09-14 01:53:28 -04:00
|
|
|
if self.numClasses <= 2:
|
2019-02-01 18:29:58 -05:00
|
|
|
return BinaryLogisticRegressionTrainingSummary(super(LogisticRegressionModel,
|
|
|
|
self).summary)
|
2017-09-14 01:53:28 -04:00
|
|
|
else:
|
2019-02-01 18:29:58 -05:00
|
|
|
return LogisticRegressionTrainingSummary(super(LogisticRegressionModel,
|
|
|
|
self).summary)
|
2016-11-21 08:36:49 -05:00
|
|
|
else:
|
|
|
|
raise RuntimeError("No training summary available for this %s" %
|
|
|
|
self.__class__.__name__)
|
2016-04-06 15:07:47 -04:00
|
|
|
|
|
|
|
def evaluate(self, dataset):
|
|
|
|
"""
|
|
|
|
Evaluates the model on a test dataset.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
Test dataset to evaluate model on.
|
2016-04-06 15:07:47 -04:00
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
2021-05-03 02:34:24 -04:00
|
|
|
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
|
2016-04-06 15:07:47 -04:00
|
|
|
java_blr_summary = self._call_java("evaluate", dataset)
|
2020-05-14 11:54:35 -04:00
|
|
|
if self.numClasses <= 2:
|
|
|
|
return BinaryLogisticRegressionSummary(java_blr_summary)
|
|
|
|
else:
|
|
|
|
return LogisticRegressionSummary(java_blr_summary)
|
2016-04-06 15:07:47 -04:00
|
|
|
|
|
|
|
|
2020-06-20 09:43:28 -04:00
|
|
|
class LogisticRegressionSummary(_ClassificationSummary):
|
2016-04-06 15:07:47 -04:00
|
|
|
"""
|
|
|
|
Abstraction for Logistic Regression Results for a given model.
|
|
|
|
|
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("2.0.0")
|
|
|
|
def probabilityCol(self):
|
|
|
|
"""
|
2016-04-08 23:15:44 -04:00
|
|
|
Field in "predictions" which gives the probability
|
2016-04-06 15:07:47 -04:00
|
|
|
of each class as a vector.
|
|
|
|
"""
|
|
|
|
return self._call_java("probabilityCol")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("2.0.0")
|
|
|
|
def featuresCol(self):
|
|
|
|
"""
|
|
|
|
Field in "predictions" which gives the features of each instance
|
|
|
|
as a vector.
|
|
|
|
"""
|
|
|
|
return self._call_java("featuresCol")
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
2020-06-20 09:43:28 -04:00
|
|
|
class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSummary):
|
2016-04-06 15:07:47 -04:00
|
|
|
"""
|
|
|
|
Abstraction for multinomial Logistic Regression Training results.
|
|
|
|
|
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
"""
|
2020-06-20 09:43:28 -04:00
|
|
|
pass
|
2016-04-06 15:07:47 -04:00
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
2020-06-20 09:43:28 -04:00
|
|
|
class BinaryLogisticRegressionSummary(_BinaryClassificationSummary,
|
|
|
|
LogisticRegressionSummary):
|
2016-04-06 15:07:47 -04:00
|
|
|
"""
|
|
|
|
Binary Logistic regression results for a given model.
|
|
|
|
|
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
"""
|
2020-06-20 09:43:28 -04:00
|
|
|
pass
|
2016-04-06 15:07:47 -04:00
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary,
|
|
|
|
LogisticRegressionTrainingSummary):
|
|
|
|
"""
|
|
|
|
Binary Logistic regression training results for a given model.
|
|
|
|
|
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2019-10-12 10:13:50 -04:00
|
|
|
@inherit_doc
|
|
|
|
class _DecisionTreeClassifierParams(_DecisionTreeParams, _TreeClassifierParams):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
2019-10-12 10:13:50 -04:00
|
|
|
Params for :py:class:`DecisionTreeClassifier` and :py:class:`DecisionTreeClassificationModel`.
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
2020-07-16 14:12:29 -04:00
|
|
|
|
2020-08-03 11:50:34 -04:00
|
|
|
def __init__(self, *args):
|
|
|
|
super(_DecisionTreeClassifierParams, self).__init__(*args)
|
2020-07-16 14:12:29 -04:00
|
|
|
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
|
|
|
|
impurity="gini", leafCol="", minWeightFractionPerNode=0.0)
|
2015-10-27 16:55:03 -04:00
|
|
|
|
|
|
|
|
2015-05-13 18:13:09 -04:00
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class DecisionTreeClassifier(_JavaProbabilisticClassifier, _DecisionTreeClassifierParams,
|
2019-10-12 10:13:50 -04:00
|
|
|
JavaMLWritable, JavaMLReadable):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
2016-05-09 04:11:17 -04:00
|
|
|
`Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_
|
2015-05-13 18:13:09 -04:00
|
|
|
learning algorithm for classification.
|
|
|
|
It supports both binary and multiclass labels, as well as both continuous and categorical
|
|
|
|
features.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 1.4.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2016-05-17 15:51:07 -04:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> from pyspark.ml.feature import StringIndexer
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([
|
2015-05-13 18:13:09 -04:00
|
|
|
... (1.0, Vectors.dense(1.0)),
|
|
|
|
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
|
|
|
|
>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
|
|
|
|
>>> si_model = stringIndexer.fit(df)
|
|
|
|
>>> td = si_model.transform(df)
|
2019-08-23 18:18:35 -04:00
|
|
|
>>> dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed", leafCol="leafId")
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> model = dt.fit(td)
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.getLabelCol()
|
|
|
|
'indexed'
|
|
|
|
>>> model.setFeaturesCol("features")
|
|
|
|
DecisionTreeClassificationModel...
|
2015-07-07 11:58:08 -04:00
|
|
|
>>> model.numNodes
|
|
|
|
3
|
|
|
|
>>> model.depth
|
|
|
|
1
|
2016-03-11 02:54:23 -05:00
|
|
|
>>> model.featureImportances
|
|
|
|
SparseVector(1, {0: 1.0})
|
2016-08-22 06:21:22 -04:00
|
|
|
>>> model.numFeatures
|
|
|
|
1
|
|
|
|
>>> model.numClasses
|
|
|
|
2
|
2016-06-02 18:55:14 -04:00
|
|
|
>>> print(model.toDebugString)
|
2019-11-11 14:03:26 -05:00
|
|
|
DecisionTreeClassificationModel...depth=1, numNodes=3...
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.predict(test0.head().features)
|
|
|
|
0.0
|
2020-01-03 12:42:56 -05:00
|
|
|
>>> model.predictRaw(test0.head().features)
|
|
|
|
DenseVector([1.0, 0.0])
|
|
|
|
>>> model.predictProbability(test0.head().features)
|
|
|
|
DenseVector([1.0, 0.0])
|
2015-08-03 01:19:27 -04:00
|
|
|
>>> result = model.transform(test0).head()
|
|
|
|
>>> result.prediction
|
2015-05-13 18:13:09 -04:00
|
|
|
0.0
|
2015-08-03 01:19:27 -04:00
|
|
|
>>> result.probability
|
|
|
|
DenseVector([1.0, 0.0])
|
|
|
|
>>> result.rawPrediction
|
|
|
|
DenseVector([1.0, 0.0])
|
2019-08-23 18:18:35 -04:00
|
|
|
>>> result.leafId
|
|
|
|
0.0
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> model.transform(test1).head().prediction
|
|
|
|
1.0
|
2016-03-24 22:20:49 -04:00
|
|
|
>>> dtc_path = temp_path + "/dtc"
|
|
|
|
>>> dt.save(dtc_path)
|
|
|
|
>>> dt2 = DecisionTreeClassifier.load(dtc_path)
|
|
|
|
>>> dt2.getMaxDepth()
|
|
|
|
2
|
|
|
|
>>> model_path = temp_path + "/dtc_model"
|
|
|
|
>>> model.save(model_path)
|
|
|
|
>>> model2 = DecisionTreeClassificationModel.load(model_path)
|
|
|
|
>>> model.featureImportances == model2.featureImportances
|
|
|
|
True
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
|
|
|
|
True
|
2019-02-27 22:11:30 -05:00
|
|
|
>>> df3 = spark.createDataFrame([
|
|
|
|
... (1.0, 0.2, Vectors.dense(1.0)),
|
|
|
|
... (1.0, 0.8, Vectors.dense(1.0)),
|
|
|
|
... (0.0, 1.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])
|
|
|
|
>>> si3 = StringIndexer(inputCol="label", outputCol="indexed")
|
|
|
|
>>> si_model3 = si3.fit(df3)
|
|
|
|
>>> td3 = si_model3.transform(df3)
|
|
|
|
>>> dt3 = DecisionTreeClassifier(maxDepth=2, weightCol="weight", labelCol="indexed")
|
|
|
|
>>> model3 = dt3.fit(td3)
|
|
|
|
>>> print(model3.toDebugString)
|
2019-11-11 14:03:26 -05:00
|
|
|
DecisionTreeClassificationModel...depth=1, numNodes=3...
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2015-08-03 01:19:27 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction",
|
2015-05-13 18:13:09 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
2016-01-06 13:52:25 -05:00
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",
|
2019-10-12 10:13:50 -04:00
|
|
|
seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2015-08-03 01:19:27 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", \
|
2015-05-14 21:16:22 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
|
2016-01-06 13:52:25 -05:00
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
|
2019-10-12 10:13:50 -04:00
|
|
|
seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0)
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
super(DecisionTreeClassifier, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid)
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-05-13 18:13:09 -04:00
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.4.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2015-08-03 01:19:27 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction",
|
2015-05-13 18:13:09 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
|
2019-10-12 10:13:50 -04:00
|
|
|
impurity="gini", seed=None, weightCol=None, leafCol="",
|
|
|
|
minWeightFractionPerNode=0.0):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2015-08-03 01:19:27 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", \
|
2015-05-14 21:16:22 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
|
2016-01-06 13:52:25 -05:00
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
|
2019-10-12 10:13:50 -04:00
|
|
|
seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0)
|
2015-05-13 18:13:09 -04:00
|
|
|
Sets params for the DecisionTreeClassifier.
|
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-05-13 18:13:09 -04:00
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return DecisionTreeClassificationModel(java_model)
|
|
|
|
|
2019-07-20 11:44:33 -04:00
|
|
|
def setMaxDepth(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxDepth`.
|
|
|
|
"""
|
|
|
|
return self._set(maxDepth=value)
|
|
|
|
|
|
|
|
def setMaxBins(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxBins`.
|
|
|
|
"""
|
|
|
|
return self._set(maxBins=value)
|
|
|
|
|
|
|
|
def setMinInstancesPerNode(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minInstancesPerNode`.
|
|
|
|
"""
|
|
|
|
return self._set(minInstancesPerNode=value)
|
|
|
|
|
2019-10-12 10:13:50 -04:00
|
|
|
@since("3.0.0")
|
|
|
|
def setMinWeightFractionPerNode(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minWeightFractionPerNode`.
|
|
|
|
"""
|
|
|
|
return self._set(minWeightFractionPerNode=value)
|
|
|
|
|
2019-07-20 11:44:33 -04:00
|
|
|
def setMinInfoGain(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minInfoGain`.
|
|
|
|
"""
|
|
|
|
return self._set(minInfoGain=value)
|
|
|
|
|
|
|
|
def setMaxMemoryInMB(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxMemoryInMB`.
|
|
|
|
"""
|
|
|
|
return self._set(maxMemoryInMB=value)
|
|
|
|
|
|
|
|
def setCacheNodeIds(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`cacheNodeIds`.
|
|
|
|
"""
|
|
|
|
return self._set(cacheNodeIds=value)
|
|
|
|
|
|
|
|
@since("1.4.0")
|
|
|
|
def setImpurity(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`impurity`.
|
|
|
|
"""
|
|
|
|
return self._set(impurity=value)
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
@since("1.4.0")
|
|
|
|
def setCheckpointInterval(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`checkpointInterval`.
|
|
|
|
"""
|
|
|
|
return self._set(checkpointInterval=value)
|
|
|
|
|
|
|
|
def setSeed(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`seed`.
|
|
|
|
"""
|
|
|
|
return self._set(seed=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setWeightCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`weightCol`.
|
|
|
|
"""
|
|
|
|
return self._set(weightCol=value)
|
|
|
|
|
2015-05-13 18:13:09 -04:00
|
|
|
|
2015-07-07 11:58:08 -04:00
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class DecisionTreeClassificationModel(_DecisionTreeModel, _JavaProbabilisticClassificationModel,
|
2019-10-12 10:13:50 -04:00
|
|
|
_DecisionTreeClassifierParams, JavaMLWritable,
|
|
|
|
JavaMLReadable):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
Model fitted by DecisionTreeClassifier.
|
2015-11-09 16:16:04 -05:00
|
|
|
|
|
|
|
.. versionadded:: 1.4.0
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
|
2016-03-11 02:54:23 -05:00
|
|
|
@property
|
|
|
|
def featureImportances(self):
|
|
|
|
"""
|
|
|
|
Estimate of the importance of each feature.
|
|
|
|
|
|
|
|
This generalizes the idea of "Gini" importance to other losses,
|
|
|
|
following the explanation of Gini importance from "Random Forests" documentation
|
|
|
|
by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
|
|
|
|
|
|
|
|
This feature importance is calculated as follows:
|
|
|
|
- importance(feature j) = sum (over nodes which split on feature j) of the gain,
|
|
|
|
where gain is scaled by the number of instances passing through node
|
|
|
|
- Normalize importances for tree to sum to 1.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
|
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
Feature importance for single decision trees can have high variance due to
|
|
|
|
correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`
|
|
|
|
to determine feature importance instead.
|
2016-03-11 02:54:23 -05:00
|
|
|
"""
|
|
|
|
return self._call_java("featureImportances")
|
|
|
|
|
2015-05-13 18:13:09 -04:00
|
|
|
|
|
|
|
@inherit_doc
|
2019-10-12 10:13:50 -04:00
|
|
|
class _RandomForestClassifierParams(_RandomForestParams, _TreeClassifierParams):
|
|
|
|
"""
|
|
|
|
Params for :py:class:`RandomForestClassifier` and :py:class:`RandomForestClassificationModel`.
|
|
|
|
"""
|
2020-07-16 14:12:29 -04:00
|
|
|
|
2020-08-03 11:50:34 -04:00
|
|
|
def __init__(self, *args):
|
|
|
|
super(_RandomForestClassifierParams, self).__init__(*args)
|
2020-07-16 14:12:29 -04:00
|
|
|
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
|
|
|
|
impurity="gini", numTrees=20, featureSubsetStrategy="auto",
|
|
|
|
subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0,
|
|
|
|
bootstrap=True)
|
2019-10-12 10:13:50 -04:00
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class RandomForestClassifier(_JavaProbabilisticClassifier, _RandomForestClassifierParams,
|
2016-04-08 13:39:12 -04:00
|
|
|
JavaMLWritable, JavaMLReadable):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
2016-05-09 04:11:17 -04:00
|
|
|
`Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
|
2015-05-13 18:13:09 -04:00
|
|
|
learning algorithm for classification.
|
|
|
|
It supports both binary and multiclass labels, as well as both continuous and categorical
|
|
|
|
features.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 1.4.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2015-08-04 17:54:26 -04:00
|
|
|
>>> import numpy
|
2015-07-07 11:58:08 -04:00
|
|
|
>>> from numpy import allclose
|
2016-05-17 15:51:07 -04:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> from pyspark.ml.feature import StringIndexer
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([
|
2015-05-13 18:13:09 -04:00
|
|
|
... (1.0, Vectors.dense(1.0)),
|
|
|
|
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
|
|
|
|
>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
|
|
|
|
>>> si_model = stringIndexer.fit(df)
|
|
|
|
>>> td = si_model.transform(df)
|
2019-08-23 18:18:35 -04:00
|
|
|
>>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42,
|
|
|
|
... leafCol="leafId")
|
2020-01-14 09:25:51 -05:00
|
|
|
>>> rf.getMinWeightFractionPerNode()
|
|
|
|
0.0
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> model = rf.fit(td)
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.getLabelCol()
|
|
|
|
'indexed'
|
|
|
|
>>> model.setFeaturesCol("features")
|
|
|
|
RandomForestClassificationModel...
|
|
|
|
>>> model.setRawPredictionCol("newRawPrediction")
|
|
|
|
RandomForestClassificationModel...
|
2020-01-23 03:44:13 -05:00
|
|
|
>>> model.getBootstrap()
|
|
|
|
True
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.getRawPredictionCol()
|
|
|
|
'newRawPrediction'
|
2016-03-11 02:54:23 -05:00
|
|
|
>>> model.featureImportances
|
|
|
|
SparseVector(1, {0: 1.0})
|
2015-07-29 21:18:29 -04:00
|
|
|
>>> allclose(model.treeWeights, [1.0, 1.0, 1.0])
|
2015-07-07 11:58:08 -04:00
|
|
|
True
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.predict(test0.head().features)
|
|
|
|
0.0
|
2020-01-03 12:42:56 -05:00
|
|
|
>>> model.predictRaw(test0.head().features)
|
|
|
|
DenseVector([2.0, 0.0])
|
|
|
|
>>> model.predictProbability(test0.head().features)
|
|
|
|
DenseVector([1.0, 0.0])
|
2015-08-04 17:54:26 -04:00
|
|
|
>>> result = model.transform(test0).head()
|
|
|
|
>>> result.prediction
|
2015-05-13 18:13:09 -04:00
|
|
|
0.0
|
2015-08-04 17:54:26 -04:00
|
|
|
>>> numpy.argmax(result.probability)
|
|
|
|
0
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> numpy.argmax(result.newRawPrediction)
|
2015-08-04 17:54:26 -04:00
|
|
|
0
|
2019-08-23 18:18:35 -04:00
|
|
|
>>> result.leafId
|
|
|
|
DenseVector([0.0, 0.0, 0.0])
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> model.transform(test1).head().prediction
|
|
|
|
1.0
|
2016-06-02 18:55:14 -04:00
|
|
|
>>> model.trees
|
2019-11-11 14:03:26 -05:00
|
|
|
[DecisionTreeClassificationModel...depth=..., DecisionTreeClassificationModel...]
|
2016-04-08 13:39:12 -04:00
|
|
|
>>> rfc_path = temp_path + "/rfc"
|
|
|
|
>>> rf.save(rfc_path)
|
|
|
|
>>> rf2 = RandomForestClassifier.load(rfc_path)
|
|
|
|
>>> rf2.getNumTrees()
|
|
|
|
3
|
|
|
|
>>> model_path = temp_path + "/rfc_model"
|
|
|
|
>>> model.save(model_path)
|
|
|
|
>>> model2 = RandomForestClassificationModel.load(model_path)
|
|
|
|
>>> model.featureImportances == model2.featureImportances
|
|
|
|
True
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
|
|
|
|
True
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2015-08-04 17:54:26 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction",
|
2015-05-13 18:13:09 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",
|
2019-08-23 18:18:35 -04:00
|
|
|
numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0,
|
2020-01-23 03:44:13 -05:00
|
|
|
leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2015-08-04 17:54:26 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", \
|
2015-05-14 21:16:22 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
|
2019-08-23 18:18:35 -04:00
|
|
|
numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, \
|
2020-01-23 03:44:13 -05:00
|
|
|
leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True)
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
super(RandomForestClassifier, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.classification.RandomForestClassifier", self.uid)
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-05-13 18:13:09 -04:00
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.4.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2015-08-04 17:54:26 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction",
|
2015-05-13 18:13:09 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
2015-05-20 18:16:12 -04:00
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
|
2019-08-23 18:18:35 -04:00
|
|
|
impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0,
|
2020-01-23 03:44:13 -05:00
|
|
|
leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
2015-05-14 21:16:22 -04:00
|
|
|
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2015-08-04 17:54:26 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", \
|
2015-05-14 21:16:22 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
|
2015-05-20 18:16:12 -04:00
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
|
2019-08-23 18:18:35 -04:00
|
|
|
impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0, \
|
2020-01-23 03:44:13 -05:00
|
|
|
leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True)
|
2015-05-13 18:13:09 -04:00
|
|
|
Sets params for linear classification.
|
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-05-13 18:13:09 -04:00
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return RandomForestClassificationModel(java_model)
|
|
|
|
|
2019-07-20 11:44:33 -04:00
|
|
|
def setMaxDepth(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxDepth`.
|
|
|
|
"""
|
|
|
|
return self._set(maxDepth=value)
|
|
|
|
|
|
|
|
def setMaxBins(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxBins`.
|
|
|
|
"""
|
|
|
|
return self._set(maxBins=value)
|
|
|
|
|
|
|
|
def setMinInstancesPerNode(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minInstancesPerNode`.
|
|
|
|
"""
|
|
|
|
return self._set(minInstancesPerNode=value)
|
|
|
|
|
|
|
|
def setMinInfoGain(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minInfoGain`.
|
|
|
|
"""
|
|
|
|
return self._set(minInfoGain=value)
|
|
|
|
|
|
|
|
def setMaxMemoryInMB(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxMemoryInMB`.
|
|
|
|
"""
|
|
|
|
return self._set(maxMemoryInMB=value)
|
|
|
|
|
|
|
|
def setCacheNodeIds(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`cacheNodeIds`.
|
|
|
|
"""
|
|
|
|
return self._set(cacheNodeIds=value)
|
|
|
|
|
|
|
|
@since("1.4.0")
|
|
|
|
def setImpurity(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`impurity`.
|
|
|
|
"""
|
|
|
|
return self._set(impurity=value)
|
|
|
|
|
|
|
|
@since("1.4.0")
|
|
|
|
def setNumTrees(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`numTrees`.
|
|
|
|
"""
|
|
|
|
return self._set(numTrees=value)
|
|
|
|
|
2020-01-23 03:44:13 -05:00
|
|
|
@since("3.0.0")
|
|
|
|
def setBootstrap(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`bootstrap`.
|
|
|
|
"""
|
|
|
|
return self._set(bootstrap=value)
|
|
|
|
|
2019-07-20 11:44:33 -04:00
|
|
|
@since("1.4.0")
|
|
|
|
def setSubsamplingRate(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`subsamplingRate`.
|
|
|
|
"""
|
|
|
|
return self._set(subsamplingRate=value)
|
|
|
|
|
2018-05-30 14:04:09 -04:00
|
|
|
@since("2.4.0")
|
|
|
|
def setFeatureSubsetStrategy(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`featureSubsetStrategy`.
|
|
|
|
"""
|
|
|
|
return self._set(featureSubsetStrategy=value)
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
def setSeed(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`seed`.
|
|
|
|
"""
|
|
|
|
return self._set(seed=value)
|
|
|
|
|
|
|
|
def setCheckpointInterval(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`checkpointInterval`.
|
|
|
|
"""
|
|
|
|
return self._set(checkpointInterval=value)
|
|
|
|
|
2020-01-14 09:25:51 -05:00
|
|
|
@since("3.0.0")
|
|
|
|
def setWeightCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`weightCol`.
|
|
|
|
"""
|
|
|
|
return self._set(weightCol=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setMinWeightFractionPerNode(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minWeightFractionPerNode`.
|
|
|
|
"""
|
|
|
|
return self._set(minWeightFractionPerNode=value)
|
|
|
|
|
2015-05-13 18:13:09 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class RandomForestClassificationModel(_TreeEnsembleModel, _JavaProbabilisticClassificationModel,
|
2019-10-12 10:13:50 -04:00
|
|
|
_RandomForestClassifierParams, JavaMLWritable,
|
2020-07-01 09:09:07 -04:00
|
|
|
JavaMLReadable, HasTrainingSummary):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
Model fitted by RandomForestClassifier.
|
2015-11-09 16:16:04 -05:00
|
|
|
|
|
|
|
.. versionadded:: 1.4.0
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
|
2016-03-11 02:54:23 -05:00
|
|
|
@property
|
|
|
|
def featureImportances(self):
|
|
|
|
"""
|
|
|
|
Estimate of the importance of each feature.
|
|
|
|
|
2016-03-31 16:00:10 -04:00
|
|
|
Each feature's importance is the average of its importance across all trees in the ensemble
|
|
|
|
The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
|
|
|
|
(Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
|
|
|
|
and follows the implementation from scikit-learn.
|
2016-03-11 02:54:23 -05:00
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
|
|
|
|
See Also
|
|
|
|
--------
|
|
|
|
DecisionTreeClassificationModel.featureImportances
|
2016-03-11 02:54:23 -05:00
|
|
|
"""
|
|
|
|
return self._call_java("featureImportances")
|
|
|
|
|
2016-06-02 18:55:14 -04:00
|
|
|
@property
|
|
|
|
@since("2.0.0")
|
|
|
|
def trees(self):
|
|
|
|
"""Trees in this ensemble. Warning: These have null parent Estimators."""
|
|
|
|
return [DecisionTreeClassificationModel(m) for m in list(self._call_java("trees"))]
|
|
|
|
|
2020-07-01 09:09:07 -04:00
|
|
|
@property
|
|
|
|
@since("3.1.0")
|
|
|
|
def summary(self):
|
|
|
|
"""
|
2021-05-11 21:38:59 -04:00
|
|
|
Gets summary (accuracy/precision/recall, objective history, total iterations) of model
|
2020-07-01 09:09:07 -04:00
|
|
|
trained on the training set. An exception is thrown if `trainingSummary is None`.
|
|
|
|
"""
|
|
|
|
if self.hasSummary:
|
|
|
|
if self.numClasses <= 2:
|
|
|
|
return BinaryRandomForestClassificationTrainingSummary(
|
|
|
|
super(RandomForestClassificationModel, self).summary)
|
|
|
|
else:
|
|
|
|
return RandomForestClassificationTrainingSummary(
|
|
|
|
super(RandomForestClassificationModel, self).summary)
|
|
|
|
else:
|
|
|
|
raise RuntimeError("No training summary available for this %s" %
|
|
|
|
self.__class__.__name__)
|
|
|
|
|
|
|
|
def evaluate(self, dataset):
|
|
|
|
"""
|
|
|
|
Evaluates the model on a test dataset.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
Test dataset to evaluate model on.
|
2020-07-01 09:09:07 -04:00
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
2021-05-03 02:34:24 -04:00
|
|
|
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
|
2020-07-01 09:09:07 -04:00
|
|
|
java_rf_summary = self._call_java("evaluate", dataset)
|
|
|
|
if self.numClasses <= 2:
|
|
|
|
return BinaryRandomForestClassificationSummary(java_rf_summary)
|
|
|
|
else:
|
|
|
|
return RandomForestClassificationSummary(java_rf_summary)
|
|
|
|
|
|
|
|
|
|
|
|
class RandomForestClassificationSummary(_ClassificationSummary):
|
|
|
|
"""
|
|
|
|
Abstraction for RandomForestClassification Results for a given model.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
2020-07-01 09:09:07 -04:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class RandomForestClassificationTrainingSummary(RandomForestClassificationSummary,
|
|
|
|
_TrainingSummary):
|
|
|
|
"""
|
|
|
|
Abstraction for RandomForestClassificationTraining Training results.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
2020-07-01 09:09:07 -04:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class BinaryRandomForestClassificationSummary(_BinaryClassificationSummary):
|
|
|
|
"""
|
|
|
|
BinaryRandomForestClassification results for a given model.
|
|
|
|
|
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class BinaryRandomForestClassificationTrainingSummary(BinaryRandomForestClassificationSummary,
|
|
|
|
RandomForestClassificationTrainingSummary):
|
|
|
|
"""
|
|
|
|
BinaryRandomForestClassification training results for a given model.
|
|
|
|
|
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2015-05-13 18:13:09 -04:00
|
|
|
|
2019-10-14 11:52:23 -04:00
|
|
|
class _GBTClassifierParams(_GBTParams, _HasVarianceImpurity):
|
2018-12-07 16:53:35 -05:00
|
|
|
"""
|
2019-10-14 11:52:23 -04:00
|
|
|
Params for :py:class:`GBTClassifier` and :py:class:`GBTClassifierModel`.
|
2018-12-07 16:53:35 -05:00
|
|
|
|
|
|
|
.. versionadded:: 3.0.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
supportedLossTypes = ["logistic"]
|
|
|
|
|
|
|
|
lossType = Param(Params._dummy(), "lossType",
|
|
|
|
"Loss function which GBT tries to minimize (case-insensitive). " +
|
|
|
|
"Supported options: " + ", ".join(supportedLossTypes),
|
|
|
|
typeConverter=TypeConverters.toString)
|
|
|
|
|
2020-08-03 11:50:34 -04:00
|
|
|
def __init__(self, *args):
|
|
|
|
super(_GBTClassifierParams, self).__init__(*args)
|
2020-07-16 14:12:29 -04:00
|
|
|
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
|
|
|
|
lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0,
|
|
|
|
impurity="variance", featureSubsetStrategy="all", validationTol=0.01,
|
|
|
|
leafCol="", minWeightFractionPerNode=0.0)
|
|
|
|
|
2018-12-07 16:53:35 -05:00
|
|
|
@since("1.4.0")
|
|
|
|
def getLossType(self):
|
|
|
|
"""
|
|
|
|
Gets the value of lossType or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.lossType)
|
|
|
|
|
|
|
|
|
2015-05-13 18:13:09 -04:00
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class GBTClassifier(_JavaProbabilisticClassifier, _GBTClassifierParams,
|
2019-10-12 10:13:50 -04:00
|
|
|
JavaMLWritable, JavaMLReadable):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
2016-05-09 04:11:17 -04:00
|
|
|
`Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_
|
2015-05-13 18:13:09 -04:00
|
|
|
learning algorithm for classification.
|
|
|
|
It supports binary labels, as well as both continuous and categorical features.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 1.4.0
|
|
|
|
|
|
|
|
Notes
|
|
|
|
-----
|
|
|
|
Multiclass labels are not currently supported.
|
|
|
|
|
2016-05-09 04:11:17 -04:00
|
|
|
The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
Gradient Boosting vs. TreeBoost:
|
|
|
|
|
2016-05-09 04:11:17 -04:00
|
|
|
- This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
|
|
|
|
- Both algorithms learn tree ensembles by minimizing loss functions.
|
|
|
|
- TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
|
2020-11-09 19:33:48 -05:00
|
|
|
based on the loss function, whereas the original gradient boosting method does not.
|
2016-05-09 04:11:17 -04:00
|
|
|
- We expect to implement TreeBoost in the future:
|
2020-11-09 19:33:48 -05:00
|
|
|
`SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_
|
2016-11-22 06:40:18 -05:00
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
Examples
|
|
|
|
--------
|
2015-07-07 11:58:08 -04:00
|
|
|
>>> from numpy import allclose
|
2016-05-17 15:51:07 -04:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> from pyspark.ml.feature import StringIndexer
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([
|
2015-05-13 18:13:09 -04:00
|
|
|
... (1.0, Vectors.dense(1.0)),
|
|
|
|
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
|
|
|
|
>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
|
|
|
|
>>> si_model = stringIndexer.fit(df)
|
|
|
|
>>> td = si_model.transform(df)
|
2019-08-23 18:18:35 -04:00
|
|
|
>>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42,
|
|
|
|
... leafCol="leafId")
|
2019-10-27 23:36:10 -04:00
|
|
|
>>> gbt.setMaxIter(5)
|
|
|
|
GBTClassifier...
|
2019-12-09 14:39:33 -05:00
|
|
|
>>> gbt.setMinWeightFractionPerNode(0.049)
|
|
|
|
GBTClassifier...
|
2019-10-27 23:36:10 -04:00
|
|
|
>>> gbt.getMaxIter()
|
|
|
|
5
|
2018-05-30 14:04:09 -04:00
|
|
|
>>> gbt.getFeatureSubsetStrategy()
|
|
|
|
'all'
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> model = gbt.fit(td)
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.getLabelCol()
|
|
|
|
'indexed'
|
|
|
|
>>> model.setFeaturesCol("features")
|
|
|
|
GBTClassificationModel...
|
|
|
|
>>> model.setThresholds([0.3, 0.7])
|
|
|
|
GBTClassificationModel...
|
|
|
|
>>> model.getThresholds()
|
|
|
|
[0.3, 0.7]
|
2016-03-31 16:00:10 -04:00
|
|
|
>>> model.featureImportances
|
|
|
|
SparseVector(1, {0: 1.0})
|
2015-07-07 11:58:08 -04:00
|
|
|
>>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
|
|
|
|
True
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.predict(test0.head().features)
|
|
|
|
0.0
|
2020-01-03 12:42:56 -05:00
|
|
|
>>> model.predictRaw(test0.head().features)
|
|
|
|
DenseVector([1.1697, -1.1697])
|
|
|
|
>>> model.predictProbability(test0.head().features)
|
|
|
|
DenseVector([0.9121, 0.0879])
|
2019-08-23 18:18:35 -04:00
|
|
|
>>> result = model.transform(test0).head()
|
|
|
|
>>> result.prediction
|
2015-05-13 18:13:09 -04:00
|
|
|
0.0
|
2019-08-23 18:18:35 -04:00
|
|
|
>>> result.leafId
|
|
|
|
DenseVector([0.0, 0.0, 0.0, 0.0, 0.0])
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
|
2015-05-13 18:13:09 -04:00
|
|
|
>>> model.transform(test1).head().prediction
|
|
|
|
1.0
|
2016-06-02 18:55:14 -04:00
|
|
|
>>> model.totalNumNodes
|
|
|
|
15
|
|
|
|
>>> print(model.toDebugString)
|
2019-11-11 14:03:26 -05:00
|
|
|
GBTClassificationModel...numTrees=5...
|
2016-04-15 00:36:03 -04:00
|
|
|
>>> gbtc_path = temp_path + "gbtc"
|
|
|
|
>>> gbt.save(gbtc_path)
|
|
|
|
>>> gbt2 = GBTClassifier.load(gbtc_path)
|
|
|
|
>>> gbt2.getMaxDepth()
|
|
|
|
2
|
|
|
|
>>> model_path = temp_path + "gbtc_model"
|
|
|
|
>>> model.save(model_path)
|
|
|
|
>>> model2 = GBTClassificationModel.load(model_path)
|
|
|
|
>>> model.featureImportances == model2.featureImportances
|
|
|
|
True
|
|
|
|
>>> model.treeWeights == model2.treeWeights
|
|
|
|
True
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
|
|
|
|
True
|
2016-06-20 19:28:11 -04:00
|
|
|
>>> model.trees
|
2019-11-11 14:03:26 -05:00
|
|
|
[DecisionTreeRegressionModel...depth=..., DecisionTreeRegressionModel...]
|
2018-05-15 17:16:31 -04:00
|
|
|
>>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0),)],
|
|
|
|
... ["indexed", "features"])
|
|
|
|
>>> model.evaluateEachIteration(validation)
|
|
|
|
[0.25..., 0.23..., 0.21..., 0.19..., 0.18...]
|
2018-05-30 14:04:09 -04:00
|
|
|
>>> model.numClasses
|
|
|
|
2
|
2018-12-07 16:53:35 -05:00
|
|
|
>>> gbt = gbt.setValidationIndicatorCol("validationIndicator")
|
|
|
|
>>> gbt.getValidationIndicatorCol()
|
|
|
|
'validationIndicator'
|
|
|
|
>>> gbt.getValidationTol()
|
|
|
|
0.01
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2015-05-13 18:13:09 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic",
|
2018-12-07 16:53:35 -05:00
|
|
|
maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, impurity="variance",
|
2019-08-23 18:18:35 -04:00
|
|
|
featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None,
|
2019-12-09 14:39:33 -05:00
|
|
|
leafCol="", minWeightFractionPerNode=0.0, weightCol=None):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2015-05-14 21:16:22 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
|
2018-05-30 14:04:09 -04:00
|
|
|
lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \
|
2018-12-07 16:53:35 -05:00
|
|
|
impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \
|
2019-12-09 14:39:33 -05:00
|
|
|
validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0, \
|
|
|
|
weightCol=None)
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
super(GBTClassifier, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.classification.GBTClassifier", self.uid)
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-05-13 18:13:09 -04:00
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.4.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2015-05-13 18:13:09 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
|
2018-05-30 14:04:09 -04:00
|
|
|
lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0,
|
2018-12-07 16:53:35 -05:00
|
|
|
impurity="variance", featureSubsetStrategy="all", validationTol=0.01,
|
2019-12-09 14:39:33 -05:00
|
|
|
validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0,
|
|
|
|
weightCol=None):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2015-05-14 21:16:22 -04:00
|
|
|
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
|
|
|
|
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
|
2018-05-30 14:04:09 -04:00
|
|
|
lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \
|
2018-12-07 16:53:35 -05:00
|
|
|
impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \
|
2019-12-09 14:39:33 -05:00
|
|
|
validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0, \
|
|
|
|
weightCol=None)
|
2015-05-13 18:13:09 -04:00
|
|
|
Sets params for Gradient Boosted Tree Classification.
|
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-05-13 18:13:09 -04:00
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return GBTClassificationModel(java_model)
|
|
|
|
|
2019-07-20 11:44:33 -04:00
|
|
|
def setMaxDepth(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxDepth`.
|
|
|
|
"""
|
|
|
|
return self._set(maxDepth=value)
|
|
|
|
|
|
|
|
def setMaxBins(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxBins`.
|
|
|
|
"""
|
|
|
|
return self._set(maxBins=value)
|
|
|
|
|
|
|
|
def setMinInstancesPerNode(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minInstancesPerNode`.
|
|
|
|
"""
|
|
|
|
return self._set(minInstancesPerNode=value)
|
|
|
|
|
|
|
|
def setMinInfoGain(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minInfoGain`.
|
|
|
|
"""
|
|
|
|
return self._set(minInfoGain=value)
|
|
|
|
|
|
|
|
def setMaxMemoryInMB(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxMemoryInMB`.
|
|
|
|
"""
|
|
|
|
return self._set(maxMemoryInMB=value)
|
|
|
|
|
|
|
|
def setCacheNodeIds(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`cacheNodeIds`.
|
|
|
|
"""
|
|
|
|
return self._set(cacheNodeIds=value)
|
|
|
|
|
|
|
|
@since("1.4.0")
|
|
|
|
def setImpurity(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`impurity`.
|
|
|
|
"""
|
|
|
|
return self._set(impurity=value)
|
|
|
|
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.4.0")
|
2015-05-13 18:13:09 -04:00
|
|
|
def setLossType(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`lossType`.
|
|
|
|
"""
|
2016-05-03 10:46:13 -04:00
|
|
|
return self._set(lossType=value)
|
2015-05-13 18:13:09 -04:00
|
|
|
|
2019-07-20 11:44:33 -04:00
|
|
|
@since("1.4.0")
|
|
|
|
def setSubsamplingRate(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`subsamplingRate`.
|
|
|
|
"""
|
|
|
|
return self._set(subsamplingRate=value)
|
|
|
|
|
2018-05-30 14:04:09 -04:00
|
|
|
@since("2.4.0")
|
|
|
|
def setFeatureSubsetStrategy(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`featureSubsetStrategy`.
|
|
|
|
"""
|
|
|
|
return self._set(featureSubsetStrategy=value)
|
|
|
|
|
2018-12-07 16:53:35 -05:00
|
|
|
@since("3.0.0")
|
|
|
|
def setValidationIndicatorCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`validationIndicatorCol`.
|
|
|
|
"""
|
|
|
|
return self._set(validationIndicatorCol=value)
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
@since("1.4.0")
|
|
|
|
def setMaxIter(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxIter`.
|
|
|
|
"""
|
|
|
|
return self._set(maxIter=value)
|
|
|
|
|
|
|
|
@since("1.4.0")
|
|
|
|
def setCheckpointInterval(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`checkpointInterval`.
|
|
|
|
"""
|
|
|
|
return self._set(checkpointInterval=value)
|
|
|
|
|
|
|
|
@since("1.4.0")
|
|
|
|
def setSeed(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`seed`.
|
|
|
|
"""
|
|
|
|
return self._set(seed=value)
|
|
|
|
|
|
|
|
@since("1.4.0")
|
|
|
|
def setStepSize(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`stepSize`.
|
|
|
|
"""
|
|
|
|
return self._set(stepSize=value)
|
|
|
|
|
2019-12-09 14:39:33 -05:00
|
|
|
@since("3.0.0")
|
|
|
|
def setWeightCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`weightCol`.
|
|
|
|
"""
|
|
|
|
return self._set(weightCol=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setMinWeightFractionPerNode(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minWeightFractionPerNode`.
|
|
|
|
"""
|
|
|
|
return self._set(minWeightFractionPerNode=value)
|
|
|
|
|
2015-05-13 18:13:09 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class GBTClassificationModel(_TreeEnsembleModel, _JavaProbabilisticClassificationModel,
|
2019-10-14 11:52:23 -04:00
|
|
|
_GBTClassifierParams, JavaMLWritable, JavaMLReadable):
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
Model fitted by GBTClassifier.
|
2015-11-09 16:16:04 -05:00
|
|
|
|
|
|
|
.. versionadded:: 1.4.0
|
2015-05-13 18:13:09 -04:00
|
|
|
"""
|
|
|
|
|
2016-03-31 16:00:10 -04:00
|
|
|
@property
|
|
|
|
def featureImportances(self):
|
|
|
|
"""
|
|
|
|
Estimate of the importance of each feature.
|
|
|
|
|
|
|
|
Each feature's importance is the average of its importance across all trees in the ensemble
|
|
|
|
The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
|
|
|
|
(Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
|
|
|
|
and follows the implementation from scikit-learn.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
|
|
|
|
See Also
|
|
|
|
--------
|
|
|
|
DecisionTreeClassificationModel.featureImportances
|
2016-03-31 16:00:10 -04:00
|
|
|
"""
|
|
|
|
return self._call_java("featureImportances")
|
|
|
|
|
2016-06-02 18:55:14 -04:00
|
|
|
@property
|
|
|
|
@since("2.0.0")
|
|
|
|
def trees(self):
|
|
|
|
"""Trees in this ensemble. Warning: These have null parent Estimators."""
|
|
|
|
return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))]
|
|
|
|
|
2018-05-15 17:16:31 -04:00
|
|
|
def evaluateEachIteration(self, dataset):
|
|
|
|
"""
|
|
|
|
Method to compute error or loss for every iteration of gradient boosting.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.4.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
Test dataset to evaluate model on.
|
2018-05-15 17:16:31 -04:00
|
|
|
"""
|
|
|
|
return self._call_java("evaluateEachIteration", dataset)
|
|
|
|
|
2015-05-13 18:13:09 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class _NaiveBayesParams(_PredictorParams, HasWeightCol):
|
2019-10-18 05:26:54 -04:00
|
|
|
"""
|
|
|
|
Params for :py:class:`NaiveBayes` and :py:class:`NaiveBayesModel`.
|
|
|
|
|
|
|
|
.. versionadded:: 3.0.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
|
|
|
|
"default is 1.0", typeConverter=TypeConverters.toFloat)
|
|
|
|
modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
|
2019-11-17 21:05:42 -05:00
|
|
|
"(case-sensitive). Supported options: multinomial (default), bernoulli " +
|
|
|
|
"and gaussian.",
|
2019-10-18 05:26:54 -04:00
|
|
|
typeConverter=TypeConverters.toString)
|
|
|
|
|
2020-08-03 11:50:34 -04:00
|
|
|
def __init__(self, *args):
|
|
|
|
super(_NaiveBayesParams, self).__init__(*args)
|
2020-07-16 14:12:29 -04:00
|
|
|
self._setDefault(smoothing=1.0, modelType="multinomial")
|
|
|
|
|
2019-10-18 05:26:54 -04:00
|
|
|
@since("1.5.0")
|
|
|
|
def getSmoothing(self):
|
|
|
|
"""
|
|
|
|
Gets the value of smoothing or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.smoothing)
|
|
|
|
|
|
|
|
@since("1.5.0")
|
|
|
|
def getModelType(self):
|
|
|
|
"""
|
|
|
|
Gets the value of modelType or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.modelType)
|
|
|
|
|
|
|
|
|
2015-07-31 02:03:48 -04:00
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class NaiveBayes(_JavaProbabilisticClassifier, _NaiveBayesParams, HasThresholds, HasWeightCol,
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
JavaMLWritable, JavaMLReadable):
|
2015-07-31 02:03:48 -04:00
|
|
|
"""
|
|
|
|
Naive Bayes Classifiers.
|
2020-11-09 19:33:48 -05:00
|
|
|
It supports both Multinomial and Bernoulli NB. `Multinomial NB \
|
2016-05-09 04:11:17 -04:00
|
|
|
<http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html>`_
|
2015-08-12 16:24:18 -04:00
|
|
|
can handle finitely supported discrete data. For example, by converting documents into
|
|
|
|
TF-IDF vectors, it can be used for document classification. By making every vector a
|
2020-11-09 19:33:48 -05:00
|
|
|
binary (0/1) data, it can also be used as `Bernoulli NB \
|
2016-05-09 04:11:17 -04:00
|
|
|
<http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html>`_.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
2019-11-17 21:05:42 -05:00
|
|
|
The input feature values for Multinomial NB and Bernoulli NB must be nonnegative.
|
2019-11-21 05:22:05 -05:00
|
|
|
Since 3.0.0, it supports Complement NB which is an adaptation of the Multinomial NB.
|
|
|
|
Specifically, Complement NB uses statistics from the complement of each class to compute
|
|
|
|
the model's coefficients. The inventors of Complement NB show empirically that the parameter
|
|
|
|
estimates for CNB are more stable than those for Multinomial NB. Like Multinomial NB, the
|
|
|
|
input feature values for Complement NB must be nonnegative.
|
2020-11-09 19:33:48 -05:00
|
|
|
Since 3.0.0, it also supports `Gaussian NB \
|
2019-11-17 21:05:42 -05:00
|
|
|
<https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Gaussian_naive_Bayes>`_.
|
|
|
|
which can handle continuous data.
|
2015-07-31 02:03:48 -04:00
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 1.5.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2015-07-31 02:03:48 -04:00
|
|
|
>>> from pyspark.sql import Row
|
2016-05-17 15:51:07 -04:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([
|
2016-10-12 22:52:57 -04:00
|
|
|
... Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])),
|
|
|
|
... Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])),
|
|
|
|
... Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))])
|
|
|
|
>>> nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight")
|
2015-07-31 02:03:48 -04:00
|
|
|
>>> model = nb.fit(df)
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.setFeaturesCol("features")
|
[SPARK-29867][ML][PYTHON] Add __repr__ in Python ML Models
### What changes were proposed in this pull request?
Add ```__repr__``` in Python ML Models
### Why are the changes needed?
In Python ML Models, some of them have ```__repr__```, others don't. In the doctest, when calling Model.setXXX, some of the Models print out the xxxModel... correctly, some of them can't because of lacking the ```__repr__``` method. For example:
```
>>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)
>>> model = gm.fit(df)
>>> model.setPredictionCol("newPrediction")
GaussianMixture...
```
After the change, the above code will become the following:
```
>>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)
>>> model = gm.fit(df)
>>> model.setPredictionCol("newPrediction")
GaussianMixtureModel...
```
### Does this PR introduce any user-facing change?
Yes.
### How was this patch tested?
doctest
Closes #26489 from huaxingao/spark-29876.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-16 00:44:39 -05:00
|
|
|
NaiveBayesModel...
|
2019-10-18 05:26:54 -04:00
|
|
|
>>> model.getSmoothing()
|
|
|
|
1.0
|
2015-07-31 02:03:48 -04:00
|
|
|
>>> model.pi
|
2016-10-12 22:52:57 -04:00
|
|
|
DenseVector([-0.81..., -0.58...])
|
2015-07-31 02:03:48 -04:00
|
|
|
>>> model.theta
|
2016-10-12 22:52:57 -04:00
|
|
|
DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1)
|
2019-12-02 21:02:23 -05:00
|
|
|
>>> model.sigma
|
|
|
|
DenseMatrix(0, 0, [...], ...)
|
2015-07-31 02:03:48 -04:00
|
|
|
>>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.predict(test0.head().features)
|
|
|
|
1.0
|
2020-01-03 12:42:56 -05:00
|
|
|
>>> model.predictRaw(test0.head().features)
|
|
|
|
DenseVector([-1.72..., -0.99...])
|
|
|
|
>>> model.predictProbability(test0.head().features)
|
|
|
|
DenseVector([0.32..., 0.67...])
|
2015-08-03 01:19:27 -04:00
|
|
|
>>> result = model.transform(test0).head()
|
|
|
|
>>> result.prediction
|
2015-07-31 02:03:48 -04:00
|
|
|
1.0
|
2015-08-03 01:19:27 -04:00
|
|
|
>>> result.probability
|
2016-10-12 22:52:57 -04:00
|
|
|
DenseVector([0.32..., 0.67...])
|
2015-08-03 01:19:27 -04:00
|
|
|
>>> result.rawPrediction
|
2016-10-12 22:52:57 -04:00
|
|
|
DenseVector([-1.72..., -0.99...])
|
2015-07-31 02:03:48 -04:00
|
|
|
>>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
|
|
|
|
>>> model.transform(test1).head().prediction
|
|
|
|
1.0
|
2016-03-16 17:21:42 -04:00
|
|
|
>>> nb_path = temp_path + "/nb"
|
|
|
|
>>> nb.save(nb_path)
|
|
|
|
>>> nb2 = NaiveBayes.load(nb_path)
|
|
|
|
>>> nb2.getSmoothing()
|
|
|
|
1.0
|
|
|
|
>>> model_path = temp_path + "/nb_model"
|
|
|
|
>>> model.save(model_path)
|
|
|
|
>>> model2 = NaiveBayesModel.load(model_path)
|
|
|
|
>>> model.pi == model2.pi
|
|
|
|
True
|
|
|
|
>>> model.theta == model2.theta
|
|
|
|
True
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
|
|
|
|
True
|
2016-05-13 02:39:59 -04:00
|
|
|
>>> nb = nb.setThresholds([0.01, 10.00])
|
|
|
|
>>> model3 = nb.fit(df)
|
|
|
|
>>> result = model3.transform(test0).head()
|
|
|
|
>>> result.prediction
|
|
|
|
0.0
|
2019-11-17 21:05:42 -05:00
|
|
|
>>> nb3 = NaiveBayes().setModelType("gaussian")
|
|
|
|
>>> model4 = nb3.fit(df)
|
|
|
|
>>> model4.getModelType()
|
|
|
|
'gaussian'
|
|
|
|
>>> model4.sigma
|
|
|
|
DenseMatrix(2, 2, [0.0, 0.25, 0.0, 0.0], 1)
|
2019-11-21 05:22:05 -05:00
|
|
|
>>> nb5 = NaiveBayes(smoothing=1.0, modelType="complement", weightCol="weight")
|
|
|
|
>>> model5 = nb5.fit(df)
|
|
|
|
>>> model5.getModelType()
|
|
|
|
'complement'
|
|
|
|
>>> model5.theta
|
|
|
|
DenseMatrix(2, 2, [...], 1)
|
2019-12-02 21:02:23 -05:00
|
|
|
>>> model5.sigma
|
|
|
|
DenseMatrix(0, 0, [...], ...)
|
2015-07-31 02:03:48 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2015-08-03 01:19:27 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0,
|
2016-10-12 22:52:57 -04:00
|
|
|
modelType="multinomial", thresholds=None, weightCol=None):
|
2015-07-31 02:03:48 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2015-08-03 01:19:27 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, \
|
2016-10-12 22:52:57 -04:00
|
|
|
modelType="multinomial", thresholds=None, weightCol=None)
|
2015-07-31 02:03:48 -04:00
|
|
|
"""
|
|
|
|
super(NaiveBayes, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.classification.NaiveBayes", self.uid)
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-07-31 02:03:48 -04:00
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.5.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2015-08-03 01:19:27 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0,
|
2016-10-12 22:52:57 -04:00
|
|
|
modelType="multinomial", thresholds=None, weightCol=None):
|
2015-07-31 02:03:48 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2015-08-03 01:19:27 -04:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, \
|
2016-10-12 22:52:57 -04:00
|
|
|
modelType="multinomial", thresholds=None, weightCol=None)
|
2015-07-31 02:03:48 -04:00
|
|
|
Sets params for Naive Bayes.
|
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-07-31 02:03:48 -04:00
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return NaiveBayesModel(java_model)
|
|
|
|
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.5.0")
|
2015-07-31 02:03:48 -04:00
|
|
|
def setSmoothing(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`smoothing`.
|
|
|
|
"""
|
2016-05-03 10:46:13 -04:00
|
|
|
return self._set(smoothing=value)
|
2015-07-31 02:03:48 -04:00
|
|
|
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.5.0")
|
2015-07-31 02:03:48 -04:00
|
|
|
def setModelType(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`modelType`.
|
|
|
|
"""
|
2016-05-03 10:46:13 -04:00
|
|
|
return self._set(modelType=value)
|
2015-07-31 02:03:48 -04:00
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
def setWeightCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`weightCol`.
|
|
|
|
"""
|
|
|
|
return self._set(weightCol=value)
|
|
|
|
|
2015-07-31 02:03:48 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class NaiveBayesModel(_JavaProbabilisticClassificationModel, _NaiveBayesParams, JavaMLWritable,
|
2019-10-18 05:26:54 -04:00
|
|
|
JavaMLReadable):
|
2015-07-31 02:03:48 -04:00
|
|
|
"""
|
|
|
|
Model fitted by NaiveBayes.
|
2015-11-09 16:16:04 -05:00
|
|
|
|
|
|
|
.. versionadded:: 1.5.0
|
2015-07-31 02:03:48 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
2016-06-22 13:05:25 -04:00
|
|
|
@since("2.0.0")
|
2015-07-31 02:03:48 -04:00
|
|
|
def pi(self):
|
|
|
|
"""
|
|
|
|
log of class priors.
|
|
|
|
"""
|
|
|
|
return self._call_java("pi")
|
|
|
|
|
|
|
|
@property
|
2016-06-22 13:05:25 -04:00
|
|
|
@since("2.0.0")
|
2015-07-31 02:03:48 -04:00
|
|
|
def theta(self):
|
|
|
|
"""
|
|
|
|
log of class conditional probabilities.
|
|
|
|
"""
|
|
|
|
return self._call_java("theta")
|
|
|
|
|
2019-11-17 21:05:42 -05:00
|
|
|
@property
|
|
|
|
@since("3.0.0")
|
|
|
|
def sigma(self):
|
|
|
|
"""
|
|
|
|
variance of each feature.
|
|
|
|
"""
|
|
|
|
return self._call_java("sigma")
|
|
|
|
|
2015-07-31 02:03:48 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class _MultilayerPerceptronParams(_ProbabilisticClassifierParams, HasSeed, HasMaxIter,
|
2020-02-09 00:14:30 -05:00
|
|
|
HasTol, HasStepSize, HasSolver, HasBlockSize):
|
2019-10-18 05:26:54 -04:00
|
|
|
"""
|
|
|
|
Params for :py:class:`MultilayerPerceptronClassifier`.
|
|
|
|
|
|
|
|
.. versionadded:: 3.0.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
|
|
|
|
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
|
|
|
|
"neurons and output layer of 10 neurons.",
|
|
|
|
typeConverter=TypeConverters.toListInt)
|
|
|
|
solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " +
|
|
|
|
"options: l-bfgs, gd.", typeConverter=TypeConverters.toString)
|
|
|
|
initialWeights = Param(Params._dummy(), "initialWeights", "The initial weights of the model.",
|
|
|
|
typeConverter=TypeConverters.toVector)
|
|
|
|
|
2020-08-03 11:50:34 -04:00
|
|
|
def __init__(self, *args):
|
|
|
|
super(_MultilayerPerceptronParams, self).__init__(*args)
|
[SPARK-32232][ML][PYSPARK] Make sure ML has the same default solver values between Scala and Python
# What changes were proposed in this pull request?
current problems:
```
mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123)
model = mlp.fit(df)
path = tempfile.mkdtemp()
model_path = path + "/mlp"
model.save(model_path)
model2 = MultilayerPerceptronClassificationModel.load(model_path)
self.assertEqual(model2.getSolver(), "l-bfgs") # this fails because model2.getSolver() returns 'auto'
model2.transform(df)
# this fails with Exception pyspark.sql.utils.IllegalArgumentException: MultilayerPerceptronClassifier_dec859ed24ec parameter solver given invalid value auto.
```
FMClassifier/Regression and GeneralizedLinearRegression have the same problems.
Here are the root cause of the problems:
1. In HasSolver, both Scala and Python default solver to 'auto'
2. On Scala side, mlp overrides the default of solver to 'l-bfgs', FMClassifier/Regression overrides the default of solver to 'adamW', and glr overrides the default of solver to 'irls'
3. On Scala side, mlp overrides the default of solver in MultilayerPerceptronClassificationParams, so both MultilayerPerceptronClassification and MultilayerPerceptronClassificationModel have 'l-bfgs' as default
4. On Python side, mlp overrides the default of solver in MultilayerPerceptronClassification, so it has default as 'l-bfgs', but MultilayerPerceptronClassificationModel doesn't override the default so it gets the default from HasSolver which is 'auto'. In theory, we don't care about the solver value or any other params values for MultilayerPerceptronClassificationModel, because we have the fitted model already. That's why on Python side, we never set default values for any of the XXXModel.
5. when calling getSolver on the loaded mlp model, it calls this line of code underneath:
```
def _transfer_params_from_java(self):
"""
Transforms the embedded params from the companion Java object.
"""
......
# SPARK-14931: Only check set params back to avoid default params mismatch.
if self._java_obj.isSet(java_param):
value = _java2py(sc, self._java_obj.getOrDefault(java_param))
self._set(**{param.name: value})
......
```
that's why model2.getSolver() returns 'auto'. The code doesn't get the default Scala value (in this case 'l-bfgs') to set to Python param, so it takes the default value (in this case 'auto') on Python side.
6. when calling model2.transform(df), it calls this underneath:
```
def _transfer_params_to_java(self):
"""
Transforms the embedded params to the companion Java object.
"""
......
if self.hasDefault(param):
pair = self._make_java_param_pair(param, self._defaultParamMap[param])
pair_defaults.append(pair)
......
```
Again, it gets the Python default solver which is 'auto', and this caused the Exception
7. Currently, on Scala side, for some of the algorithms, we set default values in the XXXParam, so both estimator and transformer get the default value. However, for some of the algorithms, we only set default in estimators, and the XXXModel doesn't get the default value. On Python side, we never set defaults for the XXXModel. This causes the default value inconsistency.
8. My proposed solution: set default params in XXXParam for both Scala and Python, so both the estimator and transformer have the same default value for both Scala and Python. I currently only changed solver in this PR. If everyone is OK with the fix, I will change all the other params as well.
I hope my explanation makes sense to your folks :)
### Why are the changes needed?
Fix bug
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
existing and new tests
Closes #29060 from huaxingao/solver_parity.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
2020-07-11 11:37:26 -04:00
|
|
|
self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs")
|
|
|
|
|
2019-10-18 05:26:54 -04:00
|
|
|
@since("1.6.0")
|
|
|
|
def getLayers(self):
|
|
|
|
"""
|
|
|
|
Gets the value of layers or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.layers)
|
|
|
|
|
|
|
|
@since("2.0.0")
|
|
|
|
def getInitialWeights(self):
|
|
|
|
"""
|
|
|
|
Gets the value of initialWeights or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.initialWeights)
|
|
|
|
|
|
|
|
|
2015-09-11 11:52:28 -04:00
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class MultilayerPerceptronClassifier(_JavaProbabilisticClassifier, _MultilayerPerceptronParams,
|
2019-10-18 05:26:54 -04:00
|
|
|
JavaMLWritable, JavaMLReadable):
|
2015-09-11 11:52:28 -04:00
|
|
|
"""
|
|
|
|
Classifier trainer based on the Multilayer Perceptron.
|
|
|
|
Each layer has sigmoid activation function, output layer has softmax.
|
|
|
|
Number of inputs has to be equal to the size of feature vectors.
|
|
|
|
Number of outputs has to be equal to the total number of labels.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 1.6.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2016-05-17 15:51:07 -04:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([
|
2015-09-11 11:52:28 -04:00
|
|
|
... (0.0, Vectors.dense([0.0, 0.0])),
|
|
|
|
... (1.0, Vectors.dense([0.0, 1.0])),
|
|
|
|
... (1.0, Vectors.dense([1.0, 0.0])),
|
|
|
|
... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"])
|
2020-02-09 00:14:30 -05:00
|
|
|
>>> mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123)
|
2019-10-27 23:36:10 -04:00
|
|
|
>>> mlp.setMaxIter(100)
|
|
|
|
MultilayerPerceptronClassifier...
|
|
|
|
>>> mlp.getMaxIter()
|
|
|
|
100
|
2020-02-09 00:14:30 -05:00
|
|
|
>>> mlp.getBlockSize()
|
|
|
|
128
|
|
|
|
>>> mlp.setBlockSize(1)
|
|
|
|
MultilayerPerceptronClassifier...
|
|
|
|
>>> mlp.getBlockSize()
|
|
|
|
1
|
2015-09-11 11:52:28 -04:00
|
|
|
>>> model = mlp.fit(df)
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.setFeaturesCol("features")
|
[SPARK-29867][ML][PYTHON] Add __repr__ in Python ML Models
### What changes were proposed in this pull request?
Add ```__repr__``` in Python ML Models
### Why are the changes needed?
In Python ML Models, some of them have ```__repr__```, others don't. In the doctest, when calling Model.setXXX, some of the Models print out the xxxModel... correctly, some of them can't because of lacking the ```__repr__``` method. For example:
```
>>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)
>>> model = gm.fit(df)
>>> model.setPredictionCol("newPrediction")
GaussianMixture...
```
After the change, the above code will become the following:
```
>>> gm = GaussianMixture(k=3, tol=0.0001, seed=10)
>>> model = gm.fit(df)
>>> model.setPredictionCol("newPrediction")
GaussianMixtureModel...
```
### Does this PR introduce any user-facing change?
Yes.
### How was this patch tested?
doctest
Closes #26489 from huaxingao/spark-29876.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-16 00:44:39 -05:00
|
|
|
MultilayerPerceptronClassificationModel...
|
2020-01-03 13:01:11 -05:00
|
|
|
>>> model.getMaxIter()
|
|
|
|
100
|
|
|
|
>>> model.getLayers()
|
2016-06-03 18:56:17 -04:00
|
|
|
[2, 2, 2]
|
2015-09-11 11:52:28 -04:00
|
|
|
>>> model.weights.size
|
2016-06-03 18:56:17 -04:00
|
|
|
12
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> testDF = spark.createDataFrame([
|
2015-09-11 11:52:28 -04:00
|
|
|
... (Vectors.dense([1.0, 0.0]),),
|
|
|
|
... (Vectors.dense([0.0, 0.0]),)], ["features"])
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.predict(testDF.head().features)
|
|
|
|
1.0
|
2020-01-03 12:42:56 -05:00
|
|
|
>>> model.predictRaw(testDF.head().features)
|
|
|
|
DenseVector([-16.208, 16.344])
|
|
|
|
>>> model.predictProbability(testDF.head().features)
|
|
|
|
DenseVector([0.0, 1.0])
|
2017-08-23 00:16:34 -04:00
|
|
|
>>> model.transform(testDF).select("features", "prediction").show()
|
2015-09-11 11:52:28 -04:00
|
|
|
+---------+----------+
|
|
|
|
| features|prediction|
|
|
|
|
+---------+----------+
|
|
|
|
|[1.0,0.0]| 1.0|
|
|
|
|
|[0.0,0.0]| 0.0|
|
|
|
|
+---------+----------+
|
|
|
|
...
|
2016-03-30 18:47:01 -04:00
|
|
|
>>> mlp_path = temp_path + "/mlp"
|
|
|
|
>>> mlp.save(mlp_path)
|
|
|
|
>>> mlp2 = MultilayerPerceptronClassifier.load(mlp_path)
|
|
|
|
>>> mlp2.getBlockSize()
|
|
|
|
1
|
|
|
|
>>> model_path = temp_path + "/mlp_model"
|
|
|
|
>>> model.save(model_path)
|
|
|
|
>>> model2 = MultilayerPerceptronClassificationModel.load(model_path)
|
2020-01-03 13:01:11 -05:00
|
|
|
>>> model.getLayers() == model2.getLayers()
|
2016-03-30 18:47:01 -04:00
|
|
|
True
|
|
|
|
>>> model.weights == model2.weights
|
|
|
|
True
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> model.transform(testDF).take(1) == model2.transform(testDF).take(1)
|
|
|
|
True
|
2016-06-03 18:56:17 -04:00
|
|
|
>>> mlp2 = mlp2.setInitialWeights(list(range(0, 12)))
|
|
|
|
>>> model3 = mlp2.fit(df)
|
|
|
|
>>> model3.weights != model2.weights
|
|
|
|
True
|
2020-01-03 13:01:11 -05:00
|
|
|
>>> model3.getLayers() == model.getLayers()
|
2016-06-03 18:56:17 -04:00
|
|
|
True
|
2015-09-11 11:52:28 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2016-07-25 15:00:37 -04:00
|
|
|
maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
|
2017-09-11 04:52:48 -04:00
|
|
|
solver="l-bfgs", initialWeights=None, probabilityCol="probability",
|
2018-04-11 18:52:13 -04:00
|
|
|
rawPredictionCol="rawPrediction"):
|
2015-09-11 11:52:28 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2016-07-25 15:00:37 -04:00
|
|
|
maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \
|
2017-09-11 04:52:48 -04:00
|
|
|
solver="l-bfgs", initialWeights=None, probabilityCol="probability", \
|
2018-04-11 18:52:13 -04:00
|
|
|
rawPredictionCol="rawPrediction")
|
2015-09-11 11:52:28 -04:00
|
|
|
"""
|
|
|
|
super(MultilayerPerceptronClassifier, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-09-11 11:52:28 -04:00
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.6.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2016-07-25 15:00:37 -04:00
|
|
|
maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
|
2017-09-11 04:52:48 -04:00
|
|
|
solver="l-bfgs", initialWeights=None, probabilityCol="probability",
|
2018-04-11 18:52:13 -04:00
|
|
|
rawPredictionCol="rawPrediction"):
|
2015-09-11 11:52:28 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2016-07-25 15:00:37 -04:00
|
|
|
maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \
|
2017-09-11 04:52:48 -04:00
|
|
|
solver="l-bfgs", initialWeights=None, probabilityCol="probability", \
|
2018-04-11 18:52:13 -04:00
|
|
|
rawPredictionCol="rawPrediction"):
|
2015-09-11 11:52:28 -04:00
|
|
|
Sets params for MultilayerPerceptronClassifier.
|
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2016-05-01 15:29:01 -04:00
|
|
|
return self._set(**kwargs)
|
2015-09-11 11:52:28 -04:00
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return MultilayerPerceptronClassificationModel(java_model)
|
|
|
|
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.6.0")
|
2015-09-11 11:52:28 -04:00
|
|
|
def setLayers(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`layers`.
|
|
|
|
"""
|
2016-05-03 10:46:13 -04:00
|
|
|
return self._set(layers=value)
|
2015-09-11 11:52:28 -04:00
|
|
|
|
2015-11-09 16:16:04 -05:00
|
|
|
@since("1.6.0")
|
2015-09-11 11:52:28 -04:00
|
|
|
def setBlockSize(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`blockSize`.
|
|
|
|
"""
|
2016-05-03 10:46:13 -04:00
|
|
|
return self._set(blockSize=value)
|
2015-09-11 11:52:28 -04:00
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
@since("2.0.0")
|
|
|
|
def setInitialWeights(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`initialWeights`.
|
|
|
|
"""
|
|
|
|
return self._set(initialWeights=value)
|
|
|
|
|
|
|
|
def setMaxIter(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxIter`.
|
|
|
|
"""
|
|
|
|
return self._set(maxIter=value)
|
|
|
|
|
|
|
|
def setSeed(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`seed`.
|
|
|
|
"""
|
|
|
|
return self._set(seed=value)
|
|
|
|
|
|
|
|
def setTol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`tol`.
|
|
|
|
"""
|
|
|
|
return self._set(tol=value)
|
|
|
|
|
2016-06-03 18:56:17 -04:00
|
|
|
@since("2.0.0")
|
|
|
|
def setStepSize(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`stepSize`.
|
|
|
|
"""
|
|
|
|
return self._set(stepSize=value)
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
def setSolver(self, value):
|
2016-06-03 18:56:17 -04:00
|
|
|
"""
|
2019-10-27 23:36:10 -04:00
|
|
|
Sets the value of :py:attr:`solver`.
|
2016-06-03 18:56:17 -04:00
|
|
|
"""
|
2019-10-27 23:36:10 -04:00
|
|
|
return self._set(solver=value)
|
2016-06-03 18:56:17 -04:00
|
|
|
|
2015-09-11 11:52:28 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class MultilayerPerceptronClassificationModel(_JavaProbabilisticClassificationModel,
|
2020-01-03 13:01:11 -05:00
|
|
|
_MultilayerPerceptronParams, JavaMLWritable,
|
2020-07-29 10:58:25 -04:00
|
|
|
JavaMLReadable, HasTrainingSummary):
|
2015-09-11 11:52:28 -04:00
|
|
|
"""
|
|
|
|
Model fitted by MultilayerPerceptronClassifier.
|
2015-11-09 16:16:04 -05:00
|
|
|
|
|
|
|
.. versionadded:: 1.6.0
|
2015-09-11 11:52:28 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
2016-06-22 13:05:25 -04:00
|
|
|
@since("2.0.0")
|
2015-09-11 11:52:28 -04:00
|
|
|
def weights(self):
|
|
|
|
"""
|
2016-09-06 06:30:37 -04:00
|
|
|
the weights of layers.
|
2015-09-11 11:52:28 -04:00
|
|
|
"""
|
|
|
|
return self._call_java("weights")
|
|
|
|
|
2020-07-29 10:58:25 -04:00
|
|
|
@since("3.1.0")
|
|
|
|
def summary(self):
|
|
|
|
"""
|
2021-05-11 21:38:59 -04:00
|
|
|
Gets summary (accuracy/precision/recall, objective history, total iterations) of model
|
2020-07-29 10:58:25 -04:00
|
|
|
trained on the training set. An exception is thrown if `trainingSummary is None`.
|
|
|
|
"""
|
|
|
|
if self.hasSummary:
|
|
|
|
return MultilayerPerceptronClassificationTrainingSummary(
|
|
|
|
super(MultilayerPerceptronClassificationModel, self).summary)
|
|
|
|
else:
|
|
|
|
raise RuntimeError("No training summary available for this %s" %
|
|
|
|
self.__class__.__name__)
|
|
|
|
|
|
|
|
def evaluate(self, dataset):
|
|
|
|
"""
|
|
|
|
Evaluates the model on a test dataset.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
Test dataset to evaluate model on.
|
2020-07-29 10:58:25 -04:00
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
2021-05-03 02:34:24 -04:00
|
|
|
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
|
2020-07-29 10:58:25 -04:00
|
|
|
java_mlp_summary = self._call_java("evaluate", dataset)
|
|
|
|
return MultilayerPerceptronClassificationSummary(java_mlp_summary)
|
|
|
|
|
|
|
|
|
|
|
|
class MultilayerPerceptronClassificationSummary(_ClassificationSummary):
|
|
|
|
"""
|
|
|
|
Abstraction for MultilayerPerceptronClassifier Results for a given model.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
2020-07-29 10:58:25 -04:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class MultilayerPerceptronClassificationTrainingSummary(MultilayerPerceptronClassificationSummary,
|
|
|
|
_TrainingSummary):
|
|
|
|
"""
|
|
|
|
Abstraction for MultilayerPerceptronClassifier Training results.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
2020-07-29 10:58:25 -04:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2015-09-11 11:52:28 -04:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class _OneVsRestParams(_ClassifierParams, HasWeightCol):
|
2016-04-18 14:52:29 -04:00
|
|
|
"""
|
2019-10-14 11:52:23 -04:00
|
|
|
Params for :py:class:`OneVsRest` and :py:class:`OneVsRestModelModel`.
|
2016-04-18 14:52:29 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
classifier = Param(Params._dummy(), "classifier", "base binary classifier")
|
|
|
|
|
|
|
|
@since("2.0.0")
|
|
|
|
def getClassifier(self):
|
|
|
|
"""
|
|
|
|
Gets the value of classifier or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.classifier)
|
|
|
|
|
|
|
|
|
2016-04-15 15:58:38 -04:00
|
|
|
@inherit_doc
|
2020-12-03 19:35:50 -05:00
|
|
|
class OneVsRest(Estimator, _OneVsRestParams, HasParallelism, MLReadable, MLWritable):
|
2016-04-15 15:58:38 -04:00
|
|
|
"""
|
|
|
|
Reduction of Multiclass Classification to Binary Classification.
|
|
|
|
Performs reduction using one against all strategy.
|
|
|
|
For a multiclass classification with k classes, train k models (one per class).
|
|
|
|
Each example is scored against all k models and the model with highest score
|
|
|
|
is picked to label the example.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2016-04-15 15:58:38 -04:00
|
|
|
>>> from pyspark.sql import Row
|
2016-05-17 15:51:07 -04:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
2017-04-26 09:34:18 -04:00
|
|
|
>>> data_path = "data/mllib/sample_multiclass_classification_data.txt"
|
|
|
|
>>> df = spark.read.format("libsvm").load(data_path)
|
|
|
|
>>> lr = LogisticRegression(regParam=0.01)
|
2016-04-15 15:58:38 -04:00
|
|
|
>>> ovr = OneVsRest(classifier=lr)
|
2019-03-02 10:09:28 -05:00
|
|
|
>>> ovr.getRawPredictionCol()
|
|
|
|
'rawPrediction'
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> ovr.setPredictionCol("newPrediction")
|
|
|
|
OneVsRest...
|
2016-04-15 15:58:38 -04:00
|
|
|
>>> model = ovr.fit(df)
|
2017-04-26 09:34:18 -04:00
|
|
|
>>> model.models[0].coefficients
|
|
|
|
DenseVector([0.5..., -1.0..., 3.4..., 4.2...])
|
|
|
|
>>> model.models[1].coefficients
|
|
|
|
DenseVector([-2.1..., 3.1..., -2.6..., -2.3...])
|
|
|
|
>>> model.models[2].coefficients
|
|
|
|
DenseVector([0.3..., -3.4..., 1.0..., -1.1...])
|
2016-04-15 15:58:38 -04:00
|
|
|
>>> [x.intercept for x in model.models]
|
2017-04-26 09:34:18 -04:00
|
|
|
[-2.7..., -2.5..., -1.3...]
|
|
|
|
>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0, 1.0, 1.0))]).toDF()
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.transform(test0).head().newPrediction
|
2016-04-15 15:58:38 -04:00
|
|
|
0.0
|
2017-04-26 09:34:18 -04:00
|
|
|
>>> test1 = sc.parallelize([Row(features=Vectors.sparse(4, [0], [1.0]))]).toDF()
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.transform(test1).head().newPrediction
|
2016-04-15 15:58:38 -04:00
|
|
|
2.0
|
2017-04-26 09:34:18 -04:00
|
|
|
>>> test2 = sc.parallelize([Row(features=Vectors.dense(0.5, 0.4, 0.3, 0.2))]).toDF()
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model.transform(test2).head().newPrediction
|
2017-04-26 09:34:18 -04:00
|
|
|
0.0
|
2017-01-31 18:42:36 -05:00
|
|
|
>>> model_path = temp_path + "/ovr_model"
|
|
|
|
>>> model.save(model_path)
|
|
|
|
>>> model2 = OneVsRestModel.load(model_path)
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
>>> model2.transform(test0).head().newPrediction
|
2017-04-26 09:34:18 -04:00
|
|
|
0.0
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
|
|
|
|
True
|
2019-03-02 10:09:28 -05:00
|
|
|
>>> model.transform(test2).columns
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
['features', 'rawPrediction', 'newPrediction']
|
2016-04-15 15:58:38 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2019-03-02 10:09:28 -05:00
|
|
|
rawPredictionCol="rawPrediction", classifier=None, weightCol=None, parallelism=1):
|
2016-04-15 15:58:38 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2019-03-02 10:09:28 -05:00
|
|
|
rawPredictionCol="rawPrediction", classifier=None, weightCol=None, parallelism=1):
|
2016-04-15 15:58:38 -04:00
|
|
|
"""
|
|
|
|
super(OneVsRest, self).__init__()
|
2017-09-12 13:02:27 -04:00
|
|
|
self._setDefault(parallelism=1)
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2016-04-15 15:58:38 -04:00
|
|
|
self._set(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
@since("2.0.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2019-03-02 10:09:28 -05:00
|
|
|
rawPredictionCol="rawPrediction", classifier=None, weightCol=None, parallelism=1):
|
2016-04-15 15:58:38 -04:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2019-03-02 10:09:28 -05:00
|
|
|
rawPredictionCol="rawPrediction", classifier=None, weightCol=None, parallelism=1):
|
2016-04-15 15:58:38 -04:00
|
|
|
Sets params for OneVsRest.
|
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2016-04-15 15:58:38 -04:00
|
|
|
return self._set(**kwargs)
|
|
|
|
|
2019-09-13 13:29:19 -04:00
|
|
|
@since("2.0.0")
|
|
|
|
def setClassifier(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`classifier`.
|
|
|
|
"""
|
|
|
|
return self._set(classifier=value)
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
def setLabelCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`labelCol`.
|
|
|
|
"""
|
|
|
|
return self._set(labelCol=value)
|
|
|
|
|
|
|
|
def setFeaturesCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`featuresCol`.
|
|
|
|
"""
|
|
|
|
return self._set(featuresCol=value)
|
|
|
|
|
|
|
|
def setPredictionCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`predictionCol`.
|
|
|
|
"""
|
|
|
|
return self._set(predictionCol=value)
|
|
|
|
|
|
|
|
def setRawPredictionCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`rawPredictionCol`.
|
|
|
|
"""
|
|
|
|
return self._set(rawPredictionCol=value)
|
|
|
|
|
|
|
|
def setWeightCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`weightCol`.
|
|
|
|
"""
|
|
|
|
return self._set(weightCol=value)
|
|
|
|
|
|
|
|
def setParallelism(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`parallelism`.
|
|
|
|
"""
|
|
|
|
return self._set(parallelism=value)
|
|
|
|
|
2016-04-15 15:58:38 -04:00
|
|
|
def _fit(self, dataset):
|
|
|
|
labelCol = self.getLabelCol()
|
|
|
|
featuresCol = self.getFeaturesCol()
|
|
|
|
predictionCol = self.getPredictionCol()
|
|
|
|
classifier = self.getClassifier()
|
|
|
|
|
|
|
|
numClasses = int(dataset.agg({labelCol: "max"}).head()["max("+labelCol+")"]) + 1
|
|
|
|
|
2017-07-27 22:10:35 -04:00
|
|
|
weightCol = None
|
|
|
|
if (self.isDefined(self.weightCol) and self.getWeightCol()):
|
|
|
|
if isinstance(classifier, HasWeightCol):
|
|
|
|
weightCol = self.getWeightCol()
|
|
|
|
else:
|
|
|
|
warnings.warn("weightCol is ignored, "
|
|
|
|
"as it is not supported by {} now.".format(classifier))
|
|
|
|
|
|
|
|
if weightCol:
|
|
|
|
multiclassLabeled = dataset.select(labelCol, featuresCol, weightCol)
|
|
|
|
else:
|
|
|
|
multiclassLabeled = dataset.select(labelCol, featuresCol)
|
2016-04-15 15:58:38 -04:00
|
|
|
|
|
|
|
# persist if underlying dataset is not persistent.
|
2017-09-14 02:09:44 -04:00
|
|
|
handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False)
|
2016-04-15 15:58:38 -04:00
|
|
|
if handlePersistence:
|
|
|
|
multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)
|
|
|
|
|
|
|
|
def trainSingleClass(index):
|
|
|
|
binaryLabelCol = "mc2b$" + str(index)
|
|
|
|
trainingDataset = multiclassLabeled.withColumn(
|
|
|
|
binaryLabelCol,
|
|
|
|
when(multiclassLabeled[labelCol] == float(index), 1.0).otherwise(0.0))
|
|
|
|
paramMap = dict([(classifier.labelCol, binaryLabelCol),
|
|
|
|
(classifier.featuresCol, featuresCol),
|
|
|
|
(classifier.predictionCol, predictionCol)])
|
2017-07-27 22:10:35 -04:00
|
|
|
if weightCol:
|
|
|
|
paramMap[classifier.weightCol] = weightCol
|
2016-04-15 15:58:38 -04:00
|
|
|
return classifier.fit(trainingDataset, paramMap)
|
|
|
|
|
2017-09-12 13:02:27 -04:00
|
|
|
pool = ThreadPool(processes=min(self.getParallelism(), numClasses))
|
|
|
|
|
|
|
|
models = pool.map(trainSingleClass, range(numClasses))
|
2016-04-15 15:58:38 -04:00
|
|
|
|
|
|
|
if handlePersistence:
|
|
|
|
multiclassLabeled.unpersist()
|
|
|
|
|
|
|
|
return self._copyValues(OneVsRestModel(models=models))
|
|
|
|
|
|
|
|
def copy(self, extra=None):
|
|
|
|
"""
|
|
|
|
Creates a copy of this instance with a randomly generated uid
|
|
|
|
and some extra params. This creates a deep copy of the embedded paramMap,
|
|
|
|
and copies the embedded and extra parameters over.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
extra : dict, optional
|
|
|
|
Extra parameters to copy to the new instance
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:py:class:`OneVsRest`
|
|
|
|
Copy of this instance
|
2016-04-15 15:58:38 -04:00
|
|
|
"""
|
|
|
|
if extra is None:
|
|
|
|
extra = dict()
|
|
|
|
newOvr = Params.copy(self, extra)
|
|
|
|
if self.isSet(self.classifier):
|
|
|
|
newOvr.setClassifier(self.getClassifier().copy(extra))
|
|
|
|
return newOvr
|
|
|
|
|
2016-04-18 14:52:29 -04:00
|
|
|
@classmethod
|
|
|
|
def _from_java(cls, java_stage):
|
|
|
|
"""
|
|
|
|
Given a Java OneVsRest, create and return a Python wrapper of it.
|
|
|
|
Used for ML persistence.
|
|
|
|
"""
|
|
|
|
featuresCol = java_stage.getFeaturesCol()
|
|
|
|
labelCol = java_stage.getLabelCol()
|
|
|
|
predictionCol = java_stage.getPredictionCol()
|
2019-03-02 10:09:28 -05:00
|
|
|
rawPredictionCol = java_stage.getRawPredictionCol()
|
2016-04-18 14:52:29 -04:00
|
|
|
classifier = JavaParams._from_java(java_stage.getClassifier())
|
2017-09-12 13:02:27 -04:00
|
|
|
parallelism = java_stage.getParallelism()
|
2016-04-18 14:52:29 -04:00
|
|
|
py_stage = cls(featuresCol=featuresCol, labelCol=labelCol, predictionCol=predictionCol,
|
2019-03-02 10:09:28 -05:00
|
|
|
rawPredictionCol=rawPredictionCol, classifier=classifier,
|
|
|
|
parallelism=parallelism)
|
[SPARK-30504][PYTHON][ML] Set weightCol in OneVsRest(Model) _to_java and _from_java
### What changes were proposed in this pull request?
This PR adjusts `_to_java` and `_from_java` of `OneVsRest` and `OneVsRestModel` to preserve `weightCol`.
### Why are the changes needed?
Currently both `Params` don't preserve `weightCol` `Params` when data is saved / loaded:
```python
from pyspark.ml.classification import LogisticRegression, OneVsRest, OneVsRestModel
from pyspark.ml.linalg import DenseVector
df = spark.createDataFrame([(0, 1, DenseVector([1.0, 0.0])), (0, 1, DenseVector([1.0, 0.0]))], ("label", "w", "features"))
ovr = OneVsRest(classifier=LogisticRegression()).setWeightCol("w")
ovrm = ovr.fit(df)
ovr.getWeightCol()
## 'w'
ovrm.getWeightCol()
## 'w'
ovr.write().overwrite().save("/tmp/ovr")
ovr_ = OneVsRest.load("/tmp/ovr")
ovr_.getWeightCol()
## KeyError
## ...
## KeyError: Param(parent='OneVsRest_5145d56b6bd1', name='weightCol', doc='weight column name. ...)
ovrm.write().overwrite().save("/tmp/ovrm")
ovrm_ = OneVsRestModel.load("/tmp/ovrm")
ovrm_ .getWeightCol()
## KeyError
## ...
## KeyError: Param(parent='OneVsRestModel_598c6d900fad', name='weightCol', doc='weight column name ...
```
### Does this PR introduce any user-facing change?
After this PR is merged, loaded objects will have `weightCol` `Param` set.
### How was this patch tested?
- Manual testing.
- Extension of existing persistence tests.
Closes #27190 from zero323/SPARK-30504.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
2020-01-15 09:42:24 -05:00
|
|
|
if java_stage.isDefined(java_stage.getParam("weightCol")):
|
|
|
|
py_stage.setWeightCol(java_stage.getWeightCol())
|
2016-04-18 14:52:29 -04:00
|
|
|
py_stage._resetUid(java_stage.uid())
|
|
|
|
return py_stage
|
|
|
|
|
|
|
|
def _to_java(self):
|
|
|
|
"""
|
|
|
|
Transfer this instance to a Java OneVsRest. Used for ML persistence.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
py4j.java_gateway.JavaObject
|
|
|
|
Java object equivalent to this instance.
|
2016-04-18 14:52:29 -04:00
|
|
|
"""
|
|
|
|
_java_obj = JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRest",
|
|
|
|
self.uid)
|
|
|
|
_java_obj.setClassifier(self.getClassifier()._to_java())
|
2017-09-12 13:02:27 -04:00
|
|
|
_java_obj.setParallelism(self.getParallelism())
|
2016-04-18 14:52:29 -04:00
|
|
|
_java_obj.setFeaturesCol(self.getFeaturesCol())
|
|
|
|
_java_obj.setLabelCol(self.getLabelCol())
|
|
|
|
_java_obj.setPredictionCol(self.getPredictionCol())
|
[SPARK-30504][PYTHON][ML] Set weightCol in OneVsRest(Model) _to_java and _from_java
### What changes were proposed in this pull request?
This PR adjusts `_to_java` and `_from_java` of `OneVsRest` and `OneVsRestModel` to preserve `weightCol`.
### Why are the changes needed?
Currently both `Params` don't preserve `weightCol` `Params` when data is saved / loaded:
```python
from pyspark.ml.classification import LogisticRegression, OneVsRest, OneVsRestModel
from pyspark.ml.linalg import DenseVector
df = spark.createDataFrame([(0, 1, DenseVector([1.0, 0.0])), (0, 1, DenseVector([1.0, 0.0]))], ("label", "w", "features"))
ovr = OneVsRest(classifier=LogisticRegression()).setWeightCol("w")
ovrm = ovr.fit(df)
ovr.getWeightCol()
## 'w'
ovrm.getWeightCol()
## 'w'
ovr.write().overwrite().save("/tmp/ovr")
ovr_ = OneVsRest.load("/tmp/ovr")
ovr_.getWeightCol()
## KeyError
## ...
## KeyError: Param(parent='OneVsRest_5145d56b6bd1', name='weightCol', doc='weight column name. ...)
ovrm.write().overwrite().save("/tmp/ovrm")
ovrm_ = OneVsRestModel.load("/tmp/ovrm")
ovrm_ .getWeightCol()
## KeyError
## ...
## KeyError: Param(parent='OneVsRestModel_598c6d900fad', name='weightCol', doc='weight column name ...
```
### Does this PR introduce any user-facing change?
After this PR is merged, loaded objects will have `weightCol` `Param` set.
### How was this patch tested?
- Manual testing.
- Extension of existing persistence tests.
Closes #27190 from zero323/SPARK-30504.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
2020-01-15 09:42:24 -05:00
|
|
|
if (self.isDefined(self.weightCol) and self.getWeightCol()):
|
|
|
|
_java_obj.setWeightCol(self.getWeightCol())
|
2019-03-02 10:09:28 -05:00
|
|
|
_java_obj.setRawPredictionCol(self.getRawPredictionCol())
|
2016-04-18 14:52:29 -04:00
|
|
|
return _java_obj
|
2016-04-15 15:58:38 -04:00
|
|
|
|
2020-12-03 19:35:50 -05:00
|
|
|
@classmethod
|
|
|
|
def read(cls):
|
|
|
|
return OneVsRestReader(cls)
|
|
|
|
|
|
|
|
def write(self):
|
|
|
|
if isinstance(self.getClassifier(), JavaMLWritable):
|
|
|
|
return JavaMLWriter(self)
|
|
|
|
else:
|
|
|
|
return OneVsRestWriter(self)
|
|
|
|
|
|
|
|
|
|
|
|
class _OneVsRestSharedReadWrite:
|
|
|
|
@staticmethod
|
|
|
|
def saveImpl(instance, sc, path, extraMetadata=None):
|
|
|
|
skipParams = ['classifier']
|
|
|
|
jsonParams = DefaultParamsWriter.extractJsonParams(instance, skipParams)
|
|
|
|
DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap=jsonParams,
|
|
|
|
extraMetadata=extraMetadata)
|
|
|
|
classifierPath = os.path.join(path, 'classifier')
|
|
|
|
instance.getClassifier().save(classifierPath)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def loadClassifier(path, sc):
|
|
|
|
classifierPath = os.path.join(path, 'classifier')
|
|
|
|
return DefaultParamsReader.loadParamsInstance(classifierPath, sc)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def validateParams(instance):
|
|
|
|
elems_to_check = [instance.getClassifier()]
|
|
|
|
if isinstance(instance, OneVsRestModel):
|
|
|
|
elems_to_check.extend(instance.models)
|
|
|
|
|
|
|
|
for elem in elems_to_check:
|
|
|
|
if not isinstance(elem, MLWritable):
|
|
|
|
raise ValueError(f'OneVsRest write will fail because it contains {elem.uid} '
|
|
|
|
f'which is not writable.')
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class OneVsRestReader(MLReader):
|
|
|
|
def __init__(self, cls):
|
|
|
|
super(OneVsRestReader, self).__init__()
|
|
|
|
self.cls = cls
|
|
|
|
|
|
|
|
def load(self, path):
|
|
|
|
metadata = DefaultParamsReader.loadMetadata(path, self.sc)
|
|
|
|
if not DefaultParamsReader.isPythonParamsInstance(metadata):
|
|
|
|
return JavaMLReader(self.cls).load(path)
|
|
|
|
else:
|
|
|
|
classifier = _OneVsRestSharedReadWrite.loadClassifier(path, self.sc)
|
|
|
|
ova = OneVsRest(classifier=classifier)._resetUid(metadata['uid'])
|
|
|
|
DefaultParamsReader.getAndSetParams(ova, metadata, skipParams=['classifier'])
|
|
|
|
return ova
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class OneVsRestWriter(MLWriter):
|
|
|
|
def __init__(self, instance):
|
|
|
|
super(OneVsRestWriter, self).__init__()
|
|
|
|
self.instance = instance
|
|
|
|
|
|
|
|
def saveImpl(self, path):
|
|
|
|
_OneVsRestSharedReadWrite.validateParams(self.instance)
|
|
|
|
_OneVsRestSharedReadWrite.saveImpl(self.instance, self.sc, path)
|
2017-07-17 13:07:32 -04:00
|
|
|
|
2020-12-03 19:35:50 -05:00
|
|
|
|
|
|
|
class OneVsRestModel(Model, _OneVsRestParams, MLReadable, MLWritable):
|
2016-04-15 15:58:38 -04:00
|
|
|
"""
|
|
|
|
Model fitted by OneVsRest.
|
|
|
|
This stores the models resulting from training k binary classifiers: one for each class.
|
|
|
|
Each example is scored against all k models, and the model with the highest score
|
|
|
|
is picked to label the example.
|
|
|
|
|
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
"""
|
|
|
|
|
2019-10-27 23:36:10 -04:00
|
|
|
def setFeaturesCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`featuresCol`.
|
|
|
|
"""
|
|
|
|
return self._set(featuresCol=value)
|
|
|
|
|
|
|
|
def setPredictionCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`predictionCol`.
|
|
|
|
"""
|
|
|
|
return self._set(predictionCol=value)
|
|
|
|
|
|
|
|
def setRawPredictionCol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`rawPredictionCol`.
|
|
|
|
"""
|
|
|
|
return self._set(rawPredictionCol=value)
|
|
|
|
|
2016-04-15 15:58:38 -04:00
|
|
|
def __init__(self, models):
|
|
|
|
super(OneVsRestModel, self).__init__()
|
|
|
|
self.models = models
|
2020-12-03 19:35:50 -05:00
|
|
|
if not isinstance(models[0], JavaMLWritable):
|
|
|
|
return
|
|
|
|
# set java instance
|
2017-07-17 13:07:32 -04:00
|
|
|
java_models = [model._to_java() for model in self.models]
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
java_models_array = JavaWrapper._new_java_array(java_models,
|
|
|
|
sc._gateway.jvm.org.apache.spark.ml
|
|
|
|
.classification.ClassificationModel)
|
|
|
|
# TODO: need to set metadata
|
|
|
|
metadata = JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata")
|
|
|
|
self._java_obj = \
|
|
|
|
JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRestModel",
|
|
|
|
self.uid, metadata.empty(), java_models_array)
|
2016-04-15 15:58:38 -04:00
|
|
|
|
|
|
|
def _transform(self, dataset):
|
|
|
|
# determine the input columns: these need to be passed through
|
|
|
|
origCols = dataset.columns
|
|
|
|
|
|
|
|
# add an accumulator column to store predictions of all the models
|
|
|
|
accColName = "mbc$acc" + str(uuid.uuid4())
|
|
|
|
initUDF = udf(lambda _: [], ArrayType(DoubleType()))
|
|
|
|
newDataset = dataset.withColumn(accColName, initUDF(dataset[origCols[0]]))
|
|
|
|
|
|
|
|
# persist if underlying dataset is not persistent.
|
2017-09-14 02:09:44 -04:00
|
|
|
handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False)
|
2016-04-15 15:58:38 -04:00
|
|
|
if handlePersistence:
|
|
|
|
newDataset.persist(StorageLevel.MEMORY_AND_DISK)
|
|
|
|
|
|
|
|
# update the accumulator column with the result of prediction of models
|
|
|
|
aggregatedDataset = newDataset
|
|
|
|
for index, model in enumerate(self.models):
|
2019-03-02 10:09:28 -05:00
|
|
|
rawPredictionCol = self.getRawPredictionCol()
|
|
|
|
|
2016-04-15 15:58:38 -04:00
|
|
|
columns = origCols + [rawPredictionCol, accColName]
|
|
|
|
|
|
|
|
# add temporary column to store intermediate scores and update
|
|
|
|
tmpColName = "mbc$tmp" + str(uuid.uuid4())
|
|
|
|
updateUDF = udf(
|
|
|
|
lambda predictions, prediction: predictions + [prediction.tolist()[1]],
|
|
|
|
ArrayType(DoubleType()))
|
|
|
|
transformedDataset = model.transform(aggregatedDataset).select(*columns)
|
|
|
|
updatedDataset = transformedDataset.withColumn(
|
|
|
|
tmpColName,
|
|
|
|
updateUDF(transformedDataset[accColName], transformedDataset[rawPredictionCol]))
|
|
|
|
newColumns = origCols + [tmpColName]
|
|
|
|
|
|
|
|
# switch out the intermediate column with the accumulator column
|
|
|
|
aggregatedDataset = updatedDataset\
|
|
|
|
.select(*newColumns).withColumnRenamed(tmpColName, accColName)
|
|
|
|
|
|
|
|
if handlePersistence:
|
|
|
|
newDataset.unpersist()
|
|
|
|
|
2019-03-02 10:09:28 -05:00
|
|
|
if self.getRawPredictionCol():
|
|
|
|
def func(predictions):
|
|
|
|
predArray = []
|
|
|
|
for x in predictions:
|
|
|
|
predArray.append(x)
|
|
|
|
return Vectors.dense(predArray)
|
|
|
|
|
2021-04-21 04:29:10 -04:00
|
|
|
rawPredictionUDF = udf(func, VectorUDT())
|
2019-03-02 10:09:28 -05:00
|
|
|
aggregatedDataset = aggregatedDataset.withColumn(
|
|
|
|
self.getRawPredictionCol(), rawPredictionUDF(aggregatedDataset[accColName]))
|
|
|
|
|
|
|
|
if self.getPredictionCol():
|
|
|
|
# output the index of the classifier with highest confidence as prediction
|
|
|
|
labelUDF = udf(lambda predictions: float(max(enumerate(predictions),
|
|
|
|
key=operator.itemgetter(1))[0]), DoubleType())
|
|
|
|
aggregatedDataset = aggregatedDataset.withColumn(
|
|
|
|
self.getPredictionCol(), labelUDF(aggregatedDataset[accColName]))
|
|
|
|
return aggregatedDataset.drop(accColName)
|
2016-04-15 15:58:38 -04:00
|
|
|
|
|
|
|
def copy(self, extra=None):
|
|
|
|
"""
|
|
|
|
Creates a copy of this instance with a randomly generated uid
|
|
|
|
and some extra params. This creates a deep copy of the embedded paramMap,
|
|
|
|
and copies the embedded and extra parameters over.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 2.0.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
extra : dict, optional
|
|
|
|
Extra parameters to copy to the new instance
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
:py:class:`OneVsRestModel`
|
|
|
|
Copy of this instance
|
2016-04-15 15:58:38 -04:00
|
|
|
"""
|
|
|
|
if extra is None:
|
|
|
|
extra = dict()
|
|
|
|
newModel = Params.copy(self, extra)
|
|
|
|
newModel.models = [model.copy(extra) for model in self.models]
|
|
|
|
return newModel
|
|
|
|
|
2016-04-18 14:52:29 -04:00
|
|
|
@classmethod
|
|
|
|
def _from_java(cls, java_stage):
|
|
|
|
"""
|
|
|
|
Given a Java OneVsRestModel, create and return a Python wrapper of it.
|
|
|
|
Used for ML persistence.
|
|
|
|
"""
|
|
|
|
featuresCol = java_stage.getFeaturesCol()
|
|
|
|
labelCol = java_stage.getLabelCol()
|
|
|
|
predictionCol = java_stage.getPredictionCol()
|
|
|
|
classifier = JavaParams._from_java(java_stage.getClassifier())
|
|
|
|
models = [JavaParams._from_java(model) for model in java_stage.models()]
|
2020-01-13 06:03:32 -05:00
|
|
|
py_stage = cls(models=models).setPredictionCol(predictionCol)\
|
2019-09-13 13:29:19 -04:00
|
|
|
.setFeaturesCol(featuresCol)
|
2020-01-13 06:03:32 -05:00
|
|
|
py_stage._set(labelCol=labelCol)
|
[SPARK-30504][PYTHON][ML] Set weightCol in OneVsRest(Model) _to_java and _from_java
### What changes were proposed in this pull request?
This PR adjusts `_to_java` and `_from_java` of `OneVsRest` and `OneVsRestModel` to preserve `weightCol`.
### Why are the changes needed?
Currently both `Params` don't preserve `weightCol` `Params` when data is saved / loaded:
```python
from pyspark.ml.classification import LogisticRegression, OneVsRest, OneVsRestModel
from pyspark.ml.linalg import DenseVector
df = spark.createDataFrame([(0, 1, DenseVector([1.0, 0.0])), (0, 1, DenseVector([1.0, 0.0]))], ("label", "w", "features"))
ovr = OneVsRest(classifier=LogisticRegression()).setWeightCol("w")
ovrm = ovr.fit(df)
ovr.getWeightCol()
## 'w'
ovrm.getWeightCol()
## 'w'
ovr.write().overwrite().save("/tmp/ovr")
ovr_ = OneVsRest.load("/tmp/ovr")
ovr_.getWeightCol()
## KeyError
## ...
## KeyError: Param(parent='OneVsRest_5145d56b6bd1', name='weightCol', doc='weight column name. ...)
ovrm.write().overwrite().save("/tmp/ovrm")
ovrm_ = OneVsRestModel.load("/tmp/ovrm")
ovrm_ .getWeightCol()
## KeyError
## ...
## KeyError: Param(parent='OneVsRestModel_598c6d900fad', name='weightCol', doc='weight column name ...
```
### Does this PR introduce any user-facing change?
After this PR is merged, loaded objects will have `weightCol` `Param` set.
### How was this patch tested?
- Manual testing.
- Extension of existing persistence tests.
Closes #27190 from zero323/SPARK-30504.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
2020-01-15 09:42:24 -05:00
|
|
|
if java_stage.isDefined(java_stage.getParam("weightCol")):
|
|
|
|
py_stage._set(weightCol=java_stage.getWeightCol())
|
2019-09-13 13:29:19 -04:00
|
|
|
py_stage._set(classifier=classifier)
|
2016-04-18 14:52:29 -04:00
|
|
|
py_stage._resetUid(java_stage.uid())
|
|
|
|
return py_stage
|
|
|
|
|
|
|
|
def _to_java(self):
|
|
|
|
"""
|
|
|
|
Transfer this instance to a Java OneVsRestModel. Used for ML persistence.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
py4j.java_gateway.JavaObject
|
|
|
|
Java object equivalent to this instance.
|
2016-04-18 14:52:29 -04:00
|
|
|
"""
|
2017-01-31 18:42:36 -05:00
|
|
|
sc = SparkContext._active_spark_context
|
2016-04-18 14:52:29 -04:00
|
|
|
java_models = [model._to_java() for model in self.models]
|
2017-01-31 18:42:36 -05:00
|
|
|
java_models_array = JavaWrapper._new_java_array(
|
|
|
|
java_models, sc._gateway.jvm.org.apache.spark.ml.classification.ClassificationModel)
|
|
|
|
metadata = JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata")
|
2016-04-18 14:52:29 -04:00
|
|
|
_java_obj = JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRestModel",
|
2017-01-31 18:42:36 -05:00
|
|
|
self.uid, metadata.empty(), java_models_array)
|
2016-04-18 14:52:29 -04:00
|
|
|
_java_obj.set("classifier", self.getClassifier()._to_java())
|
|
|
|
_java_obj.set("featuresCol", self.getFeaturesCol())
|
|
|
|
_java_obj.set("labelCol", self.getLabelCol())
|
|
|
|
_java_obj.set("predictionCol", self.getPredictionCol())
|
[SPARK-30504][PYTHON][ML] Set weightCol in OneVsRest(Model) _to_java and _from_java
### What changes were proposed in this pull request?
This PR adjusts `_to_java` and `_from_java` of `OneVsRest` and `OneVsRestModel` to preserve `weightCol`.
### Why are the changes needed?
Currently both `Params` don't preserve `weightCol` `Params` when data is saved / loaded:
```python
from pyspark.ml.classification import LogisticRegression, OneVsRest, OneVsRestModel
from pyspark.ml.linalg import DenseVector
df = spark.createDataFrame([(0, 1, DenseVector([1.0, 0.0])), (0, 1, DenseVector([1.0, 0.0]))], ("label", "w", "features"))
ovr = OneVsRest(classifier=LogisticRegression()).setWeightCol("w")
ovrm = ovr.fit(df)
ovr.getWeightCol()
## 'w'
ovrm.getWeightCol()
## 'w'
ovr.write().overwrite().save("/tmp/ovr")
ovr_ = OneVsRest.load("/tmp/ovr")
ovr_.getWeightCol()
## KeyError
## ...
## KeyError: Param(parent='OneVsRest_5145d56b6bd1', name='weightCol', doc='weight column name. ...)
ovrm.write().overwrite().save("/tmp/ovrm")
ovrm_ = OneVsRestModel.load("/tmp/ovrm")
ovrm_ .getWeightCol()
## KeyError
## ...
## KeyError: Param(parent='OneVsRestModel_598c6d900fad', name='weightCol', doc='weight column name ...
```
### Does this PR introduce any user-facing change?
After this PR is merged, loaded objects will have `weightCol` `Param` set.
### How was this patch tested?
- Manual testing.
- Extension of existing persistence tests.
Closes #27190 from zero323/SPARK-30504.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
2020-01-15 09:42:24 -05:00
|
|
|
if (self.isDefined(self.weightCol) and self.getWeightCol()):
|
|
|
|
_java_obj.set("weightCol", self.getWeightCol())
|
2016-04-18 14:52:29 -04:00
|
|
|
return _java_obj
|
|
|
|
|
2020-12-03 19:35:50 -05:00
|
|
|
@classmethod
|
|
|
|
def read(cls):
|
|
|
|
return OneVsRestModelReader(cls)
|
|
|
|
|
|
|
|
def write(self):
|
|
|
|
if all(map(lambda elem: isinstance(elem, JavaMLWritable),
|
|
|
|
[self.getClassifier()] + self.models)):
|
|
|
|
return JavaMLWriter(self)
|
|
|
|
else:
|
|
|
|
return OneVsRestModelWriter(self)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class OneVsRestModelReader(MLReader):
|
|
|
|
def __init__(self, cls):
|
|
|
|
super(OneVsRestModelReader, self).__init__()
|
|
|
|
self.cls = cls
|
|
|
|
|
|
|
|
def load(self, path):
|
|
|
|
metadata = DefaultParamsReader.loadMetadata(path, self.sc)
|
|
|
|
if not DefaultParamsReader.isPythonParamsInstance(metadata):
|
|
|
|
return JavaMLReader(self.cls).load(path)
|
|
|
|
else:
|
|
|
|
classifier = _OneVsRestSharedReadWrite.loadClassifier(path, self.sc)
|
|
|
|
numClasses = metadata['numClasses']
|
|
|
|
subModels = [None] * numClasses
|
|
|
|
for idx in range(numClasses):
|
|
|
|
subModelPath = os.path.join(path, f'model_{idx}')
|
|
|
|
subModels[idx] = DefaultParamsReader.loadParamsInstance(subModelPath, self.sc)
|
|
|
|
ovaModel = OneVsRestModel(subModels)._resetUid(metadata['uid'])
|
|
|
|
ovaModel.set(ovaModel.classifier, classifier)
|
|
|
|
DefaultParamsReader.getAndSetParams(ovaModel, metadata, skipParams=['classifier'])
|
|
|
|
return ovaModel
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class OneVsRestModelWriter(MLWriter):
|
|
|
|
def __init__(self, instance):
|
|
|
|
super(OneVsRestModelWriter, self).__init__()
|
|
|
|
self.instance = instance
|
|
|
|
|
|
|
|
def saveImpl(self, path):
|
|
|
|
_OneVsRestSharedReadWrite.validateParams(self.instance)
|
|
|
|
instance = self.instance
|
|
|
|
numClasses = len(instance.models)
|
|
|
|
extraMetadata = {'numClasses': numClasses}
|
|
|
|
_OneVsRestSharedReadWrite.saveImpl(instance, self.sc, path, extraMetadata=extraMetadata)
|
|
|
|
for idx in range(numClasses):
|
|
|
|
subModelPath = os.path.join(path, f'model_{idx}')
|
|
|
|
instance.models[idx].save(subModelPath)
|
|
|
|
|
2016-04-15 15:58:38 -04:00
|
|
|
|
2019-12-26 12:39:53 -05:00
|
|
|
@inherit_doc
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class FMClassifier(_JavaProbabilisticClassifier, _FactorizationMachinesParams, JavaMLWritable,
|
2019-12-30 23:56:19 -05:00
|
|
|
JavaMLReadable):
|
2019-12-26 12:39:53 -05:00
|
|
|
"""
|
|
|
|
Factorization Machines learning algorithm for classification.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
Solver supports:
|
2019-12-26 12:39:53 -05:00
|
|
|
|
|
|
|
* gd (normal mini-batch gradient descent)
|
|
|
|
* adamW (default)
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 3.0.0
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
2019-12-26 12:39:53 -05:00
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
|
|
|
>>> from pyspark.ml.classification import FMClassifier
|
|
|
|
>>> df = spark.createDataFrame([
|
|
|
|
... (1.0, Vectors.dense(1.0)),
|
|
|
|
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
|
2019-12-30 23:56:19 -05:00
|
|
|
>>> fm = FMClassifier(factorSize=2)
|
|
|
|
>>> fm.setSeed(11)
|
|
|
|
FMClassifier...
|
2019-12-26 12:39:53 -05:00
|
|
|
>>> model = fm.fit(df)
|
2019-12-30 23:56:19 -05:00
|
|
|
>>> model.getMaxIter()
|
|
|
|
100
|
2019-12-26 12:39:53 -05:00
|
|
|
>>> test0 = spark.createDataFrame([
|
|
|
|
... (Vectors.dense(-1.0),),
|
|
|
|
... (Vectors.dense(0.5),),
|
|
|
|
... (Vectors.dense(1.0),),
|
|
|
|
... (Vectors.dense(2.0),)], ["features"])
|
2020-01-03 12:42:56 -05:00
|
|
|
>>> model.predictRaw(test0.head().features)
|
|
|
|
DenseVector([22.13..., -22.13...])
|
|
|
|
>>> model.predictProbability(test0.head().features)
|
|
|
|
DenseVector([1.0, 0.0])
|
2019-12-26 12:39:53 -05:00
|
|
|
>>> model.transform(test0).select("features", "probability").show(10, False)
|
|
|
|
+--------+------------------------------------------+
|
|
|
|
|features|probability |
|
|
|
|
+--------+------------------------------------------+
|
|
|
|
|[-1.0] |[0.9999999997574736,2.425264676902229E-10]|
|
|
|
|
|[0.5] |[0.47627851732981163,0.5237214826701884] |
|
|
|
|
|[1.0] |[5.491554426243495E-4,0.9994508445573757] |
|
|
|
|
|[2.0] |[2.005766663870645E-10,0.9999999997994233]|
|
|
|
|
+--------+------------------------------------------+
|
|
|
|
...
|
|
|
|
>>> model.intercept
|
|
|
|
-7.316665276826291
|
|
|
|
>>> model.linear
|
|
|
|
DenseVector([14.8232])
|
|
|
|
>>> model.factors
|
|
|
|
DenseMatrix(1, 2, [0.0163, -0.0051], 1)
|
2020-08-03 11:50:34 -04:00
|
|
|
>>> model_path = temp_path + "/fm_model"
|
|
|
|
>>> model.save(model_path)
|
|
|
|
>>> model2 = FMClassificationModel.load(model_path)
|
|
|
|
>>> model2.intercept
|
|
|
|
-7.316665276826291
|
|
|
|
>>> model2.linear
|
|
|
|
DenseVector([14.8232])
|
|
|
|
>>> model2.factors
|
|
|
|
DenseMatrix(1, 2, [0.0163, -0.0051], 1)
|
|
|
|
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
|
|
|
|
True
|
2019-12-26 12:39:53 -05:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2019-12-26 12:39:53 -05:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction",
|
|
|
|
factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0,
|
|
|
|
miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0,
|
|
|
|
tol=1e-6, solver="adamW", thresholds=None, seed=None):
|
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2019-12-26 12:39:53 -05:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", \
|
|
|
|
factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0, \
|
|
|
|
miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0, \
|
|
|
|
tol=1e-6, solver="adamW", thresholds=None, seed=None)
|
|
|
|
"""
|
|
|
|
super(FMClassifier, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.classification.FMClassifier", self.uid)
|
|
|
|
kwargs = self._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
@since("3.0.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
|
2019-12-26 12:39:53 -05:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction",
|
|
|
|
factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0,
|
|
|
|
miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0,
|
|
|
|
tol=1e-6, solver="adamW", thresholds=None, seed=None):
|
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
|
2019-12-26 12:39:53 -05:00
|
|
|
probabilityCol="probability", rawPredictionCol="rawPrediction", \
|
|
|
|
factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0, \
|
|
|
|
miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0, \
|
|
|
|
tol=1e-6, solver="adamW", thresholds=None, seed=None)
|
|
|
|
Sets Params for FMClassifier.
|
|
|
|
"""
|
|
|
|
kwargs = self._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return FMClassificationModel(java_model)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setFactorSize(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`factorSize`.
|
|
|
|
"""
|
|
|
|
return self._set(factorSize=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setFitLinear(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`fitLinear`.
|
|
|
|
"""
|
|
|
|
return self._set(fitLinear=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setMiniBatchFraction(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`miniBatchFraction`.
|
|
|
|
"""
|
|
|
|
return self._set(miniBatchFraction=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setInitStd(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`initStd`.
|
|
|
|
"""
|
|
|
|
return self._set(initStd=value)
|
|
|
|
|
2019-12-30 23:56:19 -05:00
|
|
|
@since("3.0.0")
|
|
|
|
def setMaxIter(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxIter`.
|
|
|
|
"""
|
|
|
|
return self._set(maxIter=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setStepSize(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`stepSize`.
|
|
|
|
"""
|
|
|
|
return self._set(stepSize=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setTol(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`tol`.
|
|
|
|
"""
|
|
|
|
return self._set(tol=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setSolver(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`solver`.
|
|
|
|
"""
|
|
|
|
return self._set(solver=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setSeed(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`seed`.
|
|
|
|
"""
|
|
|
|
return self._set(seed=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setFitIntercept(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`fitIntercept`.
|
|
|
|
"""
|
|
|
|
return self._set(fitIntercept=value)
|
|
|
|
|
|
|
|
@since("3.0.0")
|
|
|
|
def setRegParam(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`regParam`.
|
|
|
|
"""
|
|
|
|
return self._set(regParam=value)
|
|
|
|
|
2019-12-26 12:39:53 -05:00
|
|
|
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
class FMClassificationModel(_JavaProbabilisticClassificationModel, _FactorizationMachinesParams,
|
2020-07-15 13:13:03 -04:00
|
|
|
JavaMLWritable, JavaMLReadable, HasTrainingSummary):
|
2019-12-26 12:39:53 -05:00
|
|
|
"""
|
|
|
|
Model fitted by :class:`FMClassifier`.
|
|
|
|
|
|
|
|
.. versionadded:: 3.0.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.0.0")
|
|
|
|
def intercept(self):
|
|
|
|
"""
|
|
|
|
Model intercept.
|
|
|
|
"""
|
|
|
|
return self._call_java("intercept")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.0.0")
|
|
|
|
def linear(self):
|
|
|
|
"""
|
|
|
|
Model linear term.
|
|
|
|
"""
|
|
|
|
return self._call_java("linear")
|
|
|
|
|
|
|
|
@property
|
|
|
|
@since("3.0.0")
|
|
|
|
def factors(self):
|
|
|
|
"""
|
|
|
|
Model factor term.
|
|
|
|
"""
|
|
|
|
return self._call_java("factors")
|
|
|
|
|
2020-07-15 13:13:03 -04:00
|
|
|
@since("3.1.0")
|
|
|
|
def summary(self):
|
|
|
|
"""
|
2021-05-11 21:38:59 -04:00
|
|
|
Gets summary (accuracy/precision/recall, objective history, total iterations) of model
|
2020-07-15 13:13:03 -04:00
|
|
|
trained on the training set. An exception is thrown if `trainingSummary is None`.
|
|
|
|
"""
|
|
|
|
if self.hasSummary:
|
|
|
|
return FMClassificationTrainingSummary(super(FMClassificationModel, self).summary)
|
|
|
|
else:
|
|
|
|
raise RuntimeError("No training summary available for this %s" %
|
|
|
|
self.__class__.__name__)
|
|
|
|
|
|
|
|
def evaluate(self, dataset):
|
|
|
|
"""
|
|
|
|
Evaluates the model on a test dataset.
|
|
|
|
|
2020-11-09 19:33:48 -05:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
dataset : :py:class:`pyspark.sql.DataFrame`
|
|
|
|
Test dataset to evaluate model on.
|
2020-07-15 13:13:03 -04:00
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
2021-05-03 02:34:24 -04:00
|
|
|
raise TypeError("dataset must be a DataFrame but got %s." % type(dataset))
|
2020-07-15 13:13:03 -04:00
|
|
|
java_fm_summary = self._call_java("evaluate", dataset)
|
|
|
|
return FMClassificationSummary(java_fm_summary)
|
|
|
|
|
|
|
|
|
|
|
|
class FMClassificationSummary(_BinaryClassificationSummary):
|
|
|
|
"""
|
|
|
|
Abstraction for FMClassifier Results for a given model.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
2020-07-15 13:13:03 -04:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class FMClassificationTrainingSummary(FMClassificationSummary, _TrainingSummary):
|
|
|
|
"""
|
|
|
|
Abstraction for FMClassifier Training results.
|
2020-11-09 19:33:48 -05:00
|
|
|
|
2020-07-15 13:13:03 -04:00
|
|
|
.. versionadded:: 3.1.0
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2019-12-26 12:39:53 -05:00
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
2016-03-16 17:21:42 -04:00
|
|
|
import pyspark.ml.classification
|
2016-05-23 21:14:48 -04:00
|
|
|
from pyspark.sql import SparkSession
|
2016-03-16 17:21:42 -04:00
|
|
|
globs = pyspark.ml.classification.__dict__.copy()
|
2015-01-28 20:14:23 -05:00
|
|
|
# The small batch size here ensures that we see multiple batches,
|
|
|
|
# even in these small test examples:
|
2016-05-23 21:14:48 -04:00
|
|
|
spark = SparkSession.builder\
|
|
|
|
.master("local[2]")\
|
|
|
|
.appName("ml.classification tests")\
|
|
|
|
.getOrCreate()
|
|
|
|
sc = spark.sparkContext
|
2015-01-28 20:14:23 -05:00
|
|
|
globs['sc'] = sc
|
2016-05-23 21:14:48 -04:00
|
|
|
globs['spark'] = spark
|
2016-03-16 17:21:42 -04:00
|
|
|
import tempfile
|
|
|
|
temp_path = tempfile.mkdtemp()
|
|
|
|
globs['temp_path'] = temp_path
|
|
|
|
try:
|
|
|
|
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
|
2016-05-23 21:14:48 -04:00
|
|
|
spark.stop()
|
2016-03-16 17:21:42 -04:00
|
|
|
finally:
|
|
|
|
from shutil import rmtree
|
|
|
|
try:
|
|
|
|
rmtree(temp_path)
|
|
|
|
except OSError:
|
|
|
|
pass
|
2015-01-28 20:14:23 -05:00
|
|
|
if failure_count:
|
2018-03-08 06:38:34 -05:00
|
|
|
sys.exit(-1)
|