2015-01-28 20:14:23 -05:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
2020-08-30 22:23:31 -04:00
|
|
|
import os
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2020-08-30 22:23:31 -04:00
|
|
|
from pyspark import keyword_only, since, SparkContext
|
2017-02-06 21:12:20 -05:00
|
|
|
from pyspark.ml.base import Estimator, Model, Transformer
|
2015-01-28 20:14:23 -05:00
|
|
|
from pyspark.ml.param import Param, Params
|
2020-08-30 22:23:31 -04:00
|
|
|
from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader, \
|
|
|
|
DefaultParamsReader, DefaultParamsWriter, MLWriter, MLReader, JavaMLWritable
|
[SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model
### What changes were proposed in this pull request?
Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model.
Most pyspark estimators/transformers inherit `JavaParams`, but some estimators are special (in order to support pure python implemented nested estimators/transformers):
* Pipeline
* OneVsRest
* CrossValidator
* TrainValidationSplit
But note that, currently, in pyspark, estimators listed above, their model reader/writer do NOT support pure python implemented nested estimators/transformers. Because they use java reader/writer wrapper as python side reader/writer.
Pyspark CrossValidator/TrainValidationSplit model reader/writer require all estimators define the `_transfer_param_map_to_java` and `_transfer_param_map_from_java` (used in model read/write).
OneVsRest class already defines the two methods, but Pipeline do not, so it lead to this bug.
In this PR I add `_transfer_param_map_to_java` and `_transfer_param_map_from_java` into Pipeline class.
### Why are the changes needed?
Bug fix.
### Does this PR introduce any user-facing change?
No
### How was this patch tested?
Unit test.
Manually test in pyspark shell:
1) CrossValidator with Simple Pipeline estimator
```
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0),
(4, "b spark who", 1.0),
(5, "g d a y", 0.0),
(6, "spark fly", 1.0),
(7, "was mapreduce", 0.0),
], ["id", "text", "label"])
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=BinaryClassificationEvaluator(),
numFolds=2) # use 3+ folds in practice
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)
cvModel.save('/tmp/cv_model001')
CrossValidatorModel.load('/tmp/cv_model001')
```
2) CrossValidator with Pipeline estimator which include a OneVsRest estimator stage, and OneVsRest estimator nest a LogisticRegression estimator.
```
from pyspark.ml.linalg import Vectors
from pyspark.ml import Estimator, Model
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, OneVsRest
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.param import Param, Params
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder, \
TrainValidationSplit, TrainValidationSplitModel
from pyspark.sql.functions import rand
from pyspark.testing.mlutils import SparkSessionTestCase
dataset = spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
ova = OneVsRest(classifier=LogisticRegression())
lr1 = LogisticRegression().setMaxIter(100)
lr2 = LogisticRegression().setMaxIter(150)
grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
evaluator = MulticlassClassificationEvaluator()
pipeline = Pipeline(stages=[ova])
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
cvModel.save('/tmp/model002')
cvModel2 = CrossValidatorModel.load('/tmp/model002')
```
TrainValidationSplit testing code are similar so I do not paste them.
Closes #28279 from WeichenXu123/fix_pipeline_tuning.
Authored-by: Weichen Xu <weichen.xu@databricks.com>
Signed-off-by: Xiangrui Meng <meng@databricks.com>
2020-04-27 00:04:14 -04:00
|
|
|
from pyspark.ml.wrapper import JavaParams, JavaWrapper
|
|
|
|
from pyspark.ml.common import inherit_doc, _java2py, _py2java
|
2015-01-28 20:14:23 -05:00
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
2016-03-22 15:11:23 -04:00
|
|
|
class Pipeline(Estimator, MLReadable, MLWritable):
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
A simple pipeline, which acts as an estimator. A Pipeline consists
|
|
|
|
of a sequence of stages, each of which is either an
|
|
|
|
:py:class:`Estimator` or a :py:class:`Transformer`. When
|
|
|
|
:py:meth:`Pipeline.fit` is called, the stages are executed in
|
|
|
|
order. If a stage is an :py:class:`Estimator`, its
|
|
|
|
:py:meth:`Estimator.fit` method will be called on the input
|
|
|
|
dataset to fit a model. Then the model, which is a transformer,
|
|
|
|
will be used to transform the dataset as the input to the next
|
|
|
|
stage. If a stage is a :py:class:`Transformer`, its
|
|
|
|
:py:meth:`Transformer.transform` method will be called to produce
|
|
|
|
the dataset for the next stage. The fitted model from a
|
2016-06-06 04:35:47 -04:00
|
|
|
:py:class:`Pipeline` is a :py:class:`PipelineModel`, which
|
2015-01-28 20:14:23 -05:00
|
|
|
consists of fitted models and transformers, corresponding to the
|
2016-08-20 02:46:36 -04:00
|
|
|
pipeline stages. If stages is an empty list, the pipeline acts as an
|
2015-01-28 20:14:23 -05:00
|
|
|
identity transformer.
|
2015-11-02 19:09:22 -05:00
|
|
|
|
|
|
|
.. versionadded:: 1.3.0
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
|
2016-08-20 02:46:36 -04:00
|
|
|
stages = Param(Params._dummy(), "stages", "a list of pipeline stages")
|
2016-01-26 18:53:48 -05:00
|
|
|
|
2015-02-15 23:29:26 -05:00
|
|
|
@keyword_only
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def __init__(self, *, stages=None):
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
__init__(self, \\*, stages=None)
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
2015-01-28 20:14:23 -05:00
|
|
|
super(Pipeline, self).__init__()
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-02-15 23:29:26 -05:00
|
|
|
self.setParams(**kwargs)
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-11-02 19:09:22 -05:00
|
|
|
@since("1.3.0")
|
2015-01-28 20:14:23 -05:00
|
|
|
def setStages(self, value):
|
|
|
|
"""
|
|
|
|
Set pipeline stages.
|
2015-09-21 17:24:19 -04:00
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
:param value: a list of transformers or estimators
|
|
|
|
:return: the pipeline instance
|
|
|
|
"""
|
2016-05-03 10:46:13 -04:00
|
|
|
return self._set(stages=value)
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-11-02 19:09:22 -05:00
|
|
|
@since("1.3.0")
|
2015-01-28 20:14:23 -05:00
|
|
|
def getStages(self):
|
|
|
|
"""
|
|
|
|
Get pipeline stages.
|
|
|
|
"""
|
2016-08-20 02:46:36 -04:00
|
|
|
return self.getOrDefault(self.stages)
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-02-15 23:29:26 -05:00
|
|
|
@keyword_only
|
2015-11-02 19:09:22 -05:00
|
|
|
@since("1.3.0")
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
def setParams(self, *, stages=None):
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
[SPARK-32933][PYTHON] Use keyword-only syntax for keyword_only methods
### What changes were proposed in this pull request?
This PR adjusts signatures of methods decorated with `keyword_only` to indicate using [Python 3 keyword-only syntax](https://www.python.org/dev/peps/pep-3102/).
__Note__:
For the moment the goal is not to replace `keyword_only`. For justification see https://github.com/apache/spark/pull/29591#discussion_r489402579
### Why are the changes needed?
Right now it is not clear that `keyword_only` methods are indeed keyword only. This proposal addresses that.
In practice we could probably capture `locals` and drop `keyword_only` completel, i.e:
```python
keyword_only
def __init__(self, *, featuresCol="features"):
...
kwargs = self._input_kwargs
self.setParams(**kwargs)
```
could be replaced with
```python
def __init__(self, *, featuresCol="features"):
kwargs = locals()
del kwargs["self"]
...
self.setParams(**kwargs)
```
### Does this PR introduce _any_ user-facing change?
Docstrings and inspect tools will now indicate that `keyword_only` methods expect only keyword arguments.
For example with ` LinearSVC` will change from
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__(
self,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
)
Docstring: __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2):
File: /path/to/python/pyspark/ml/classification.py
Type: function
```
to
```
>>> from pyspark.ml.classification import LinearSVC
>>> ?LinearSVC.__init__
Signature:
LinearSVC.__init__ (
self,
*,
featuresCol='features',
labelCol='label',
predictionCol='prediction',
maxIter=100,
regParam=0.0,
tol=1e-06,
rawPredictionCol='rawPrediction',
fitIntercept=True,
standardization=True,
threshold=0.0,
weightCol=None,
aggregationDepth=2,
blockSize=1,
)
Docstring: __init__(self, \*, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, blockSize=1):
File: ~/Workspace/spark/python/pyspark/ml/classification.py
Type: function
```
### How was this patch tested?
Existing tests.
Closes #29799 from zero323/SPARK-32933.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2020-09-22 20:28:33 -04:00
|
|
|
setParams(self, \\*, stages=None)
|
2015-02-15 23:29:26 -05:00
|
|
|
Sets params for Pipeline.
|
|
|
|
"""
|
2017-03-03 19:43:45 -05:00
|
|
|
kwargs = self._input_kwargs
|
2015-04-16 02:49:42 -04:00
|
|
|
return self._set(**kwargs)
|
2015-02-15 23:29:26 -05:00
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
def _fit(self, dataset):
|
|
|
|
stages = self.getStages()
|
2015-01-28 20:14:23 -05:00
|
|
|
for stage in stages:
|
|
|
|
if not (isinstance(stage, Estimator) or isinstance(stage, Transformer)):
|
2015-04-20 13:44:09 -04:00
|
|
|
raise TypeError(
|
|
|
|
"Cannot recognize a pipeline stage of type %s." % type(stage))
|
2015-01-28 20:14:23 -05:00
|
|
|
indexOfLastEstimator = -1
|
|
|
|
for i, stage in enumerate(stages):
|
|
|
|
if isinstance(stage, Estimator):
|
|
|
|
indexOfLastEstimator = i
|
|
|
|
transformers = []
|
|
|
|
for i, stage in enumerate(stages):
|
|
|
|
if i <= indexOfLastEstimator:
|
|
|
|
if isinstance(stage, Transformer):
|
|
|
|
transformers.append(stage)
|
2015-05-18 15:02:18 -04:00
|
|
|
dataset = stage.transform(dataset)
|
2015-01-28 20:14:23 -05:00
|
|
|
else: # must be an Estimator
|
2015-05-18 15:02:18 -04:00
|
|
|
model = stage.fit(dataset)
|
2015-01-28 20:14:23 -05:00
|
|
|
transformers.append(model)
|
|
|
|
if i < indexOfLastEstimator:
|
2015-05-18 15:02:18 -04:00
|
|
|
dataset = model.transform(dataset)
|
2015-01-28 20:14:23 -05:00
|
|
|
else:
|
|
|
|
transformers.append(stage)
|
|
|
|
return PipelineModel(transformers)
|
|
|
|
|
2015-11-02 19:09:22 -05:00
|
|
|
@since("1.4.0")
|
2015-06-30 13:27:29 -04:00
|
|
|
def copy(self, extra=None):
|
2015-11-02 19:09:22 -05:00
|
|
|
"""
|
|
|
|
Creates a copy of this instance.
|
|
|
|
|
|
|
|
:param extra: extra parameters
|
|
|
|
:returns: new instance
|
|
|
|
"""
|
2015-06-30 13:27:29 -04:00
|
|
|
if extra is None:
|
|
|
|
extra = dict()
|
2015-05-18 15:02:18 -04:00
|
|
|
that = Params.copy(self, extra)
|
|
|
|
stages = [stage.copy(extra) for stage in that.getStages()]
|
|
|
|
return that.setStages(stages)
|
|
|
|
|
2016-03-16 16:49:40 -04:00
|
|
|
@since("2.0.0")
|
|
|
|
def write(self):
|
2016-04-18 16:31:48 -04:00
|
|
|
"""Returns an MLWriter instance for this ML instance."""
|
2017-08-12 02:57:08 -04:00
|
|
|
allStagesAreJava = PipelineSharedReadWrite.checkStagesForJava(self.getStages())
|
|
|
|
if allStagesAreJava:
|
|
|
|
return JavaMLWriter(self)
|
|
|
|
return PipelineWriter(self)
|
2016-03-16 16:49:40 -04:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
@since("2.0.0")
|
|
|
|
def read(cls):
|
2016-03-22 15:11:23 -04:00
|
|
|
"""Returns an MLReader instance for this class."""
|
2017-08-12 02:57:08 -04:00
|
|
|
return PipelineReader(cls)
|
2016-03-16 16:49:40 -04:00
|
|
|
|
|
|
|
@classmethod
|
2016-03-22 15:11:23 -04:00
|
|
|
def _from_java(cls, java_stage):
|
|
|
|
"""
|
|
|
|
Given a Java Pipeline, create and return a Python wrapper of it.
|
|
|
|
Used for ML persistence.
|
|
|
|
"""
|
|
|
|
# Create a new instance of this stage.
|
|
|
|
py_stage = cls()
|
|
|
|
# Load information from java_stage to the instance.
|
2016-04-13 17:08:57 -04:00
|
|
|
py_stages = [JavaParams._from_java(s) for s in java_stage.getStages()]
|
2016-03-22 15:11:23 -04:00
|
|
|
py_stage.setStages(py_stages)
|
|
|
|
py_stage._resetUid(java_stage.uid())
|
|
|
|
return py_stage
|
|
|
|
|
|
|
|
def _to_java(self):
|
|
|
|
"""
|
|
|
|
Transfer this instance to a Java Pipeline. Used for ML persistence.
|
|
|
|
|
|
|
|
:return: Java object equivalent to this instance.
|
|
|
|
"""
|
|
|
|
|
|
|
|
gateway = SparkContext._gateway
|
|
|
|
cls = SparkContext._jvm.org.apache.spark.ml.PipelineStage
|
|
|
|
java_stages = gateway.new_array(cls, len(self.getStages()))
|
|
|
|
for idx, stage in enumerate(self.getStages()):
|
|
|
|
java_stages[idx] = stage._to_java()
|
|
|
|
|
2016-04-13 17:08:57 -04:00
|
|
|
_java_obj = JavaParams._new_java_obj("org.apache.spark.ml.Pipeline", self.uid)
|
2016-03-22 15:11:23 -04:00
|
|
|
_java_obj.setStages(java_stages)
|
|
|
|
|
|
|
|
return _java_obj
|
2016-03-16 16:49:40 -04:00
|
|
|
|
[SPARK-31497][ML][PYSPARK] Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model
### What changes were proposed in this pull request?
Fix Pyspark CrossValidator/TrainValidationSplit with pipeline estimator cannot save and load model.
Most pyspark estimators/transformers inherit `JavaParams`, but some estimators are special (in order to support pure python implemented nested estimators/transformers):
* Pipeline
* OneVsRest
* CrossValidator
* TrainValidationSplit
But note that, currently, in pyspark, estimators listed above, their model reader/writer do NOT support pure python implemented nested estimators/transformers. Because they use java reader/writer wrapper as python side reader/writer.
Pyspark CrossValidator/TrainValidationSplit model reader/writer require all estimators define the `_transfer_param_map_to_java` and `_transfer_param_map_from_java` (used in model read/write).
OneVsRest class already defines the two methods, but Pipeline do not, so it lead to this bug.
In this PR I add `_transfer_param_map_to_java` and `_transfer_param_map_from_java` into Pipeline class.
### Why are the changes needed?
Bug fix.
### Does this PR introduce any user-facing change?
No
### How was this patch tested?
Unit test.
Manually test in pyspark shell:
1) CrossValidator with Simple Pipeline estimator
```
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0),
(4, "b spark who", 1.0),
(5, "g d a y", 0.0),
(6, "spark fly", 1.0),
(7, "was mapreduce", 0.0),
], ["id", "text", "label"])
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=BinaryClassificationEvaluator(),
numFolds=2) # use 3+ folds in practice
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)
cvModel.save('/tmp/cv_model001')
CrossValidatorModel.load('/tmp/cv_model001')
```
2) CrossValidator with Pipeline estimator which include a OneVsRest estimator stage, and OneVsRest estimator nest a LogisticRegression estimator.
```
from pyspark.ml.linalg import Vectors
from pyspark.ml import Estimator, Model
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, OneVsRest
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.param import Param, Params
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder, \
TrainValidationSplit, TrainValidationSplitModel
from pyspark.sql.functions import rand
from pyspark.testing.mlutils import SparkSessionTestCase
dataset = spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
ova = OneVsRest(classifier=LogisticRegression())
lr1 = LogisticRegression().setMaxIter(100)
lr2 = LogisticRegression().setMaxIter(150)
grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
evaluator = MulticlassClassificationEvaluator()
pipeline = Pipeline(stages=[ova])
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
cvModel.save('/tmp/model002')
cvModel2 = CrossValidatorModel.load('/tmp/model002')
```
TrainValidationSplit testing code are similar so I do not paste them.
Closes #28279 from WeichenXu123/fix_pipeline_tuning.
Authored-by: Weichen Xu <weichen.xu@databricks.com>
Signed-off-by: Xiangrui Meng <meng@databricks.com>
2020-04-27 00:04:14 -04:00
|
|
|
def _make_java_param_pair(self, param, value):
|
|
|
|
"""
|
|
|
|
Makes a Java param pair.
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
param = self._resolveParam(param)
|
|
|
|
java_param = sc._jvm.org.apache.spark.ml.param.Param(param.parent, param.name, param.doc)
|
|
|
|
if isinstance(value, Params) and hasattr(value, "_to_java"):
|
|
|
|
# Convert JavaEstimator/JavaTransformer object or Estimator/Transformer object which
|
|
|
|
# implements `_to_java` method (such as OneVsRest, Pipeline object) to java object.
|
|
|
|
# used in the case of an estimator having another estimator as a parameter
|
|
|
|
# the reason why this is not in _py2java in common.py is that importing
|
|
|
|
# Estimator and Model in common.py results in a circular import with inherit_doc
|
|
|
|
java_value = value._to_java()
|
|
|
|
else:
|
|
|
|
java_value = _py2java(sc, value)
|
|
|
|
return java_param.w(java_value)
|
|
|
|
|
|
|
|
def _transfer_param_map_to_java(self, pyParamMap):
|
|
|
|
"""
|
|
|
|
Transforms a Python ParamMap into a Java ParamMap.
|
|
|
|
"""
|
|
|
|
paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap")
|
|
|
|
for param in self.params:
|
|
|
|
if param in pyParamMap:
|
|
|
|
pair = self._make_java_param_pair(param, pyParamMap[param])
|
|
|
|
paramMap.put([pair])
|
|
|
|
return paramMap
|
|
|
|
|
|
|
|
def _transfer_param_map_from_java(self, javaParamMap):
|
|
|
|
"""
|
|
|
|
Transforms a Java ParamMap into a Python ParamMap.
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
paramMap = dict()
|
|
|
|
for pair in javaParamMap.toList():
|
|
|
|
param = pair.param()
|
|
|
|
if self.hasParam(str(param.name())):
|
|
|
|
java_obj = pair.value()
|
|
|
|
if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage").isInstance(java_obj):
|
|
|
|
# Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class
|
|
|
|
# and Estimator/Transformer class which implements `_from_java` static method
|
|
|
|
# (such as OneVsRest, Pipeline class).
|
|
|
|
py_obj = JavaParams._from_java(java_obj)
|
|
|
|
else:
|
|
|
|
py_obj = _java2py(sc, java_obj)
|
|
|
|
paramMap[self.getParam(param.name())] = py_obj
|
|
|
|
return paramMap
|
|
|
|
|
2016-03-16 16:49:40 -04:00
|
|
|
|
2017-08-12 02:57:08 -04:00
|
|
|
@inherit_doc
|
|
|
|
class PipelineWriter(MLWriter):
|
|
|
|
"""
|
|
|
|
(Private) Specialization of :py:class:`MLWriter` for :py:class:`Pipeline` types
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, instance):
|
|
|
|
super(PipelineWriter, self).__init__()
|
|
|
|
self.instance = instance
|
|
|
|
|
|
|
|
def saveImpl(self, path):
|
|
|
|
stages = self.instance.getStages()
|
|
|
|
PipelineSharedReadWrite.validateStages(stages)
|
|
|
|
PipelineSharedReadWrite.saveImpl(self.instance, stages, self.sc, path)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class PipelineReader(MLReader):
|
|
|
|
"""
|
|
|
|
(Private) Specialization of :py:class:`MLReader` for :py:class:`Pipeline` types
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, cls):
|
|
|
|
super(PipelineReader, self).__init__()
|
|
|
|
self.cls = cls
|
|
|
|
|
|
|
|
def load(self, path):
|
|
|
|
metadata = DefaultParamsReader.loadMetadata(path, self.sc)
|
|
|
|
if 'language' not in metadata['paramMap'] or metadata['paramMap']['language'] != 'Python':
|
|
|
|
return JavaMLReader(self.cls).load(path)
|
|
|
|
else:
|
|
|
|
uid, stages = PipelineSharedReadWrite.load(metadata, self.sc, path)
|
|
|
|
return Pipeline(stages=stages)._resetUid(uid)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class PipelineModelWriter(MLWriter):
|
|
|
|
"""
|
|
|
|
(Private) Specialization of :py:class:`MLWriter` for :py:class:`PipelineModel` types
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, instance):
|
|
|
|
super(PipelineModelWriter, self).__init__()
|
|
|
|
self.instance = instance
|
|
|
|
|
|
|
|
def saveImpl(self, path):
|
|
|
|
stages = self.instance.stages
|
|
|
|
PipelineSharedReadWrite.validateStages(stages)
|
|
|
|
PipelineSharedReadWrite.saveImpl(self.instance, stages, self.sc, path)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class PipelineModelReader(MLReader):
|
|
|
|
"""
|
|
|
|
(Private) Specialization of :py:class:`MLReader` for :py:class:`PipelineModel` types
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, cls):
|
|
|
|
super(PipelineModelReader, self).__init__()
|
|
|
|
self.cls = cls
|
|
|
|
|
|
|
|
def load(self, path):
|
|
|
|
metadata = DefaultParamsReader.loadMetadata(path, self.sc)
|
|
|
|
if 'language' not in metadata['paramMap'] or metadata['paramMap']['language'] != 'Python':
|
|
|
|
return JavaMLReader(self.cls).load(path)
|
|
|
|
else:
|
|
|
|
uid, stages = PipelineSharedReadWrite.load(metadata, self.sc, path)
|
|
|
|
return PipelineModel(stages=stages)._resetUid(uid)
|
|
|
|
|
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
@inherit_doc
|
2016-03-22 15:11:23 -04:00
|
|
|
class PipelineModel(Model, MLReadable, MLWritable):
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
Represents a compiled pipeline with transformers and fitted models.
|
2015-11-02 19:09:22 -05:00
|
|
|
|
|
|
|
.. versionadded:: 1.3.0
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
def __init__(self, stages):
|
2015-01-28 20:14:23 -05:00
|
|
|
super(PipelineModel, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self.stages = stages
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
def _transform(self, dataset):
|
|
|
|
for t in self.stages:
|
|
|
|
dataset = t.transform(dataset)
|
2015-01-28 20:14:23 -05:00
|
|
|
return dataset
|
2015-05-05 14:45:37 -04:00
|
|
|
|
2015-11-02 19:09:22 -05:00
|
|
|
@since("1.4.0")
|
2015-06-30 13:27:29 -04:00
|
|
|
def copy(self, extra=None):
|
2015-11-02 19:09:22 -05:00
|
|
|
"""
|
|
|
|
Creates a copy of this instance.
|
|
|
|
|
|
|
|
:param extra: extra parameters
|
|
|
|
:returns: new instance
|
|
|
|
"""
|
2015-06-30 13:27:29 -04:00
|
|
|
if extra is None:
|
|
|
|
extra = dict()
|
2015-05-18 15:02:18 -04:00
|
|
|
stages = [stage.copy(extra) for stage in self.stages]
|
|
|
|
return PipelineModel(stages)
|
2016-03-16 16:49:40 -04:00
|
|
|
|
|
|
|
@since("2.0.0")
|
|
|
|
def write(self):
|
2016-04-18 16:31:48 -04:00
|
|
|
"""Returns an MLWriter instance for this ML instance."""
|
2017-08-12 02:57:08 -04:00
|
|
|
allStagesAreJava = PipelineSharedReadWrite.checkStagesForJava(self.stages)
|
|
|
|
if allStagesAreJava:
|
|
|
|
return JavaMLWriter(self)
|
|
|
|
return PipelineModelWriter(self)
|
2016-03-16 16:49:40 -04:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
@since("2.0.0")
|
|
|
|
def read(cls):
|
2016-04-18 16:31:48 -04:00
|
|
|
"""Returns an MLReader instance for this class."""
|
2017-08-12 02:57:08 -04:00
|
|
|
return PipelineModelReader(cls)
|
2016-03-16 16:49:40 -04:00
|
|
|
|
|
|
|
@classmethod
|
2016-03-22 15:11:23 -04:00
|
|
|
def _from_java(cls, java_stage):
|
|
|
|
"""
|
|
|
|
Given a Java PipelineModel, create and return a Python wrapper of it.
|
|
|
|
Used for ML persistence.
|
|
|
|
"""
|
|
|
|
# Load information from java_stage to the instance.
|
2016-04-13 17:08:57 -04:00
|
|
|
py_stages = [JavaParams._from_java(s) for s in java_stage.stages()]
|
2016-03-22 15:11:23 -04:00
|
|
|
# Create a new instance of this stage.
|
|
|
|
py_stage = cls(py_stages)
|
|
|
|
py_stage._resetUid(java_stage.uid())
|
|
|
|
return py_stage
|
|
|
|
|
|
|
|
def _to_java(self):
|
|
|
|
"""
|
|
|
|
Transfer this instance to a Java PipelineModel. Used for ML persistence.
|
|
|
|
|
|
|
|
:return: Java object equivalent to this instance.
|
|
|
|
"""
|
|
|
|
|
|
|
|
gateway = SparkContext._gateway
|
|
|
|
cls = SparkContext._jvm.org.apache.spark.ml.Transformer
|
|
|
|
java_stages = gateway.new_array(cls, len(self.stages))
|
|
|
|
for idx, stage in enumerate(self.stages):
|
|
|
|
java_stages[idx] = stage._to_java()
|
|
|
|
|
|
|
|
_java_obj =\
|
2016-04-13 17:08:57 -04:00
|
|
|
JavaParams._new_java_obj("org.apache.spark.ml.PipelineModel", self.uid, java_stages)
|
2016-03-22 15:11:23 -04:00
|
|
|
|
|
|
|
return _java_obj
|
2017-08-12 02:57:08 -04:00
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class PipelineSharedReadWrite():
|
|
|
|
"""
|
|
|
|
Functions for :py:class:`MLReader` and :py:class:`MLWriter` shared between
|
|
|
|
:py:class:`Pipeline` and :py:class:`PipelineModel`
|
|
|
|
|
|
|
|
.. versionadded:: 2.3.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def checkStagesForJava(stages):
|
|
|
|
return all(isinstance(stage, JavaMLWritable) for stage in stages)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def validateStages(stages):
|
|
|
|
"""
|
|
|
|
Check that all stages are Writable
|
|
|
|
"""
|
|
|
|
for stage in stages:
|
|
|
|
if not isinstance(stage, MLWritable):
|
|
|
|
raise ValueError("Pipeline write will fail on this pipeline " +
|
|
|
|
"because stage %s of type %s is not MLWritable",
|
|
|
|
stage.uid, type(stage))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def saveImpl(instance, stages, sc, path):
|
|
|
|
"""
|
|
|
|
Save metadata and stages for a :py:class:`Pipeline` or :py:class:`PipelineModel`
|
|
|
|
- save metadata to path/metadata
|
|
|
|
- save stages to stages/IDX_UID
|
|
|
|
"""
|
|
|
|
stageUids = [stage.uid for stage in stages]
|
|
|
|
jsonParams = {'stageUids': stageUids, 'language': 'Python'}
|
|
|
|
DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap=jsonParams)
|
|
|
|
stagesDir = os.path.join(path, "stages")
|
|
|
|
for index, stage in enumerate(stages):
|
|
|
|
stage.write().save(PipelineSharedReadWrite
|
|
|
|
.getStagePath(stage.uid, index, len(stages), stagesDir))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def load(metadata, sc, path):
|
|
|
|
"""
|
|
|
|
Load metadata and stages for a :py:class:`Pipeline` or :py:class:`PipelineModel`
|
|
|
|
|
|
|
|
:return: (UID, list of stages)
|
|
|
|
"""
|
|
|
|
stagesDir = os.path.join(path, "stages")
|
|
|
|
stageUids = metadata['paramMap']['stageUids']
|
|
|
|
stages = []
|
|
|
|
for index, stageUid in enumerate(stageUids):
|
|
|
|
stagePath = \
|
|
|
|
PipelineSharedReadWrite.getStagePath(stageUid, index, len(stageUids), stagesDir)
|
|
|
|
stage = DefaultParamsReader.loadParamsInstance(stagePath, sc)
|
|
|
|
stages.append(stage)
|
|
|
|
return (metadata['uid'], stages)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def getStagePath(stageUid, stageIdx, numStages, stagesDir):
|
|
|
|
"""
|
|
|
|
Get path for saving the given stage.
|
|
|
|
"""
|
|
|
|
stageIdxDigits = len(str(numStages))
|
|
|
|
stageDir = str(stageIdx).zfill(stageIdxDigits) + "_" + stageUid
|
|
|
|
stagePath = os.path.join(stagesDir, stageDir)
|
|
|
|
return stagePath
|