2018-11-18 03:02:15 -05:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
|
|
|
import inspect
|
|
|
|
import sys
|
|
|
|
import array as pyarray
|
2018-11-18 20:22:32 -05:00
|
|
|
import unittest
|
2018-11-18 03:02:15 -05:00
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
from pyspark import keyword_only
|
|
|
|
from pyspark.ml.classification import LogisticRegression
|
|
|
|
from pyspark.ml.clustering import KMeans
|
|
|
|
from pyspark.ml.feature import Binarizer, Bucketizer, ElementwiseProduct, IndexToString, \
|
2019-10-17 20:02:31 -04:00
|
|
|
MaxAbsScaler, VectorSlicer, Word2Vec
|
|
|
|
from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
|
2018-11-18 03:02:15 -05:00
|
|
|
from pyspark.ml.param import Param, Params, TypeConverters
|
|
|
|
from pyspark.ml.param.shared import HasInputCol, HasMaxIter, HasSeed
|
|
|
|
from pyspark.ml.wrapper import JavaParams
|
|
|
|
from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase
|
|
|
|
|
|
|
|
|
|
|
|
class ParamTypeConversionTests(PySparkTestCase):
|
|
|
|
"""
|
|
|
|
Test that param type conversion happens.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def test_int(self):
|
|
|
|
lr = LogisticRegression(maxIter=5.0)
|
|
|
|
self.assertEqual(lr.getMaxIter(), 5)
|
|
|
|
self.assertTrue(type(lr.getMaxIter()) == int)
|
|
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt"))
|
|
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
|
|
|
|
|
|
|
|
def test_float(self):
|
|
|
|
lr = LogisticRegression(tol=1)
|
|
|
|
self.assertEqual(lr.getTol(), 1.0)
|
|
|
|
self.assertTrue(type(lr.getTol()) == float)
|
|
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat"))
|
|
|
|
|
|
|
|
def test_vector(self):
|
|
|
|
ewp = ElementwiseProduct(scalingVec=[1, 3])
|
|
|
|
self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
|
|
|
|
ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
|
|
|
|
self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
|
|
|
|
self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
|
|
|
|
|
|
|
|
def test_list(self):
|
|
|
|
l = [0, 1]
|
|
|
|
for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
|
2020-07-13 22:22:44 -04:00
|
|
|
pyarray.array('l', l), range(2), tuple(l)]:
|
2018-11-18 03:02:15 -05:00
|
|
|
converted = TypeConverters.toList(lst_like)
|
|
|
|
self.assertEqual(type(converted), list)
|
|
|
|
self.assertListEqual(converted, l)
|
|
|
|
|
|
|
|
def test_list_int(self):
|
|
|
|
for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
|
2020-07-13 22:22:44 -04:00
|
|
|
SparseVector(2, {0: 1.0, 1: 2.0}), range(1, 3), (1.0, 2.0),
|
2018-11-18 03:02:15 -05:00
|
|
|
pyarray.array('d', [1.0, 2.0])]:
|
|
|
|
vs = VectorSlicer(indices=indices)
|
|
|
|
self.assertListEqual(vs.getIndices(), [1, 2])
|
|
|
|
self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
|
|
|
|
self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
|
|
|
|
|
|
|
|
def test_list_float(self):
|
|
|
|
b = Bucketizer(splits=[1, 4])
|
|
|
|
self.assertEqual(b.getSplits(), [1.0, 4.0])
|
|
|
|
self.assertTrue(all([type(v) == float for v in b.getSplits()]))
|
|
|
|
self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))
|
|
|
|
|
2019-09-16 23:52:20 -04:00
|
|
|
def test_list_list_float(self):
|
|
|
|
b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]])
|
|
|
|
self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]])
|
|
|
|
self.assertTrue(all([type(v) == list for v in b.getSplitsArray()]))
|
|
|
|
self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]]))
|
|
|
|
self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]]))
|
|
|
|
self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0]))
|
|
|
|
self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
|
|
|
|
|
2018-11-18 03:02:15 -05:00
|
|
|
def test_list_string(self):
|
|
|
|
for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]:
|
|
|
|
idx_to_string = IndexToString(labels=labels)
|
|
|
|
self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
|
|
|
|
self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
|
|
|
|
|
|
|
|
def test_string(self):
|
|
|
|
lr = LogisticRegression()
|
|
|
|
for col in ['features', u'features', np.str_('features')]:
|
|
|
|
lr.setFeaturesCol(col)
|
|
|
|
self.assertEqual(lr.getFeaturesCol(), 'features')
|
|
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3))
|
|
|
|
|
|
|
|
def test_bool(self):
|
|
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
|
|
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
|
|
|
|
|
|
|
|
|
|
|
|
class TestParams(HasMaxIter, HasInputCol, HasSeed):
|
|
|
|
"""
|
|
|
|
A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
|
|
|
|
"""
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, seed=None):
|
|
|
|
super(TestParams, self).__init__()
|
|
|
|
self._setDefault(maxIter=10)
|
|
|
|
kwargs = self._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, seed=None):
|
|
|
|
"""
|
|
|
|
setParams(self, seed=None)
|
|
|
|
Sets params for this test.
|
|
|
|
"""
|
|
|
|
kwargs = self._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
|
|
|
|
"""
|
|
|
|
A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
|
|
|
|
"""
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, seed=None):
|
|
|
|
super(OtherTestParams, self).__init__()
|
|
|
|
self._setDefault(maxIter=10)
|
|
|
|
kwargs = self._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, seed=None):
|
|
|
|
"""
|
|
|
|
setParams(self, seed=None)
|
|
|
|
Sets params for this test.
|
|
|
|
"""
|
|
|
|
kwargs = self._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
class HasThrowableProperty(Params):
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
super(HasThrowableProperty, self).__init__()
|
|
|
|
self.p = Param(self, "none", "empty param")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def test_property(self):
|
|
|
|
raise RuntimeError("Test property to raise error when invoked")
|
|
|
|
|
|
|
|
|
|
|
|
class ParamTests(SparkSessionTestCase):
|
|
|
|
|
|
|
|
def test_copy_new_parent(self):
|
|
|
|
testParams = TestParams()
|
|
|
|
# Copying an instantiated param should fail
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
testParams.maxIter._copy_new_parent(testParams)
|
|
|
|
# Copying a dummy param should succeed
|
|
|
|
TestParams.maxIter._copy_new_parent(testParams)
|
|
|
|
maxIter = testParams.maxIter
|
|
|
|
self.assertEqual(maxIter.name, "maxIter")
|
|
|
|
self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
|
|
|
|
self.assertTrue(maxIter.parent == testParams.uid)
|
|
|
|
|
|
|
|
def test_param(self):
|
|
|
|
testParams = TestParams()
|
|
|
|
maxIter = testParams.maxIter
|
|
|
|
self.assertEqual(maxIter.name, "maxIter")
|
|
|
|
self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
|
|
|
|
self.assertTrue(maxIter.parent == testParams.uid)
|
|
|
|
|
|
|
|
def test_hasparam(self):
|
|
|
|
testParams = TestParams()
|
|
|
|
self.assertTrue(all([testParams.hasParam(p.name) for p in testParams.params]))
|
|
|
|
self.assertFalse(testParams.hasParam("notAParameter"))
|
|
|
|
self.assertTrue(testParams.hasParam(u"maxIter"))
|
|
|
|
|
|
|
|
def test_resolveparam(self):
|
|
|
|
testParams = TestParams()
|
|
|
|
self.assertEqual(testParams._resolveParam(testParams.maxIter), testParams.maxIter)
|
|
|
|
self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter)
|
|
|
|
|
|
|
|
self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter)
|
2020-07-13 22:22:44 -04:00
|
|
|
self.assertRaises(AttributeError, lambda: testParams._resolveParam(u"아"))
|
2018-11-18 03:02:15 -05:00
|
|
|
|
|
|
|
def test_params(self):
|
|
|
|
testParams = TestParams()
|
|
|
|
maxIter = testParams.maxIter
|
|
|
|
inputCol = testParams.inputCol
|
|
|
|
seed = testParams.seed
|
|
|
|
|
|
|
|
params = testParams.params
|
|
|
|
self.assertEqual(params, [inputCol, maxIter, seed])
|
|
|
|
|
|
|
|
self.assertTrue(testParams.hasParam(maxIter.name))
|
|
|
|
self.assertTrue(testParams.hasDefault(maxIter))
|
|
|
|
self.assertFalse(testParams.isSet(maxIter))
|
|
|
|
self.assertTrue(testParams.isDefined(maxIter))
|
|
|
|
self.assertEqual(testParams.getMaxIter(), 10)
|
|
|
|
|
|
|
|
self.assertTrue(testParams.hasParam(inputCol.name))
|
|
|
|
self.assertFalse(testParams.hasDefault(inputCol))
|
|
|
|
self.assertFalse(testParams.isSet(inputCol))
|
|
|
|
self.assertFalse(testParams.isDefined(inputCol))
|
|
|
|
with self.assertRaises(KeyError):
|
|
|
|
testParams.getInputCol()
|
|
|
|
|
|
|
|
otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " +
|
|
|
|
"set raises an error for a non-member parameter.",
|
|
|
|
typeConverter=TypeConverters.toString)
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
testParams.set(otherParam, "value")
|
|
|
|
|
|
|
|
# Since the default is normally random, set it to a known number for debug str
|
|
|
|
testParams._setDefault(seed=41)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
testParams.explainParams(),
|
|
|
|
"\n".join(["inputCol: input column name. (undefined)",
|
2019-10-27 23:36:10 -04:00
|
|
|
"maxIter: max number of iterations (>= 0). (default: 10)",
|
|
|
|
"seed: random seed. (default: 41)"]))
|
2018-11-18 03:02:15 -05:00
|
|
|
|
2019-10-17 20:02:31 -04:00
|
|
|
def test_clear_param(self):
|
|
|
|
df = self.spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"])
|
|
|
|
maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled")
|
|
|
|
model = maScaler.fit(df)
|
|
|
|
self.assertTrue(model.isSet(model.outputCol))
|
|
|
|
self.assertEqual(model.getOutputCol(), "scaled")
|
|
|
|
model.clear(model.outputCol)
|
|
|
|
self.assertFalse(model.isSet(model.outputCol))
|
|
|
|
self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler')
|
|
|
|
output = model.transform(df)
|
|
|
|
self.assertEqual(model.getOutputCol(), output.schema.names[1])
|
|
|
|
|
2018-11-18 03:02:15 -05:00
|
|
|
def test_kmeans_param(self):
|
|
|
|
algo = KMeans()
|
|
|
|
self.assertEqual(algo.getInitMode(), "k-means||")
|
|
|
|
algo.setK(10)
|
|
|
|
self.assertEqual(algo.getK(), 10)
|
|
|
|
algo.setInitSteps(10)
|
|
|
|
self.assertEqual(algo.getInitSteps(), 10)
|
|
|
|
self.assertEqual(algo.getDistanceMeasure(), "euclidean")
|
|
|
|
algo.setDistanceMeasure("cosine")
|
|
|
|
self.assertEqual(algo.getDistanceMeasure(), "cosine")
|
|
|
|
|
|
|
|
def test_hasseed(self):
|
|
|
|
noSeedSpecd = TestParams()
|
|
|
|
withSeedSpecd = TestParams(seed=42)
|
|
|
|
other = OtherTestParams()
|
|
|
|
# Check that we no longer use 42 as the magic number
|
|
|
|
self.assertNotEqual(noSeedSpecd.getSeed(), 42)
|
|
|
|
origSeed = noSeedSpecd.getSeed()
|
|
|
|
# Check that we only compute the seed once
|
|
|
|
self.assertEqual(noSeedSpecd.getSeed(), origSeed)
|
|
|
|
# Check that a specified seed is honored
|
|
|
|
self.assertEqual(withSeedSpecd.getSeed(), 42)
|
|
|
|
# Check that a different class has a different seed
|
|
|
|
self.assertNotEqual(other.getSeed(), noSeedSpecd.getSeed())
|
|
|
|
|
|
|
|
def test_param_property_error(self):
|
|
|
|
param_store = HasThrowableProperty()
|
|
|
|
self.assertRaises(RuntimeError, lambda: param_store.test_property)
|
|
|
|
params = param_store.params # should not invoke the property 'test_property'
|
|
|
|
self.assertEqual(len(params), 1)
|
|
|
|
|
|
|
|
def test_word2vec_param(self):
|
|
|
|
model = Word2Vec().setWindowSize(6)
|
|
|
|
# Check windowSize is set properly
|
|
|
|
self.assertEqual(model.getWindowSize(), 6)
|
|
|
|
|
|
|
|
def test_copy_param_extras(self):
|
|
|
|
tp = TestParams(seed=42)
|
|
|
|
extra = {tp.getParam(TestParams.inputCol.name): "copy_input"}
|
|
|
|
tp_copy = tp.copy(extra=extra)
|
|
|
|
self.assertEqual(tp.uid, tp_copy.uid)
|
|
|
|
self.assertEqual(tp.params, tp_copy.params)
|
|
|
|
for k, v in extra.items():
|
|
|
|
self.assertTrue(tp_copy.isDefined(k))
|
|
|
|
self.assertEqual(tp_copy.getOrDefault(k), v)
|
|
|
|
copied_no_extra = {}
|
|
|
|
for k, v in tp_copy._paramMap.items():
|
|
|
|
if k not in extra:
|
|
|
|
copied_no_extra[k] = v
|
|
|
|
self.assertEqual(tp._paramMap, copied_no_extra)
|
|
|
|
self.assertEqual(tp._defaultParamMap, tp_copy._defaultParamMap)
|
2019-11-19 17:15:00 -05:00
|
|
|
with self.assertRaises(TypeError):
|
|
|
|
tp.copy(extra={"unknown_parameter": None})
|
|
|
|
with self.assertRaises(TypeError):
|
|
|
|
tp.copy(extra=["must be a dict"])
|
2018-11-18 03:02:15 -05:00
|
|
|
|
|
|
|
def test_logistic_regression_check_thresholds(self):
|
|
|
|
self.assertIsInstance(
|
|
|
|
LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]),
|
|
|
|
LogisticRegression
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertRaisesRegexp(
|
|
|
|
ValueError,
|
|
|
|
"Logistic Regression getThreshold found inconsistent.*$",
|
|
|
|
LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5]
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_preserve_set_state(self):
|
|
|
|
dataset = self.spark.createDataFrame([(0.5,)], ["data"])
|
|
|
|
binarizer = Binarizer(inputCol="data")
|
|
|
|
self.assertFalse(binarizer.isSet("threshold"))
|
|
|
|
binarizer.transform(dataset)
|
|
|
|
binarizer._transfer_params_from_java()
|
|
|
|
self.assertFalse(binarizer.isSet("threshold"),
|
|
|
|
"Params not explicitly set should remain unset after transform")
|
|
|
|
|
|
|
|
def test_default_params_transferred(self):
|
|
|
|
dataset = self.spark.createDataFrame([(0.5,)], ["data"])
|
|
|
|
binarizer = Binarizer(inputCol="data")
|
|
|
|
# intentionally change the pyspark default, but don't set it
|
|
|
|
binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
|
|
|
|
result = binarizer.transform(dataset).select("my_default").collect()
|
|
|
|
self.assertFalse(binarizer.isSet(binarizer.outputCol))
|
|
|
|
self.assertEqual(result[0][0], 1.0)
|
|
|
|
|
|
|
|
|
|
|
|
class DefaultValuesTests(PySparkTestCase):
|
|
|
|
"""
|
|
|
|
Test :py:class:`JavaParams` classes to see if their default Param values match
|
|
|
|
those in their Scala counterparts.
|
|
|
|
"""
|
|
|
|
def test_java_params(self):
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
import re
|
|
|
|
|
2018-11-18 03:02:15 -05:00
|
|
|
import pyspark.ml.feature
|
|
|
|
import pyspark.ml.classification
|
|
|
|
import pyspark.ml.clustering
|
|
|
|
import pyspark.ml.evaluation
|
|
|
|
import pyspark.ml.pipeline
|
|
|
|
import pyspark.ml.recommendation
|
|
|
|
import pyspark.ml.regression
|
|
|
|
|
|
|
|
modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering,
|
|
|
|
pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation,
|
|
|
|
pyspark.ml.regression]
|
|
|
|
for module in modules:
|
|
|
|
for name, cls in inspect.getmembers(module, inspect.isclass):
|
|
|
|
if not name.endswith('Model') and not name.endswith('Params') \
|
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON
### What changes were proposed in this pull request?
Add some common classes in Python to make it have the same structure as Scala
1. Scala has ClassifierParams/Classifier/ClassificationModel:
```
trait ClassifierParams
extends PredictorParams with HasRawPredictionCol
abstract class Classifier
extends Predictor with ClassifierParams {
def setRawPredictionCol
}
abstract class ClassificationModel
extends PredictionModel with ClassifierParams {
def setRawPredictionCol
}
```
This PR makes Python has the following:
```
class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams):
pass
class JavaClassifier(JavaPredictor, JavaClassifierParams):
def setRawPredictionCol
class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams):
def setRawPredictionCol
```
2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel:
```
trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol with HasThresholds
abstract class ProbabilisticClassifier
extends Classifier with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
abstract class ProbabilisticClassificationModel
extends ClassificationModel with ProbabilisticClassifierParams {
def setProbabilityCol
def setThresholds
}
```
This PR makes Python have the following:
```
class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams):
pass
class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams):
def setProbabilityCol
def setThresholds
```
3. Scala has PredictorParams/Predictor/PredictionModel:
```
trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol
abstract class Predictor
extends Estimator with PredictorParams {
def setLabelCol
def setFeaturesCol
def setPredictionCol
}
abstract class PredictionModel
extends Model with PredictorParams {
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
}
```
This PR makes Python have the following:
```
class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol):
pass
class JavaPredictor(JavaEstimator, JavaPredictorParams):
def setLabelCol
def setFeaturesCol
def setPredictionCol
class JavaPredictionModel(JavaModel, JavaPredictorParams):
def setFeaturesCol
def setPredictionCol
def numFeatures
def predict
```
### Why are the changes needed?
Have parity between Python and Scala ML
### Does this PR introduce any user-facing change?
Yes. Add the following changes:
```
LinearSVCModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- predict
```
```
LogisticRegressionModel
DecisionTreeClassificationModel
RandomForestClassificationModel
GBTClassificationModel
NaiveBayesModel
MultilayerPerceptronClassificationModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- get/setRawPredictionCol
- get/setProbabilityCol
- predict
```
```
LinearRegressionModel
IsotonicRegressionModel
DecisionTreeRegressionModel
RandomForestRegressionModel
GBTRegressionModel
AFTSurvivalRegressionModel
GeneralizedLinearRegressionModel
- get/setFeatureCol
- get/setPredictionCol
- get/setLabelCol
- predict
```
### How was this patch tested?
Add a few doc tests.
Closes #25776 from huaxingao/spark-28985.
Authored-by: Huaxin Gao <huaxing@us.ibm.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-09-19 09:17:25 -04:00
|
|
|
and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \
|
2020-05-07 23:02:24 -04:00
|
|
|
and not re.match("_?Java", name) and name != '_LSH' \
|
|
|
|
and name != '_Selector':
|
2018-11-18 03:02:15 -05:00
|
|
|
# NOTE: disable check_params_exist until there is parity with Scala API
|
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend
### What changes were proposed in this pull request?
Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends.
Note
- `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented.
- `PredictionModel` should be abstract as `_transform` is not implemented.
### Why are the changes needed?
To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors.
For longer discussion see [SPARK-29212](https://issues.apache.org/jira/browse/SPARK-29212) and / or https://github.com/apache/spark/pull/25776.
### Does this PR introduce any user-facing change?
It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same.
Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore).
It is for discussion if the same should be done to equivalent classes from `ml.wrapper`.
If we take `JavaClassifier` as an example, type hierarchy will change from
![old pyspark ml classification JavaClassifier](https://user-images.githubusercontent.com/1554276/72657093-5c0b0c80-39a0-11ea-9069-a897d75de483.png)
to
![new pyspark ml classification _JavaClassifier](https://user-images.githubusercontent.com/1554276/72657098-64fbde00-39a0-11ea-8f80-01187a5ea5a6.png)
Similarly the old model
![old pyspark ml classification JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657103-7513bd80-39a0-11ea-9ffc-59eb6ab61fde.png)
will become
![new pyspark ml classification _JavaClassificationModel](https://user-images.githubusercontent.com/1554276/72657110-80ff7f80-39a0-11ea-9f5c-fe408664e827.png)
### How was this patch tested?
Existing unit tests.
Closes #27245 from zero323/SPARK-29212.
Authored-by: zero323 <mszymkiewicz@gmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
2020-03-03 23:20:02 -05:00
|
|
|
|
2018-11-18 03:02:15 -05:00
|
|
|
check_params(self, cls(), check_params_exist=False)
|
|
|
|
|
|
|
|
# Additional classes that need explicit construction
|
|
|
|
from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
|
|
|
|
check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'),
|
|
|
|
check_params_exist=False)
|
|
|
|
check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'),
|
|
|
|
check_params_exist=False)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
from pyspark.ml.tests.test_param import *
|
|
|
|
|
|
|
|
try:
|
|
|
|
import xmlrunner
|
2019-06-23 20:58:17 -04:00
|
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
2018-11-18 03:02:15 -05:00
|
|
|
except ImportError:
|
|
|
|
testRunner = None
|
|
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|