e804ed5e33
modify Param._copyValues to check valid Param objects supplied as extra ### What changes were proposed in this pull request? Estimator.fit() and Model.transform() accept a dictionary of extra parameters whose values are used to overwrite those supplied at initialization or by default. Additionally, the ParamGridBuilder.addGrid accepts a parameter and list of values. The keys are presumed to be valid Param objects. This change adds a check that only Param objects are supplied as keys. ### Why are the changes needed? Param objects are created by and bound to an instance of Params (Estimator, Model, or Transformer). They may be obtained from their parent as attributes, or by name through getParam. The documentation does not state that keys must be valid Param objects, nor describe how one may be obtained. The current behavior is to silently ignore keys which are not valid Param objects. ### Does this PR introduce any user-facing change? If the user does not pass in a Param object as required for keys in `extra` for Estimator.fit() and Model.transform(), and `param` for ParamGridBuilder.addGrid, an error will be raised indicating it is an invalid object. ### How was this patch tested? Added method test_copy_param_extras_check to test_param.py. Tested with Python 3.7 Closes #26527 from JohnHBauer/paramExtra. Authored-by: John Bauer <john.h.bauer@gmail.com> Signed-off-by: Bryan Cutler <cutlerb@gmail.com>
389 lines
16 KiB
Python
389 lines
16 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import inspect
|
|
import sys
|
|
import array as pyarray
|
|
import unittest
|
|
|
|
import numpy as np
|
|
|
|
from pyspark import keyword_only
|
|
from pyspark.ml.classification import LogisticRegression
|
|
from pyspark.ml.clustering import KMeans
|
|
from pyspark.ml.feature import Binarizer, Bucketizer, ElementwiseProduct, IndexToString, \
|
|
MaxAbsScaler, VectorSlicer, Word2Vec
|
|
from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
|
|
from pyspark.ml.param import Param, Params, TypeConverters
|
|
from pyspark.ml.param.shared import HasInputCol, HasMaxIter, HasSeed
|
|
from pyspark.ml.wrapper import JavaParams
|
|
from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase
|
|
|
|
|
|
if sys.version > '3':
|
|
xrange = range
|
|
|
|
|
|
class ParamTypeConversionTests(PySparkTestCase):
|
|
"""
|
|
Test that param type conversion happens.
|
|
"""
|
|
|
|
def test_int(self):
|
|
lr = LogisticRegression(maxIter=5.0)
|
|
self.assertEqual(lr.getMaxIter(), 5)
|
|
self.assertTrue(type(lr.getMaxIter()) == int)
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt"))
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
|
|
|
|
def test_float(self):
|
|
lr = LogisticRegression(tol=1)
|
|
self.assertEqual(lr.getTol(), 1.0)
|
|
self.assertTrue(type(lr.getTol()) == float)
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat"))
|
|
|
|
def test_vector(self):
|
|
ewp = ElementwiseProduct(scalingVec=[1, 3])
|
|
self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
|
|
ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
|
|
self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
|
|
self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
|
|
|
|
def test_list(self):
|
|
l = [0, 1]
|
|
for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
|
|
pyarray.array('l', l), xrange(2), tuple(l)]:
|
|
converted = TypeConverters.toList(lst_like)
|
|
self.assertEqual(type(converted), list)
|
|
self.assertListEqual(converted, l)
|
|
|
|
def test_list_int(self):
|
|
for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
|
|
SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
|
|
pyarray.array('d', [1.0, 2.0])]:
|
|
vs = VectorSlicer(indices=indices)
|
|
self.assertListEqual(vs.getIndices(), [1, 2])
|
|
self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
|
|
self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
|
|
|
|
def test_list_float(self):
|
|
b = Bucketizer(splits=[1, 4])
|
|
self.assertEqual(b.getSplits(), [1.0, 4.0])
|
|
self.assertTrue(all([type(v) == float for v in b.getSplits()]))
|
|
self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))
|
|
|
|
def test_list_list_float(self):
|
|
b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]])
|
|
self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]])
|
|
self.assertTrue(all([type(v) == list for v in b.getSplitsArray()]))
|
|
self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]]))
|
|
self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]]))
|
|
self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0]))
|
|
self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
|
|
|
|
def test_list_string(self):
|
|
for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]:
|
|
idx_to_string = IndexToString(labels=labels)
|
|
self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
|
|
self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
|
|
|
|
def test_string(self):
|
|
lr = LogisticRegression()
|
|
for col in ['features', u'features', np.str_('features')]:
|
|
lr.setFeaturesCol(col)
|
|
self.assertEqual(lr.getFeaturesCol(), 'features')
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3))
|
|
|
|
def test_bool(self):
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
|
|
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
|
|
|
|
|
|
class TestParams(HasMaxIter, HasInputCol, HasSeed):
|
|
"""
|
|
A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
|
|
"""
|
|
@keyword_only
|
|
def __init__(self, seed=None):
|
|
super(TestParams, self).__init__()
|
|
self._setDefault(maxIter=10)
|
|
kwargs = self._input_kwargs
|
|
self.setParams(**kwargs)
|
|
|
|
@keyword_only
|
|
def setParams(self, seed=None):
|
|
"""
|
|
setParams(self, seed=None)
|
|
Sets params for this test.
|
|
"""
|
|
kwargs = self._input_kwargs
|
|
return self._set(**kwargs)
|
|
|
|
|
|
class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
|
|
"""
|
|
A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
|
|
"""
|
|
@keyword_only
|
|
def __init__(self, seed=None):
|
|
super(OtherTestParams, self).__init__()
|
|
self._setDefault(maxIter=10)
|
|
kwargs = self._input_kwargs
|
|
self.setParams(**kwargs)
|
|
|
|
@keyword_only
|
|
def setParams(self, seed=None):
|
|
"""
|
|
setParams(self, seed=None)
|
|
Sets params for this test.
|
|
"""
|
|
kwargs = self._input_kwargs
|
|
return self._set(**kwargs)
|
|
|
|
|
|
class HasThrowableProperty(Params):
|
|
|
|
def __init__(self):
|
|
super(HasThrowableProperty, self).__init__()
|
|
self.p = Param(self, "none", "empty param")
|
|
|
|
@property
|
|
def test_property(self):
|
|
raise RuntimeError("Test property to raise error when invoked")
|
|
|
|
|
|
class ParamTests(SparkSessionTestCase):
|
|
|
|
def test_copy_new_parent(self):
|
|
testParams = TestParams()
|
|
# Copying an instantiated param should fail
|
|
with self.assertRaises(ValueError):
|
|
testParams.maxIter._copy_new_parent(testParams)
|
|
# Copying a dummy param should succeed
|
|
TestParams.maxIter._copy_new_parent(testParams)
|
|
maxIter = testParams.maxIter
|
|
self.assertEqual(maxIter.name, "maxIter")
|
|
self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
|
|
self.assertTrue(maxIter.parent == testParams.uid)
|
|
|
|
def test_param(self):
|
|
testParams = TestParams()
|
|
maxIter = testParams.maxIter
|
|
self.assertEqual(maxIter.name, "maxIter")
|
|
self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
|
|
self.assertTrue(maxIter.parent == testParams.uid)
|
|
|
|
def test_hasparam(self):
|
|
testParams = TestParams()
|
|
self.assertTrue(all([testParams.hasParam(p.name) for p in testParams.params]))
|
|
self.assertFalse(testParams.hasParam("notAParameter"))
|
|
self.assertTrue(testParams.hasParam(u"maxIter"))
|
|
|
|
def test_resolveparam(self):
|
|
testParams = TestParams()
|
|
self.assertEqual(testParams._resolveParam(testParams.maxIter), testParams.maxIter)
|
|
self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter)
|
|
|
|
self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter)
|
|
if sys.version_info[0] >= 3:
|
|
# In Python 3, it is allowed to get/set attributes with non-ascii characters.
|
|
e_cls = AttributeError
|
|
else:
|
|
e_cls = UnicodeEncodeError
|
|
self.assertRaises(e_cls, lambda: testParams._resolveParam(u"아"))
|
|
|
|
def test_params(self):
|
|
testParams = TestParams()
|
|
maxIter = testParams.maxIter
|
|
inputCol = testParams.inputCol
|
|
seed = testParams.seed
|
|
|
|
params = testParams.params
|
|
self.assertEqual(params, [inputCol, maxIter, seed])
|
|
|
|
self.assertTrue(testParams.hasParam(maxIter.name))
|
|
self.assertTrue(testParams.hasDefault(maxIter))
|
|
self.assertFalse(testParams.isSet(maxIter))
|
|
self.assertTrue(testParams.isDefined(maxIter))
|
|
self.assertEqual(testParams.getMaxIter(), 10)
|
|
|
|
self.assertTrue(testParams.hasParam(inputCol.name))
|
|
self.assertFalse(testParams.hasDefault(inputCol))
|
|
self.assertFalse(testParams.isSet(inputCol))
|
|
self.assertFalse(testParams.isDefined(inputCol))
|
|
with self.assertRaises(KeyError):
|
|
testParams.getInputCol()
|
|
|
|
otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " +
|
|
"set raises an error for a non-member parameter.",
|
|
typeConverter=TypeConverters.toString)
|
|
with self.assertRaises(ValueError):
|
|
testParams.set(otherParam, "value")
|
|
|
|
# Since the default is normally random, set it to a known number for debug str
|
|
testParams._setDefault(seed=41)
|
|
|
|
self.assertEqual(
|
|
testParams.explainParams(),
|
|
"\n".join(["inputCol: input column name. (undefined)",
|
|
"maxIter: max number of iterations (>= 0). (default: 10)",
|
|
"seed: random seed. (default: 41)"]))
|
|
|
|
def test_clear_param(self):
|
|
df = self.spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"])
|
|
maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled")
|
|
model = maScaler.fit(df)
|
|
self.assertTrue(model.isSet(model.outputCol))
|
|
self.assertEqual(model.getOutputCol(), "scaled")
|
|
model.clear(model.outputCol)
|
|
self.assertFalse(model.isSet(model.outputCol))
|
|
self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler')
|
|
output = model.transform(df)
|
|
self.assertEqual(model.getOutputCol(), output.schema.names[1])
|
|
|
|
def test_kmeans_param(self):
|
|
algo = KMeans()
|
|
self.assertEqual(algo.getInitMode(), "k-means||")
|
|
algo.setK(10)
|
|
self.assertEqual(algo.getK(), 10)
|
|
algo.setInitSteps(10)
|
|
self.assertEqual(algo.getInitSteps(), 10)
|
|
self.assertEqual(algo.getDistanceMeasure(), "euclidean")
|
|
algo.setDistanceMeasure("cosine")
|
|
self.assertEqual(algo.getDistanceMeasure(), "cosine")
|
|
|
|
def test_hasseed(self):
|
|
noSeedSpecd = TestParams()
|
|
withSeedSpecd = TestParams(seed=42)
|
|
other = OtherTestParams()
|
|
# Check that we no longer use 42 as the magic number
|
|
self.assertNotEqual(noSeedSpecd.getSeed(), 42)
|
|
origSeed = noSeedSpecd.getSeed()
|
|
# Check that we only compute the seed once
|
|
self.assertEqual(noSeedSpecd.getSeed(), origSeed)
|
|
# Check that a specified seed is honored
|
|
self.assertEqual(withSeedSpecd.getSeed(), 42)
|
|
# Check that a different class has a different seed
|
|
self.assertNotEqual(other.getSeed(), noSeedSpecd.getSeed())
|
|
|
|
def test_param_property_error(self):
|
|
param_store = HasThrowableProperty()
|
|
self.assertRaises(RuntimeError, lambda: param_store.test_property)
|
|
params = param_store.params # should not invoke the property 'test_property'
|
|
self.assertEqual(len(params), 1)
|
|
|
|
def test_word2vec_param(self):
|
|
model = Word2Vec().setWindowSize(6)
|
|
# Check windowSize is set properly
|
|
self.assertEqual(model.getWindowSize(), 6)
|
|
|
|
def test_copy_param_extras(self):
|
|
tp = TestParams(seed=42)
|
|
extra = {tp.getParam(TestParams.inputCol.name): "copy_input"}
|
|
tp_copy = tp.copy(extra=extra)
|
|
self.assertEqual(tp.uid, tp_copy.uid)
|
|
self.assertEqual(tp.params, tp_copy.params)
|
|
for k, v in extra.items():
|
|
self.assertTrue(tp_copy.isDefined(k))
|
|
self.assertEqual(tp_copy.getOrDefault(k), v)
|
|
copied_no_extra = {}
|
|
for k, v in tp_copy._paramMap.items():
|
|
if k not in extra:
|
|
copied_no_extra[k] = v
|
|
self.assertEqual(tp._paramMap, copied_no_extra)
|
|
self.assertEqual(tp._defaultParamMap, tp_copy._defaultParamMap)
|
|
with self.assertRaises(TypeError):
|
|
tp.copy(extra={"unknown_parameter": None})
|
|
with self.assertRaises(TypeError):
|
|
tp.copy(extra=["must be a dict"])
|
|
|
|
def test_logistic_regression_check_thresholds(self):
|
|
self.assertIsInstance(
|
|
LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]),
|
|
LogisticRegression
|
|
)
|
|
|
|
self.assertRaisesRegexp(
|
|
ValueError,
|
|
"Logistic Regression getThreshold found inconsistent.*$",
|
|
LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5]
|
|
)
|
|
|
|
def test_preserve_set_state(self):
|
|
dataset = self.spark.createDataFrame([(0.5,)], ["data"])
|
|
binarizer = Binarizer(inputCol="data")
|
|
self.assertFalse(binarizer.isSet("threshold"))
|
|
binarizer.transform(dataset)
|
|
binarizer._transfer_params_from_java()
|
|
self.assertFalse(binarizer.isSet("threshold"),
|
|
"Params not explicitly set should remain unset after transform")
|
|
|
|
def test_default_params_transferred(self):
|
|
dataset = self.spark.createDataFrame([(0.5,)], ["data"])
|
|
binarizer = Binarizer(inputCol="data")
|
|
# intentionally change the pyspark default, but don't set it
|
|
binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
|
|
result = binarizer.transform(dataset).select("my_default").collect()
|
|
self.assertFalse(binarizer.isSet(binarizer.outputCol))
|
|
self.assertEqual(result[0][0], 1.0)
|
|
|
|
|
|
class DefaultValuesTests(PySparkTestCase):
|
|
"""
|
|
Test :py:class:`JavaParams` classes to see if their default Param values match
|
|
those in their Scala counterparts.
|
|
"""
|
|
|
|
def test_java_params(self):
|
|
import pyspark.ml.feature
|
|
import pyspark.ml.classification
|
|
import pyspark.ml.clustering
|
|
import pyspark.ml.evaluation
|
|
import pyspark.ml.pipeline
|
|
import pyspark.ml.recommendation
|
|
import pyspark.ml.regression
|
|
|
|
modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering,
|
|
pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation,
|
|
pyspark.ml.regression]
|
|
for module in modules:
|
|
for name, cls in inspect.getmembers(module, inspect.isclass):
|
|
if not name.endswith('Model') and not name.endswith('Params') \
|
|
and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \
|
|
and not name.startswith('Java') and name != '_LSH':
|
|
# NOTE: disable check_params_exist until there is parity with Scala API
|
|
check_params(self, cls(), check_params_exist=False)
|
|
|
|
# Additional classes that need explicit construction
|
|
from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
|
|
check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'),
|
|
check_params_exist=False)
|
|
check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'),
|
|
check_params_exist=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from pyspark.ml.tests.test_param import *
|
|
|
|
try:
|
|
import xmlrunner
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
|
except ImportError:
|
|
testRunner = None
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|