# -*- coding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import inspect import sys import array as pyarray import unittest import numpy as np from pyspark import keyword_only from pyspark.ml.classification import LogisticRegression from pyspark.ml.clustering import KMeans from pyspark.ml.feature import Binarizer, Bucketizer, ElementwiseProduct, IndexToString, \ MaxAbsScaler, VectorSlicer, Word2Vec from pyspark.ml.linalg import DenseVector, SparseVector, Vectors from pyspark.ml.param import Param, Params, TypeConverters from pyspark.ml.param.shared import HasInputCol, HasMaxIter, HasSeed from pyspark.ml.wrapper import JavaParams from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase if sys.version > '3': xrange = range class ParamTypeConversionTests(PySparkTestCase): """ Test that param type conversion happens. """ def test_int(self): lr = LogisticRegression(maxIter=5.0) self.assertEqual(lr.getMaxIter(), 5) self.assertTrue(type(lr.getMaxIter()) == int) self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt")) self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1)) def test_float(self): lr = LogisticRegression(tol=1) self.assertEqual(lr.getTol(), 1.0) self.assertTrue(type(lr.getTol()) == float) self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat")) def test_vector(self): ewp = ElementwiseProduct(scalingVec=[1, 3]) self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0])) ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4])) self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4])) self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"])) def test_list(self): l = [0, 1] for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l), pyarray.array('l', l), xrange(2), tuple(l)]: converted = TypeConverters.toList(lst_like) self.assertEqual(type(converted), list) self.assertListEqual(converted, l) def test_list_int(self): for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]), SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0), pyarray.array('d', [1.0, 2.0])]: vs = VectorSlicer(indices=indices) self.assertListEqual(vs.getIndices(), [1, 2]) self.assertTrue(all([type(v) == int for v in vs.getIndices()])) self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"])) def test_list_float(self): b = Bucketizer(splits=[1, 4]) self.assertEqual(b.getSplits(), [1.0, 4.0]) self.assertTrue(all([type(v) == float for v in b.getSplits()])) self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0])) def test_list_list_float(self): b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]]) self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]]) self.assertTrue(all([type(v) == list for v in b.getSplitsArray()])) self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]])) self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]])) self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0])) self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]])) def test_list_string(self): for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]: idx_to_string = IndexToString(labels=labels) self.assertListEqual(idx_to_string.getLabels(), ['a', 'b']) self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2])) def test_string(self): lr = LogisticRegression() for col in ['features', u'features', np.str_('features')]: lr.setFeaturesCol(col) self.assertEqual(lr.getFeaturesCol(), 'features') self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3)) def test_bool(self): self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1)) self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false")) class TestParams(HasMaxIter, HasInputCol, HasSeed): """ A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed. """ @keyword_only def __init__(self, seed=None): super(TestParams, self).__init__() self._setDefault(maxIter=10) kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only def setParams(self, seed=None): """ setParams(self, seed=None) Sets params for this test. """ kwargs = self._input_kwargs return self._set(**kwargs) class OtherTestParams(HasMaxIter, HasInputCol, HasSeed): """ A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed. """ @keyword_only def __init__(self, seed=None): super(OtherTestParams, self).__init__() self._setDefault(maxIter=10) kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only def setParams(self, seed=None): """ setParams(self, seed=None) Sets params for this test. """ kwargs = self._input_kwargs return self._set(**kwargs) class HasThrowableProperty(Params): def __init__(self): super(HasThrowableProperty, self).__init__() self.p = Param(self, "none", "empty param") @property def test_property(self): raise RuntimeError("Test property to raise error when invoked") class ParamTests(SparkSessionTestCase): def test_copy_new_parent(self): testParams = TestParams() # Copying an instantiated param should fail with self.assertRaises(ValueError): testParams.maxIter._copy_new_parent(testParams) # Copying a dummy param should succeed TestParams.maxIter._copy_new_parent(testParams) maxIter = testParams.maxIter self.assertEqual(maxIter.name, "maxIter") self.assertEqual(maxIter.doc, "max number of iterations (>= 0).") self.assertTrue(maxIter.parent == testParams.uid) def test_param(self): testParams = TestParams() maxIter = testParams.maxIter self.assertEqual(maxIter.name, "maxIter") self.assertEqual(maxIter.doc, "max number of iterations (>= 0).") self.assertTrue(maxIter.parent == testParams.uid) def test_hasparam(self): testParams = TestParams() self.assertTrue(all([testParams.hasParam(p.name) for p in testParams.params])) self.assertFalse(testParams.hasParam("notAParameter")) self.assertTrue(testParams.hasParam(u"maxIter")) def test_resolveparam(self): testParams = TestParams() self.assertEqual(testParams._resolveParam(testParams.maxIter), testParams.maxIter) self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter) self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter) if sys.version_info[0] >= 3: # In Python 3, it is allowed to get/set attributes with non-ascii characters. e_cls = AttributeError else: e_cls = UnicodeEncodeError self.assertRaises(e_cls, lambda: testParams._resolveParam(u"아")) def test_params(self): testParams = TestParams() maxIter = testParams.maxIter inputCol = testParams.inputCol seed = testParams.seed params = testParams.params self.assertEqual(params, [inputCol, maxIter, seed]) self.assertTrue(testParams.hasParam(maxIter.name)) self.assertTrue(testParams.hasDefault(maxIter)) self.assertFalse(testParams.isSet(maxIter)) self.assertTrue(testParams.isDefined(maxIter)) self.assertEqual(testParams.getMaxIter(), 10) self.assertTrue(testParams.hasParam(inputCol.name)) self.assertFalse(testParams.hasDefault(inputCol)) self.assertFalse(testParams.isSet(inputCol)) self.assertFalse(testParams.isDefined(inputCol)) with self.assertRaises(KeyError): testParams.getInputCol() otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " + "set raises an error for a non-member parameter.", typeConverter=TypeConverters.toString) with self.assertRaises(ValueError): testParams.set(otherParam, "value") # Since the default is normally random, set it to a known number for debug str testParams._setDefault(seed=41) self.assertEqual( testParams.explainParams(), "\n".join(["inputCol: input column name. (undefined)", "maxIter: max number of iterations (>= 0). (default: 10)", "seed: random seed. (default: 41)"])) def test_clear_param(self): df = self.spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") model = maScaler.fit(df) self.assertTrue(model.isSet(model.outputCol)) self.assertEqual(model.getOutputCol(), "scaled") model.clear(model.outputCol) self.assertFalse(model.isSet(model.outputCol)) self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler') output = model.transform(df) self.assertEqual(model.getOutputCol(), output.schema.names[1]) def test_kmeans_param(self): algo = KMeans() self.assertEqual(algo.getInitMode(), "k-means||") algo.setK(10) self.assertEqual(algo.getK(), 10) algo.setInitSteps(10) self.assertEqual(algo.getInitSteps(), 10) self.assertEqual(algo.getDistanceMeasure(), "euclidean") algo.setDistanceMeasure("cosine") self.assertEqual(algo.getDistanceMeasure(), "cosine") def test_hasseed(self): noSeedSpecd = TestParams() withSeedSpecd = TestParams(seed=42) other = OtherTestParams() # Check that we no longer use 42 as the magic number self.assertNotEqual(noSeedSpecd.getSeed(), 42) origSeed = noSeedSpecd.getSeed() # Check that we only compute the seed once self.assertEqual(noSeedSpecd.getSeed(), origSeed) # Check that a specified seed is honored self.assertEqual(withSeedSpecd.getSeed(), 42) # Check that a different class has a different seed self.assertNotEqual(other.getSeed(), noSeedSpecd.getSeed()) def test_param_property_error(self): param_store = HasThrowableProperty() self.assertRaises(RuntimeError, lambda: param_store.test_property) params = param_store.params # should not invoke the property 'test_property' self.assertEqual(len(params), 1) def test_word2vec_param(self): model = Word2Vec().setWindowSize(6) # Check windowSize is set properly self.assertEqual(model.getWindowSize(), 6) def test_copy_param_extras(self): tp = TestParams(seed=42) extra = {tp.getParam(TestParams.inputCol.name): "copy_input"} tp_copy = tp.copy(extra=extra) self.assertEqual(tp.uid, tp_copy.uid) self.assertEqual(tp.params, tp_copy.params) for k, v in extra.items(): self.assertTrue(tp_copy.isDefined(k)) self.assertEqual(tp_copy.getOrDefault(k), v) copied_no_extra = {} for k, v in tp_copy._paramMap.items(): if k not in extra: copied_no_extra[k] = v self.assertEqual(tp._paramMap, copied_no_extra) self.assertEqual(tp._defaultParamMap, tp_copy._defaultParamMap) with self.assertRaises(TypeError): tp.copy(extra={"unknown_parameter": None}) with self.assertRaises(TypeError): tp.copy(extra=["must be a dict"]) def test_logistic_regression_check_thresholds(self): self.assertIsInstance( LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]), LogisticRegression ) self.assertRaisesRegexp( ValueError, "Logistic Regression getThreshold found inconsistent.*$", LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] ) def test_preserve_set_state(self): dataset = self.spark.createDataFrame([(0.5,)], ["data"]) binarizer = Binarizer(inputCol="data") self.assertFalse(binarizer.isSet("threshold")) binarizer.transform(dataset) binarizer._transfer_params_from_java() self.assertFalse(binarizer.isSet("threshold"), "Params not explicitly set should remain unset after transform") def test_default_params_transferred(self): dataset = self.spark.createDataFrame([(0.5,)], ["data"]) binarizer = Binarizer(inputCol="data") # intentionally change the pyspark default, but don't set it binarizer._defaultParamMap[binarizer.outputCol] = "my_default" result = binarizer.transform(dataset).select("my_default").collect() self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0) class DefaultValuesTests(PySparkTestCase): """ Test :py:class:`JavaParams` classes to see if their default Param values match those in their Scala counterparts. """ def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \ and not name.startswith('Java') and name != '_LSH': # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), check_params_exist=False) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), check_params_exist=False) if __name__ == "__main__": from pyspark.ml.tests.test_param import * try: import xmlrunner testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) except ImportError: testRunner = None unittest.main(testRunner=testRunner, verbosity=2)