spark-instrumented-optimizer/python/pyspark/ml/tests/test_param.py
HyukjinKwon 7c05f61514 [SPARK-28130][PYTHON] Print pretty messages for skipped tests when xmlrunner is available in PySpark
## What changes were proposed in this pull request?

Currently, pretty skipped message added by f7435bec6a mechanism seems not working when xmlrunner is installed apparently.

This PR fixes two things:

1. When `xmlrunner` is installed, seems `xmlrunner` does not respect `vervosity` level in unittests (default is level 1).

    So the output looks as below

    ```
    Running tests...
     ----------------------------------------------------------------------
    SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
    ----------------------------------------------------------------------
    ```

    So it is not caught by our message detection mechanism.

2. If we manually set the `vervocity` level to `xmlrunner`, it prints messages as below:

    ```
    test_mixed_udf (pyspark.sql.tests.test_pandas_udf_scalar.ScalarPandasUDFTests) ... SKIP (0.000s)
    test_mixed_udf_and_sql (pyspark.sql.tests.test_pandas_udf_scalar.ScalarPandasUDFTests) ... SKIP (0.000s)
    ...
    ```

    This is different in our Jenkins machine:

    ```
    test_createDataFrame_column_name_encoding (pyspark.sql.tests.test_arrow.ArrowTests) ... skipped 'Pandas >= 0.23.2 must be installed; however, it was not found.'
    test_createDataFrame_does_not_modify_input (pyspark.sql.tests.test_arrow.ArrowTests) ... skipped 'Pandas >= 0.23.2 must be installed; however, it was not found.'
    ...
    ```

    Note that last `SKIP` is different. This PR fixes the regular expression to catch `SKIP` case as well.

## How was this patch tested?

Manually tested.

**Before:**

```
Starting test(python2.7): pyspark....
Finished test(python2.7): pyspark.... (0s)
...
Tests passed in 562 seconds

========================================================================
...
```

**After:**

```
Starting test(python2.7): pyspark....
Finished test(python2.7): pyspark.... (48s) ... 93 tests were skipped
...
Tests passed in 560 seconds

Skipped tests pyspark.... with python2.7:
      pyspark...(...) ... SKIP (0.000s)
...

========================================================================
...
```

Closes #24927 from HyukjinKwon/SPARK-28130.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2019-06-24 09:58:17 +09:00

367 lines
14 KiB
Python

# -*- coding: utf-8 -*-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import inspect
import sys
import array as pyarray
import unittest
import numpy as np
from pyspark import keyword_only
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import Binarizer, Bucketizer, ElementwiseProduct, IndexToString, \
VectorSlicer, Word2Vec
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasInputCol, HasMaxIter, HasSeed
from pyspark.ml.wrapper import JavaParams
from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase
if sys.version > '3':
xrange = range
class ParamTypeConversionTests(PySparkTestCase):
"""
Test that param type conversion happens.
"""
def test_int(self):
lr = LogisticRegression(maxIter=5.0)
self.assertEqual(lr.getMaxIter(), 5)
self.assertTrue(type(lr.getMaxIter()) == int)
self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt"))
self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
def test_float(self):
lr = LogisticRegression(tol=1)
self.assertEqual(lr.getTol(), 1.0)
self.assertTrue(type(lr.getTol()) == float)
self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat"))
def test_vector(self):
ewp = ElementwiseProduct(scalingVec=[1, 3])
self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
def test_list(self):
l = [0, 1]
for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
pyarray.array('l', l), xrange(2), tuple(l)]:
converted = TypeConverters.toList(lst_like)
self.assertEqual(type(converted), list)
self.assertListEqual(converted, l)
def test_list_int(self):
for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
pyarray.array('d', [1.0, 2.0])]:
vs = VectorSlicer(indices=indices)
self.assertListEqual(vs.getIndices(), [1, 2])
self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
def test_list_float(self):
b = Bucketizer(splits=[1, 4])
self.assertEqual(b.getSplits(), [1.0, 4.0])
self.assertTrue(all([type(v) == float for v in b.getSplits()]))
self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))
def test_list_string(self):
for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]:
idx_to_string = IndexToString(labels=labels)
self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
def test_string(self):
lr = LogisticRegression()
for col in ['features', u'features', np.str_('features')]:
lr.setFeaturesCol(col)
self.assertEqual(lr.getFeaturesCol(), 'features')
self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3))
def test_bool(self):
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
class TestParams(HasMaxIter, HasInputCol, HasSeed):
"""
A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
"""
@keyword_only
def __init__(self, seed=None):
super(TestParams, self).__init__()
self._setDefault(maxIter=10)
kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
def setParams(self, seed=None):
"""
setParams(self, seed=None)
Sets params for this test.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
"""
A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
"""
@keyword_only
def __init__(self, seed=None):
super(OtherTestParams, self).__init__()
self._setDefault(maxIter=10)
kwargs = self._input_kwargs
self.setParams(**kwargs)
@keyword_only
def setParams(self, seed=None):
"""
setParams(self, seed=None)
Sets params for this test.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)
class HasThrowableProperty(Params):
def __init__(self):
super(HasThrowableProperty, self).__init__()
self.p = Param(self, "none", "empty param")
@property
def test_property(self):
raise RuntimeError("Test property to raise error when invoked")
class ParamTests(SparkSessionTestCase):
def test_copy_new_parent(self):
testParams = TestParams()
# Copying an instantiated param should fail
with self.assertRaises(ValueError):
testParams.maxIter._copy_new_parent(testParams)
# Copying a dummy param should succeed
TestParams.maxIter._copy_new_parent(testParams)
maxIter = testParams.maxIter
self.assertEqual(maxIter.name, "maxIter")
self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
self.assertTrue(maxIter.parent == testParams.uid)
def test_param(self):
testParams = TestParams()
maxIter = testParams.maxIter
self.assertEqual(maxIter.name, "maxIter")
self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
self.assertTrue(maxIter.parent == testParams.uid)
def test_hasparam(self):
testParams = TestParams()
self.assertTrue(all([testParams.hasParam(p.name) for p in testParams.params]))
self.assertFalse(testParams.hasParam("notAParameter"))
self.assertTrue(testParams.hasParam(u"maxIter"))
def test_resolveparam(self):
testParams = TestParams()
self.assertEqual(testParams._resolveParam(testParams.maxIter), testParams.maxIter)
self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter)
self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter)
if sys.version_info[0] >= 3:
# In Python 3, it is allowed to get/set attributes with non-ascii characters.
e_cls = AttributeError
else:
e_cls = UnicodeEncodeError
self.assertRaises(e_cls, lambda: testParams._resolveParam(u""))
def test_params(self):
testParams = TestParams()
maxIter = testParams.maxIter
inputCol = testParams.inputCol
seed = testParams.seed
params = testParams.params
self.assertEqual(params, [inputCol, maxIter, seed])
self.assertTrue(testParams.hasParam(maxIter.name))
self.assertTrue(testParams.hasDefault(maxIter))
self.assertFalse(testParams.isSet(maxIter))
self.assertTrue(testParams.isDefined(maxIter))
self.assertEqual(testParams.getMaxIter(), 10)
testParams.setMaxIter(100)
self.assertTrue(testParams.isSet(maxIter))
self.assertEqual(testParams.getMaxIter(), 100)
self.assertTrue(testParams.hasParam(inputCol.name))
self.assertFalse(testParams.hasDefault(inputCol))
self.assertFalse(testParams.isSet(inputCol))
self.assertFalse(testParams.isDefined(inputCol))
with self.assertRaises(KeyError):
testParams.getInputCol()
otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " +
"set raises an error for a non-member parameter.",
typeConverter=TypeConverters.toString)
with self.assertRaises(ValueError):
testParams.set(otherParam, "value")
# Since the default is normally random, set it to a known number for debug str
testParams._setDefault(seed=41)
testParams.setSeed(43)
self.assertEqual(
testParams.explainParams(),
"\n".join(["inputCol: input column name. (undefined)",
"maxIter: max number of iterations (>= 0). (default: 10, current: 100)",
"seed: random seed. (default: 41, current: 43)"]))
def test_kmeans_param(self):
algo = KMeans()
self.assertEqual(algo.getInitMode(), "k-means||")
algo.setK(10)
self.assertEqual(algo.getK(), 10)
algo.setInitSteps(10)
self.assertEqual(algo.getInitSteps(), 10)
self.assertEqual(algo.getDistanceMeasure(), "euclidean")
algo.setDistanceMeasure("cosine")
self.assertEqual(algo.getDistanceMeasure(), "cosine")
def test_hasseed(self):
noSeedSpecd = TestParams()
withSeedSpecd = TestParams(seed=42)
other = OtherTestParams()
# Check that we no longer use 42 as the magic number
self.assertNotEqual(noSeedSpecd.getSeed(), 42)
origSeed = noSeedSpecd.getSeed()
# Check that we only compute the seed once
self.assertEqual(noSeedSpecd.getSeed(), origSeed)
# Check that a specified seed is honored
self.assertEqual(withSeedSpecd.getSeed(), 42)
# Check that a different class has a different seed
self.assertNotEqual(other.getSeed(), noSeedSpecd.getSeed())
def test_param_property_error(self):
param_store = HasThrowableProperty()
self.assertRaises(RuntimeError, lambda: param_store.test_property)
params = param_store.params # should not invoke the property 'test_property'
self.assertEqual(len(params), 1)
def test_word2vec_param(self):
model = Word2Vec().setWindowSize(6)
# Check windowSize is set properly
self.assertEqual(model.getWindowSize(), 6)
def test_copy_param_extras(self):
tp = TestParams(seed=42)
extra = {tp.getParam(TestParams.inputCol.name): "copy_input"}
tp_copy = tp.copy(extra=extra)
self.assertEqual(tp.uid, tp_copy.uid)
self.assertEqual(tp.params, tp_copy.params)
for k, v in extra.items():
self.assertTrue(tp_copy.isDefined(k))
self.assertEqual(tp_copy.getOrDefault(k), v)
copied_no_extra = {}
for k, v in tp_copy._paramMap.items():
if k not in extra:
copied_no_extra[k] = v
self.assertEqual(tp._paramMap, copied_no_extra)
self.assertEqual(tp._defaultParamMap, tp_copy._defaultParamMap)
def test_logistic_regression_check_thresholds(self):
self.assertIsInstance(
LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]),
LogisticRegression
)
self.assertRaisesRegexp(
ValueError,
"Logistic Regression getThreshold found inconsistent.*$",
LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5]
)
def test_preserve_set_state(self):
dataset = self.spark.createDataFrame([(0.5,)], ["data"])
binarizer = Binarizer(inputCol="data")
self.assertFalse(binarizer.isSet("threshold"))
binarizer.transform(dataset)
binarizer._transfer_params_from_java()
self.assertFalse(binarizer.isSet("threshold"),
"Params not explicitly set should remain unset after transform")
def test_default_params_transferred(self):
dataset = self.spark.createDataFrame([(0.5,)], ["data"])
binarizer = Binarizer(inputCol="data")
# intentionally change the pyspark default, but don't set it
binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
result = binarizer.transform(dataset).select("my_default").collect()
self.assertFalse(binarizer.isSet(binarizer.outputCol))
self.assertEqual(result[0][0], 1.0)
class DefaultValuesTests(PySparkTestCase):
"""
Test :py:class:`JavaParams` classes to see if their default Param values match
those in their Scala counterparts.
"""
def test_java_params(self):
import pyspark.ml.feature
import pyspark.ml.classification
import pyspark.ml.clustering
import pyspark.ml.evaluation
import pyspark.ml.pipeline
import pyspark.ml.recommendation
import pyspark.ml.regression
modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering,
pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation,
pyspark.ml.regression]
for module in modules:
for name, cls in inspect.getmembers(module, inspect.isclass):
if not name.endswith('Model') and not name.endswith('Params') \
and issubclass(cls, JavaParams) and not inspect.isabstract(cls):
# NOTE: disable check_params_exist until there is parity with Scala API
check_params(self, cls(), check_params_exist=False)
# Additional classes that need explicit construction
from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'),
check_params_exist=False)
check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'),
check_params_exist=False)
if __name__ == "__main__":
from pyspark.ml.tests.test_param import *
try:
import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)