
395 lines
16 KiB
Raw Normal View History

# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import array as pyarray
import unittest
import numpy as np
from pyspark import keyword_only
from import LogisticRegression
from import KMeans
from import Binarizer, Bucketizer, ElementwiseProduct, IndexToString, \
MaxAbsScaler, VectorSlicer, Word2Vec
from import DenseVector, SparseVector, Vectors
from import Param, Params, TypeConverters
from import HasInputCol, HasMaxIter, HasSeed
from import LinearRegressionModel, GeneralizedLinearRegressionModel
from import JavaParams
from pyspark.testing.mlutils import check_params, PySparkTestCase, SparkSessionTestCase
class ParamTypeConversionTests(PySparkTestCase):
Test that param type conversion happens.
def test_int(self):
lr = LogisticRegression(maxIter=5.0)
self.assertEqual(lr.getMaxIter(), 5)
self.assertTrue(type(lr.getMaxIter()) == int)
self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt"))
self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
def test_float(self):
lr = LogisticRegression(tol=1)
self.assertEqual(lr.getTol(), 1.0)
self.assertTrue(type(lr.getTol()) == float)
self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat"))
def test_vector(self):
ewp = ElementwiseProduct(scalingVec=[1, 3])
self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
def test_list(self):
l = [0, 1]
for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
pyarray.array('l', l), range(2), tuple(l)]:
converted = TypeConverters.toList(lst_like)
self.assertEqual(type(converted), list)
self.assertListEqual(converted, l)
def test_list_int(self):
for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
SparseVector(2, {0: 1.0, 1: 2.0}), range(1, 3), (1.0, 2.0),
pyarray.array('d', [1.0, 2.0])]:
vs = VectorSlicer(indices=indices)
self.assertListEqual(vs.getIndices(), [1, 2])
self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
def test_list_float(self):
b = Bucketizer(splits=[1, 4])
self.assertEqual(b.getSplits(), [1.0, 4.0])
self.assertTrue(all([type(v) == float for v in b.getSplits()]))
self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))
def test_list_list_float(self):
b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]])
self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]])
self.assertTrue(all([type(v) == list for v in b.getSplitsArray()]))
self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]]))
self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]]))
self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0]))
self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
def test_list_string(self):
for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]:
idx_to_string = IndexToString(labels=labels)
self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
def test_string(self):
lr = LogisticRegression()
for col in ['features', u'features', np.str_('features')]:
self.assertEqual(lr.getFeaturesCol(), 'features')
self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3))
def test_bool(self):
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
class TestParams(HasMaxIter, HasInputCol, HasSeed):
A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
def __init__(self, seed=None):
super(TestParams, self).__init__()
kwargs = self._input_kwargs
def setParams(self, seed=None):
setParams(self, seed=None)
Sets params for this test.
kwargs = self._input_kwargs
return self._set(**kwargs)
class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed.
def __init__(self, seed=None):
super(OtherTestParams, self).__init__()
kwargs = self._input_kwargs
def setParams(self, seed=None):
setParams(self, seed=None)
Sets params for this test.
kwargs = self._input_kwargs
return self._set(**kwargs)
class HasThrowableProperty(Params):
def __init__(self):
super(HasThrowableProperty, self).__init__()
self.p = Param(self, "none", "empty param")
def test_property(self):
raise RuntimeError("Test property to raise error when invoked")
class ParamTests(SparkSessionTestCase):
def test_copy_new_parent(self):
testParams = TestParams()
# Copying an instantiated param should fail
with self.assertRaises(ValueError):
# Copying a dummy param should succeed
maxIter = testParams.maxIter
self.assertEqual(, "maxIter")
self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
self.assertTrue(maxIter.parent == testParams.uid)
def test_param(self):
testParams = TestParams()
maxIter = testParams.maxIter
self.assertEqual(, "maxIter")
self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
self.assertTrue(maxIter.parent == testParams.uid)
def test_hasparam(self):
testParams = TestParams()
self.assertTrue(all([testParams.hasParam( for p in testParams.params]))
def test_resolveparam(self):
testParams = TestParams()
self.assertEqual(testParams._resolveParam(testParams.maxIter), testParams.maxIter)
self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter)
self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter)
self.assertRaises(AttributeError, lambda: testParams._resolveParam(u""))
# Invalid type
invalid_type = 1
self.assertRaises(TypeError, testParams._resolveParam, invalid_type)
def test_params(self):
testParams = TestParams()
maxIter = testParams.maxIter
inputCol = testParams.inputCol
seed = testParams.seed
params = testParams.params
self.assertEqual(params, [inputCol, maxIter, seed])
self.assertEqual(testParams.getMaxIter(), 10)
with self.assertRaises(KeyError):
otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " +
"set raises an error for a non-member parameter.",
with self.assertRaises(ValueError):
testParams.set(otherParam, "value")
# Since the default is normally random, set it to a known number for debug str
"\n".join(["inputCol: input column name. (undefined)",
"maxIter: max number of iterations (>= 0). (default: 10)",
"seed: random seed. (default: 41)"]))
def test_clear_param(self):
df = self.spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"])
maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled")
model =
self.assertEqual(model.getOutputCol(), "scaled")
self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler')
output = model.transform(df)
self.assertEqual(model.getOutputCol(), output.schema.names[1])
def test_kmeans_param(self):
algo = KMeans()
self.assertEqual(algo.getInitMode(), "k-means||")
self.assertEqual(algo.getK(), 10)
self.assertEqual(algo.getInitSteps(), 10)
self.assertEqual(algo.getDistanceMeasure(), "euclidean")
self.assertEqual(algo.getDistanceMeasure(), "cosine")
def test_hasseed(self):
noSeedSpecd = TestParams()
withSeedSpecd = TestParams(seed=42)
other = OtherTestParams()
# Check that we no longer use 42 as the magic number
self.assertNotEqual(noSeedSpecd.getSeed(), 42)
origSeed = noSeedSpecd.getSeed()
# Check that we only compute the seed once
self.assertEqual(noSeedSpecd.getSeed(), origSeed)
# Check that a specified seed is honored
self.assertEqual(withSeedSpecd.getSeed(), 42)
# Check that a different class has a different seed
self.assertNotEqual(other.getSeed(), noSeedSpecd.getSeed())
def test_param_property_error(self):
param_store = HasThrowableProperty()
self.assertRaises(RuntimeError, lambda: param_store.test_property)
params = param_store.params # should not invoke the property 'test_property'
self.assertEqual(len(params), 1)
def test_word2vec_param(self):
model = Word2Vec().setWindowSize(6)
# Check windowSize is set properly
self.assertEqual(model.getWindowSize(), 6)
def test_copy_param_extras(self):
tp = TestParams(seed=42)
extra = {tp.getParam( "copy_input"}
tp_copy = tp.copy(extra=extra)
self.assertEqual(tp.uid, tp_copy.uid)
self.assertEqual(tp.params, tp_copy.params)
for k, v in extra.items():
self.assertEqual(tp_copy.getOrDefault(k), v)
copied_no_extra = {}
for k, v in tp_copy._paramMap.items():
if k not in extra:
copied_no_extra[k] = v
self.assertEqual(tp._paramMap, copied_no_extra)
self.assertEqual(tp._defaultParamMap, tp_copy._defaultParamMap)
2019-11-19 17:15:00 -05:00
with self.assertRaises(TypeError):
tp.copy(extra={"unknown_parameter": None})
with self.assertRaises(TypeError):
tp.copy(extra=["must be a dict"])
def test_logistic_regression_check_thresholds(self):
LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]),
"Logistic Regression getThreshold found inconsistent.*$",
LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5]
def test_preserve_set_state(self):
dataset = self.spark.createDataFrame([(0.5,)], ["data"])
binarizer = Binarizer(inputCol="data")
"Params not explicitly set should remain unset after transform")
def test_default_params_transferred(self):
dataset = self.spark.createDataFrame([(0.5,)], ["data"])
binarizer = Binarizer(inputCol="data")
# intentionally change the pyspark default, but don't set it
binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
result = binarizer.transform(dataset).select("my_default").collect()
self.assertEqual(result[0][0], 1.0)
def test_lr_evaluate_invaild_type(self):
lr = LinearRegressionModel()
invalid_type = ""
self.assertRaises(TypeError, lr.evaluate, invalid_type)
def test_glr_evaluate_invaild_type(self):
glr = GeneralizedLinearRegressionModel()
invalid_type = ""
self.assertRaises(TypeError, glr.evaluate, invalid_type)
class DefaultValuesTests(PySparkTestCase):
Test :py:class:`JavaParams` classes to see if their default Param values match
those in their Scala counterparts.
def test_java_params(self):
[SPARK-29212][ML][PYSPARK] Add common classes without using JVM backend ### What changes were proposed in this pull request? Implement common base ML classes (`Predictor`, `PredictionModel`, `Classifier`, `ClasssificationModel` `ProbabilisticClassifier`, `ProbabilisticClasssificationModel`, `Regressor`, `RegrssionModel`) for non-Java backends. Note - `Predictor` and `JavaClassifier` should be abstract as `_fit` method is not implemented. - `PredictionModel` should be abstract as `_transform` is not implemented. ### Why are the changes needed? To provide extensions points for non-JVM algorithms, as well as a public (as opposed to `Java*` variants, which are commonly described in docstrings as private) hierarchy which can be used to distinguish between different classes of predictors. For longer discussion see [SPARK-29212]( and / or ### Does this PR introduce any user-facing change? It adds new base classes as listed above, but effective interfaces (method resolution order notwithstanding) stay the same. Additionally "private" `Java*` classes in`ml.regression` and `ml.classification` have been renamed to follow PEP-8 conventions (added leading underscore). It is for discussion if the same should be done to equivalent classes from `ml.wrapper`. If we take `JavaClassifier` as an example, type hierarchy will change from ![old pyspark ml classification JavaClassifier]( to ![new pyspark ml classification _JavaClassifier]( Similarly the old model ![old pyspark ml classification JavaClassificationModel]( will become ![new pyspark ml classification _JavaClassificationModel]( ### How was this patch tested? Existing unit tests. Closes #27245 from zero323/SPARK-29212. Authored-by: zero323 <> Signed-off-by: zhengruifeng <>
2020-03-03 23:20:02 -05:00
import re
modules = [,,,,,,]
for module in modules:
for name, cls in inspect.getmembers(module, inspect.isclass):
if not name.endswith('Model') and not name.endswith('Params') \
[SPARK-28985][PYTHON][ML] Add common classes (JavaPredictor/JavaClassificationModel/JavaProbabilisticClassifier) in PYTHON ### What changes were proposed in this pull request? Add some common classes in Python to make it have the same structure as Scala 1. Scala has ClassifierParams/Classifier/ClassificationModel: ``` trait ClassifierParams extends PredictorParams with HasRawPredictionCol abstract class Classifier extends Predictor with ClassifierParams { def setRawPredictionCol } abstract class ClassificationModel extends PredictionModel with ClassifierParams { def setRawPredictionCol } ``` This PR makes Python has the following: ``` class JavaClassifierParams(HasRawPredictionCol, JavaPredictorParams): pass class JavaClassifier(JavaPredictor, JavaClassifierParams): def setRawPredictionCol class JavaClassificationModel(JavaPredictionModel, JavaClassifierParams): def setRawPredictionCol ``` 2. Scala has ProbabilisticClassifierParams/ProbabilisticClassifier/ProbabilisticClassificationModel: ``` trait ProbabilisticClassifierParams extends ClassifierParams with HasProbabilityCol with HasThresholds abstract class ProbabilisticClassifier extends Classifier with ProbabilisticClassifierParams { def setProbabilityCol def setThresholds } abstract class ProbabilisticClassificationModel extends ClassificationModel with ProbabilisticClassifierParams { def setProbabilityCol def setThresholds } ``` This PR makes Python have the following: ``` class JavaProbabilisticClassifierParams(HasProbabilityCol, HasThresholds, JavaClassifierParams): pass class JavaProbabilisticClassifier(JavaClassifier, JavaProbabilisticClassifierParams): def setProbabilityCol def setThresholds class JavaProbabilisticClassificationModel(JavaClassificationModel, JavaProbabilisticClassifierParams): def setProbabilityCol def setThresholds ``` 3. Scala has PredictorParams/Predictor/PredictionModel: ``` trait PredictorParams extends Params with HasLabelCol with HasFeaturesCol with HasPredictionCol abstract class Predictor extends Estimator with PredictorParams { def setLabelCol def setFeaturesCol def setPredictionCol } abstract class PredictionModel extends Model with PredictorParams { def setFeaturesCol def setPredictionCol def numFeatures def predict } ``` This PR makes Python have the following: ``` class JavaPredictorParams(HasLabelCol, HasFeaturesCol, HasPredictionCol): pass class JavaPredictor(JavaEstimator, JavaPredictorParams): def setLabelCol def setFeaturesCol def setPredictionCol class JavaPredictionModel(JavaModel, JavaPredictorParams): def setFeaturesCol def setPredictionCol def numFeatures def predict ``` ### Why are the changes needed? Have parity between Python and Scala ML ### Does this PR introduce any user-facing change? Yes. Add the following changes: ``` LinearSVCModel - get/setFeatureCol - get/setPredictionCol - get/setLabelCol - get/setRawPredictionCol - predict ``` ``` LogisticRegressionModel DecisionTreeClassificationModel RandomForestClassificationModel GBTClassificationModel NaiveBayesModel MultilayerPerceptronClassificationModel - get/setFeatureCol - get/setPredictionCol - get/setLabelCol - get/setRawPredictionCol - get/setProbabilityCol - predict ``` ``` LinearRegressionModel IsotonicRegressionModel DecisionTreeRegressionModel RandomForestRegressionModel GBTRegressionModel AFTSurvivalRegressionModel GeneralizedLinearRegressionModel - get/setFeatureCol - get/setPredictionCol - get/setLabelCol - predict ``` ### How was this patch tested? Add a few doc tests. Closes #25776 from huaxingao/spark-28985. Authored-by: Huaxin Gao <> Signed-off-by: Sean Owen <>
2019-09-19 09:17:25 -04:00
and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \
and not re.match("_?Java", name) and name != '_LSH' \
and name != '_Selector':
check_params(self, cls(), check_params_exist=True)
# Additional classes that need explicit construction
from import CountVectorizerModel, StringIndexerModel
check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'),
check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'),
if __name__ == "__main__":
[SPARK-32319][PYSPARK] Disallow the use of unused imports Disallow the use of unused imports: - Unnecessary increases the memory footprint of the application - Removes the imports that are required for the examples in the docstring from the file-scope to the example itself. This keeps the files itself clean, and gives a more complete example as it also includes the imports :) ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" python/pyspark/ F401 'functools.partial' imported but unused python/pyspark/ F401 'traceback' imported but unused python/pyspark/ F401 '_heapq.*' imported but unused python/pyspark/ F401 'pyspark.version.__version__' imported but unused python/pyspark/ F401 'pyspark._globals._NoValue' imported but unused python/pyspark/ F401 'pyspark.sql.SQLContext' imported but unused python/pyspark/ F401 'pyspark.sql.HiveContext' imported but unused python/pyspark/ F401 'pyspark.sql.Row' imported but unused python/pyspark/ F401 're' imported but unused python/pyspark/ F401 'tempfile.NamedTemporaryFile' imported but unused python/pyspark/mllib/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/ F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/ F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/ F401 'pyspark.mllib.regression.LabeledPoint' imported but unused python/pyspark/mllib/tests/ F401 'sys' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_linalg.*' imported but unused python/pyspark/mllib/tests/ F401 'numpy.random' imported but unused python/pyspark/mllib/tests/ F401 'numpy.exp' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_feature.*' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_util.*' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg._convert_to_vector' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.DenseMatrix' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.SparseMatrix' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.MatrixUDT' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_stat.*' imported but unused python/pyspark/mllib/tests/ F401 'time.time' imported but unused python/pyspark/mllib/tests/ F401 'time.sleep' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_streaming_algorithms.*' imported but unused python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_algorithms.*' imported but unused python/pyspark/tests/ F401 'xmlrunner' imported but unused python/pyspark/tests/ F401 'sys' imported but unused python/pyspark/tests/ F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_rdd.*' imported but unused python/pyspark/tests/ F401 'sys' imported but unused python/pyspark/tests/ F401 'array.array' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_readwrite.*' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_join.*' imported but unused python/pyspark/tests/ F401 'shutil' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_taskcontext.*' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_conf.*' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_broadcast.*' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_daemon.*' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_util.*' imported but unused python/pyspark/tests/ F401 'random' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_pin_thread.*' imported but unused python/pyspark/tests/ F401 'sys' imported but unused python/pyspark/tests/ F401 'resource' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_worker.*' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_profiler.*' imported but unused python/pyspark/tests/ F401 'sys' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_shuffle.*' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_rddbarrier.*' imported but unused python/pyspark/tests/ F401 'userlibrary.UserClass' imported but unused python/pyspark/tests/ F401 'userlib.UserClass' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_context.*' imported but unused python/pyspark/tests/ F401 'pyspark.tests.test_appsubmit.*' imported but unused python/pyspark/streaming/ F401 'sys' imported but unused python/pyspark/streaming/tests/ F401 'pyspark.RDD' imported but unused python/pyspark/streaming/tests/ F401 'pyspark.streaming.tests.test_dstream.*' imported but unused python/pyspark/streaming/tests/ F401 'pyspark.streaming.tests.test_kinesis.*' imported but unused python/pyspark/streaming/tests/ F401 'pyspark.streaming.tests.test_listener.*' imported but unused python/pyspark/streaming/tests/ F401 'pyspark.streaming.tests.test_context.*' imported but unused python/pyspark/testing/ F401 'scipy.sparse' imported but unused python/pyspark/testing/ F401 'numpy as np' imported but unused python/pyspark/ml/ F401 '' imported but unused python/pyspark/ml/ F401 '' imported but unused python/pyspark/ml/ F401 '' imported but unused python/pyspark/ml/ F401 'sys' imported but unused python/pyspark/ml/ F401 '' imported but unused python/pyspark/ml/ F401 'sys' imported but unused python/pyspark/ml/ F401 '' imported but unused python/pyspark/ml/ F401 '' imported but unused python/pyspark/ml/tests/ F401 'sys' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 'pyspark.sql.functions as F' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 'sys' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 'py4j' imported but unused python/pyspark/ml/tests/ F401 'pyspark.testing.mlutils.PySparkTestCase' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 'sys' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/tests/ F401 '*' imported but unused python/pyspark/ml/param/ F401 'sys' imported but unused python/pyspark/resource/tests/ F401 'random' imported but unused python/pyspark/resource/tests/ F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/resource/tests/ F401 'pyspark.resource.tests.test_resources.*' imported but unused python/pyspark/sql/ F401 'pyspark.sql.udf.UserDefinedFunction' imported but unused python/pyspark/sql/ F401 'pyspark.sql.pandas.functions.pandas_udf' imported but unused python/pyspark/sql/ F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/ F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/ F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/ F401 'pyspark.sql.types.IntegerType' imported but unused python/pyspark/sql/ F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/ F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/ F401 'pyspark.sql.udf.UDFRegistration' imported but unused python/pyspark/sql/ F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_utils.*' imported but unused python/pyspark/sql/tests/ F401 'sys' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.functions.pandas_udf' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_pandas_map.*' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_catalog.*' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_group.*' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_session.*' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_conf.*' imported but unused python/pyspark/sql/tests/ F401 'sys' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.functions.sum' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/ F401 'pandas.util.testing.assert_series_equal' imported but unused python/pyspark/sql/tests/ F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_pandas_cogrouped_map.*' imported but unused python/pyspark/sql/tests/ F401 'py4j' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_pandas_udf_typehints.*' imported but unused python/pyspark/sql/tests/ F401 'sys' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.functions.exists' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_functions.*' imported but unused python/pyspark/sql/tests/ F401 'sys' imported but unused python/pyspark/sql/tests/ F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_pandas_udf_window.*' imported but unused python/pyspark/sql/tests/ F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/ F401 'sys' imported but unused python/pyspark/sql/tests/ F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/ F401 'pyspark.sql.DataFrame' imported but unused python/pyspark/sql/avro/ F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/pandas/ F401 'sys' imported but unused ``` After: ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" fokkodriesprongFan spark % ``` ### What changes were proposed in this pull request? Removing unused imports from the Python files to keep everything nice and tidy. ### Why are the changes needed? Cleaning up of the imports that aren't used, and suppressing the imports that are used as references to other modules, preserving backward compatibility. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Adding the rule to the existing Flake8 checks. Closes #29121 from Fokko/SPARK-32319. Authored-by: Fokko Driesprong <> Signed-off-by: Dongjoon Hyun <>
2020-08-08 11:51:57 -04:00
from import * # noqa: F401
import xmlrunner # type: ignore[import]
[SPARK-28130][PYTHON] Print pretty messages for skipped tests when xmlrunner is available in PySpark ## What changes were proposed in this pull request? Currently, pretty skipped message added by mechanism seems not working when xmlrunner is installed apparently. This PR fixes two things: 1. When `xmlrunner` is installed, seems `xmlrunner` does not respect `vervosity` level in unittests (default is level 1). So the output looks as below ``` Running tests... ---------------------------------------------------------------------- SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS ---------------------------------------------------------------------- ``` So it is not caught by our message detection mechanism. 2. If we manually set the `vervocity` level to `xmlrunner`, it prints messages as below: ``` test_mixed_udf (pyspark.sql.tests.test_pandas_udf_scalar.ScalarPandasUDFTests) ... SKIP (0.000s) test_mixed_udf_and_sql (pyspark.sql.tests.test_pandas_udf_scalar.ScalarPandasUDFTests) ... SKIP (0.000s) ... ``` This is different in our Jenkins machine: ``` test_createDataFrame_column_name_encoding (pyspark.sql.tests.test_arrow.ArrowTests) ... skipped 'Pandas >= 0.23.2 must be installed; however, it was not found.' test_createDataFrame_does_not_modify_input (pyspark.sql.tests.test_arrow.ArrowTests) ... skipped 'Pandas >= 0.23.2 must be installed; however, it was not found.' ... ``` Note that last `SKIP` is different. This PR fixes the regular expression to catch `SKIP` case as well. ## How was this patch tested? Manually tested. **Before:** ``` Starting test(python2.7): pyspark.... Finished test(python2.7): pyspark.... (0s) ... Tests passed in 562 seconds ======================================================================== ... ``` **After:** ``` Starting test(python2.7): pyspark.... Finished test(python2.7): pyspark.... (48s) ... 93 tests were skipped ... Tests passed in 560 seconds Skipped tests pyspark.... with python2.7: pyspark...(...) ... SKIP (0.000s) ... ======================================================================== ... ``` Closes #24927 from HyukjinKwon/SPARK-28130. Authored-by: HyukjinKwon <> Signed-off-by: HyukjinKwon <>
2019-06-23 20:58:17 -04:00
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)