spark-instrumented-optimizer/python/pyspark/ml/tests/test_training_summary.py
Fokko Driesprong 9fcf0ea718 [SPARK-32319][PYSPARK] Disallow the use of unused imports
Disallow the use of unused imports:

- Unnecessary increases the memory footprint of the application
- Removes the imports that are required for the examples in the docstring from the file-scope to the example itself. This keeps the files itself clean, and gives a more complete example as it also includes the imports :)

```
fokkodriesprongFan spark % flake8 python | grep -i "imported but unused"
python/pyspark/cloudpickle.py:46:1: F401 'functools.partial' imported but unused
python/pyspark/cloudpickle.py:55:1: F401 'traceback' imported but unused
python/pyspark/heapq3.py:868:5: F401 '_heapq.*' imported but unused
python/pyspark/__init__.py:61:1: F401 'pyspark.version.__version__' imported but unused
python/pyspark/__init__.py:62:1: F401 'pyspark._globals._NoValue' imported but unused
python/pyspark/__init__.py:115:1: F401 'pyspark.sql.SQLContext' imported but unused
python/pyspark/__init__.py:115:1: F401 'pyspark.sql.HiveContext' imported but unused
python/pyspark/__init__.py:115:1: F401 'pyspark.sql.Row' imported but unused
python/pyspark/rdd.py:21:1: F401 're' imported but unused
python/pyspark/rdd.py:29:1: F401 'tempfile.NamedTemporaryFile' imported but unused
python/pyspark/mllib/regression.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/classification.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/feature.py:30:1: F401 'pyspark.mllib.regression.LabeledPoint' imported but unused
python/pyspark/mllib/tests/test_linalg.py:18:1: F401 'sys' imported but unused
python/pyspark/mllib/tests/test_linalg.py:642:5: F401 'pyspark.mllib.tests.test_linalg.*' imported but unused
python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.random' imported but unused
python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.exp' imported but unused
python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused
python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused
python/pyspark/mllib/tests/test_feature.py:185:5: F401 'pyspark.mllib.tests.test_feature.*' imported but unused
python/pyspark/mllib/tests/test_util.py:97:5: F401 'pyspark.mllib.tests.test_util.*' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg._convert_to_vector' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseMatrix' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseMatrix' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.MatrixUDT' imported but unused
python/pyspark/mllib/tests/test_stat.py:181:5: F401 'pyspark.mllib.tests.test_stat.*' imported but unused
python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.time' imported but unused
python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.sleep' imported but unused
python/pyspark/mllib/tests/test_streaming_algorithms.py:470:5: F401 'pyspark.mllib.tests.test_streaming_algorithms.*' imported but unused
python/pyspark/mllib/tests/test_algorithms.py:295:5: F401 'pyspark.mllib.tests.test_algorithms.*' imported but unused
python/pyspark/tests/test_serializers.py:90:13: F401 'xmlrunner' imported but unused
python/pyspark/tests/test_rdd.py:21:1: F401 'sys' imported but unused
python/pyspark/tests/test_rdd.py:29:1: F401 'pyspark.resource.ResourceProfile' imported but unused
python/pyspark/tests/test_rdd.py:885:5: F401 'pyspark.tests.test_rdd.*' imported but unused
python/pyspark/tests/test_readwrite.py:19:1: F401 'sys' imported but unused
python/pyspark/tests/test_readwrite.py:22:1: F401 'array.array' imported but unused
python/pyspark/tests/test_readwrite.py:309:5: F401 'pyspark.tests.test_readwrite.*' imported but unused
python/pyspark/tests/test_join.py:62:5: F401 'pyspark.tests.test_join.*' imported but unused
python/pyspark/tests/test_taskcontext.py:19:1: F401 'shutil' imported but unused
python/pyspark/tests/test_taskcontext.py:325:5: F401 'pyspark.tests.test_taskcontext.*' imported but unused
python/pyspark/tests/test_conf.py:36:5: F401 'pyspark.tests.test_conf.*' imported but unused
python/pyspark/tests/test_broadcast.py:148:5: F401 'pyspark.tests.test_broadcast.*' imported but unused
python/pyspark/tests/test_daemon.py:76:5: F401 'pyspark.tests.test_daemon.*' imported but unused
python/pyspark/tests/test_util.py:77:5: F401 'pyspark.tests.test_util.*' imported but unused
python/pyspark/tests/test_pin_thread.py:19:1: F401 'random' imported but unused
python/pyspark/tests/test_pin_thread.py:149:5: F401 'pyspark.tests.test_pin_thread.*' imported but unused
python/pyspark/tests/test_worker.py:19:1: F401 'sys' imported but unused
python/pyspark/tests/test_worker.py:26:5: F401 'resource' imported but unused
python/pyspark/tests/test_worker.py:203:5: F401 'pyspark.tests.test_worker.*' imported but unused
python/pyspark/tests/test_profiler.py:101:5: F401 'pyspark.tests.test_profiler.*' imported but unused
python/pyspark/tests/test_shuffle.py:18:1: F401 'sys' imported but unused
python/pyspark/tests/test_shuffle.py:171:5: F401 'pyspark.tests.test_shuffle.*' imported but unused
python/pyspark/tests/test_rddbarrier.py:43:5: F401 'pyspark.tests.test_rddbarrier.*' imported but unused
python/pyspark/tests/test_context.py:129:13: F401 'userlibrary.UserClass' imported but unused
python/pyspark/tests/test_context.py:140:13: F401 'userlib.UserClass' imported but unused
python/pyspark/tests/test_context.py:310:5: F401 'pyspark.tests.test_context.*' imported but unused
python/pyspark/tests/test_appsubmit.py:241:5: F401 'pyspark.tests.test_appsubmit.*' imported but unused
python/pyspark/streaming/dstream.py:18:1: F401 'sys' imported but unused
python/pyspark/streaming/tests/test_dstream.py:27:1: F401 'pyspark.RDD' imported but unused
python/pyspark/streaming/tests/test_dstream.py:647:5: F401 'pyspark.streaming.tests.test_dstream.*' imported but unused
python/pyspark/streaming/tests/test_kinesis.py:83:5: F401 'pyspark.streaming.tests.test_kinesis.*' imported but unused
python/pyspark/streaming/tests/test_listener.py:152:5: F401 'pyspark.streaming.tests.test_listener.*' imported but unused
python/pyspark/streaming/tests/test_context.py:178:5: F401 'pyspark.streaming.tests.test_context.*' imported but unused
python/pyspark/testing/utils.py:30:5: F401 'scipy.sparse' imported but unused
python/pyspark/testing/utils.py:36:5: F401 'numpy as np' imported but unused
python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._TreeEnsembleParams' imported but unused
python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._HasVarianceImpurity' imported but unused
python/pyspark/ml/regression.py:29:1: F401 'pyspark.ml.wrapper.JavaParams' imported but unused
python/pyspark/ml/util.py:19:1: F401 'sys' imported but unused
python/pyspark/ml/__init__.py:25:1: F401 'pyspark.ml.pipeline' imported but unused
python/pyspark/ml/pipeline.py:18:1: F401 'sys' imported but unused
python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.DenseMatrix' imported but unused
python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.Vectors' imported but unused
python/pyspark/ml/tests/test_training_summary.py:18:1: F401 'sys' imported but unused
python/pyspark/ml/tests/test_training_summary.py:364:5: F401 'pyspark.ml.tests.test_training_summary.*' imported but unused
python/pyspark/ml/tests/test_linalg.py:381:5: F401 'pyspark.ml.tests.test_linalg.*' imported but unused
python/pyspark/ml/tests/test_tuning.py:427:9: F401 'pyspark.sql.functions as F' imported but unused
python/pyspark/ml/tests/test_tuning.py:757:5: F401 'pyspark.ml.tests.test_tuning.*' imported but unused
python/pyspark/ml/tests/test_wrapper.py:120:5: F401 'pyspark.ml.tests.test_wrapper.*' imported but unused
python/pyspark/ml/tests/test_feature.py:19:1: F401 'sys' imported but unused
python/pyspark/ml/tests/test_feature.py:304:5: F401 'pyspark.ml.tests.test_feature.*' imported but unused
python/pyspark/ml/tests/test_image.py:19:1: F401 'py4j' imported but unused
python/pyspark/ml/tests/test_image.py:22:1: F401 'pyspark.testing.mlutils.PySparkTestCase' imported but unused
python/pyspark/ml/tests/test_image.py:71:5: F401 'pyspark.ml.tests.test_image.*' imported but unused
python/pyspark/ml/tests/test_persistence.py:456:5: F401 'pyspark.ml.tests.test_persistence.*' imported but unused
python/pyspark/ml/tests/test_evaluation.py:56:5: F401 'pyspark.ml.tests.test_evaluation.*' imported but unused
python/pyspark/ml/tests/test_stat.py:43:5: F401 'pyspark.ml.tests.test_stat.*' imported but unused
python/pyspark/ml/tests/test_base.py:70:5: F401 'pyspark.ml.tests.test_base.*' imported but unused
python/pyspark/ml/tests/test_param.py:20:1: F401 'sys' imported but unused
python/pyspark/ml/tests/test_param.py:375:5: F401 'pyspark.ml.tests.test_param.*' imported but unused
python/pyspark/ml/tests/test_pipeline.py:62:5: F401 'pyspark.ml.tests.test_pipeline.*' imported but unused
python/pyspark/ml/tests/test_algorithms.py:333:5: F401 'pyspark.ml.tests.test_algorithms.*' imported but unused
python/pyspark/ml/param/__init__.py:18:1: F401 'sys' imported but unused
python/pyspark/resource/tests/test_resources.py:17:1: F401 'random' imported but unused
python/pyspark/resource/tests/test_resources.py:20:1: F401 'pyspark.resource.ResourceProfile' imported but unused
python/pyspark/resource/tests/test_resources.py:75:5: F401 'pyspark.resource.tests.test_resources.*' imported but unused
python/pyspark/sql/functions.py:32:1: F401 'pyspark.sql.udf.UserDefinedFunction' imported but unused
python/pyspark/sql/functions.py:34:1: F401 'pyspark.sql.pandas.functions.pandas_udf' imported but unused
python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.Row' imported but unused
python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.StringType' imported but unused
python/pyspark/sql/readwriter.py:1084:5: F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.IntegerType' imported but unused
python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.Row' imported but unused
python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.StringType' imported but unused
python/pyspark/sql/context.py:27:1: F401 'pyspark.sql.udf.UDFRegistration' imported but unused
python/pyspark/sql/streaming.py:1212:5: F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/tests/test_utils.py:55:5: F401 'pyspark.sql.tests.test_utils.*' imported but unused
python/pyspark/sql/tests/test_pandas_map.py:18:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.pandas_udf' imported but unused
python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused
python/pyspark/sql/tests/test_pandas_map.py:119:5: F401 'pyspark.sql.tests.test_pandas_map.*' imported but unused
python/pyspark/sql/tests/test_catalog.py:193:5: F401 'pyspark.sql.tests.test_catalog.*' imported but unused
python/pyspark/sql/tests/test_group.py:39:5: F401 'pyspark.sql.tests.test_group.*' imported but unused
python/pyspark/sql/tests/test_session.py:361:5: F401 'pyspark.sql.tests.test_session.*' imported but unused
python/pyspark/sql/tests/test_conf.py:49:5: F401 'pyspark.sql.tests.test_conf.*' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:19:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.sum' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:29:5: F401 'pandas.util.testing.assert_series_equal' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:32:5: F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:248:5: F401 'pyspark.sql.tests.test_pandas_cogrouped_map.*' imported but unused
python/pyspark/sql/tests/test_udf.py:24:1: F401 'py4j' imported but unused
python/pyspark/sql/tests/test_pandas_udf_typehints.py:246:5: F401 'pyspark.sql.tests.test_pandas_udf_typehints.*' imported but unused
python/pyspark/sql/tests/test_functions.py:19:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_functions.py:362:9: F401 'pyspark.sql.functions.exists' imported but unused
python/pyspark/sql/tests/test_functions.py:387:5: F401 'pyspark.sql.tests.test_functions.*' imported but unused
python/pyspark/sql/tests/test_pandas_udf_scalar.py:21:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_pandas_udf_scalar.py:45:5: F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/test_pandas_udf_window.py:355:5: F401 'pyspark.sql.tests.test_pandas_udf_window.*' imported but unused
python/pyspark/sql/tests/test_arrow.py:38:5: F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/test_pandas_grouped_map.py:20:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_pandas_grouped_map.py:38:5: F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/test_dataframe.py:382:9: F401 'pyspark.sql.DataFrame' imported but unused
python/pyspark/sql/avro/functions.py:125:5: F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/pandas/functions.py:19:1: F401 'sys' imported but unused
```

After:
```
fokkodriesprongFan spark % flake8 python | grep -i "imported but unused"
fokkodriesprongFan spark %
```

### What changes were proposed in this pull request?

Removing unused imports from the Python files to keep everything nice and tidy.

### Why are the changes needed?

Cleaning up of the imports that aren't used, and suppressing the imports that are used as references to other modules, preserving backward compatibility.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Adding the rule to the existing Flake8 checks.

Closes #29121 from Fokko/SPARK-32319.

Authored-by: Fokko Driesprong <fokko@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
2020-08-08 08:51:57 -07:00

453 lines
25 KiB
Python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
from pyspark.ml.classification import BinaryLogisticRegressionSummary, \
BinaryRandomForestClassificationSummary, FMClassifier, \
FMClassificationSummary, LinearSVC, LinearSVCSummary, \
LogisticRegression, LogisticRegressionSummary, \
MultilayerPerceptronClassifier, MultilayerPerceptronClassificationSummary, \
RandomForestClassificationSummary, RandomForestClassifier
from pyspark.ml.clustering import BisectingKMeans, GaussianMixture, KMeans
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression
from pyspark.sql import DataFrame
from pyspark.testing.mlutils import SparkSessionTestCase
class TrainingSummaryTest(SparkSessionTestCase):
def test_linear_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertEqual(s.totalIterations, 0)
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.predictionCol, "prediction")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
self.assertAlmostEqual(s.meanSquaredError, 0.0)
self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
self.assertAlmostEqual(s.r2, 1.0, 2)
self.assertAlmostEqual(s.r2adj, 1.0, 2)
self.assertTrue(isinstance(s.residuals, DataFrame))
self.assertEqual(s.numInstances, 2)
self.assertEqual(s.degreesOfFreedom, 1)
devResiduals = s.devianceResiduals
self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
coefStdErr = s.coefficientStandardErrors
self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
tValues = s.tValues
self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
pValues = s.pValues
self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned
# The child class LinearRegressionTrainingSummary runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
def test_glr_summary(self):
from pyspark.ml.linalg import Vectors
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
fitIntercept=False)
model = glr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.predictionCol, "prediction")
self.assertEqual(s.numInstances, 2)
self.assertTrue(isinstance(s.residuals(), DataFrame))
self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
coefStdErr = s.coefficientStandardErrors
self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
tValues = s.tValues
self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
pValues = s.pValues
self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
self.assertEqual(s.degreesOfFreedom, 1)
self.assertEqual(s.residualDegreeOfFreedom, 1)
self.assertEqual(s.residualDegreeOfFreedomNull, 2)
self.assertEqual(s.rank, 1)
self.assertTrue(isinstance(s.solver, str))
self.assertTrue(isinstance(s.aic, float))
self.assertTrue(isinstance(s.deviance, float))
self.assertTrue(isinstance(s.nullDeviance, float))
self.assertTrue(isinstance(s.dispersion, float))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned
# The child class GeneralizedLinearRegressionTrainingSummary runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def test_binary_logistic_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
self.assertAlmostEqual(s.accuracy, 1.0, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary))
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_multiclass_logistic_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], [])),
(2.0, 2.0, Vectors.dense(2.0)),
(2.0, 2.0, Vectors.dense(1.9))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertAlmostEqual(s.accuracy, 0.75, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertTrue(isinstance(sameSummary, LogisticRegressionSummary))
self.assertFalse(isinstance(sameSummary, BinaryLogisticRegressionSummary))
self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
def test_linear_svc_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0, 1.0, 1.0)),
(0.0, 2.0, Vectors.dense(1.0, 2.0, 3.0))],
["label", "weight", "features"])
svc = LinearSVC(maxIter=5, weightCol="weight")
model = svc.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary()
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.scoreCol, "rawPrediction")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.predictionCol, "prediction")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
print(s.weightedTruePositiveRate)
self.assertAlmostEqual(s.weightedTruePositiveRate, 0.5, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.5, 2)
self.assertAlmostEqual(s.weightedRecall, 0.5, 2)
self.assertAlmostEqual(s.weightedPrecision, 0.25, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 0.3333333333333333, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.3333333333333333, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertTrue(isinstance(sameSummary, LinearSVCSummary))
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_binary_randomforest_classification_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
rf = RandomForestClassifier(weightCol="weight")
model = rf.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.predictionCol, "prediction")
self.assertEqual(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
self.assertAlmostEqual(s.accuracy, 1.0, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertTrue(isinstance(sameSummary, BinaryRandomForestClassificationSummary))
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_multiclass_randomforest_classification_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], [])),
(2.0, 2.0, Vectors.dense(2.0)),
(2.0, 2.0, Vectors.dense(1.9))],
["label", "weight", "features"])
rf = RandomForestClassifier(weightCol="weight")
model = rf.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.predictionCol, "prediction")
self.assertEqual(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertAlmostEqual(s.accuracy, 1.0, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertTrue(isinstance(sameSummary, RandomForestClassificationSummary))
self.assertFalse(isinstance(sameSummary, BinaryRandomForestClassificationSummary))
self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
def test_fm_classification_summary(self):
df = self.spark.createDataFrame([(1.0, Vectors.dense(2.0)),
(0.0, Vectors.dense(2.0)),
(0.0, Vectors.dense(6.0)),
(1.0, Vectors.dense(3.0))
],
["label", "features"])
fm = FMClassifier(maxIter=5)
model = fm.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary()
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.scoreCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.predictionCol, "prediction")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 0.625, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
self.assertAlmostEqual(s.weightedPrecision, 0.8333333333333333, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 0.7333333333333334, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.7333333333333334, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertTrue(isinstance(sameSummary, FMClassificationSummary))
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_mlp_classification_summary(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])),
(1.0, Vectors.dense([0.0, 1.0])),
(1.0, Vectors.dense([1.0, 0.0])),
(0.0, Vectors.dense([1.0, 1.0]))
],
["label", "features"])
mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123)
model = mlp.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary()
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.predictionCol, "prediction")
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertAlmostEqual(s.accuracy, 1.0, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertTrue(isinstance(sameSummary, MultilayerPerceptronClassificationSummary))
self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
def test_gaussian_mixture_summary(self):
data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
(Vectors.sparse(1, [], []),)]
df = self.spark.createDataFrame(data, ["features"])
gmm = GaussianMixture(k=2)
model = gmm.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertTrue(isinstance(s.probability, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 3)
def test_bisecting_kmeans_summary(self):
data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
(Vectors.sparse(1, [], []),)]
df = self.spark.createDataFrame(data, ["features"])
bkm = BisectingKMeans(k=2)
model = bkm.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 20)
def test_kmeans_summary(self):
data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = self.spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 1)
if __name__ == "__main__":
from pyspark.ml.tests.test_training_summary import * # noqa: F401
try:
import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)