9fcf0ea718
Disallow the use of unused imports: - Unnecessary increases the memory footprint of the application - Removes the imports that are required for the examples in the docstring from the file-scope to the example itself. This keeps the files itself clean, and gives a more complete example as it also includes the imports :) ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" python/pyspark/cloudpickle.py:46:1: F401 'functools.partial' imported but unused python/pyspark/cloudpickle.py:55:1: F401 'traceback' imported but unused python/pyspark/heapq3.py:868:5: F401 '_heapq.*' imported but unused python/pyspark/__init__.py:61:1: F401 'pyspark.version.__version__' imported but unused python/pyspark/__init__.py:62:1: F401 'pyspark._globals._NoValue' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.SQLContext' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.HiveContext' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.Row' imported but unused python/pyspark/rdd.py:21:1: F401 're' imported but unused python/pyspark/rdd.py:29:1: F401 'tempfile.NamedTemporaryFile' imported but unused python/pyspark/mllib/regression.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/classification.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/feature.py:30:1: F401 'pyspark.mllib.regression.LabeledPoint' imported but unused python/pyspark/mllib/tests/test_linalg.py:18:1: F401 'sys' imported but unused python/pyspark/mllib/tests/test_linalg.py:642:5: F401 'pyspark.mllib.tests.test_linalg.*' imported but unused python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.random' imported but unused python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.exp' imported but unused python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/test_feature.py:185:5: F401 'pyspark.mllib.tests.test_feature.*' imported but unused python/pyspark/mllib/tests/test_util.py:97:5: F401 'pyspark.mllib.tests.test_util.*' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg._convert_to_vector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseMatrix' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseMatrix' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.MatrixUDT' imported but unused python/pyspark/mllib/tests/test_stat.py:181:5: F401 'pyspark.mllib.tests.test_stat.*' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.time' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.sleep' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:470:5: F401 'pyspark.mllib.tests.test_streaming_algorithms.*' imported but unused python/pyspark/mllib/tests/test_algorithms.py:295:5: F401 'pyspark.mllib.tests.test_algorithms.*' imported but unused python/pyspark/tests/test_serializers.py:90:13: F401 'xmlrunner' imported but unused python/pyspark/tests/test_rdd.py:21:1: F401 'sys' imported but unused python/pyspark/tests/test_rdd.py:29:1: F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/tests/test_rdd.py:885:5: F401 'pyspark.tests.test_rdd.*' imported but unused python/pyspark/tests/test_readwrite.py:19:1: F401 'sys' imported but unused python/pyspark/tests/test_readwrite.py:22:1: F401 'array.array' imported but unused python/pyspark/tests/test_readwrite.py:309:5: F401 'pyspark.tests.test_readwrite.*' imported but unused python/pyspark/tests/test_join.py:62:5: F401 'pyspark.tests.test_join.*' imported but unused python/pyspark/tests/test_taskcontext.py:19:1: F401 'shutil' imported but unused python/pyspark/tests/test_taskcontext.py:325:5: F401 'pyspark.tests.test_taskcontext.*' imported but unused python/pyspark/tests/test_conf.py:36:5: F401 'pyspark.tests.test_conf.*' imported but unused python/pyspark/tests/test_broadcast.py:148:5: F401 'pyspark.tests.test_broadcast.*' imported but unused python/pyspark/tests/test_daemon.py:76:5: F401 'pyspark.tests.test_daemon.*' imported but unused python/pyspark/tests/test_util.py:77:5: F401 'pyspark.tests.test_util.*' imported but unused python/pyspark/tests/test_pin_thread.py:19:1: F401 'random' imported but unused python/pyspark/tests/test_pin_thread.py:149:5: F401 'pyspark.tests.test_pin_thread.*' imported but unused python/pyspark/tests/test_worker.py:19:1: F401 'sys' imported but unused python/pyspark/tests/test_worker.py:26:5: F401 'resource' imported but unused python/pyspark/tests/test_worker.py:203:5: F401 'pyspark.tests.test_worker.*' imported but unused python/pyspark/tests/test_profiler.py:101:5: F401 'pyspark.tests.test_profiler.*' imported but unused python/pyspark/tests/test_shuffle.py:18:1: F401 'sys' imported but unused python/pyspark/tests/test_shuffle.py:171:5: F401 'pyspark.tests.test_shuffle.*' imported but unused python/pyspark/tests/test_rddbarrier.py:43:5: F401 'pyspark.tests.test_rddbarrier.*' imported but unused python/pyspark/tests/test_context.py:129:13: F401 'userlibrary.UserClass' imported but unused python/pyspark/tests/test_context.py:140:13: F401 'userlib.UserClass' imported but unused python/pyspark/tests/test_context.py:310:5: F401 'pyspark.tests.test_context.*' imported but unused python/pyspark/tests/test_appsubmit.py:241:5: F401 'pyspark.tests.test_appsubmit.*' imported but unused python/pyspark/streaming/dstream.py:18:1: F401 'sys' imported but unused python/pyspark/streaming/tests/test_dstream.py:27:1: F401 'pyspark.RDD' imported but unused python/pyspark/streaming/tests/test_dstream.py:647:5: F401 'pyspark.streaming.tests.test_dstream.*' imported but unused python/pyspark/streaming/tests/test_kinesis.py:83:5: F401 'pyspark.streaming.tests.test_kinesis.*' imported but unused python/pyspark/streaming/tests/test_listener.py:152:5: F401 'pyspark.streaming.tests.test_listener.*' imported but unused python/pyspark/streaming/tests/test_context.py:178:5: F401 'pyspark.streaming.tests.test_context.*' imported but unused python/pyspark/testing/utils.py:30:5: F401 'scipy.sparse' imported but unused python/pyspark/testing/utils.py:36:5: F401 'numpy as np' imported but unused python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._TreeEnsembleParams' imported but unused python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._HasVarianceImpurity' imported but unused python/pyspark/ml/regression.py:29:1: F401 'pyspark.ml.wrapper.JavaParams' imported but unused python/pyspark/ml/util.py:19:1: F401 'sys' imported but unused python/pyspark/ml/__init__.py:25:1: F401 'pyspark.ml.pipeline' imported but unused python/pyspark/ml/pipeline.py:18:1: F401 'sys' imported but unused python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.DenseMatrix' imported but unused python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.Vectors' imported but unused python/pyspark/ml/tests/test_training_summary.py:18:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_training_summary.py:364:5: F401 'pyspark.ml.tests.test_training_summary.*' imported but unused python/pyspark/ml/tests/test_linalg.py:381:5: F401 'pyspark.ml.tests.test_linalg.*' imported but unused python/pyspark/ml/tests/test_tuning.py:427:9: F401 'pyspark.sql.functions as F' imported but unused python/pyspark/ml/tests/test_tuning.py:757:5: F401 'pyspark.ml.tests.test_tuning.*' imported but unused python/pyspark/ml/tests/test_wrapper.py:120:5: F401 'pyspark.ml.tests.test_wrapper.*' imported but unused python/pyspark/ml/tests/test_feature.py:19:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_feature.py:304:5: F401 'pyspark.ml.tests.test_feature.*' imported but unused python/pyspark/ml/tests/test_image.py:19:1: F401 'py4j' imported but unused python/pyspark/ml/tests/test_image.py:22:1: F401 'pyspark.testing.mlutils.PySparkTestCase' imported but unused python/pyspark/ml/tests/test_image.py:71:5: F401 'pyspark.ml.tests.test_image.*' imported but unused python/pyspark/ml/tests/test_persistence.py:456:5: F401 'pyspark.ml.tests.test_persistence.*' imported but unused python/pyspark/ml/tests/test_evaluation.py:56:5: F401 'pyspark.ml.tests.test_evaluation.*' imported but unused python/pyspark/ml/tests/test_stat.py:43:5: F401 'pyspark.ml.tests.test_stat.*' imported but unused python/pyspark/ml/tests/test_base.py:70:5: F401 'pyspark.ml.tests.test_base.*' imported but unused python/pyspark/ml/tests/test_param.py:20:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_param.py:375:5: F401 'pyspark.ml.tests.test_param.*' imported but unused python/pyspark/ml/tests/test_pipeline.py:62:5: F401 'pyspark.ml.tests.test_pipeline.*' imported but unused python/pyspark/ml/tests/test_algorithms.py:333:5: F401 'pyspark.ml.tests.test_algorithms.*' imported but unused python/pyspark/ml/param/__init__.py:18:1: F401 'sys' imported but unused python/pyspark/resource/tests/test_resources.py:17:1: F401 'random' imported but unused python/pyspark/resource/tests/test_resources.py:20:1: F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/resource/tests/test_resources.py:75:5: F401 'pyspark.resource.tests.test_resources.*' imported but unused python/pyspark/sql/functions.py:32:1: F401 'pyspark.sql.udf.UserDefinedFunction' imported but unused python/pyspark/sql/functions.py:34:1: F401 'pyspark.sql.pandas.functions.pandas_udf' imported but unused python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/readwriter.py:1084:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.IntegerType' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/context.py:27:1: F401 'pyspark.sql.udf.UDFRegistration' imported but unused python/pyspark/sql/streaming.py:1212:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/tests/test_utils.py:55:5: F401 'pyspark.sql.tests.test_utils.*' imported but unused python/pyspark/sql/tests/test_pandas_map.py:18:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.pandas_udf' imported but unused python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/test_pandas_map.py:119:5: F401 'pyspark.sql.tests.test_pandas_map.*' imported but unused python/pyspark/sql/tests/test_catalog.py:193:5: F401 'pyspark.sql.tests.test_catalog.*' imported but unused python/pyspark/sql/tests/test_group.py:39:5: F401 'pyspark.sql.tests.test_group.*' imported but unused python/pyspark/sql/tests/test_session.py:361:5: F401 'pyspark.sql.tests.test_session.*' imported but unused python/pyspark/sql/tests/test_conf.py:49:5: F401 'pyspark.sql.tests.test_conf.*' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:19:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.sum' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:29:5: F401 'pandas.util.testing.assert_series_equal' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:32:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:248:5: F401 'pyspark.sql.tests.test_pandas_cogrouped_map.*' imported but unused python/pyspark/sql/tests/test_udf.py:24:1: F401 'py4j' imported but unused python/pyspark/sql/tests/test_pandas_udf_typehints.py:246:5: F401 'pyspark.sql.tests.test_pandas_udf_typehints.*' imported but unused python/pyspark/sql/tests/test_functions.py:19:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_functions.py:362:9: F401 'pyspark.sql.functions.exists' imported but unused python/pyspark/sql/tests/test_functions.py:387:5: F401 'pyspark.sql.tests.test_functions.*' imported but unused python/pyspark/sql/tests/test_pandas_udf_scalar.py:21:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_udf_scalar.py:45:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_udf_window.py:355:5: F401 'pyspark.sql.tests.test_pandas_udf_window.*' imported but unused python/pyspark/sql/tests/test_arrow.py:38:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_grouped_map.py:20:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_grouped_map.py:38:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_dataframe.py:382:9: F401 'pyspark.sql.DataFrame' imported but unused python/pyspark/sql/avro/functions.py:125:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/pandas/functions.py:19:1: F401 'sys' imported but unused ``` After: ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" fokkodriesprongFan spark % ``` ### What changes were proposed in this pull request? Removing unused imports from the Python files to keep everything nice and tidy. ### Why are the changes needed? Cleaning up of the imports that aren't used, and suppressing the imports that are used as references to other modules, preserving backward compatibility. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Adding the rule to the existing Flake8 checks. Closes #29121 from Fokko/SPARK-32319. Authored-by: Fokko Driesprong <fokko@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
464 lines
19 KiB
Python
464 lines
19 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import json
|
|
from shutil import rmtree
|
|
import tempfile
|
|
import unittest
|
|
|
|
from pyspark.ml import Transformer
|
|
from pyspark.ml.classification import DecisionTreeClassifier, FMClassifier, \
|
|
FMClassificationModel, LogisticRegression, MultilayerPerceptronClassifier, \
|
|
MultilayerPerceptronClassificationModel, OneVsRest, OneVsRestModel
|
|
from pyspark.ml.clustering import KMeans
|
|
from pyspark.ml.feature import Binarizer, HashingTF, PCA
|
|
from pyspark.ml.linalg import Vectors
|
|
from pyspark.ml.param import Params
|
|
from pyspark.ml.pipeline import Pipeline, PipelineModel
|
|
from pyspark.ml.regression import DecisionTreeRegressor, GeneralizedLinearRegression, \
|
|
GeneralizedLinearRegressionModel, \
|
|
LinearRegression
|
|
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWriter
|
|
from pyspark.ml.wrapper import JavaParams
|
|
from pyspark.testing.mlutils import MockUnaryTransformer, SparkSessionTestCase
|
|
|
|
|
|
class TestDefaultSolver(SparkSessionTestCase):
|
|
|
|
def test_multilayer_load(self):
|
|
df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])),
|
|
(1.0, Vectors.dense([0.0, 1.0])),
|
|
(1.0, Vectors.dense([1.0, 0.0])),
|
|
(0.0, Vectors.dense([1.0, 1.0]))],
|
|
["label", "features"])
|
|
|
|
mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123)
|
|
model = mlp.fit(df)
|
|
self.assertEqual(model.getSolver(), "l-bfgs")
|
|
transformed1 = model.transform(df)
|
|
path = tempfile.mkdtemp()
|
|
model_path = path + "/mlp"
|
|
model.save(model_path)
|
|
model2 = MultilayerPerceptronClassificationModel.load(model_path)
|
|
self.assertEqual(model2.getSolver(), "l-bfgs")
|
|
transformed2 = model2.transform(df)
|
|
self.assertEqual(transformed1.take(4), transformed2.take(4))
|
|
|
|
def test_fm_load(self):
|
|
df = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)),
|
|
(0.0, Vectors.sparse(1, [], []))],
|
|
["label", "features"])
|
|
fm = FMClassifier(factorSize=2, maxIter=50, stepSize=2.0)
|
|
model = fm.fit(df)
|
|
self.assertEqual(model.getSolver(), "adamW")
|
|
transformed1 = model.transform(df)
|
|
path = tempfile.mkdtemp()
|
|
model_path = path + "/fm"
|
|
model.save(model_path)
|
|
model2 = FMClassificationModel.load(model_path)
|
|
self.assertEqual(model2.getSolver(), "adamW")
|
|
transformed2 = model2.transform(df)
|
|
self.assertEqual(transformed1.take(2), transformed2.take(2))
|
|
|
|
def test_glr_load(self):
|
|
df = self.spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)),
|
|
(1.0, Vectors.dense(1.0, 2.0)),
|
|
(2.0, Vectors.dense(0.0, 0.0)),
|
|
(2.0, Vectors.dense(1.0, 1.0))],
|
|
["label", "features"])
|
|
glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p")
|
|
model = glr.fit(df)
|
|
self.assertEqual(model.getSolver(), "irls")
|
|
transformed1 = model.transform(df)
|
|
path = tempfile.mkdtemp()
|
|
model_path = path + "/glr"
|
|
model.save(model_path)
|
|
model2 = GeneralizedLinearRegressionModel.load(model_path)
|
|
self.assertEqual(model2.getSolver(), "irls")
|
|
transformed2 = model2.transform(df)
|
|
self.assertEqual(transformed1.take(4), transformed2.take(4))
|
|
|
|
|
|
class PersistenceTest(SparkSessionTestCase):
|
|
|
|
def test_linear_regression(self):
|
|
lr = LinearRegression(maxIter=1)
|
|
path = tempfile.mkdtemp()
|
|
lr_path = path + "/lr"
|
|
lr.save(lr_path)
|
|
lr2 = LinearRegression.load(lr_path)
|
|
self.assertEqual(lr.uid, lr2.uid)
|
|
self.assertEqual(type(lr.uid), type(lr2.uid))
|
|
self.assertEqual(lr2.uid, lr2.maxIter.parent,
|
|
"Loaded LinearRegression instance uid (%s) did not match Param's uid (%s)"
|
|
% (lr2.uid, lr2.maxIter.parent))
|
|
self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
|
|
"Loaded LinearRegression instance default params did not match " +
|
|
"original defaults")
|
|
try:
|
|
rmtree(path)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_linear_regression_pmml_basic(self):
|
|
# Most of the validation is done in the Scala side, here we just check
|
|
# that we output text rather than parquet (e.g. that the format flag
|
|
# was respected).
|
|
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
|
|
(0.0, 2.0, Vectors.sparse(1, [], []))],
|
|
["label", "weight", "features"])
|
|
lr = LinearRegression(maxIter=1)
|
|
model = lr.fit(df)
|
|
path = tempfile.mkdtemp()
|
|
lr_path = path + "/lr-pmml"
|
|
model.write().format("pmml").save(lr_path)
|
|
pmml_text_list = self.sc.textFile(lr_path).collect()
|
|
pmml_text = "\n".join(pmml_text_list)
|
|
self.assertIn("Apache Spark", pmml_text)
|
|
self.assertIn("PMML", pmml_text)
|
|
|
|
def test_logistic_regression(self):
|
|
lr = LogisticRegression(maxIter=1)
|
|
path = tempfile.mkdtemp()
|
|
lr_path = path + "/logreg"
|
|
lr.save(lr_path)
|
|
lr2 = LogisticRegression.load(lr_path)
|
|
self.assertEqual(lr2.uid, lr2.maxIter.parent,
|
|
"Loaded LogisticRegression instance uid (%s) "
|
|
"did not match Param's uid (%s)"
|
|
% (lr2.uid, lr2.maxIter.parent))
|
|
self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
|
|
"Loaded LogisticRegression instance default params did not match " +
|
|
"original defaults")
|
|
try:
|
|
rmtree(path)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_kmeans(self):
|
|
kmeans = KMeans(k=2, seed=1)
|
|
path = tempfile.mkdtemp()
|
|
km_path = path + "/km"
|
|
kmeans.save(km_path)
|
|
kmeans2 = KMeans.load(km_path)
|
|
self.assertEqual(kmeans.uid, kmeans2.uid)
|
|
self.assertEqual(type(kmeans.uid), type(kmeans2.uid))
|
|
self.assertEqual(kmeans2.uid, kmeans2.k.parent,
|
|
"Loaded KMeans instance uid (%s) did not match Param's uid (%s)"
|
|
% (kmeans2.uid, kmeans2.k.parent))
|
|
self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k],
|
|
"Loaded KMeans instance default params did not match " +
|
|
"original defaults")
|
|
try:
|
|
rmtree(path)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_kmean_pmml_basic(self):
|
|
# Most of the validation is done in the Scala side, here we just check
|
|
# that we output text rather than parquet (e.g. that the format flag
|
|
# was respected).
|
|
data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
|
|
(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
|
|
df = self.spark.createDataFrame(data, ["features"])
|
|
kmeans = KMeans(k=2, seed=1)
|
|
model = kmeans.fit(df)
|
|
path = tempfile.mkdtemp()
|
|
km_path = path + "/km-pmml"
|
|
model.write().format("pmml").save(km_path)
|
|
pmml_text_list = self.sc.textFile(km_path).collect()
|
|
pmml_text = "\n".join(pmml_text_list)
|
|
self.assertIn("Apache Spark", pmml_text)
|
|
self.assertIn("PMML", pmml_text)
|
|
|
|
def _compare_params(self, m1, m2, param):
|
|
"""
|
|
Compare 2 ML Params instances for the given param, and assert both have the same param value
|
|
and parent. The param must be a parameter of m1.
|
|
"""
|
|
# Prevent key not found error in case of some param in neither paramMap nor defaultParamMap.
|
|
if m1.isDefined(param):
|
|
paramValue1 = m1.getOrDefault(param)
|
|
paramValue2 = m2.getOrDefault(m2.getParam(param.name))
|
|
if isinstance(paramValue1, Params):
|
|
self._compare_pipelines(paramValue1, paramValue2)
|
|
else:
|
|
self.assertEqual(paramValue1, paramValue2) # for general types param
|
|
# Assert parents are equal
|
|
self.assertEqual(param.parent, m2.getParam(param.name).parent)
|
|
else:
|
|
# If m1 is not defined param, then m2 should not, too. See SPARK-14931.
|
|
self.assertFalse(m2.isDefined(m2.getParam(param.name)))
|
|
|
|
def _compare_pipelines(self, m1, m2):
|
|
"""
|
|
Compare 2 ML types, asserting that they are equivalent.
|
|
This currently supports:
|
|
- basic types
|
|
- Pipeline, PipelineModel
|
|
- OneVsRest, OneVsRestModel
|
|
This checks:
|
|
- uid
|
|
- type
|
|
- Param values and parents
|
|
"""
|
|
self.assertEqual(m1.uid, m2.uid)
|
|
self.assertEqual(type(m1), type(m2))
|
|
if isinstance(m1, JavaParams) or isinstance(m1, Transformer):
|
|
self.assertEqual(len(m1.params), len(m2.params))
|
|
for p in m1.params:
|
|
self._compare_params(m1, m2, p)
|
|
elif isinstance(m1, Pipeline):
|
|
self.assertEqual(len(m1.getStages()), len(m2.getStages()))
|
|
for s1, s2 in zip(m1.getStages(), m2.getStages()):
|
|
self._compare_pipelines(s1, s2)
|
|
elif isinstance(m1, PipelineModel):
|
|
self.assertEqual(len(m1.stages), len(m2.stages))
|
|
for s1, s2 in zip(m1.stages, m2.stages):
|
|
self._compare_pipelines(s1, s2)
|
|
elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel):
|
|
for p in m1.params:
|
|
self._compare_params(m1, m2, p)
|
|
if isinstance(m1, OneVsRestModel):
|
|
self.assertEqual(len(m1.models), len(m2.models))
|
|
for x, y in zip(m1.models, m2.models):
|
|
self._compare_pipelines(x, y)
|
|
else:
|
|
raise RuntimeError("_compare_pipelines does not yet support type: %s" % type(m1))
|
|
|
|
def test_pipeline_persistence(self):
|
|
"""
|
|
Pipeline[HashingTF, PCA]
|
|
"""
|
|
temp_path = tempfile.mkdtemp()
|
|
|
|
try:
|
|
df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
|
|
tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
|
|
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
|
|
pl = Pipeline(stages=[tf, pca])
|
|
model = pl.fit(df)
|
|
|
|
pipeline_path = temp_path + "/pipeline"
|
|
pl.save(pipeline_path)
|
|
loaded_pipeline = Pipeline.load(pipeline_path)
|
|
self._compare_pipelines(pl, loaded_pipeline)
|
|
|
|
model_path = temp_path + "/pipeline-model"
|
|
model.save(model_path)
|
|
loaded_model = PipelineModel.load(model_path)
|
|
self._compare_pipelines(model, loaded_model)
|
|
finally:
|
|
try:
|
|
rmtree(temp_path)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_nested_pipeline_persistence(self):
|
|
"""
|
|
Pipeline[HashingTF, Pipeline[PCA]]
|
|
"""
|
|
temp_path = tempfile.mkdtemp()
|
|
|
|
try:
|
|
df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
|
|
tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
|
|
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
|
|
p0 = Pipeline(stages=[pca])
|
|
pl = Pipeline(stages=[tf, p0])
|
|
model = pl.fit(df)
|
|
|
|
pipeline_path = temp_path + "/pipeline"
|
|
pl.save(pipeline_path)
|
|
loaded_pipeline = Pipeline.load(pipeline_path)
|
|
self._compare_pipelines(pl, loaded_pipeline)
|
|
|
|
model_path = temp_path + "/pipeline-model"
|
|
model.save(model_path)
|
|
loaded_model = PipelineModel.load(model_path)
|
|
self._compare_pipelines(model, loaded_model)
|
|
finally:
|
|
try:
|
|
rmtree(temp_path)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_python_transformer_pipeline_persistence(self):
|
|
"""
|
|
Pipeline[MockUnaryTransformer, Binarizer]
|
|
"""
|
|
temp_path = tempfile.mkdtemp()
|
|
|
|
try:
|
|
df = self.spark.range(0, 10).toDF('input')
|
|
tf = MockUnaryTransformer(shiftVal=2)\
|
|
.setInputCol("input").setOutputCol("shiftedInput")
|
|
tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized")
|
|
pl = Pipeline(stages=[tf, tf2])
|
|
model = pl.fit(df)
|
|
|
|
pipeline_path = temp_path + "/pipeline"
|
|
pl.save(pipeline_path)
|
|
loaded_pipeline = Pipeline.load(pipeline_path)
|
|
self._compare_pipelines(pl, loaded_pipeline)
|
|
|
|
model_path = temp_path + "/pipeline-model"
|
|
model.save(model_path)
|
|
loaded_model = PipelineModel.load(model_path)
|
|
self._compare_pipelines(model, loaded_model)
|
|
finally:
|
|
try:
|
|
rmtree(temp_path)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_onevsrest(self):
|
|
temp_path = tempfile.mkdtemp()
|
|
df = self.spark.createDataFrame([(0.0, 0.5, Vectors.dense(1.0, 0.8)),
|
|
(1.0, 0.5, Vectors.sparse(2, [], [])),
|
|
(2.0, 1.0, Vectors.dense(0.5, 0.5))] * 10,
|
|
["label", "wt", "features"])
|
|
|
|
lr = LogisticRegression(maxIter=5, regParam=0.01)
|
|
ovr = OneVsRest(classifier=lr)
|
|
|
|
def reload_and_compare(ovr, suffix):
|
|
model = ovr.fit(df)
|
|
ovrPath = temp_path + "/{}".format(suffix)
|
|
ovr.save(ovrPath)
|
|
loadedOvr = OneVsRest.load(ovrPath)
|
|
self._compare_pipelines(ovr, loadedOvr)
|
|
modelPath = temp_path + "/{}Model".format(suffix)
|
|
model.save(modelPath)
|
|
loadedModel = OneVsRestModel.load(modelPath)
|
|
self._compare_pipelines(model, loadedModel)
|
|
|
|
reload_and_compare(OneVsRest(classifier=lr), "ovr")
|
|
reload_and_compare(OneVsRest(classifier=lr).setWeightCol("wt"), "ovrw")
|
|
|
|
def test_decisiontree_classifier(self):
|
|
dt = DecisionTreeClassifier(maxDepth=1)
|
|
path = tempfile.mkdtemp()
|
|
dtc_path = path + "/dtc"
|
|
dt.save(dtc_path)
|
|
dt2 = DecisionTreeClassifier.load(dtc_path)
|
|
self.assertEqual(dt2.uid, dt2.maxDepth.parent,
|
|
"Loaded DecisionTreeClassifier instance uid (%s) "
|
|
"did not match Param's uid (%s)"
|
|
% (dt2.uid, dt2.maxDepth.parent))
|
|
self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth],
|
|
"Loaded DecisionTreeClassifier instance default params did not match " +
|
|
"original defaults")
|
|
try:
|
|
rmtree(path)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_decisiontree_regressor(self):
|
|
dt = DecisionTreeRegressor(maxDepth=1)
|
|
path = tempfile.mkdtemp()
|
|
dtr_path = path + "/dtr"
|
|
dt.save(dtr_path)
|
|
dt2 = DecisionTreeClassifier.load(dtr_path)
|
|
self.assertEqual(dt2.uid, dt2.maxDepth.parent,
|
|
"Loaded DecisionTreeRegressor instance uid (%s) "
|
|
"did not match Param's uid (%s)"
|
|
% (dt2.uid, dt2.maxDepth.parent))
|
|
self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth],
|
|
"Loaded DecisionTreeRegressor instance default params did not match " +
|
|
"original defaults")
|
|
try:
|
|
rmtree(path)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_default_read_write(self):
|
|
temp_path = tempfile.mkdtemp()
|
|
|
|
lr = LogisticRegression()
|
|
lr.setMaxIter(50)
|
|
lr.setThreshold(.75)
|
|
writer = DefaultParamsWriter(lr)
|
|
|
|
savePath = temp_path + "/lr"
|
|
writer.save(savePath)
|
|
|
|
reader = DefaultParamsReadable.read()
|
|
lr2 = reader.load(savePath)
|
|
|
|
self.assertEqual(lr.uid, lr2.uid)
|
|
self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())
|
|
|
|
# test overwrite
|
|
lr.setThreshold(.8)
|
|
writer.overwrite().save(savePath)
|
|
|
|
reader = DefaultParamsReadable.read()
|
|
lr3 = reader.load(savePath)
|
|
|
|
self.assertEqual(lr.uid, lr3.uid)
|
|
self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
|
|
|
|
def test_default_read_write_default_params(self):
|
|
lr = LogisticRegression()
|
|
self.assertFalse(lr.isSet(lr.getParam("threshold")))
|
|
|
|
lr.setMaxIter(50)
|
|
lr.setThreshold(.75)
|
|
|
|
# `threshold` is set by user, default param `predictionCol` is not set by user.
|
|
self.assertTrue(lr.isSet(lr.getParam("threshold")))
|
|
self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
|
|
self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))
|
|
|
|
writer = DefaultParamsWriter(lr)
|
|
metadata = json.loads(writer._get_metadata_to_save(lr, self.sc))
|
|
self.assertTrue("defaultParamMap" in metadata)
|
|
|
|
reader = DefaultParamsReadable.read()
|
|
metadataStr = json.dumps(metadata, separators=[',', ':'])
|
|
loadedMetadata = reader._parseMetaData(metadataStr, )
|
|
reader.getAndSetParams(lr, loadedMetadata)
|
|
|
|
self.assertTrue(lr.isSet(lr.getParam("threshold")))
|
|
self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
|
|
self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))
|
|
|
|
# manually create metadata without `defaultParamMap` section.
|
|
del metadata['defaultParamMap']
|
|
metadataStr = json.dumps(metadata, separators=[',', ':'])
|
|
loadedMetadata = reader._parseMetaData(metadataStr, )
|
|
with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"):
|
|
reader.getAndSetParams(lr, loadedMetadata)
|
|
|
|
# Prior to 2.4.0, metadata doesn't have `defaultParamMap`.
|
|
metadata['sparkVersion'] = '2.3.0'
|
|
metadataStr = json.dumps(metadata, separators=[',', ':'])
|
|
loadedMetadata = reader._parseMetaData(metadataStr, )
|
|
reader.getAndSetParams(lr, loadedMetadata)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from pyspark.ml.tests.test_persistence import * # noqa: F401
|
|
|
|
try:
|
|
import xmlrunner
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
|
except ImportError:
|
|
testRunner = None
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|