9fcf0ea718
Disallow the use of unused imports: - Unnecessary increases the memory footprint of the application - Removes the imports that are required for the examples in the docstring from the file-scope to the example itself. This keeps the files itself clean, and gives a more complete example as it also includes the imports :) ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" python/pyspark/cloudpickle.py:46:1: F401 'functools.partial' imported but unused python/pyspark/cloudpickle.py:55:1: F401 'traceback' imported but unused python/pyspark/heapq3.py:868:5: F401 '_heapq.*' imported but unused python/pyspark/__init__.py:61:1: F401 'pyspark.version.__version__' imported but unused python/pyspark/__init__.py:62:1: F401 'pyspark._globals._NoValue' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.SQLContext' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.HiveContext' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.Row' imported but unused python/pyspark/rdd.py:21:1: F401 're' imported but unused python/pyspark/rdd.py:29:1: F401 'tempfile.NamedTemporaryFile' imported but unused python/pyspark/mllib/regression.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/classification.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/feature.py:30:1: F401 'pyspark.mllib.regression.LabeledPoint' imported but unused python/pyspark/mllib/tests/test_linalg.py:18:1: F401 'sys' imported but unused python/pyspark/mllib/tests/test_linalg.py:642:5: F401 'pyspark.mllib.tests.test_linalg.*' imported but unused python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.random' imported but unused python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.exp' imported but unused python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/test_feature.py:185:5: F401 'pyspark.mllib.tests.test_feature.*' imported but unused python/pyspark/mllib/tests/test_util.py:97:5: F401 'pyspark.mllib.tests.test_util.*' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg._convert_to_vector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseMatrix' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseMatrix' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.MatrixUDT' imported but unused python/pyspark/mllib/tests/test_stat.py:181:5: F401 'pyspark.mllib.tests.test_stat.*' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.time' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.sleep' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:470:5: F401 'pyspark.mllib.tests.test_streaming_algorithms.*' imported but unused python/pyspark/mllib/tests/test_algorithms.py:295:5: F401 'pyspark.mllib.tests.test_algorithms.*' imported but unused python/pyspark/tests/test_serializers.py:90:13: F401 'xmlrunner' imported but unused python/pyspark/tests/test_rdd.py:21:1: F401 'sys' imported but unused python/pyspark/tests/test_rdd.py:29:1: F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/tests/test_rdd.py:885:5: F401 'pyspark.tests.test_rdd.*' imported but unused python/pyspark/tests/test_readwrite.py:19:1: F401 'sys' imported but unused python/pyspark/tests/test_readwrite.py:22:1: F401 'array.array' imported but unused python/pyspark/tests/test_readwrite.py:309:5: F401 'pyspark.tests.test_readwrite.*' imported but unused python/pyspark/tests/test_join.py:62:5: F401 'pyspark.tests.test_join.*' imported but unused python/pyspark/tests/test_taskcontext.py:19:1: F401 'shutil' imported but unused python/pyspark/tests/test_taskcontext.py:325:5: F401 'pyspark.tests.test_taskcontext.*' imported but unused python/pyspark/tests/test_conf.py:36:5: F401 'pyspark.tests.test_conf.*' imported but unused python/pyspark/tests/test_broadcast.py:148:5: F401 'pyspark.tests.test_broadcast.*' imported but unused python/pyspark/tests/test_daemon.py:76:5: F401 'pyspark.tests.test_daemon.*' imported but unused python/pyspark/tests/test_util.py:77:5: F401 'pyspark.tests.test_util.*' imported but unused python/pyspark/tests/test_pin_thread.py:19:1: F401 'random' imported but unused python/pyspark/tests/test_pin_thread.py:149:5: F401 'pyspark.tests.test_pin_thread.*' imported but unused python/pyspark/tests/test_worker.py:19:1: F401 'sys' imported but unused python/pyspark/tests/test_worker.py:26:5: F401 'resource' imported but unused python/pyspark/tests/test_worker.py:203:5: F401 'pyspark.tests.test_worker.*' imported but unused python/pyspark/tests/test_profiler.py:101:5: F401 'pyspark.tests.test_profiler.*' imported but unused python/pyspark/tests/test_shuffle.py:18:1: F401 'sys' imported but unused python/pyspark/tests/test_shuffle.py:171:5: F401 'pyspark.tests.test_shuffle.*' imported but unused python/pyspark/tests/test_rddbarrier.py:43:5: F401 'pyspark.tests.test_rddbarrier.*' imported but unused python/pyspark/tests/test_context.py:129:13: F401 'userlibrary.UserClass' imported but unused python/pyspark/tests/test_context.py:140:13: F401 'userlib.UserClass' imported but unused python/pyspark/tests/test_context.py:310:5: F401 'pyspark.tests.test_context.*' imported but unused python/pyspark/tests/test_appsubmit.py:241:5: F401 'pyspark.tests.test_appsubmit.*' imported but unused python/pyspark/streaming/dstream.py:18:1: F401 'sys' imported but unused python/pyspark/streaming/tests/test_dstream.py:27:1: F401 'pyspark.RDD' imported but unused python/pyspark/streaming/tests/test_dstream.py:647:5: F401 'pyspark.streaming.tests.test_dstream.*' imported but unused python/pyspark/streaming/tests/test_kinesis.py:83:5: F401 'pyspark.streaming.tests.test_kinesis.*' imported but unused python/pyspark/streaming/tests/test_listener.py:152:5: F401 'pyspark.streaming.tests.test_listener.*' imported but unused python/pyspark/streaming/tests/test_context.py:178:5: F401 'pyspark.streaming.tests.test_context.*' imported but unused python/pyspark/testing/utils.py:30:5: F401 'scipy.sparse' imported but unused python/pyspark/testing/utils.py:36:5: F401 'numpy as np' imported but unused python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._TreeEnsembleParams' imported but unused python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._HasVarianceImpurity' imported but unused python/pyspark/ml/regression.py:29:1: F401 'pyspark.ml.wrapper.JavaParams' imported but unused python/pyspark/ml/util.py:19:1: F401 'sys' imported but unused python/pyspark/ml/__init__.py:25:1: F401 'pyspark.ml.pipeline' imported but unused python/pyspark/ml/pipeline.py:18:1: F401 'sys' imported but unused python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.DenseMatrix' imported but unused python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.Vectors' imported but unused python/pyspark/ml/tests/test_training_summary.py:18:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_training_summary.py:364:5: F401 'pyspark.ml.tests.test_training_summary.*' imported but unused python/pyspark/ml/tests/test_linalg.py:381:5: F401 'pyspark.ml.tests.test_linalg.*' imported but unused python/pyspark/ml/tests/test_tuning.py:427:9: F401 'pyspark.sql.functions as F' imported but unused python/pyspark/ml/tests/test_tuning.py:757:5: F401 'pyspark.ml.tests.test_tuning.*' imported but unused python/pyspark/ml/tests/test_wrapper.py:120:5: F401 'pyspark.ml.tests.test_wrapper.*' imported but unused python/pyspark/ml/tests/test_feature.py:19:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_feature.py:304:5: F401 'pyspark.ml.tests.test_feature.*' imported but unused python/pyspark/ml/tests/test_image.py:19:1: F401 'py4j' imported but unused python/pyspark/ml/tests/test_image.py:22:1: F401 'pyspark.testing.mlutils.PySparkTestCase' imported but unused python/pyspark/ml/tests/test_image.py:71:5: F401 'pyspark.ml.tests.test_image.*' imported but unused python/pyspark/ml/tests/test_persistence.py:456:5: F401 'pyspark.ml.tests.test_persistence.*' imported but unused python/pyspark/ml/tests/test_evaluation.py:56:5: F401 'pyspark.ml.tests.test_evaluation.*' imported but unused python/pyspark/ml/tests/test_stat.py:43:5: F401 'pyspark.ml.tests.test_stat.*' imported but unused python/pyspark/ml/tests/test_base.py:70:5: F401 'pyspark.ml.tests.test_base.*' imported but unused python/pyspark/ml/tests/test_param.py:20:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_param.py:375:5: F401 'pyspark.ml.tests.test_param.*' imported but unused python/pyspark/ml/tests/test_pipeline.py:62:5: F401 'pyspark.ml.tests.test_pipeline.*' imported but unused python/pyspark/ml/tests/test_algorithms.py:333:5: F401 'pyspark.ml.tests.test_algorithms.*' imported but unused python/pyspark/ml/param/__init__.py:18:1: F401 'sys' imported but unused python/pyspark/resource/tests/test_resources.py:17:1: F401 'random' imported but unused python/pyspark/resource/tests/test_resources.py:20:1: F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/resource/tests/test_resources.py:75:5: F401 'pyspark.resource.tests.test_resources.*' imported but unused python/pyspark/sql/functions.py:32:1: F401 'pyspark.sql.udf.UserDefinedFunction' imported but unused python/pyspark/sql/functions.py:34:1: F401 'pyspark.sql.pandas.functions.pandas_udf' imported but unused python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/readwriter.py:1084:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.IntegerType' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/context.py:27:1: F401 'pyspark.sql.udf.UDFRegistration' imported but unused python/pyspark/sql/streaming.py:1212:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/tests/test_utils.py:55:5: F401 'pyspark.sql.tests.test_utils.*' imported but unused python/pyspark/sql/tests/test_pandas_map.py:18:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.pandas_udf' imported but unused python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/test_pandas_map.py:119:5: F401 'pyspark.sql.tests.test_pandas_map.*' imported but unused python/pyspark/sql/tests/test_catalog.py:193:5: F401 'pyspark.sql.tests.test_catalog.*' imported but unused python/pyspark/sql/tests/test_group.py:39:5: F401 'pyspark.sql.tests.test_group.*' imported but unused python/pyspark/sql/tests/test_session.py:361:5: F401 'pyspark.sql.tests.test_session.*' imported but unused python/pyspark/sql/tests/test_conf.py:49:5: F401 'pyspark.sql.tests.test_conf.*' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:19:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.sum' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:29:5: F401 'pandas.util.testing.assert_series_equal' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:32:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:248:5: F401 'pyspark.sql.tests.test_pandas_cogrouped_map.*' imported but unused python/pyspark/sql/tests/test_udf.py:24:1: F401 'py4j' imported but unused python/pyspark/sql/tests/test_pandas_udf_typehints.py:246:5: F401 'pyspark.sql.tests.test_pandas_udf_typehints.*' imported but unused python/pyspark/sql/tests/test_functions.py:19:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_functions.py:362:9: F401 'pyspark.sql.functions.exists' imported but unused python/pyspark/sql/tests/test_functions.py:387:5: F401 'pyspark.sql.tests.test_functions.*' imported but unused python/pyspark/sql/tests/test_pandas_udf_scalar.py:21:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_udf_scalar.py:45:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_udf_window.py:355:5: F401 'pyspark.sql.tests.test_pandas_udf_window.*' imported but unused python/pyspark/sql/tests/test_arrow.py:38:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_grouped_map.py:20:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_grouped_map.py:38:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_dataframe.py:382:9: F401 'pyspark.sql.DataFrame' imported but unused python/pyspark/sql/avro/functions.py:125:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/pandas/functions.py:19:1: F401 'sys' imported but unused ``` After: ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" fokkodriesprongFan spark % ``` ### What changes were proposed in this pull request? Removing unused imports from the Python files to keep everything nice and tidy. ### Why are the changes needed? Cleaning up of the imports that aren't used, and suppressing the imports that are used as references to other modules, preserving backward compatibility. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Adding the rule to the existing Flake8 checks. Closes #29121 from Fokko/SPARK-32319. Authored-by: Fokko Driesprong <fokko@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
341 lines
15 KiB
Python
341 lines
15 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from shutil import rmtree
|
|
import tempfile
|
|
import unittest
|
|
|
|
import numpy as np
|
|
|
|
from pyspark.ml.classification import FMClassifier, LogisticRegression, \
|
|
MultilayerPerceptronClassifier, OneVsRest
|
|
from pyspark.ml.clustering import DistributedLDAModel, KMeans, LocalLDAModel, LDA, LDAModel
|
|
from pyspark.ml.fpm import FPGrowth
|
|
from pyspark.ml.linalg import Matrices, Vectors
|
|
from pyspark.ml.recommendation import ALS
|
|
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression
|
|
from pyspark.sql import Row
|
|
from pyspark.testing.mlutils import SparkSessionTestCase
|
|
|
|
|
|
class LogisticRegressionTest(SparkSessionTestCase):
|
|
|
|
def test_binomial_logistic_regression_with_bound(self):
|
|
|
|
df = self.spark.createDataFrame(
|
|
[(1.0, 1.0, Vectors.dense(0.0, 5.0)),
|
|
(0.0, 2.0, Vectors.dense(1.0, 2.0)),
|
|
(1.0, 3.0, Vectors.dense(2.0, 1.0)),
|
|
(0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])
|
|
|
|
lor = LogisticRegression(regParam=0.01, weightCol="weight",
|
|
lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
|
|
upperBoundsOnIntercepts=Vectors.dense(0.0))
|
|
model = lor.fit(df)
|
|
self.assertTrue(
|
|
np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
|
|
self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
|
|
|
|
def test_multinomial_logistic_regression_with_bound(self):
|
|
|
|
data_path = "data/mllib/sample_multiclass_classification_data.txt"
|
|
df = self.spark.read.format("libsvm").load(data_path)
|
|
|
|
lor = LogisticRegression(regParam=0.01,
|
|
lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
|
|
upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
|
|
model = lor.fit(df)
|
|
expected = [[4.593, 4.5516, 9.0099, 12.2904],
|
|
[1.0, 8.1093, 7.0, 10.0],
|
|
[3.041, 5.0, 8.0, 11.0]]
|
|
for i in range(0, len(expected)):
|
|
self.assertTrue(
|
|
np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
|
|
self.assertTrue(
|
|
np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
|
|
|
|
|
|
class MultilayerPerceptronClassifierTest(SparkSessionTestCase):
|
|
|
|
def test_raw_and_probability_prediction(self):
|
|
|
|
data_path = "data/mllib/sample_multiclass_classification_data.txt"
|
|
df = self.spark.read.format("libsvm").load(data_path)
|
|
|
|
mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3],
|
|
blockSize=128, seed=123)
|
|
model = mlp.fit(df)
|
|
test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
|
|
result = model.transform(test).head()
|
|
expected_prediction = 2.0
|
|
expected_probability = [0.0, 0.0, 1.0]
|
|
expected_rawPrediction = [-11.6081922998, -8.15827998691, 22.17757045]
|
|
self.assertTrue(result.prediction, expected_prediction)
|
|
self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4))
|
|
self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1))
|
|
|
|
|
|
class OneVsRestTests(SparkSessionTestCase):
|
|
|
|
def test_copy(self):
|
|
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
|
|
(1.0, Vectors.sparse(2, [], [])),
|
|
(2.0, Vectors.dense(0.5, 0.5))],
|
|
["label", "features"])
|
|
lr = LogisticRegression(maxIter=5, regParam=0.01)
|
|
ovr = OneVsRest(classifier=lr)
|
|
ovr1 = ovr.copy({lr.maxIter: 10})
|
|
self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
|
|
self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
|
|
model = ovr.fit(df)
|
|
model1 = model.copy({model.predictionCol: "indexed"})
|
|
self.assertEqual(model1.getPredictionCol(), "indexed")
|
|
|
|
def test_output_columns(self):
|
|
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
|
|
(1.0, Vectors.sparse(2, [], [])),
|
|
(2.0, Vectors.dense(0.5, 0.5))],
|
|
["label", "features"])
|
|
lr = LogisticRegression(maxIter=5, regParam=0.01)
|
|
ovr = OneVsRest(classifier=lr, parallelism=1)
|
|
model = ovr.fit(df)
|
|
output = model.transform(df)
|
|
self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
|
|
|
|
def test_parallelism_doesnt_change_output(self):
|
|
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
|
|
(1.0, Vectors.sparse(2, [], [])),
|
|
(2.0, Vectors.dense(0.5, 0.5))],
|
|
["label", "features"])
|
|
ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
|
|
modelPar1 = ovrPar1.fit(df)
|
|
ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
|
|
modelPar2 = ovrPar2.fit(df)
|
|
for i, model in enumerate(modelPar1.models):
|
|
self.assertTrue(np.allclose(model.coefficients.toArray(),
|
|
modelPar2.models[i].coefficients.toArray(), atol=1E-4))
|
|
self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
|
|
|
|
def test_support_for_weightCol(self):
|
|
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
|
|
(1.0, Vectors.sparse(2, [], []), 1.0),
|
|
(2.0, Vectors.dense(0.5, 0.5), 1.0)],
|
|
["label", "features", "weight"])
|
|
# classifier inherits hasWeightCol
|
|
lr = LogisticRegression(maxIter=5, regParam=0.01)
|
|
ovr = OneVsRest(classifier=lr, weightCol="weight")
|
|
self.assertIsNotNone(ovr.fit(df))
|
|
# classifier doesn't inherit hasWeightCol
|
|
dt = FMClassifier()
|
|
ovr2 = OneVsRest(classifier=dt, weightCol="weight")
|
|
self.assertIsNotNone(ovr2.fit(df))
|
|
|
|
|
|
class KMeansTests(SparkSessionTestCase):
|
|
|
|
def test_kmeans_cosine_distance(self):
|
|
data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),),
|
|
(Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),),
|
|
(Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)]
|
|
df = self.spark.createDataFrame(data, ["features"])
|
|
kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine")
|
|
model = kmeans.fit(df)
|
|
result = model.transform(df).collect()
|
|
self.assertTrue(result[0].prediction == result[1].prediction)
|
|
self.assertTrue(result[2].prediction == result[3].prediction)
|
|
self.assertTrue(result[4].prediction == result[5].prediction)
|
|
|
|
|
|
class LDATest(SparkSessionTestCase):
|
|
|
|
def _compare(self, m1, m2):
|
|
"""
|
|
Temp method for comparing instances.
|
|
TODO: Replace with generic implementation once SPARK-14706 is merged.
|
|
"""
|
|
self.assertEqual(m1.uid, m2.uid)
|
|
self.assertEqual(type(m1), type(m2))
|
|
self.assertEqual(len(m1.params), len(m2.params))
|
|
for p in m1.params:
|
|
if m1.isDefined(p):
|
|
self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p))
|
|
self.assertEqual(p.parent, m2.getParam(p.name).parent)
|
|
if isinstance(m1, LDAModel):
|
|
self.assertEqual(m1.vocabSize(), m2.vocabSize())
|
|
self.assertEqual(m1.topicsMatrix(), m2.topicsMatrix())
|
|
|
|
def test_persistence(self):
|
|
# Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
|
|
df = self.spark.createDataFrame([
|
|
[1, Vectors.dense([0.0, 1.0])],
|
|
[2, Vectors.sparse(2, {0: 1.0})],
|
|
], ["id", "features"])
|
|
# Fit model
|
|
lda = LDA(k=2, seed=1, optimizer="em")
|
|
distributedModel = lda.fit(df)
|
|
self.assertTrue(distributedModel.isDistributed())
|
|
localModel = distributedModel.toLocal()
|
|
self.assertFalse(localModel.isDistributed())
|
|
# Define paths
|
|
path = tempfile.mkdtemp()
|
|
lda_path = path + "/lda"
|
|
dist_model_path = path + "/distLDAModel"
|
|
local_model_path = path + "/localLDAModel"
|
|
# Test LDA
|
|
lda.save(lda_path)
|
|
lda2 = LDA.load(lda_path)
|
|
self._compare(lda, lda2)
|
|
# Test DistributedLDAModel
|
|
distributedModel.save(dist_model_path)
|
|
distributedModel2 = DistributedLDAModel.load(dist_model_path)
|
|
self._compare(distributedModel, distributedModel2)
|
|
# Test LocalLDAModel
|
|
localModel.save(local_model_path)
|
|
localModel2 = LocalLDAModel.load(local_model_path)
|
|
self._compare(localModel, localModel2)
|
|
# Clean up
|
|
try:
|
|
rmtree(path)
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
class FPGrowthTests(SparkSessionTestCase):
|
|
def setUp(self):
|
|
super(FPGrowthTests, self).setUp()
|
|
self.data = self.spark.createDataFrame(
|
|
[([1, 2], ), ([1, 2], ), ([1, 2, 3], ), ([1, 3], )],
|
|
["items"])
|
|
|
|
def test_association_rules(self):
|
|
fp = FPGrowth()
|
|
fpm = fp.fit(self.data)
|
|
|
|
expected_association_rules = self.spark.createDataFrame(
|
|
[([3], [1], 1.0, 1.0, 0.5), ([2], [1], 1.0, 1.0, 0.75)],
|
|
["antecedent", "consequent", "confidence", "lift", "support"]
|
|
)
|
|
actual_association_rules = fpm.associationRules
|
|
|
|
self.assertEqual(actual_association_rules.subtract(expected_association_rules).count(), 0)
|
|
self.assertEqual(expected_association_rules.subtract(actual_association_rules).count(), 0)
|
|
|
|
def test_freq_itemsets(self):
|
|
fp = FPGrowth()
|
|
fpm = fp.fit(self.data)
|
|
|
|
expected_freq_itemsets = self.spark.createDataFrame(
|
|
[([1], 4), ([2], 3), ([2, 1], 3), ([3], 2), ([3, 1], 2)],
|
|
["items", "freq"]
|
|
)
|
|
actual_freq_itemsets = fpm.freqItemsets
|
|
|
|
self.assertEqual(actual_freq_itemsets.subtract(expected_freq_itemsets).count(), 0)
|
|
self.assertEqual(expected_freq_itemsets.subtract(actual_freq_itemsets).count(), 0)
|
|
|
|
def tearDown(self):
|
|
del self.data
|
|
|
|
|
|
class ALSTest(SparkSessionTestCase):
|
|
|
|
def test_storage_levels(self):
|
|
df = self.spark.createDataFrame(
|
|
[(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
|
|
["user", "item", "rating"])
|
|
als = ALS().setMaxIter(1).setRank(1)
|
|
# test default params
|
|
als.fit(df)
|
|
self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_AND_DISK")
|
|
self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_AND_DISK")
|
|
self.assertEqual(als.getFinalStorageLevel(), "MEMORY_AND_DISK")
|
|
self.assertEqual(als._java_obj.getFinalStorageLevel(), "MEMORY_AND_DISK")
|
|
# test non-default params
|
|
als.setIntermediateStorageLevel("MEMORY_ONLY_2")
|
|
als.setFinalStorageLevel("DISK_ONLY")
|
|
als.fit(df)
|
|
self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_ONLY_2")
|
|
self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_ONLY_2")
|
|
self.assertEqual(als.getFinalStorageLevel(), "DISK_ONLY")
|
|
self.assertEqual(als._java_obj.getFinalStorageLevel(), "DISK_ONLY")
|
|
|
|
|
|
class GeneralizedLinearRegressionTest(SparkSessionTestCase):
|
|
|
|
def test_tweedie_distribution(self):
|
|
|
|
df = self.spark.createDataFrame(
|
|
[(1.0, Vectors.dense(0.0, 0.0)),
|
|
(1.0, Vectors.dense(1.0, 2.0)),
|
|
(2.0, Vectors.dense(0.0, 0.0)),
|
|
(2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])
|
|
|
|
glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
|
|
model = glr.fit(df)
|
|
self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4))
|
|
self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4))
|
|
|
|
model2 = glr.setLinkPower(-1.0).fit(df)
|
|
self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4))
|
|
self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
|
|
|
|
def test_offset(self):
|
|
|
|
df = self.spark.createDataFrame(
|
|
[(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
|
|
(0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
|
|
(0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
|
|
(0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"])
|
|
|
|
glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset")
|
|
model = glr.fit(df)
|
|
self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
|
|
atol=1E-4))
|
|
self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
|
|
|
|
|
|
class LinearRegressionTest(SparkSessionTestCase):
|
|
|
|
def test_linear_regression_with_huber_loss(self):
|
|
|
|
data_path = "data/mllib/sample_linear_regression_data.txt"
|
|
df = self.spark.read.format("libsvm").load(data_path)
|
|
|
|
lir = LinearRegression(loss="huber", epsilon=2.0)
|
|
model = lir.fit(df)
|
|
|
|
expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537,
|
|
1.2612, -0.333, -0.5694, -0.6311, 0.6053]
|
|
expectedIntercept = 0.1607
|
|
expectedScale = 9.758
|
|
|
|
self.assertTrue(
|
|
np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3))
|
|
self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3))
|
|
self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from pyspark.ml.tests.test_algorithms import * # noqa: F401
|
|
|
|
try:
|
|
import xmlrunner
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
|
except ImportError:
|
|
testRunner = None
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|