9fcf0ea718
Disallow the use of unused imports: - Unnecessary increases the memory footprint of the application - Removes the imports that are required for the examples in the docstring from the file-scope to the example itself. This keeps the files itself clean, and gives a more complete example as it also includes the imports :) ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" python/pyspark/cloudpickle.py:46:1: F401 'functools.partial' imported but unused python/pyspark/cloudpickle.py:55:1: F401 'traceback' imported but unused python/pyspark/heapq3.py:868:5: F401 '_heapq.*' imported but unused python/pyspark/__init__.py:61:1: F401 'pyspark.version.__version__' imported but unused python/pyspark/__init__.py:62:1: F401 'pyspark._globals._NoValue' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.SQLContext' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.HiveContext' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.Row' imported but unused python/pyspark/rdd.py:21:1: F401 're' imported but unused python/pyspark/rdd.py:29:1: F401 'tempfile.NamedTemporaryFile' imported but unused python/pyspark/mllib/regression.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/classification.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/feature.py:30:1: F401 'pyspark.mllib.regression.LabeledPoint' imported but unused python/pyspark/mllib/tests/test_linalg.py:18:1: F401 'sys' imported but unused python/pyspark/mllib/tests/test_linalg.py:642:5: F401 'pyspark.mllib.tests.test_linalg.*' imported but unused python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.random' imported but unused python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.exp' imported but unused python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/test_feature.py:185:5: F401 'pyspark.mllib.tests.test_feature.*' imported but unused python/pyspark/mllib/tests/test_util.py:97:5: F401 'pyspark.mllib.tests.test_util.*' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg._convert_to_vector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseMatrix' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseMatrix' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.MatrixUDT' imported but unused python/pyspark/mllib/tests/test_stat.py:181:5: F401 'pyspark.mllib.tests.test_stat.*' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.time' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.sleep' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:470:5: F401 'pyspark.mllib.tests.test_streaming_algorithms.*' imported but unused python/pyspark/mllib/tests/test_algorithms.py:295:5: F401 'pyspark.mllib.tests.test_algorithms.*' imported but unused python/pyspark/tests/test_serializers.py:90:13: F401 'xmlrunner' imported but unused python/pyspark/tests/test_rdd.py:21:1: F401 'sys' imported but unused python/pyspark/tests/test_rdd.py:29:1: F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/tests/test_rdd.py:885:5: F401 'pyspark.tests.test_rdd.*' imported but unused python/pyspark/tests/test_readwrite.py:19:1: F401 'sys' imported but unused python/pyspark/tests/test_readwrite.py:22:1: F401 'array.array' imported but unused python/pyspark/tests/test_readwrite.py:309:5: F401 'pyspark.tests.test_readwrite.*' imported but unused python/pyspark/tests/test_join.py:62:5: F401 'pyspark.tests.test_join.*' imported but unused python/pyspark/tests/test_taskcontext.py:19:1: F401 'shutil' imported but unused python/pyspark/tests/test_taskcontext.py:325:5: F401 'pyspark.tests.test_taskcontext.*' imported but unused python/pyspark/tests/test_conf.py:36:5: F401 'pyspark.tests.test_conf.*' imported but unused python/pyspark/tests/test_broadcast.py:148:5: F401 'pyspark.tests.test_broadcast.*' imported but unused python/pyspark/tests/test_daemon.py:76:5: F401 'pyspark.tests.test_daemon.*' imported but unused python/pyspark/tests/test_util.py:77:5: F401 'pyspark.tests.test_util.*' imported but unused python/pyspark/tests/test_pin_thread.py:19:1: F401 'random' imported but unused python/pyspark/tests/test_pin_thread.py:149:5: F401 'pyspark.tests.test_pin_thread.*' imported but unused python/pyspark/tests/test_worker.py:19:1: F401 'sys' imported but unused python/pyspark/tests/test_worker.py:26:5: F401 'resource' imported but unused python/pyspark/tests/test_worker.py:203:5: F401 'pyspark.tests.test_worker.*' imported but unused python/pyspark/tests/test_profiler.py:101:5: F401 'pyspark.tests.test_profiler.*' imported but unused python/pyspark/tests/test_shuffle.py:18:1: F401 'sys' imported but unused python/pyspark/tests/test_shuffle.py:171:5: F401 'pyspark.tests.test_shuffle.*' imported but unused python/pyspark/tests/test_rddbarrier.py:43:5: F401 'pyspark.tests.test_rddbarrier.*' imported but unused python/pyspark/tests/test_context.py:129:13: F401 'userlibrary.UserClass' imported but unused python/pyspark/tests/test_context.py:140:13: F401 'userlib.UserClass' imported but unused python/pyspark/tests/test_context.py:310:5: F401 'pyspark.tests.test_context.*' imported but unused python/pyspark/tests/test_appsubmit.py:241:5: F401 'pyspark.tests.test_appsubmit.*' imported but unused python/pyspark/streaming/dstream.py:18:1: F401 'sys' imported but unused python/pyspark/streaming/tests/test_dstream.py:27:1: F401 'pyspark.RDD' imported but unused python/pyspark/streaming/tests/test_dstream.py:647:5: F401 'pyspark.streaming.tests.test_dstream.*' imported but unused python/pyspark/streaming/tests/test_kinesis.py:83:5: F401 'pyspark.streaming.tests.test_kinesis.*' imported but unused python/pyspark/streaming/tests/test_listener.py:152:5: F401 'pyspark.streaming.tests.test_listener.*' imported but unused python/pyspark/streaming/tests/test_context.py:178:5: F401 'pyspark.streaming.tests.test_context.*' imported but unused python/pyspark/testing/utils.py:30:5: F401 'scipy.sparse' imported but unused python/pyspark/testing/utils.py:36:5: F401 'numpy as np' imported but unused python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._TreeEnsembleParams' imported but unused python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._HasVarianceImpurity' imported but unused python/pyspark/ml/regression.py:29:1: F401 'pyspark.ml.wrapper.JavaParams' imported but unused python/pyspark/ml/util.py:19:1: F401 'sys' imported but unused python/pyspark/ml/__init__.py:25:1: F401 'pyspark.ml.pipeline' imported but unused python/pyspark/ml/pipeline.py:18:1: F401 'sys' imported but unused python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.DenseMatrix' imported but unused python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.Vectors' imported but unused python/pyspark/ml/tests/test_training_summary.py:18:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_training_summary.py:364:5: F401 'pyspark.ml.tests.test_training_summary.*' imported but unused python/pyspark/ml/tests/test_linalg.py:381:5: F401 'pyspark.ml.tests.test_linalg.*' imported but unused python/pyspark/ml/tests/test_tuning.py:427:9: F401 'pyspark.sql.functions as F' imported but unused python/pyspark/ml/tests/test_tuning.py:757:5: F401 'pyspark.ml.tests.test_tuning.*' imported but unused python/pyspark/ml/tests/test_wrapper.py:120:5: F401 'pyspark.ml.tests.test_wrapper.*' imported but unused python/pyspark/ml/tests/test_feature.py:19:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_feature.py:304:5: F401 'pyspark.ml.tests.test_feature.*' imported but unused python/pyspark/ml/tests/test_image.py:19:1: F401 'py4j' imported but unused python/pyspark/ml/tests/test_image.py:22:1: F401 'pyspark.testing.mlutils.PySparkTestCase' imported but unused python/pyspark/ml/tests/test_image.py:71:5: F401 'pyspark.ml.tests.test_image.*' imported but unused python/pyspark/ml/tests/test_persistence.py:456:5: F401 'pyspark.ml.tests.test_persistence.*' imported but unused python/pyspark/ml/tests/test_evaluation.py:56:5: F401 'pyspark.ml.tests.test_evaluation.*' imported but unused python/pyspark/ml/tests/test_stat.py:43:5: F401 'pyspark.ml.tests.test_stat.*' imported but unused python/pyspark/ml/tests/test_base.py:70:5: F401 'pyspark.ml.tests.test_base.*' imported but unused python/pyspark/ml/tests/test_param.py:20:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_param.py:375:5: F401 'pyspark.ml.tests.test_param.*' imported but unused python/pyspark/ml/tests/test_pipeline.py:62:5: F401 'pyspark.ml.tests.test_pipeline.*' imported but unused python/pyspark/ml/tests/test_algorithms.py:333:5: F401 'pyspark.ml.tests.test_algorithms.*' imported but unused python/pyspark/ml/param/__init__.py:18:1: F401 'sys' imported but unused python/pyspark/resource/tests/test_resources.py:17:1: F401 'random' imported but unused python/pyspark/resource/tests/test_resources.py:20:1: F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/resource/tests/test_resources.py:75:5: F401 'pyspark.resource.tests.test_resources.*' imported but unused python/pyspark/sql/functions.py:32:1: F401 'pyspark.sql.udf.UserDefinedFunction' imported but unused python/pyspark/sql/functions.py:34:1: F401 'pyspark.sql.pandas.functions.pandas_udf' imported but unused python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/readwriter.py:1084:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.IntegerType' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/context.py:27:1: F401 'pyspark.sql.udf.UDFRegistration' imported but unused python/pyspark/sql/streaming.py:1212:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/tests/test_utils.py:55:5: F401 'pyspark.sql.tests.test_utils.*' imported but unused python/pyspark/sql/tests/test_pandas_map.py:18:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.pandas_udf' imported but unused python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/test_pandas_map.py:119:5: F401 'pyspark.sql.tests.test_pandas_map.*' imported but unused python/pyspark/sql/tests/test_catalog.py:193:5: F401 'pyspark.sql.tests.test_catalog.*' imported but unused python/pyspark/sql/tests/test_group.py:39:5: F401 'pyspark.sql.tests.test_group.*' imported but unused python/pyspark/sql/tests/test_session.py:361:5: F401 'pyspark.sql.tests.test_session.*' imported but unused python/pyspark/sql/tests/test_conf.py:49:5: F401 'pyspark.sql.tests.test_conf.*' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:19:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.sum' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:29:5: F401 'pandas.util.testing.assert_series_equal' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:32:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:248:5: F401 'pyspark.sql.tests.test_pandas_cogrouped_map.*' imported but unused python/pyspark/sql/tests/test_udf.py:24:1: F401 'py4j' imported but unused python/pyspark/sql/tests/test_pandas_udf_typehints.py:246:5: F401 'pyspark.sql.tests.test_pandas_udf_typehints.*' imported but unused python/pyspark/sql/tests/test_functions.py:19:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_functions.py:362:9: F401 'pyspark.sql.functions.exists' imported but unused python/pyspark/sql/tests/test_functions.py:387:5: F401 'pyspark.sql.tests.test_functions.*' imported but unused python/pyspark/sql/tests/test_pandas_udf_scalar.py:21:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_udf_scalar.py:45:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_udf_window.py:355:5: F401 'pyspark.sql.tests.test_pandas_udf_window.*' imported but unused python/pyspark/sql/tests/test_arrow.py:38:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_grouped_map.py:20:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_grouped_map.py:38:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_dataframe.py:382:9: F401 'pyspark.sql.DataFrame' imported but unused python/pyspark/sql/avro/functions.py:125:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/pandas/functions.py:19:1: F401 'sys' imported but unused ``` After: ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" fokkodriesprongFan spark % ``` ### What changes were proposed in this pull request? Removing unused imports from the Python files to keep everything nice and tidy. ### Why are the changes needed? Cleaning up of the imports that aren't used, and suppressing the imports that are used as references to other modules, preserving backward compatibility. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Adding the rule to the existing Flake8 checks. Closes #29121 from Fokko/SPARK-32319. Authored-by: Fokko Driesprong <fokko@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
303 lines
13 KiB
Python
303 lines
13 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import os
|
|
import tempfile
|
|
from shutil import rmtree
|
|
import unittest
|
|
|
|
from numpy import array, array_equal
|
|
from py4j.protocol import Py4JJavaError
|
|
|
|
from pyspark.mllib.fpm import FPGrowth
|
|
from pyspark.mllib.recommendation import Rating
|
|
from pyspark.mllib.regression import LabeledPoint
|
|
from pyspark.serializers import PickleSerializer
|
|
from pyspark.testing.mllibutils import MLlibTestCase
|
|
|
|
|
|
class ListTests(MLlibTestCase):
|
|
|
|
"""
|
|
Test MLlib algorithms on plain lists, to make sure they're passed through
|
|
as NumPy arrays.
|
|
"""
|
|
|
|
def test_bisecting_kmeans(self):
|
|
from pyspark.mllib.clustering import BisectingKMeans
|
|
data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
|
|
bskm = BisectingKMeans()
|
|
model = bskm.train(self.sc.parallelize(data, 2), k=4)
|
|
p = array([0.0, 0.0])
|
|
rdd_p = self.sc.parallelize([p])
|
|
self.assertEqual(model.predict(p), model.predict(rdd_p).first())
|
|
self.assertEqual(model.computeCost(p), model.computeCost(rdd_p))
|
|
self.assertEqual(model.k, len(model.clusterCenters))
|
|
|
|
def test_kmeans(self):
|
|
from pyspark.mllib.clustering import KMeans
|
|
data = [
|
|
[0, 1.1],
|
|
[0, 1.2],
|
|
[1.1, 0],
|
|
[1.2, 0],
|
|
]
|
|
clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||",
|
|
initializationSteps=7, epsilon=1e-4)
|
|
self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
|
|
self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
|
|
|
|
def test_kmeans_deterministic(self):
|
|
from pyspark.mllib.clustering import KMeans
|
|
X = range(0, 100, 10)
|
|
Y = range(0, 100, 10)
|
|
data = [[x, y] for x, y in zip(X, Y)]
|
|
clusters1 = KMeans.train(self.sc.parallelize(data),
|
|
3, initializationMode="k-means||",
|
|
seed=42, initializationSteps=7, epsilon=1e-4)
|
|
clusters2 = KMeans.train(self.sc.parallelize(data),
|
|
3, initializationMode="k-means||",
|
|
seed=42, initializationSteps=7, epsilon=1e-4)
|
|
centers1 = clusters1.centers
|
|
centers2 = clusters2.centers
|
|
for c1, c2 in zip(centers1, centers2):
|
|
# TODO: Allow small numeric difference.
|
|
self.assertTrue(array_equal(c1, c2))
|
|
|
|
def test_gmm(self):
|
|
from pyspark.mllib.clustering import GaussianMixture
|
|
data = self.sc.parallelize([
|
|
[1, 2],
|
|
[8, 9],
|
|
[-4, -3],
|
|
[-6, -7],
|
|
])
|
|
clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
|
|
maxIterations=10, seed=1)
|
|
labels = clusters.predict(data).collect()
|
|
self.assertEqual(labels[0], labels[1])
|
|
self.assertEqual(labels[2], labels[3])
|
|
|
|
def test_gmm_deterministic(self):
|
|
from pyspark.mllib.clustering import GaussianMixture
|
|
x = range(0, 100, 10)
|
|
y = range(0, 100, 10)
|
|
data = self.sc.parallelize([[a, b] for a, b in zip(x, y)])
|
|
clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001,
|
|
maxIterations=10, seed=63)
|
|
clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001,
|
|
maxIterations=10, seed=63)
|
|
for c1, c2 in zip(clusters1.weights, clusters2.weights):
|
|
self.assertEqual(round(c1, 7), round(c2, 7))
|
|
|
|
def test_gmm_with_initial_model(self):
|
|
from pyspark.mllib.clustering import GaussianMixture
|
|
data = self.sc.parallelize([
|
|
(-10, -5), (-9, -4), (10, 5), (9, 4)
|
|
])
|
|
|
|
gmm1 = GaussianMixture.train(data, 2, convergenceTol=0.001,
|
|
maxIterations=10, seed=63)
|
|
gmm2 = GaussianMixture.train(data, 2, convergenceTol=0.001,
|
|
maxIterations=10, seed=63, initialModel=gmm1)
|
|
self.assertAlmostEqual((gmm1.weights - gmm2.weights).sum(), 0.0)
|
|
|
|
def test_classification(self):
|
|
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
|
|
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \
|
|
RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
|
|
data = [
|
|
LabeledPoint(0.0, [1, 0, 0]),
|
|
LabeledPoint(1.0, [0, 1, 1]),
|
|
LabeledPoint(0.0, [2, 0, 0]),
|
|
LabeledPoint(1.0, [0, 2, 1])
|
|
]
|
|
rdd = self.sc.parallelize(data)
|
|
features = [p.features.tolist() for p in data]
|
|
|
|
temp_dir = tempfile.mkdtemp()
|
|
|
|
lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
|
|
self.assertTrue(lr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[1]) > 0)
|
|
self.assertTrue(lr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[3]) > 0)
|
|
|
|
svm_model = SVMWithSGD.train(rdd, iterations=10)
|
|
self.assertTrue(svm_model.predict(features[0]) <= 0)
|
|
self.assertTrue(svm_model.predict(features[1]) > 0)
|
|
self.assertTrue(svm_model.predict(features[2]) <= 0)
|
|
self.assertTrue(svm_model.predict(features[3]) > 0)
|
|
|
|
nb_model = NaiveBayes.train(rdd)
|
|
self.assertTrue(nb_model.predict(features[0]) <= 0)
|
|
self.assertTrue(nb_model.predict(features[1]) > 0)
|
|
self.assertTrue(nb_model.predict(features[2]) <= 0)
|
|
self.assertTrue(nb_model.predict(features[3]) > 0)
|
|
|
|
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
|
|
dt_model = DecisionTree.trainClassifier(
|
|
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
|
|
self.assertTrue(dt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[1]) > 0)
|
|
self.assertTrue(dt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[3]) > 0)
|
|
|
|
dt_model_dir = os.path.join(temp_dir, "dt")
|
|
dt_model.save(self.sc, dt_model_dir)
|
|
same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
|
|
self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())
|
|
|
|
rf_model = RandomForest.trainClassifier(
|
|
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
|
|
maxBins=4, seed=1)
|
|
self.assertTrue(rf_model.predict(features[0]) <= 0)
|
|
self.assertTrue(rf_model.predict(features[1]) > 0)
|
|
self.assertTrue(rf_model.predict(features[2]) <= 0)
|
|
self.assertTrue(rf_model.predict(features[3]) > 0)
|
|
|
|
rf_model_dir = os.path.join(temp_dir, "rf")
|
|
rf_model.save(self.sc, rf_model_dir)
|
|
same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
|
|
self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())
|
|
|
|
gbt_model = GradientBoostedTrees.trainClassifier(
|
|
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
|
|
self.assertTrue(gbt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(gbt_model.predict(features[1]) > 0)
|
|
self.assertTrue(gbt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(gbt_model.predict(features[3]) > 0)
|
|
|
|
gbt_model_dir = os.path.join(temp_dir, "gbt")
|
|
gbt_model.save(self.sc, gbt_model_dir)
|
|
same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
|
|
self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())
|
|
|
|
try:
|
|
rmtree(temp_dir)
|
|
except OSError:
|
|
pass
|
|
|
|
def test_regression(self):
|
|
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
|
|
RidgeRegressionWithSGD
|
|
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
|
|
data = [
|
|
LabeledPoint(-1.0, [0, -1]),
|
|
LabeledPoint(1.0, [0, 1]),
|
|
LabeledPoint(-1.0, [0, -2]),
|
|
LabeledPoint(1.0, [0, 2])
|
|
]
|
|
rdd = self.sc.parallelize(data)
|
|
features = [p.features.tolist() for p in data]
|
|
|
|
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
|
|
self.assertTrue(lr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[1]) > 0)
|
|
self.assertTrue(lr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[3]) > 0)
|
|
|
|
lasso_model = LassoWithSGD.train(rdd, iterations=10)
|
|
self.assertTrue(lasso_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lasso_model.predict(features[1]) > 0)
|
|
self.assertTrue(lasso_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lasso_model.predict(features[3]) > 0)
|
|
|
|
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
|
|
self.assertTrue(rr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(rr_model.predict(features[1]) > 0)
|
|
self.assertTrue(rr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(rr_model.predict(features[3]) > 0)
|
|
|
|
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
|
|
dt_model = DecisionTree.trainRegressor(
|
|
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
|
|
self.assertTrue(dt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[1]) > 0)
|
|
self.assertTrue(dt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[3]) > 0)
|
|
|
|
rf_model = RandomForest.trainRegressor(
|
|
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
|
|
self.assertTrue(rf_model.predict(features[0]) <= 0)
|
|
self.assertTrue(rf_model.predict(features[1]) > 0)
|
|
self.assertTrue(rf_model.predict(features[2]) <= 0)
|
|
self.assertTrue(rf_model.predict(features[3]) > 0)
|
|
|
|
gbt_model = GradientBoostedTrees.trainRegressor(
|
|
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
|
|
self.assertTrue(gbt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(gbt_model.predict(features[1]) > 0)
|
|
self.assertTrue(gbt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(gbt_model.predict(features[3]) > 0)
|
|
|
|
try:
|
|
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
|
|
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
|
|
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
|
|
except ValueError:
|
|
self.fail()
|
|
|
|
# Verify that maxBins is being passed through
|
|
GradientBoostedTrees.trainRegressor(
|
|
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32)
|
|
with self.assertRaises(Exception) as cm:
|
|
GradientBoostedTrees.trainRegressor(
|
|
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
|
|
|
|
|
|
class ALSTests(MLlibTestCase):
|
|
|
|
def test_als_ratings_serialize(self):
|
|
ser = PickleSerializer()
|
|
r = Rating(7, 1123, 3.14)
|
|
jr = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(r)))
|
|
nr = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jr)))
|
|
self.assertEqual(r.user, nr.user)
|
|
self.assertEqual(r.product, nr.product)
|
|
self.assertAlmostEqual(r.rating, nr.rating, 2)
|
|
|
|
def test_als_ratings_id_long_error(self):
|
|
ser = PickleSerializer()
|
|
r = Rating(1205640308657491975, 50233468418, 1.0)
|
|
# rating user id exceeds max int value, should fail when pickled
|
|
self.assertRaises(Py4JJavaError, self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads,
|
|
bytearray(ser.dumps(r)))
|
|
|
|
|
|
class FPGrowthTest(MLlibTestCase):
|
|
|
|
def test_fpgrowth(self):
|
|
data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
|
|
rdd = self.sc.parallelize(data, 2)
|
|
model1 = FPGrowth.train(rdd, 0.6, 2)
|
|
# use default data partition number when numPartitions is not specified
|
|
model2 = FPGrowth.train(rdd, 0.6)
|
|
self.assertEqual(sorted(model1.freqItemsets().collect()),
|
|
sorted(model2.freqItemsets().collect()))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from pyspark.mllib.tests.test_algorithms import * # noqa: F401
|
|
|
|
try:
|
|
import xmlrunner
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
|
except ImportError:
|
|
testRunner = None
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|