d614967b0b
As described in [SPARK-2627](https://issues.apache.org/jira/browse/SPARK-2627), we'd like Python code to automatically be checked for PEP 8 compliance by Jenkins. This pull request aims to do that. Notes: * We may need to install [`pep8`](https://pypi.python.org/pypi/pep8) on the build server. * I'm expecting tests to fail now that PEP 8 compliance is being checked as part of the build. I'm fine with cleaning up any remaining PEP 8 violations as part of this pull request. * I did not understand why the RAT and scalastyle reports are saved to text files. I did the same for the PEP 8 check, but only so that the console output style can match those for the RAT and scalastyle checks. The PEP 8 report is removed right after the check is complete. * Updates to the ["Contributing to Spark"](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) guide will be submitted elsewhere, as I don't believe that text is part of the Spark repo. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Author: nchammas <nicholas.chammas@gmail.com> Closes #1744 from nchammas/master and squashes the following commits: 274b238 [Nicholas Chammas] [SPARK-2627] [PySpark] minor indentation changes 983d963 [nchammas] Merge pull request #5 from apache/master 1db5314 [nchammas] Merge pull request #4 from apache/master 0e0245f [Nicholas Chammas] [SPARK-2627] undo erroneous whitespace fixes bf30942 [Nicholas Chammas] [SPARK-2627] PEP8: comment spacing 6db9a44 [nchammas] Merge pull request #3 from apache/master 7b4750e [Nicholas Chammas] merge upstream changes 91b7584 [Nicholas Chammas] [SPARK-2627] undo unnecessary line breaks 44e3e56 [Nicholas Chammas] [SPARK-2627] use tox.ini to exclude files b09fae2 [Nicholas Chammas] don't wrap comments unnecessarily bfb9f9f [Nicholas Chammas] [SPARK-2627] keep up with the PEP 8 fixes 9da347f [nchammas] Merge pull request #2 from apache/master aa5b4b5 [Nicholas Chammas] [SPARK-2627] follow Spark bash style for if blocks d0a83b9 [Nicholas Chammas] [SPARK-2627] check that pep8 downloaded fine dffb5dd [Nicholas Chammas] [SPARK-2627] download pep8 at runtime a1ce7ae [Nicholas Chammas] [SPARK-2627] space out test report sections 21da538 [Nicholas Chammas] [SPARK-2627] it's PEP 8, not PEP8 6f4900b [Nicholas Chammas] [SPARK-2627] more misc PEP 8 fixes fe57ed0 [Nicholas Chammas] removing merge conflict backups 9c01d4c [nchammas] Merge pull request #1 from apache/master 9a66cb0 [Nicholas Chammas] resolving merge conflicts a31ccc4 [Nicholas Chammas] [SPARK-2627] miscellaneous PEP 8 fixes beaa9ac [Nicholas Chammas] [SPARK-2627] fail check on non-zero status 723ed39 [Nicholas Chammas] always delete the report file 0541ebb [Nicholas Chammas] [SPARK-2627] call Python linter from run-tests 12440fa [Nicholas Chammas] [SPARK-2627] add Scala linter 61c07b9 [Nicholas Chammas] [SPARK-2627] add Python linter 75ad552 [Nicholas Chammas] make check output style consistent
335 lines
14 KiB
Python
335 lines
14 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
"""
|
|
Fuller unit tests for Python MLlib.
|
|
"""
|
|
|
|
from numpy import array, array_equal
|
|
import unittest
|
|
|
|
from pyspark.mllib._common import _convert_vector, _serialize_double_vector, \
|
|
_deserialize_double_vector, _dot, _squared_distance
|
|
from pyspark.mllib.linalg import SparseVector
|
|
from pyspark.mllib.regression import LabeledPoint
|
|
from pyspark.tests import PySparkTestCase
|
|
|
|
|
|
_have_scipy = False
|
|
try:
|
|
import scipy.sparse
|
|
_have_scipy = True
|
|
except:
|
|
# No SciPy, but that's okay, we'll skip those tests
|
|
pass
|
|
|
|
|
|
class VectorTests(unittest.TestCase):
|
|
|
|
def test_serialize(self):
|
|
sv = SparseVector(4, {1: 1, 3: 2})
|
|
dv = array([1., 2., 3., 4.])
|
|
lst = [1, 2, 3, 4]
|
|
self.assertTrue(sv is _convert_vector(sv))
|
|
self.assertTrue(dv is _convert_vector(dv))
|
|
self.assertTrue(array_equal(dv, _convert_vector(lst)))
|
|
self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(sv)))
|
|
self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(dv))))
|
|
self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(lst))))
|
|
|
|
def test_dot(self):
|
|
sv = SparseVector(4, {1: 1, 3: 2})
|
|
dv = array([1., 2., 3., 4.])
|
|
lst = [1, 2, 3, 4]
|
|
mat = array([[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.]])
|
|
self.assertEquals(10.0, _dot(sv, dv))
|
|
self.assertTrue(array_equal(array([3., 6., 9., 12.]), _dot(sv, mat)))
|
|
self.assertEquals(30.0, _dot(dv, dv))
|
|
self.assertTrue(array_equal(array([10., 20., 30., 40.]), _dot(dv, mat)))
|
|
self.assertEquals(30.0, _dot(lst, dv))
|
|
self.assertTrue(array_equal(array([10., 20., 30., 40.]), _dot(lst, mat)))
|
|
|
|
def test_squared_distance(self):
|
|
sv = SparseVector(4, {1: 1, 3: 2})
|
|
dv = array([1., 2., 3., 4.])
|
|
lst = [4, 3, 2, 1]
|
|
self.assertEquals(15.0, _squared_distance(sv, dv))
|
|
self.assertEquals(25.0, _squared_distance(sv, lst))
|
|
self.assertEquals(20.0, _squared_distance(dv, lst))
|
|
self.assertEquals(15.0, _squared_distance(dv, sv))
|
|
self.assertEquals(25.0, _squared_distance(lst, sv))
|
|
self.assertEquals(20.0, _squared_distance(lst, dv))
|
|
self.assertEquals(0.0, _squared_distance(sv, sv))
|
|
self.assertEquals(0.0, _squared_distance(dv, dv))
|
|
self.assertEquals(0.0, _squared_distance(lst, lst))
|
|
|
|
|
|
class ListTests(PySparkTestCase):
|
|
|
|
"""
|
|
Test MLlib algorithms on plain lists, to make sure they're passed through
|
|
as NumPy arrays.
|
|
"""
|
|
|
|
def test_clustering(self):
|
|
from pyspark.mllib.clustering import KMeans
|
|
data = [
|
|
[0, 1.1],
|
|
[0, 1.2],
|
|
[1.1, 0],
|
|
[1.2, 0],
|
|
]
|
|
clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
|
|
self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
|
|
self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
|
|
|
|
def test_classification(self):
|
|
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
|
|
from pyspark.mllib.tree import DecisionTree
|
|
data = [
|
|
LabeledPoint(0.0, [1, 0, 0]),
|
|
LabeledPoint(1.0, [0, 1, 1]),
|
|
LabeledPoint(0.0, [2, 0, 0]),
|
|
LabeledPoint(1.0, [0, 2, 1])
|
|
]
|
|
rdd = self.sc.parallelize(data)
|
|
features = [p.features.tolist() for p in data]
|
|
|
|
lr_model = LogisticRegressionWithSGD.train(rdd)
|
|
self.assertTrue(lr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[1]) > 0)
|
|
self.assertTrue(lr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[3]) > 0)
|
|
|
|
svm_model = SVMWithSGD.train(rdd)
|
|
self.assertTrue(svm_model.predict(features[0]) <= 0)
|
|
self.assertTrue(svm_model.predict(features[1]) > 0)
|
|
self.assertTrue(svm_model.predict(features[2]) <= 0)
|
|
self.assertTrue(svm_model.predict(features[3]) > 0)
|
|
|
|
nb_model = NaiveBayes.train(rdd)
|
|
self.assertTrue(nb_model.predict(features[0]) <= 0)
|
|
self.assertTrue(nb_model.predict(features[1]) > 0)
|
|
self.assertTrue(nb_model.predict(features[2]) <= 0)
|
|
self.assertTrue(nb_model.predict(features[3]) > 0)
|
|
|
|
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
|
|
dt_model = \
|
|
DecisionTree.trainClassifier(rdd, numClasses=2,
|
|
categoricalFeaturesInfo=categoricalFeaturesInfo)
|
|
self.assertTrue(dt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[1]) > 0)
|
|
self.assertTrue(dt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[3]) > 0)
|
|
|
|
def test_regression(self):
|
|
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
|
|
RidgeRegressionWithSGD
|
|
from pyspark.mllib.tree import DecisionTree
|
|
data = [
|
|
LabeledPoint(-1.0, [0, -1]),
|
|
LabeledPoint(1.0, [0, 1]),
|
|
LabeledPoint(-1.0, [0, -2]),
|
|
LabeledPoint(1.0, [0, 2])
|
|
]
|
|
rdd = self.sc.parallelize(data)
|
|
features = [p.features.tolist() for p in data]
|
|
|
|
lr_model = LinearRegressionWithSGD.train(rdd)
|
|
self.assertTrue(lr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[1]) > 0)
|
|
self.assertTrue(lr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[3]) > 0)
|
|
|
|
lasso_model = LassoWithSGD.train(rdd)
|
|
self.assertTrue(lasso_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lasso_model.predict(features[1]) > 0)
|
|
self.assertTrue(lasso_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lasso_model.predict(features[3]) > 0)
|
|
|
|
rr_model = RidgeRegressionWithSGD.train(rdd)
|
|
self.assertTrue(rr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(rr_model.predict(features[1]) > 0)
|
|
self.assertTrue(rr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(rr_model.predict(features[3]) > 0)
|
|
|
|
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
|
|
dt_model = \
|
|
DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
|
|
self.assertTrue(dt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[1]) > 0)
|
|
self.assertTrue(dt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[3]) > 0)
|
|
|
|
|
|
@unittest.skipIf(not _have_scipy, "SciPy not installed")
|
|
class SciPyTests(PySparkTestCase):
|
|
|
|
"""
|
|
Test both vector operations and MLlib algorithms with SciPy sparse matrices,
|
|
if SciPy is available.
|
|
"""
|
|
|
|
def test_serialize(self):
|
|
from scipy.sparse import lil_matrix
|
|
lil = lil_matrix((4, 1))
|
|
lil[1, 0] = 1
|
|
lil[3, 0] = 2
|
|
sv = SparseVector(4, {1: 1, 3: 2})
|
|
self.assertEquals(sv, _convert_vector(lil))
|
|
self.assertEquals(sv, _convert_vector(lil.tocsc()))
|
|
self.assertEquals(sv, _convert_vector(lil.tocoo()))
|
|
self.assertEquals(sv, _convert_vector(lil.tocsr()))
|
|
self.assertEquals(sv, _convert_vector(lil.todok()))
|
|
self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil)))
|
|
self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsc())))
|
|
self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsr())))
|
|
self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
|
|
|
|
def test_dot(self):
|
|
from scipy.sparse import lil_matrix
|
|
lil = lil_matrix((4, 1))
|
|
lil[1, 0] = 1
|
|
lil[3, 0] = 2
|
|
dv = array([1., 2., 3., 4.])
|
|
sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
|
|
mat = array([[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.],
|
|
[1., 2., 3., 4.]])
|
|
self.assertEquals(10.0, _dot(lil, dv))
|
|
self.assertTrue(array_equal(array([3., 6., 9., 12.]), _dot(lil, mat)))
|
|
|
|
def test_squared_distance(self):
|
|
from scipy.sparse import lil_matrix
|
|
lil = lil_matrix((4, 1))
|
|
lil[1, 0] = 3
|
|
lil[3, 0] = 2
|
|
dv = array([1., 2., 3., 4.])
|
|
sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
|
|
self.assertEquals(15.0, _squared_distance(lil, dv))
|
|
self.assertEquals(15.0, _squared_distance(lil, sv))
|
|
self.assertEquals(15.0, _squared_distance(dv, lil))
|
|
self.assertEquals(15.0, _squared_distance(sv, lil))
|
|
|
|
def scipy_matrix(self, size, values):
|
|
"""Create a column SciPy matrix from a dictionary of values"""
|
|
from scipy.sparse import lil_matrix
|
|
lil = lil_matrix((size, 1))
|
|
for key, value in values.items():
|
|
lil[key, 0] = value
|
|
return lil
|
|
|
|
def test_clustering(self):
|
|
from pyspark.mllib.clustering import KMeans
|
|
data = [
|
|
self.scipy_matrix(3, {1: 1.0}),
|
|
self.scipy_matrix(3, {1: 1.1}),
|
|
self.scipy_matrix(3, {2: 1.0}),
|
|
self.scipy_matrix(3, {2: 1.1})
|
|
]
|
|
clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
|
|
self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
|
|
self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
|
|
|
|
def test_classification(self):
|
|
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
|
|
from pyspark.mllib.tree import DecisionTree
|
|
data = [
|
|
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
|
|
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
|
|
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
|
|
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
|
|
]
|
|
rdd = self.sc.parallelize(data)
|
|
features = [p.features for p in data]
|
|
|
|
lr_model = LogisticRegressionWithSGD.train(rdd)
|
|
self.assertTrue(lr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[1]) > 0)
|
|
self.assertTrue(lr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[3]) > 0)
|
|
|
|
svm_model = SVMWithSGD.train(rdd)
|
|
self.assertTrue(svm_model.predict(features[0]) <= 0)
|
|
self.assertTrue(svm_model.predict(features[1]) > 0)
|
|
self.assertTrue(svm_model.predict(features[2]) <= 0)
|
|
self.assertTrue(svm_model.predict(features[3]) > 0)
|
|
|
|
nb_model = NaiveBayes.train(rdd)
|
|
self.assertTrue(nb_model.predict(features[0]) <= 0)
|
|
self.assertTrue(nb_model.predict(features[1]) > 0)
|
|
self.assertTrue(nb_model.predict(features[2]) <= 0)
|
|
self.assertTrue(nb_model.predict(features[3]) > 0)
|
|
|
|
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
|
|
dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
|
|
categoricalFeaturesInfo=categoricalFeaturesInfo)
|
|
self.assertTrue(dt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[1]) > 0)
|
|
self.assertTrue(dt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[3]) > 0)
|
|
|
|
def test_regression(self):
|
|
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
|
|
RidgeRegressionWithSGD
|
|
from pyspark.mllib.tree import DecisionTree
|
|
data = [
|
|
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
|
|
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
|
|
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
|
|
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
|
|
]
|
|
rdd = self.sc.parallelize(data)
|
|
features = [p.features for p in data]
|
|
|
|
lr_model = LinearRegressionWithSGD.train(rdd)
|
|
self.assertTrue(lr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[1]) > 0)
|
|
self.assertTrue(lr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lr_model.predict(features[3]) > 0)
|
|
|
|
lasso_model = LassoWithSGD.train(rdd)
|
|
self.assertTrue(lasso_model.predict(features[0]) <= 0)
|
|
self.assertTrue(lasso_model.predict(features[1]) > 0)
|
|
self.assertTrue(lasso_model.predict(features[2]) <= 0)
|
|
self.assertTrue(lasso_model.predict(features[3]) > 0)
|
|
|
|
rr_model = RidgeRegressionWithSGD.train(rdd)
|
|
self.assertTrue(rr_model.predict(features[0]) <= 0)
|
|
self.assertTrue(rr_model.predict(features[1]) > 0)
|
|
self.assertTrue(rr_model.predict(features[2]) <= 0)
|
|
self.assertTrue(rr_model.predict(features[3]) > 0)
|
|
|
|
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
|
|
dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
|
|
self.assertTrue(dt_model.predict(features[0]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[1]) > 0)
|
|
self.assertTrue(dt_model.predict(features[2]) <= 0)
|
|
self.assertTrue(dt_model.predict(features[3]) > 0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if not _have_scipy:
|
|
print "NOTE: Skipping SciPy tests as it does not seem to be installed"
|
|
unittest.main()
|
|
if not _have_scipy:
|
|
print "NOTE: SciPy tests were skipped as it does not seem to be installed"
|