d614967b0b
As described in [SPARK-2627](https://issues.apache.org/jira/browse/SPARK-2627), we'd like Python code to automatically be checked for PEP 8 compliance by Jenkins. This pull request aims to do that. Notes: * We may need to install [`pep8`](https://pypi.python.org/pypi/pep8) on the build server. * I'm expecting tests to fail now that PEP 8 compliance is being checked as part of the build. I'm fine with cleaning up any remaining PEP 8 violations as part of this pull request. * I did not understand why the RAT and scalastyle reports are saved to text files. I did the same for the PEP 8 check, but only so that the console output style can match those for the RAT and scalastyle checks. The PEP 8 report is removed right after the check is complete. * Updates to the ["Contributing to Spark"](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) guide will be submitted elsewhere, as I don't believe that text is part of the Spark repo. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Author: nchammas <nicholas.chammas@gmail.com> Closes #1744 from nchammas/master and squashes the following commits: 274b238 [Nicholas Chammas] [SPARK-2627] [PySpark] minor indentation changes 983d963 [nchammas] Merge pull request #5 from apache/master 1db5314 [nchammas] Merge pull request #4 from apache/master 0e0245f [Nicholas Chammas] [SPARK-2627] undo erroneous whitespace fixes bf30942 [Nicholas Chammas] [SPARK-2627] PEP8: comment spacing 6db9a44 [nchammas] Merge pull request #3 from apache/master 7b4750e [Nicholas Chammas] merge upstream changes 91b7584 [Nicholas Chammas] [SPARK-2627] undo unnecessary line breaks 44e3e56 [Nicholas Chammas] [SPARK-2627] use tox.ini to exclude files b09fae2 [Nicholas Chammas] don't wrap comments unnecessarily bfb9f9f [Nicholas Chammas] [SPARK-2627] keep up with the PEP 8 fixes 9da347f [nchammas] Merge pull request #2 from apache/master aa5b4b5 [Nicholas Chammas] [SPARK-2627] follow Spark bash style for if blocks d0a83b9 [Nicholas Chammas] [SPARK-2627] check that pep8 downloaded fine dffb5dd [Nicholas Chammas] [SPARK-2627] download pep8 at runtime a1ce7ae [Nicholas Chammas] [SPARK-2627] space out test report sections 21da538 [Nicholas Chammas] [SPARK-2627] it's PEP 8, not PEP8 6f4900b [Nicholas Chammas] [SPARK-2627] more misc PEP 8 fixes fe57ed0 [Nicholas Chammas] removing merge conflict backups 9c01d4c [nchammas] Merge pull request #1 from apache/master 9a66cb0 [Nicholas Chammas] resolving merge conflicts a31ccc4 [Nicholas Chammas] [SPARK-2627] miscellaneous PEP 8 fixes beaa9ac [Nicholas Chammas] [SPARK-2627] fail check on non-zero status 723ed39 [Nicholas Chammas] always delete the report file 0541ebb [Nicholas Chammas] [SPARK-2627] call Python linter from run-tests 12440fa [Nicholas Chammas] [SPARK-2627] add Scala linter 61c07b9 [Nicholas Chammas] [SPARK-2627] add Python linter 75ad552 [Nicholas Chammas] make check output style consistent
205 lines
8.1 KiB
Python
205 lines
8.1 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import numpy as np
|
|
import warnings
|
|
|
|
from pyspark.mllib.linalg import Vectors, SparseVector
|
|
from pyspark.mllib.regression import LabeledPoint
|
|
from pyspark.mllib._common import _convert_vector, _deserialize_labeled_point
|
|
from pyspark.rdd import RDD
|
|
from pyspark.serializers import NoOpSerializer
|
|
|
|
|
|
class MLUtils:
|
|
|
|
"""
|
|
Helper methods to load, save and pre-process data used in MLlib.
|
|
"""
|
|
|
|
@staticmethod
|
|
def _parse_libsvm_line(line, multiclass):
|
|
warnings.warn("deprecated", DeprecationWarning)
|
|
return _parse_libsvm_line(line)
|
|
|
|
@staticmethod
|
|
def _parse_libsvm_line(line):
|
|
"""
|
|
Parses a line in LIBSVM format into (label, indices, values).
|
|
"""
|
|
items = line.split(None)
|
|
label = float(items[0])
|
|
nnz = len(items) - 1
|
|
indices = np.zeros(nnz, dtype=np.int32)
|
|
values = np.zeros(nnz)
|
|
for i in xrange(nnz):
|
|
index, value = items[1 + i].split(":")
|
|
indices[i] = int(index) - 1
|
|
values[i] = float(value)
|
|
return label, indices, values
|
|
|
|
@staticmethod
|
|
def _convert_labeled_point_to_libsvm(p):
|
|
"""Converts a LabeledPoint to a string in LIBSVM format."""
|
|
items = [str(p.label)]
|
|
v = _convert_vector(p.features)
|
|
if type(v) == np.ndarray:
|
|
for i in xrange(len(v)):
|
|
items.append(str(i + 1) + ":" + str(v[i]))
|
|
elif type(v) == SparseVector:
|
|
nnz = len(v.indices)
|
|
for i in xrange(nnz):
|
|
items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
|
|
else:
|
|
raise TypeError("_convert_labeled_point_to_libsvm needs either ndarray or SparseVector"
|
|
" but got " % type(v))
|
|
return " ".join(items)
|
|
|
|
@staticmethod
|
|
def loadLibSVMFile(sc, path, multiclass=False, numFeatures=-1, minPartitions=None):
|
|
warnings.warn("deprecated", DeprecationWarning)
|
|
return loadLibSVMFile(sc, path, numFeatures, minPartitions)
|
|
|
|
@staticmethod
|
|
def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
|
|
"""
|
|
Loads labeled data in the LIBSVM format into an RDD of
|
|
LabeledPoint. The LIBSVM format is a text-based format used by
|
|
LIBSVM and LIBLINEAR. Each line represents a labeled sparse
|
|
feature vector using the following format:
|
|
|
|
label index1:value1 index2:value2 ...
|
|
|
|
where the indices are one-based and in ascending order. This
|
|
method parses each line into a LabeledPoint, where the feature
|
|
indices are converted to zero-based.
|
|
|
|
@param sc: Spark context
|
|
@param path: file or directory path in any Hadoop-supported file
|
|
system URI
|
|
@param numFeatures: number of features, which will be determined
|
|
from the input data if a nonpositive value
|
|
is given. This is useful when the dataset is
|
|
already split into multiple files and you
|
|
want to load them separately, because some
|
|
features may not present in certain files,
|
|
which leads to inconsistent feature
|
|
dimensions.
|
|
@param minPartitions: min number of partitions
|
|
@return: labeled data stored as an RDD of LabeledPoint
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
|
>>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
|
|
>>> tempFile.flush()
|
|
>>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
|
|
>>> tempFile.close()
|
|
>>> type(examples[0]) == LabeledPoint
|
|
True
|
|
>>> print examples[0]
|
|
(1.0,(6,[0,2,4],[1.0,2.0,3.0]))
|
|
>>> type(examples[1]) == LabeledPoint
|
|
True
|
|
>>> print examples[1]
|
|
(-1.0,(6,[],[]))
|
|
>>> type(examples[2]) == LabeledPoint
|
|
True
|
|
>>> print examples[2]
|
|
(-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
|
|
"""
|
|
|
|
lines = sc.textFile(path, minPartitions)
|
|
parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
|
|
if numFeatures <= 0:
|
|
parsed.cache()
|
|
numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
|
|
return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
|
|
|
|
@staticmethod
|
|
def saveAsLibSVMFile(data, dir):
|
|
"""
|
|
Save labeled data in LIBSVM format.
|
|
|
|
@param data: an RDD of LabeledPoint to be saved
|
|
@param dir: directory to save the data
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
>>> from fileinput import input
|
|
>>> from glob import glob
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
|
|
LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
|
>>> tempFile.close()
|
|
>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
|
|
>>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
|
|
'0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
|
|
"""
|
|
lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
|
|
lines.saveAsTextFile(dir)
|
|
|
|
@staticmethod
|
|
def loadLabeledPoints(sc, path, minPartitions=None):
|
|
"""
|
|
Load labeled points saved using RDD.saveAsTextFile.
|
|
|
|
@param sc: Spark context
|
|
@param path: file or directory path in any Hadoop-supported file
|
|
system URI
|
|
@param minPartitions: min number of partitions
|
|
@return: labeled data stored as an RDD of LabeledPoint
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
|
|
LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
|
>>> tempFile.close()
|
|
>>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
|
|
>>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
|
|
>>> type(loaded[0]) == LabeledPoint
|
|
True
|
|
>>> print examples[0]
|
|
(1.1,(3,[0,2],[-1.23,4.56e-07]))
|
|
>>> type(examples[1]) == LabeledPoint
|
|
True
|
|
>>> print examples[1]
|
|
(0.0,[1.01,2.02,3.03])
|
|
"""
|
|
minPartitions = minPartitions or min(sc.defaultParallelism, 2)
|
|
jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions)
|
|
serialized = RDD(jSerialized, sc, NoOpSerializer())
|
|
return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
|
|
|
|
|
|
def _test():
|
|
import doctest
|
|
from pyspark.context import SparkContext
|
|
globs = globals().copy()
|
|
# The small batch size here ensures that we see multiple batches,
|
|
# even in these small test examples:
|
|
globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
|
|
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
|
|
globs['sc'].stop()
|
|
if failure_count:
|
|
exit(-1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_test()
|