04e44b37cc
This PR update PySpark to support Python 3 (tested with 3.4). Known issue: unpickle array from Pyrolite is broken in Python 3, those tests are skipped. TODO: ec2/spark-ec2.py is not fully tested with python3. Author: Davies Liu <davies@databricks.com> Author: twneale <twneale@gmail.com> Author: Josh Rosen <joshrosen@databricks.com> Closes #5173 from davies/python3 and squashes the following commits: d7d6323 [Davies Liu] fix tests 6c52a98 [Davies Liu] fix mllib test 99e334f [Davies Liu] update timeout b716610 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 cafd5ec [Davies Liu] adddress comments from @mengxr bf225d7 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 179fc8d [Davies Liu] tuning flaky tests 8c8b957 [Davies Liu] fix ResourceWarning in Python 3 5c57c95 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 4006829 [Davies Liu] fix test 2fc0066 [Davies Liu] add python3 path 71535e9 [Davies Liu] fix xrange and divide 5a55ab4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 125f12c [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 ed498c8 [Davies Liu] fix compatibility with python 3 820e649 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 e8ce8c9 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 ad7c374 [Davies Liu] fix mllib test and warning ef1fc2f [Davies Liu] fix tests 4eee14a [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 20112ff [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 59bb492 [Davies Liu] fix tests 1da268c [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 ca0fdd3 [Davies Liu] fix code style 9563a15 [Davies Liu] add imap back for python 2 0b1ec04 [Davies Liu] make python examples work with Python 3 d2fd566 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 a716d34 [Davies Liu] test with python 3.4 f1700e8 [Davies Liu] fix test in python3 671b1db [Davies Liu] fix test in python3 692ff47 [Davies Liu] fix flaky test 7b9699f [Davies Liu] invalidate import cache for Python 3.3+ 9c58497 [Davies Liu] fix kill worker 309bfbf [Davies Liu] keep compatibility 5707476 [Davies Liu] cleanup, fix hash of string in 3.3+ 8662d5b [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 f53e1f0 [Davies Liu] fix tests 70b6b73 [Davies Liu] compile ec2/spark_ec2.py in python 3 a39167e [Davies Liu] support customize class in __main__ 814c77b [Davies Liu] run unittests with python 3 7f4476e [Davies Liu] mllib tests passed d737924 [Davies Liu] pass ml tests 375ea17 [Davies Liu] SQL tests pass 6cc42a9 [Davies Liu] rename 431a8de [Davies Liu] streaming tests pass 78901a7 [Davies Liu] fix hash of serializer in Python 3 24b2f2e [Davies Liu] pass all RDD tests 35f48fe [Davies Liu] run future again 1eebac2 [Davies Liu] fix conflict in ec2/spark_ec2.py 6e3c21d [Davies Liu] make cloudpickle work with Python3 2fb2db3 [Josh Rosen] Guard more changes behind sys.version; still doesn't run 1aa5e8f [twneale] Turned out `pickle.DictionaryType is dict` == True, so swapped it out 7354371 [twneale] buffer --> memoryview I'm not super sure if this a valid change, but the 2.7 docs recommend using memoryview over buffer where possible, so hoping it'll work. b69ccdf [twneale] Uses the pure python pickle._Pickler instead of c-extension _pickle.Pickler. It appears pyspark 2.7 uses the pure python pickler as well, so this shouldn't degrade pickling performance (?). f40d925 [twneale] xrange --> range e104215 [twneale] Replaces 2.7 types.InstsanceType with 3.4 `object`....could be horribly wrong depending on how types.InstanceType is used elsewhere in the package--see http://bugs.python.org/issue8206 79de9d0 [twneale] Replaces python2.7 `file` with 3.4 _io.TextIOWrapper 2adb42d [Josh Rosen] Fix up some import differences between Python 2 and 3 854be27 [Josh Rosen] Run `futurize` on Python code: 7c5b4ce [Josh Rosen] Remove Python 3 check in shell.py.
275 lines
9.9 KiB
Python
275 lines
9.9 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import sys
|
|
import numpy as np
|
|
import warnings
|
|
|
|
if sys.version > '3':
|
|
xrange = range
|
|
|
|
from pyspark.mllib.common import callMLlibFunc, inherit_doc
|
|
from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
|
|
|
|
|
|
class MLUtils(object):
|
|
|
|
"""
|
|
Helper methods to load, save and pre-process data used in MLlib.
|
|
"""
|
|
|
|
@staticmethod
|
|
def _parse_libsvm_line(line, multiclass=None):
|
|
"""
|
|
Parses a line in LIBSVM format into (label, indices, values).
|
|
"""
|
|
if multiclass is not None:
|
|
warnings.warn("deprecated", DeprecationWarning)
|
|
items = line.split(None)
|
|
label = float(items[0])
|
|
nnz = len(items) - 1
|
|
indices = np.zeros(nnz, dtype=np.int32)
|
|
values = np.zeros(nnz)
|
|
for i in xrange(nnz):
|
|
index, value = items[1 + i].split(":")
|
|
indices[i] = int(index) - 1
|
|
values[i] = float(value)
|
|
return label, indices, values
|
|
|
|
@staticmethod
|
|
def _convert_labeled_point_to_libsvm(p):
|
|
"""Converts a LabeledPoint to a string in LIBSVM format."""
|
|
from pyspark.mllib.regression import LabeledPoint
|
|
assert isinstance(p, LabeledPoint)
|
|
items = [str(p.label)]
|
|
v = _convert_to_vector(p.features)
|
|
if isinstance(v, SparseVector):
|
|
nnz = len(v.indices)
|
|
for i in xrange(nnz):
|
|
items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
|
|
else:
|
|
for i in xrange(len(v)):
|
|
items.append(str(i + 1) + ":" + str(v[i]))
|
|
return " ".join(items)
|
|
|
|
@staticmethod
|
|
def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None):
|
|
"""
|
|
Loads labeled data in the LIBSVM format into an RDD of
|
|
LabeledPoint. The LIBSVM format is a text-based format used by
|
|
LIBSVM and LIBLINEAR. Each line represents a labeled sparse
|
|
feature vector using the following format:
|
|
|
|
label index1:value1 index2:value2 ...
|
|
|
|
where the indices are one-based and in ascending order. This
|
|
method parses each line into a LabeledPoint, where the feature
|
|
indices are converted to zero-based.
|
|
|
|
:param sc: Spark context
|
|
:param path: file or directory path in any Hadoop-supported file
|
|
system URI
|
|
:param numFeatures: number of features, which will be determined
|
|
from the input data if a nonpositive value
|
|
is given. This is useful when the dataset is
|
|
already split into multiple files and you
|
|
want to load them separately, because some
|
|
features may not present in certain files,
|
|
which leads to inconsistent feature
|
|
dimensions.
|
|
:param minPartitions: min number of partitions
|
|
@return: labeled data stored as an RDD of LabeledPoint
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
>>> from pyspark.mllib.regression import LabeledPoint
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
|
>>> _ = tempFile.write(b"+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
|
|
>>> tempFile.flush()
|
|
>>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
|
|
>>> tempFile.close()
|
|
>>> examples[0]
|
|
LabeledPoint(1.0, (6,[0,2,4],[1.0,2.0,3.0]))
|
|
>>> examples[1]
|
|
LabeledPoint(-1.0, (6,[],[]))
|
|
>>> examples[2]
|
|
LabeledPoint(-1.0, (6,[1,3,5],[4.0,5.0,6.0]))
|
|
"""
|
|
from pyspark.mllib.regression import LabeledPoint
|
|
if multiclass is not None:
|
|
warnings.warn("deprecated", DeprecationWarning)
|
|
|
|
lines = sc.textFile(path, minPartitions)
|
|
parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
|
|
if numFeatures <= 0:
|
|
parsed.cache()
|
|
numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
|
|
return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
|
|
|
|
@staticmethod
|
|
def saveAsLibSVMFile(data, dir):
|
|
"""
|
|
Save labeled data in LIBSVM format.
|
|
|
|
:param data: an RDD of LabeledPoint to be saved
|
|
:param dir: directory to save the data
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
>>> from fileinput import input
|
|
>>> from pyspark.mllib.regression import LabeledPoint
|
|
>>> from glob import glob
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
|
|
LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
|
>>> tempFile.close()
|
|
>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
|
|
>>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
|
|
'0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
|
|
"""
|
|
lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
|
|
lines.saveAsTextFile(dir)
|
|
|
|
@staticmethod
|
|
def loadLabeledPoints(sc, path, minPartitions=None):
|
|
"""
|
|
Load labeled points saved using RDD.saveAsTextFile.
|
|
|
|
:param sc: Spark context
|
|
:param path: file or directory path in any Hadoop-supported file
|
|
system URI
|
|
:param minPartitions: min number of partitions
|
|
@return: labeled data stored as an RDD of LabeledPoint
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
>>> from pyspark.mllib.regression import LabeledPoint
|
|
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
|
|
LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
|
>>> tempFile.close()
|
|
>>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
|
|
>>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
|
|
[LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]
|
|
"""
|
|
minPartitions = minPartitions or min(sc.defaultParallelism, 2)
|
|
return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
|
|
|
|
|
|
class Saveable(object):
|
|
"""
|
|
Mixin for models and transformers which may be saved as files.
|
|
"""
|
|
|
|
def save(self, sc, path):
|
|
"""
|
|
Save this model to the given path.
|
|
|
|
This saves:
|
|
* human-readable (JSON) model metadata to path/metadata/
|
|
* Parquet formatted data to path/data/
|
|
|
|
The model may be loaded using py:meth:`Loader.load`.
|
|
|
|
:param sc: Spark context used to save model data.
|
|
:param path: Path specifying the directory in which to save
|
|
this model. If the directory already exists,
|
|
this method throws an exception.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
@inherit_doc
|
|
class JavaSaveable(Saveable):
|
|
"""
|
|
Mixin for models that provide save() through their Scala
|
|
implementation.
|
|
"""
|
|
|
|
def save(self, sc, path):
|
|
self._java_model.save(sc._jsc.sc(), path)
|
|
|
|
|
|
class Loader(object):
|
|
"""
|
|
Mixin for classes which can load saved models from files.
|
|
"""
|
|
|
|
@classmethod
|
|
def load(cls, sc, path):
|
|
"""
|
|
Load a model from the given path. The model should have been
|
|
saved using py:meth:`Saveable.save`.
|
|
|
|
:param sc: Spark context used for loading model files.
|
|
:param path: Path specifying the directory to which the model
|
|
was saved.
|
|
:return: model instance
|
|
"""
|
|
raise NotImplemented
|
|
|
|
|
|
@inherit_doc
|
|
class JavaLoader(Loader):
|
|
"""
|
|
Mixin for classes which can load saved models using its Scala
|
|
implementation.
|
|
"""
|
|
|
|
@classmethod
|
|
def _java_loader_class(cls):
|
|
"""
|
|
Returns the full class name of the Java loader. The default
|
|
implementation replaces "pyspark" by "org.apache.spark" in
|
|
the Python full class name.
|
|
"""
|
|
java_package = cls.__module__.replace("pyspark", "org.apache.spark")
|
|
return ".".join([java_package, cls.__name__])
|
|
|
|
@classmethod
|
|
def _load_java(cls, sc, path):
|
|
"""
|
|
Load a Java model from the given path.
|
|
"""
|
|
java_class = cls._java_loader_class()
|
|
java_obj = sc._jvm
|
|
for name in java_class.split("."):
|
|
java_obj = getattr(java_obj, name)
|
|
return java_obj.load(sc._jsc.sc(), path)
|
|
|
|
@classmethod
|
|
def load(cls, sc, path):
|
|
java_model = cls._load_java(sc, path)
|
|
return cls(java_model)
|
|
|
|
|
|
def _test():
|
|
import doctest
|
|
from pyspark.context import SparkContext
|
|
globs = globals().copy()
|
|
# The small batch size here ensures that we see multiple batches,
|
|
# even in these small test examples:
|
|
globs['sc'] = SparkContext('local[2]', 'PythonTest', batchSize=2)
|
|
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
|
|
globs['sc'].stop()
|
|
if failure_count:
|
|
exit(-1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_test()
|