2014-05-07 19:01:11 -04:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
2015-04-16 19:20:57 -04:00
|
|
|
import sys
|
2014-05-07 19:01:11 -04:00
|
|
|
import numpy as np
|
|
|
|
|
2015-04-16 19:20:57 -04:00
|
|
|
if sys.version > '3':
|
|
|
|
xrange = range
|
2015-07-28 18:00:25 -04:00
|
|
|
basestring = str
|
2015-04-16 19:20:57 -04:00
|
|
|
|
2015-09-17 11:50:00 -04:00
|
|
|
from pyspark import SparkContext, since
|
2015-04-16 19:20:57 -04:00
|
|
|
from pyspark.mllib.common import callMLlibFunc, inherit_doc
|
2014-09-19 18:01:11 -04:00
|
|
|
from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
|
2016-06-18 00:22:29 -04:00
|
|
|
from pyspark.sql import DataFrame
|
2014-06-04 15:56:56 -04:00
|
|
|
|
2014-05-07 19:01:11 -04:00
|
|
|
|
2014-09-08 18:45:28 -04:00
|
|
|
class MLUtils(object):
|
2014-08-06 15:58:24 -04:00
|
|
|
|
2014-05-07 19:01:11 -04:00
|
|
|
"""
|
|
|
|
Helper methods to load, save and pre-process data used in MLlib.
|
2015-09-17 11:50:00 -04:00
|
|
|
|
|
|
|
.. versionadded:: 1.0.0
|
2014-05-07 19:01:11 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@staticmethod
|
2018-11-07 23:48:50 -05:00
|
|
|
def _parse_libsvm_line(line):
|
2014-05-07 19:01:11 -04:00
|
|
|
"""
|
|
|
|
Parses a line in LIBSVM format into (label, indices, values).
|
|
|
|
"""
|
|
|
|
items = line.split(None)
|
|
|
|
label = float(items[0])
|
|
|
|
nnz = len(items) - 1
|
|
|
|
indices = np.zeros(nnz, dtype=np.int32)
|
|
|
|
values = np.zeros(nnz)
|
|
|
|
for i in xrange(nnz):
|
|
|
|
index, value = items[1 + i].split(":")
|
|
|
|
indices[i] = int(index) - 1
|
|
|
|
values[i] = float(value)
|
|
|
|
return label, indices, values
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _convert_labeled_point_to_libsvm(p):
|
|
|
|
"""Converts a LabeledPoint to a string in LIBSVM format."""
|
2015-03-20 14:44:21 -04:00
|
|
|
from pyspark.mllib.regression import LabeledPoint
|
2014-09-19 18:01:11 -04:00
|
|
|
assert isinstance(p, LabeledPoint)
|
2014-05-07 19:01:11 -04:00
|
|
|
items = [str(p.label)]
|
2014-09-19 18:01:11 -04:00
|
|
|
v = _convert_to_vector(p.features)
|
|
|
|
if isinstance(v, SparseVector):
|
2014-05-07 19:01:11 -04:00
|
|
|
nnz = len(v.indices)
|
|
|
|
for i in xrange(nnz):
|
|
|
|
items.append(str(v.indices[i] + 1) + ":" + str(v.values[i]))
|
|
|
|
else:
|
2014-09-19 18:01:11 -04:00
|
|
|
for i in xrange(len(v)):
|
|
|
|
items.append(str(i + 1) + ":" + str(v[i]))
|
2014-05-07 19:01:11 -04:00
|
|
|
return " ".join(items)
|
|
|
|
|
|
|
|
@staticmethod
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.0.0")
|
2018-11-07 23:48:50 -05:00
|
|
|
def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
|
2014-05-07 19:01:11 -04:00
|
|
|
"""
|
|
|
|
Loads labeled data in the LIBSVM format into an RDD of
|
|
|
|
LabeledPoint. The LIBSVM format is a text-based format used by
|
|
|
|
LIBSVM and LIBLINEAR. Each line represents a labeled sparse
|
|
|
|
feature vector using the following format:
|
|
|
|
|
|
|
|
label index1:value1 index2:value2 ...
|
|
|
|
|
|
|
|
where the indices are one-based and in ascending order. This
|
|
|
|
method parses each line into a LabeledPoint, where the feature
|
|
|
|
indices are converted to zero-based.
|
|
|
|
|
2014-10-07 21:09:27 -04:00
|
|
|
:param sc: Spark context
|
|
|
|
:param path: file or directory path in any Hadoop-supported file
|
2014-05-07 19:01:11 -04:00
|
|
|
system URI
|
2014-10-07 21:09:27 -04:00
|
|
|
:param numFeatures: number of features, which will be determined
|
2014-05-07 19:01:11 -04:00
|
|
|
from the input data if a nonpositive value
|
|
|
|
is given. This is useful when the dataset is
|
|
|
|
already split into multiple files and you
|
|
|
|
want to load them separately, because some
|
|
|
|
features may not present in certain files,
|
|
|
|
which leads to inconsistent feature
|
|
|
|
dimensions.
|
2014-10-07 21:09:27 -04:00
|
|
|
:param minPartitions: min number of partitions
|
2019-07-05 13:08:22 -04:00
|
|
|
:return: labeled data stored as an RDD of LabeledPoint
|
2014-05-07 19:01:11 -04:00
|
|
|
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
|
|
>>> from pyspark.mllib.util import MLUtils
|
2015-03-20 14:44:21 -04:00
|
|
|
>>> from pyspark.mllib.regression import LabeledPoint
|
2014-05-07 19:01:11 -04:00
|
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> _ = tempFile.write(b"+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
|
2014-05-07 19:01:11 -04:00
|
|
|
>>> tempFile.flush()
|
|
|
|
>>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
|
|
|
|
>>> tempFile.close()
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> examples[0]
|
|
|
|
LabeledPoint(1.0, (6,[0,2,4],[1.0,2.0,3.0]))
|
|
|
|
>>> examples[1]
|
|
|
|
LabeledPoint(-1.0, (6,[],[]))
|
|
|
|
>>> examples[2]
|
|
|
|
LabeledPoint(-1.0, (6,[1,3,5],[4.0,5.0,6.0]))
|
2014-05-07 19:01:11 -04:00
|
|
|
"""
|
2015-03-20 14:44:21 -04:00
|
|
|
from pyspark.mllib.regression import LabeledPoint
|
2014-05-07 19:01:11 -04:00
|
|
|
|
|
|
|
lines = sc.textFile(path, minPartitions)
|
2014-07-30 20:34:32 -04:00
|
|
|
parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
|
2014-05-07 19:01:11 -04:00
|
|
|
if numFeatures <= 0:
|
|
|
|
parsed.cache()
|
[SPARK-2478] [mllib] DecisionTree Python API
Added experimental Python API for Decision Trees.
API:
* class DecisionTreeModel
** predict() for single examples and RDDs, taking both feature vectors and LabeledPoints
** numNodes()
** depth()
** __str__()
* class DecisionTree
** trainClassifier()
** trainRegressor()
** train()
Examples and testing:
* Added example testing classification and regression with batch prediction: examples/src/main/python/mllib/tree.py
* Have also tested example usage in doc of python/pyspark/mllib/tree.py which tests single-example prediction with dense and sparse vectors
Also: Small bug fix in python/pyspark/mllib/_common.py: In _linear_predictor_typecheck, changed check for RDD to use isinstance() instead of type() in order to catch RDD subclasses.
CC mengxr manishamde
Author: Joseph K. Bradley <joseph.kurata.bradley@gmail.com>
Closes #1727 from jkbradley/decisiontree-python-new and squashes the following commits:
3744488 [Joseph K. Bradley] Renamed test tree.py to decision_tree_runner.py Small updates based on github review.
6b86a9d [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
affceb9 [Joseph K. Bradley] * Fixed bug in doc tests in pyspark/mllib/util.py caused by change in loadLibSVMFile behavior. (It used to threshold labels at 0 to make them 0/1, but it now leaves them as they are.) * Fixed small bug in loadLibSVMFile: If a data file had no features, then loadLibSVMFile would create a single all-zero feature.
67a29bc [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
cf46ad7 [Joseph K. Bradley] Python DecisionTreeModel * predict(empty RDD) returns an empty RDD instead of an error. * Removed support for calling predict() on LabeledPoint and RDD[LabeledPoint] * predict() does not cache serialized RDD any more.
aa29873 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
bf21be4 [Joseph K. Bradley] removed old run() func from DecisionTree
fa10ea7 [Joseph K. Bradley] Small style update
7968692 [Joseph K. Bradley] small braces typo fix
e34c263 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
4801b40 [Joseph K. Bradley] Small style update to DecisionTreeSuite
db0eab2 [Joseph K. Bradley] Merge branch 'decisiontree-bugfix2' into decisiontree-python-new
6873fa9 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
225822f [Joseph K. Bradley] Bug: In DecisionTree, the method sequentialBinSearchForOrderedCategoricalFeatureInClassification() indexed bins from 0 to (math.pow(2, featureCategories.toInt - 1) - 1). This upper bound is the bound for unordered categorical features, not ordered ones. The upper bound should be the arity (i.e., max value) of the feature.
93953f1 [Joseph K. Bradley] Likely done with Python API.
6df89a9 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
4562c08 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
665ba78 [Joseph K. Bradley] Small updates towards Python DecisionTree API
188cb0d [Joseph K. Bradley] Merge branch 'decisiontree-bugfix' into decisiontree-python-new
6622247 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
b8fac57 [Joseph K. Bradley] Finished Python DecisionTree API and example but need to test a bit more.
2b20c61 [Joseph K. Bradley] Small doc and style updates
1b29c13 [Joseph K. Bradley] Merge branch 'decisiontree-bugfix' into decisiontree-python-new
584449a [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
dab0b67 [Joseph K. Bradley] Added documentation for DecisionTree internals
8bb8aa0 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-bugfix
978cfcf [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-bugfix
6eed482 [Joseph K. Bradley] In DecisionTree: Changed from using procedural syntax for functions returning Unit to explicitly writing Unit return type.
376dca2 [Joseph K. Bradley] Updated meaning of maxDepth by 1 to fit scikit-learn and rpart. * In code, replaced usages of maxDepth <-- maxDepth + 1 * In params, replace settings of maxDepth <-- maxDepth - 1
e06e423 [Joseph K. Bradley] Merge branch 'decisiontree-bugfix' into decisiontree-python-new
bab3f19 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
59750f8 [Joseph K. Bradley] * Updated Strategy to check numClassesForClassification only if algo=Classification. * Updates based on comments: ** DecisionTreeRunner *** Made dataFormat arg default to libsvm ** Small cleanups ** tree.Node: Made recursive helper methods private, and renamed them.
52e17c5 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-bugfix
f5a036c [Joseph K. Bradley] Merge branch 'decisiontree-bugfix' into decisiontree-python-new
da50db7 [Joseph K. Bradley] Added one more test to DecisionTreeSuite: stump with 2 continuous variables for binary classification. Caused problems in past, but fixed now.
8e227ea [Joseph K. Bradley] Changed Strategy so it only requires numClassesForClassification >= 2 for classification
cd1d933 [Joseph K. Bradley] Merge branch 'decisiontree-bugfix' into decisiontree-python-new
8ea8750 [Joseph K. Bradley] Bug fix: Off-by-1 when finding thresholds for splits for continuous features.
8a758db [Joseph K. Bradley] Merge branch 'decisiontree-bugfix' into decisiontree-python-new
5fe44ed [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-python-new
2283df8 [Joseph K. Bradley] 2 bug fixes.
73fbea2 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into decisiontree-bugfix
5f920a1 [Joseph K. Bradley] Demonstration of bug before submitting fix: Updated DecisionTreeSuite so that 3 tests fail. Will describe bug in next commit.
f825352 [Joseph K. Bradley] Wrote Python API and example for DecisionTree. Also added toString, depth, and numNodes methods to DecisionTreeModel.
2014-08-02 16:07:17 -04:00
|
|
|
numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
|
2014-05-07 19:01:11 -04:00
|
|
|
return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
|
|
|
|
|
|
|
|
@staticmethod
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.0.0")
|
2014-05-07 19:01:11 -04:00
|
|
|
def saveAsLibSVMFile(data, dir):
|
|
|
|
"""
|
|
|
|
Save labeled data in LIBSVM format.
|
|
|
|
|
2014-10-07 21:09:27 -04:00
|
|
|
:param data: an RDD of LabeledPoint to be saved
|
|
|
|
:param dir: directory to save the data
|
2014-05-07 19:01:11 -04:00
|
|
|
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
|
|
>>> from fileinput import input
|
2015-03-20 14:44:21 -04:00
|
|
|
>>> from pyspark.mllib.regression import LabeledPoint
|
2014-05-07 19:01:11 -04:00
|
|
|
>>> from glob import glob
|
|
|
|
>>> from pyspark.mllib.util import MLUtils
|
[MINOR][PYSPARK][DOCS] Fix examples in PySpark documentation
## What changes were proposed in this pull request?
This PR proposes to fix wrongly indented examples in PySpark documentation
```
- >>> json_sdf = spark.readStream.format("json")\
- .schema(sdf_schema)\
- .load(tempfile.mkdtemp())
+ >>> json_sdf = spark.readStream.format("json") \\
+ ... .schema(sdf_schema) \\
+ ... .load(tempfile.mkdtemp())
```
```
- people.filter(people.age > 30).join(department, people.deptId == department.id)\
+ people.filter(people.age > 30).join(department, people.deptId == department.id) \\
```
```
- >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
- LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
+ >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),
+ ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
```
```
- >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
- LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
+ >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),
+ ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
```
```
- ... for x in iterator:
- ... print(x)
+ ... for x in iterator:
+ ... print(x)
```
## How was this patch tested?
Manually tested.
**Before**
![2016-09-26 8 36 02](https://cloud.githubusercontent.com/assets/6477701/18834471/05c7a478-8431-11e6-94bb-09aa37b12ddb.png)
![2016-09-26 9 22 16](https://cloud.githubusercontent.com/assets/6477701/18834472/06c8735c-8431-11e6-8775-78631eab0411.png)
<img width="601" alt="2016-09-27 2 29 27" src="https://cloud.githubusercontent.com/assets/6477701/18861294/29c0d5b4-84bf-11e6-99c5-3c9d913c125d.png">
<img width="1056" alt="2016-09-27 2 29 58" src="https://cloud.githubusercontent.com/assets/6477701/18861298/31694cd8-84bf-11e6-9e61-9888cb8c2089.png">
<img width="1079" alt="2016-09-27 2 30 05" src="https://cloud.githubusercontent.com/assets/6477701/18861301/359722da-84bf-11e6-97f9-5f5365582d14.png">
**After**
![2016-09-26 9 29 47](https://cloud.githubusercontent.com/assets/6477701/18834467/0367f9da-8431-11e6-86d9-a490d3297339.png)
![2016-09-26 9 30 24](https://cloud.githubusercontent.com/assets/6477701/18834463/f870fae0-8430-11e6-9482-01fc47898492.png)
<img width="515" alt="2016-09-27 2 28 19" src="https://cloud.githubusercontent.com/assets/6477701/18861305/3ff88b88-84bf-11e6-902c-9f725e8a8b10.png">
<img width="652" alt="2016-09-27 3 50 59" src="https://cloud.githubusercontent.com/assets/6477701/18863053/592fbc74-84ca-11e6-8dbf-99cf57947de8.png">
<img width="709" alt="2016-09-27 3 51 03" src="https://cloud.githubusercontent.com/assets/6477701/18863060/601607be-84ca-11e6-80aa-a401df41c321.png">
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #15242 from HyukjinKwon/minor-example-pyspark.
2016-09-28 06:19:04 -04:00
|
|
|
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),
|
|
|
|
... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
|
2014-05-07 19:01:11 -04:00
|
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
|
|
|
>>> tempFile.close()
|
|
|
|
>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
|
|
|
|
>>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
|
|
|
|
'0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
|
|
|
|
"""
|
|
|
|
lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
|
|
|
|
lines.saveAsTextFile(dir)
|
|
|
|
|
2014-06-04 15:56:56 -04:00
|
|
|
@staticmethod
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.1.0")
|
2014-06-04 15:56:56 -04:00
|
|
|
def loadLabeledPoints(sc, path, minPartitions=None):
|
|
|
|
"""
|
|
|
|
Load labeled points saved using RDD.saveAsTextFile.
|
|
|
|
|
2014-10-07 21:09:27 -04:00
|
|
|
:param sc: Spark context
|
|
|
|
:param path: file or directory path in any Hadoop-supported file
|
2014-06-04 15:56:56 -04:00
|
|
|
system URI
|
2014-10-07 21:09:27 -04:00
|
|
|
:param minPartitions: min number of partitions
|
2019-07-05 13:08:22 -04:00
|
|
|
:return: labeled data stored as an RDD of LabeledPoint
|
2014-06-04 15:56:56 -04:00
|
|
|
|
|
|
|
>>> from tempfile import NamedTemporaryFile
|
|
|
|
>>> from pyspark.mllib.util import MLUtils
|
2015-03-20 14:44:21 -04:00
|
|
|
>>> from pyspark.mllib.regression import LabeledPoint
|
[MINOR][PYSPARK][DOCS] Fix examples in PySpark documentation
## What changes were proposed in this pull request?
This PR proposes to fix wrongly indented examples in PySpark documentation
```
- >>> json_sdf = spark.readStream.format("json")\
- .schema(sdf_schema)\
- .load(tempfile.mkdtemp())
+ >>> json_sdf = spark.readStream.format("json") \\
+ ... .schema(sdf_schema) \\
+ ... .load(tempfile.mkdtemp())
```
```
- people.filter(people.age > 30).join(department, people.deptId == department.id)\
+ people.filter(people.age > 30).join(department, people.deptId == department.id) \\
```
```
- >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
- LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
+ >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),
+ ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
```
```
- >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
- LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
+ >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),
+ ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
```
```
- ... for x in iterator:
- ... print(x)
+ ... for x in iterator:
+ ... print(x)
```
## How was this patch tested?
Manually tested.
**Before**
![2016-09-26 8 36 02](https://cloud.githubusercontent.com/assets/6477701/18834471/05c7a478-8431-11e6-94bb-09aa37b12ddb.png)
![2016-09-26 9 22 16](https://cloud.githubusercontent.com/assets/6477701/18834472/06c8735c-8431-11e6-8775-78631eab0411.png)
<img width="601" alt="2016-09-27 2 29 27" src="https://cloud.githubusercontent.com/assets/6477701/18861294/29c0d5b4-84bf-11e6-99c5-3c9d913c125d.png">
<img width="1056" alt="2016-09-27 2 29 58" src="https://cloud.githubusercontent.com/assets/6477701/18861298/31694cd8-84bf-11e6-9e61-9888cb8c2089.png">
<img width="1079" alt="2016-09-27 2 30 05" src="https://cloud.githubusercontent.com/assets/6477701/18861301/359722da-84bf-11e6-97f9-5f5365582d14.png">
**After**
![2016-09-26 9 29 47](https://cloud.githubusercontent.com/assets/6477701/18834467/0367f9da-8431-11e6-86d9-a490d3297339.png)
![2016-09-26 9 30 24](https://cloud.githubusercontent.com/assets/6477701/18834463/f870fae0-8430-11e6-9482-01fc47898492.png)
<img width="515" alt="2016-09-27 2 28 19" src="https://cloud.githubusercontent.com/assets/6477701/18861305/3ff88b88-84bf-11e6-902c-9f725e8a8b10.png">
<img width="652" alt="2016-09-27 3 50 59" src="https://cloud.githubusercontent.com/assets/6477701/18863053/592fbc74-84ca-11e6-8dbf-99cf57947de8.png">
<img width="709" alt="2016-09-27 3 51 03" src="https://cloud.githubusercontent.com/assets/6477701/18863060/601607be-84ca-11e6-80aa-a401df41c321.png">
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #15242 from HyukjinKwon/minor-example-pyspark.
2016-09-28 06:19:04 -04:00
|
|
|
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),
|
|
|
|
... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
|
2014-06-04 15:56:56 -04:00
|
|
|
>>> tempFile = NamedTemporaryFile(delete=True)
|
|
|
|
>>> tempFile.close()
|
|
|
|
>>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
|
2014-11-11 01:26:16 -05:00
|
|
|
>>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
|
|
|
|
[LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]
|
2014-06-04 15:56:56 -04:00
|
|
|
"""
|
|
|
|
minPartitions = minPartitions or min(sc.defaultParallelism, 2)
|
2014-10-31 01:25:18 -04:00
|
|
|
return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
|
2014-06-04 15:56:56 -04:00
|
|
|
|
2015-07-01 14:14:07 -04:00
|
|
|
@staticmethod
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.5.0")
|
2015-07-01 14:14:07 -04:00
|
|
|
def appendBias(data):
|
|
|
|
"""
|
|
|
|
Returns a new vector with `1.0` (bias) appended to
|
|
|
|
the end of the input vector.
|
|
|
|
"""
|
|
|
|
vec = _convert_to_vector(data)
|
|
|
|
if isinstance(vec, SparseVector):
|
|
|
|
newIndices = np.append(vec.indices, len(vec))
|
|
|
|
newValues = np.append(vec.values, 1.0)
|
|
|
|
return SparseVector(len(vec) + 1, newIndices, newValues)
|
|
|
|
else:
|
|
|
|
return _convert_to_vector(np.append(vec.toArray(), 1.0))
|
|
|
|
|
|
|
|
@staticmethod
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.5.0")
|
2015-07-01 14:14:07 -04:00
|
|
|
def loadVectors(sc, path):
|
|
|
|
"""
|
|
|
|
Loads vectors saved using `RDD[Vector].saveAsTextFile`
|
|
|
|
with the default number of partitions.
|
|
|
|
"""
|
|
|
|
return callMLlibFunc("loadVectors", sc, path)
|
|
|
|
|
2016-06-18 00:22:29 -04:00
|
|
|
@staticmethod
|
|
|
|
@since("2.0.0")
|
|
|
|
def convertVectorColumnsToML(dataset, *cols):
|
|
|
|
"""
|
|
|
|
Converts vector columns in an input DataFrame from the
|
|
|
|
:py:class:`pyspark.mllib.linalg.Vector` type to the new
|
|
|
|
:py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml`
|
|
|
|
package.
|
|
|
|
|
|
|
|
:param dataset:
|
|
|
|
input dataset
|
|
|
|
:param cols:
|
|
|
|
a list of vector columns to be converted.
|
|
|
|
New vector columns will be ignored. If unspecified, all old
|
|
|
|
vector columns will be converted excepted nested ones.
|
|
|
|
:return:
|
|
|
|
the input dataset with old vector columns converted to the
|
|
|
|
new vector type
|
|
|
|
|
|
|
|
>>> import pyspark
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(0, Vectors.sparse(2, [1], [1.0]), Vectors.dense(2.0, 3.0))],
|
|
|
|
... ["id", "x", "y"])
|
|
|
|
>>> r1 = MLUtils.convertVectorColumnsToML(df).first()
|
|
|
|
>>> isinstance(r1.x, pyspark.ml.linalg.SparseVector)
|
|
|
|
True
|
|
|
|
>>> isinstance(r1.y, pyspark.ml.linalg.DenseVector)
|
|
|
|
True
|
|
|
|
>>> r2 = MLUtils.convertVectorColumnsToML(df, "x").first()
|
|
|
|
>>> isinstance(r2.x, pyspark.ml.linalg.SparseVector)
|
|
|
|
True
|
|
|
|
>>> isinstance(r2.y, pyspark.mllib.linalg.DenseVector)
|
|
|
|
True
|
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
|
|
|
raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
|
|
|
|
return callMLlibFunc("convertVectorColumnsToML", dataset, list(cols))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@since("2.0.0")
|
|
|
|
def convertVectorColumnsFromML(dataset, *cols):
|
|
|
|
"""
|
|
|
|
Converts vector columns in an input DataFrame to the
|
|
|
|
:py:class:`pyspark.mllib.linalg.Vector` type from the new
|
|
|
|
:py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml`
|
|
|
|
package.
|
|
|
|
|
|
|
|
:param dataset:
|
|
|
|
input dataset
|
|
|
|
:param cols:
|
|
|
|
a list of vector columns to be converted.
|
|
|
|
Old vector columns will be ignored. If unspecified, all new
|
|
|
|
vector columns will be converted except nested ones.
|
|
|
|
:return:
|
|
|
|
the input dataset with new vector columns converted to the
|
|
|
|
old vector type
|
|
|
|
|
|
|
|
>>> import pyspark
|
|
|
|
>>> from pyspark.ml.linalg import Vectors
|
|
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(0, Vectors.sparse(2, [1], [1.0]), Vectors.dense(2.0, 3.0))],
|
|
|
|
... ["id", "x", "y"])
|
|
|
|
>>> r1 = MLUtils.convertVectorColumnsFromML(df).first()
|
|
|
|
>>> isinstance(r1.x, pyspark.mllib.linalg.SparseVector)
|
|
|
|
True
|
|
|
|
>>> isinstance(r1.y, pyspark.mllib.linalg.DenseVector)
|
|
|
|
True
|
|
|
|
>>> r2 = MLUtils.convertVectorColumnsFromML(df, "x").first()
|
|
|
|
>>> isinstance(r2.x, pyspark.mllib.linalg.SparseVector)
|
|
|
|
True
|
|
|
|
>>> isinstance(r2.y, pyspark.ml.linalg.DenseVector)
|
|
|
|
True
|
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
|
|
|
raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
|
|
|
|
return callMLlibFunc("convertVectorColumnsFromML", dataset, list(cols))
|
|
|
|
|
2016-06-28 09:28:22 -04:00
|
|
|
@staticmethod
|
|
|
|
@since("2.0.0")
|
|
|
|
def convertMatrixColumnsToML(dataset, *cols):
|
|
|
|
"""
|
|
|
|
Converts matrix columns in an input DataFrame from the
|
|
|
|
:py:class:`pyspark.mllib.linalg.Matrix` type to the new
|
|
|
|
:py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml`
|
|
|
|
package.
|
|
|
|
|
|
|
|
:param dataset:
|
|
|
|
input dataset
|
|
|
|
:param cols:
|
|
|
|
a list of matrix columns to be converted.
|
|
|
|
New matrix columns will be ignored. If unspecified, all old
|
|
|
|
matrix columns will be converted excepted nested ones.
|
|
|
|
:return:
|
|
|
|
the input dataset with old matrix columns converted to the
|
|
|
|
new matrix type
|
|
|
|
|
|
|
|
>>> import pyspark
|
|
|
|
>>> from pyspark.mllib.linalg import Matrices
|
|
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]),
|
|
|
|
... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"])
|
|
|
|
>>> r1 = MLUtils.convertMatrixColumnsToML(df).first()
|
|
|
|
>>> isinstance(r1.x, pyspark.ml.linalg.SparseMatrix)
|
|
|
|
True
|
|
|
|
>>> isinstance(r1.y, pyspark.ml.linalg.DenseMatrix)
|
|
|
|
True
|
|
|
|
>>> r2 = MLUtils.convertMatrixColumnsToML(df, "x").first()
|
|
|
|
>>> isinstance(r2.x, pyspark.ml.linalg.SparseMatrix)
|
|
|
|
True
|
|
|
|
>>> isinstance(r2.y, pyspark.mllib.linalg.DenseMatrix)
|
|
|
|
True
|
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
|
|
|
raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
|
|
|
|
return callMLlibFunc("convertMatrixColumnsToML", dataset, list(cols))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@since("2.0.0")
|
|
|
|
def convertMatrixColumnsFromML(dataset, *cols):
|
|
|
|
"""
|
|
|
|
Converts matrix columns in an input DataFrame to the
|
|
|
|
:py:class:`pyspark.mllib.linalg.Matrix` type from the new
|
|
|
|
:py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml`
|
|
|
|
package.
|
|
|
|
|
|
|
|
:param dataset:
|
|
|
|
input dataset
|
|
|
|
:param cols:
|
|
|
|
a list of matrix columns to be converted.
|
|
|
|
Old matrix columns will be ignored. If unspecified, all new
|
|
|
|
matrix columns will be converted except nested ones.
|
|
|
|
:return:
|
|
|
|
the input dataset with new matrix columns converted to the
|
|
|
|
old matrix type
|
|
|
|
|
|
|
|
>>> import pyspark
|
|
|
|
>>> from pyspark.ml.linalg import Matrices
|
|
|
|
>>> from pyspark.mllib.util import MLUtils
|
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]),
|
|
|
|
... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"])
|
|
|
|
>>> r1 = MLUtils.convertMatrixColumnsFromML(df).first()
|
|
|
|
>>> isinstance(r1.x, pyspark.mllib.linalg.SparseMatrix)
|
|
|
|
True
|
|
|
|
>>> isinstance(r1.y, pyspark.mllib.linalg.DenseMatrix)
|
|
|
|
True
|
|
|
|
>>> r2 = MLUtils.convertMatrixColumnsFromML(df, "x").first()
|
|
|
|
>>> isinstance(r2.x, pyspark.mllib.linalg.SparseMatrix)
|
|
|
|
True
|
|
|
|
>>> isinstance(r2.y, pyspark.ml.linalg.DenseMatrix)
|
|
|
|
True
|
|
|
|
"""
|
|
|
|
if not isinstance(dataset, DataFrame):
|
|
|
|
raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
|
|
|
|
return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols))
|
|
|
|
|
2014-06-04 15:56:56 -04:00
|
|
|
|
2015-03-01 19:26:57 -05:00
|
|
|
class Saveable(object):
|
|
|
|
"""
|
|
|
|
Mixin for models and transformers which may be saved as files.
|
2015-09-17 11:50:00 -04:00
|
|
|
|
|
|
|
.. versionadded:: 1.3.0
|
2015-03-01 19:26:57 -05:00
|
|
|
"""
|
|
|
|
|
|
|
|
def save(self, sc, path):
|
|
|
|
"""
|
|
|
|
Save this model to the given path.
|
|
|
|
|
|
|
|
This saves:
|
|
|
|
* human-readable (JSON) model metadata to path/metadata/
|
|
|
|
* Parquet formatted data to path/data/
|
|
|
|
|
|
|
|
The model may be loaded using py:meth:`Loader.load`.
|
|
|
|
|
|
|
|
:param sc: Spark context used to save model data.
|
|
|
|
:param path: Path specifying the directory in which to save
|
|
|
|
this model. If the directory already exists,
|
|
|
|
this method throws an exception.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
2015-03-03 01:27:01 -05:00
|
|
|
@inherit_doc
|
|
|
|
class JavaSaveable(Saveable):
|
|
|
|
"""
|
|
|
|
Mixin for models that provide save() through their Scala
|
|
|
|
implementation.
|
2015-09-17 11:50:00 -04:00
|
|
|
|
|
|
|
.. versionadded:: 1.3.0
|
2015-03-03 01:27:01 -05:00
|
|
|
"""
|
|
|
|
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.3.0")
|
2015-03-03 01:27:01 -05:00
|
|
|
def save(self, sc, path):
|
2015-09-17 11:50:00 -04:00
|
|
|
"""Save this model to the given path."""
|
2015-07-28 18:00:25 -04:00
|
|
|
if not isinstance(sc, SparkContext):
|
|
|
|
raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
|
|
|
|
if not isinstance(path, basestring):
|
|
|
|
raise TypeError("path should be a basestring, got type %s" % type(path))
|
2015-03-03 01:27:01 -05:00
|
|
|
self._java_model.save(sc._jsc.sc(), path)
|
|
|
|
|
|
|
|
|
2015-03-01 19:26:57 -05:00
|
|
|
class Loader(object):
|
|
|
|
"""
|
|
|
|
Mixin for classes which can load saved models from files.
|
2015-09-17 11:50:00 -04:00
|
|
|
|
|
|
|
.. versionadded:: 1.3.0
|
2015-03-01 19:26:57 -05:00
|
|
|
"""
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def load(cls, sc, path):
|
|
|
|
"""
|
|
|
|
Load a model from the given path. The model should have been
|
|
|
|
saved using py:meth:`Saveable.save`.
|
|
|
|
|
|
|
|
:param sc: Spark context used for loading model files.
|
|
|
|
:param path: Path specifying the directory to which the model
|
|
|
|
was saved.
|
|
|
|
:return: model instance
|
|
|
|
"""
|
2019-01-17 20:40:39 -05:00
|
|
|
raise NotImplementedError
|
2015-03-01 19:26:57 -05:00
|
|
|
|
|
|
|
|
2015-03-03 01:27:01 -05:00
|
|
|
@inherit_doc
|
2015-03-01 19:26:57 -05:00
|
|
|
class JavaLoader(Loader):
|
|
|
|
"""
|
|
|
|
Mixin for classes which can load saved models using its Scala
|
|
|
|
implementation.
|
2015-09-17 11:50:00 -04:00
|
|
|
|
|
|
|
.. versionadded:: 1.3.0
|
2015-03-01 19:26:57 -05:00
|
|
|
"""
|
|
|
|
|
|
|
|
@classmethod
|
2015-03-03 01:27:01 -05:00
|
|
|
def _java_loader_class(cls):
|
|
|
|
"""
|
|
|
|
Returns the full class name of the Java loader. The default
|
|
|
|
implementation replaces "pyspark" by "org.apache.spark" in
|
|
|
|
the Python full class name.
|
|
|
|
"""
|
2015-03-01 19:26:57 -05:00
|
|
|
java_package = cls.__module__.replace("pyspark", "org.apache.spark")
|
2015-03-03 01:27:01 -05:00
|
|
|
return ".".join([java_package, cls.__name__])
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def _load_java(cls, sc, path):
|
|
|
|
"""
|
|
|
|
Load a Java model from the given path.
|
|
|
|
"""
|
|
|
|
java_class = cls._java_loader_class()
|
2015-03-01 19:26:57 -05:00
|
|
|
java_obj = sc._jvm
|
|
|
|
for name in java_class.split("."):
|
|
|
|
java_obj = getattr(java_obj, name)
|
2015-03-03 01:27:01 -05:00
|
|
|
return java_obj.load(sc._jsc.sc(), path)
|
|
|
|
|
|
|
|
@classmethod
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.3.0")
|
2015-03-03 01:27:01 -05:00
|
|
|
def load(cls, sc, path):
|
2015-09-17 11:50:00 -04:00
|
|
|
"""Load a model from the given path."""
|
2015-03-03 01:27:01 -05:00
|
|
|
java_model = cls._load_java(sc, path)
|
|
|
|
return cls(java_model)
|
2015-03-01 19:26:57 -05:00
|
|
|
|
|
|
|
|
2015-06-23 15:43:32 -04:00
|
|
|
class LinearDataGenerator(object):
|
2015-09-17 11:50:00 -04:00
|
|
|
"""Utils for generating linear data.
|
|
|
|
|
|
|
|
.. versionadded:: 1.5.0
|
|
|
|
"""
|
2015-06-23 15:43:32 -04:00
|
|
|
|
|
|
|
@staticmethod
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.5.0")
|
2015-06-23 15:43:32 -04:00
|
|
|
def generateLinearInput(intercept, weights, xMean, xVariance,
|
|
|
|
nPoints, seed, eps):
|
|
|
|
"""
|
|
|
|
:param: intercept bias factor, the term c in X'w + c
|
|
|
|
:param: weights feature vector, the term w in X'w + c
|
|
|
|
:param: xMean Point around which the data X is centered.
|
|
|
|
:param: xVariance Variance of the given data
|
|
|
|
:param: nPoints Number of points to be generated
|
|
|
|
:param: seed Random Seed
|
|
|
|
:param: eps Used to scale the noise. If eps is set high,
|
|
|
|
the amount of gaussian noise added is more.
|
2015-08-13 13:16:40 -04:00
|
|
|
|
2015-06-23 15:43:32 -04:00
|
|
|
Returns a list of LabeledPoints of length nPoints
|
|
|
|
"""
|
|
|
|
weights = [float(weight) for weight in weights]
|
|
|
|
xMean = [float(mean) for mean in xMean]
|
|
|
|
xVariance = [float(var) for var in xVariance]
|
|
|
|
return list(callMLlibFunc(
|
|
|
|
"generateLinearInputWrapper", float(intercept), weights, xMean,
|
|
|
|
xVariance, int(nPoints), int(seed), float(eps)))
|
|
|
|
|
|
|
|
@staticmethod
|
2015-09-17 11:50:00 -04:00
|
|
|
@since("1.5.0")
|
2015-06-23 15:43:32 -04:00
|
|
|
def generateLinearRDD(sc, nexamples, nfeatures, eps,
|
|
|
|
nParts=2, intercept=0.0):
|
|
|
|
"""
|
2016-11-19 06:24:15 -05:00
|
|
|
Generate an RDD of LabeledPoints.
|
2015-06-23 15:43:32 -04:00
|
|
|
"""
|
|
|
|
return callMLlibFunc(
|
|
|
|
"generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures),
|
|
|
|
float(eps), int(nParts), float(intercept))
|
|
|
|
|
|
|
|
|
2014-05-07 19:01:11 -04:00
|
|
|
def _test():
|
|
|
|
import doctest
|
2016-05-23 21:14:48 -04:00
|
|
|
from pyspark.sql import SparkSession
|
2014-05-07 19:01:11 -04:00
|
|
|
globs = globals().copy()
|
|
|
|
# The small batch size here ensures that we see multiple batches,
|
|
|
|
# even in these small test examples:
|
2016-05-23 21:14:48 -04:00
|
|
|
spark = SparkSession.builder\
|
|
|
|
.master("local[2]")\
|
|
|
|
.appName("mllib.util tests")\
|
|
|
|
.getOrCreate()
|
2016-06-18 00:22:29 -04:00
|
|
|
globs['spark'] = spark
|
2016-05-23 21:14:48 -04:00
|
|
|
globs['sc'] = spark.sparkContext
|
2014-05-07 19:01:11 -04:00
|
|
|
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
|
2016-05-23 21:14:48 -04:00
|
|
|
spark.stop()
|
2014-05-07 19:01:11 -04:00
|
|
|
if failure_count:
|
2018-03-08 06:38:34 -05:00
|
|
|
sys.exit(-1)
|
2014-05-07 19:01:11 -04:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
_test()
|