2015-01-28 20:14:23 -05:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
2015-08-06 13:09:58 -04:00
|
|
|
import sys
|
|
|
|
if sys.version > '3':
|
|
|
|
basestring = str
|
|
|
|
|
2015-04-16 19:20:57 -04:00
|
|
|
from pyspark.rdd import ignore_unicode_prefix
|
2015-05-08 14:14:39 -04:00
|
|
|
from pyspark.ml.param.shared import *
|
2015-02-20 05:31:32 -05:00
|
|
|
from pyspark.ml.util import keyword_only
|
2015-09-01 13:48:57 -04:00
|
|
|
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm
|
2015-02-20 05:31:32 -05:00
|
|
|
from pyspark.mllib.common import inherit_doc
|
2015-08-06 13:09:58 -04:00
|
|
|
from pyspark.mllib.linalg import _convert_to_vector
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-09-21 16:06:23 -04:00
|
|
|
__all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT',
|
|
|
|
'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'MinMaxScaler',
|
|
|
|
'MinMaxScalerModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel',
|
|
|
|
'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
|
|
|
|
'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
|
|
|
|
'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
|
|
|
|
'Word2Vec', 'Word2VecModel']
|
2015-01-28 20:14:23 -05:00
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
2015-05-08 14:14:39 -04:00
|
|
|
class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Binarize a column of continuous features given a threshold.
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> df = sqlContext.createDataFrame([(0.5,)], ["values"])
|
|
|
|
>>> binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features")
|
|
|
|
>>> binarizer.transform(df).head().features
|
|
|
|
0.0
|
|
|
|
>>> binarizer.setParams(outputCol="freqs").transform(df).head().freqs
|
|
|
|
0.0
|
|
|
|
>>> params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"}
|
|
|
|
>>> binarizer.transform(df, params).head().vector
|
|
|
|
1.0
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
threshold = Param(Params._dummy(), "threshold",
|
|
|
|
"threshold in binary classification prediction, in range [0, 1]")
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-02-15 23:29:26 -05:00
|
|
|
@keyword_only
|
2015-05-08 14:14:39 -04:00
|
|
|
def __init__(self, threshold=0.0, inputCol=None, outputCol=None):
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
2015-05-08 14:14:39 -04:00
|
|
|
__init__(self, threshold=0.0, inputCol=None, outputCol=None)
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
2015-05-08 14:14:39 -04:00
|
|
|
super(Binarizer, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid)
|
2015-05-08 14:14:39 -04:00
|
|
|
self.threshold = Param(self, "threshold",
|
|
|
|
"threshold in binary classification prediction, in range [0, 1]")
|
|
|
|
self._setDefault(threshold=0.0)
|
2015-02-15 23:29:26 -05:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-05-08 14:14:39 -04:00
|
|
|
def setParams(self, threshold=0.0, inputCol=None, outputCol=None):
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
2015-05-08 14:14:39 -04:00
|
|
|
setParams(self, threshold=0.0, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this Binarizer.
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
2015-04-16 02:49:42 -04:00
|
|
|
return self._set(**kwargs)
|
2015-02-15 23:29:26 -05:00
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
def setThreshold(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`threshold`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.threshold] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getThreshold(self):
|
|
|
|
"""
|
|
|
|
Gets the value of threshold or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.threshold)
|
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-05-13 16:21:36 -04:00
|
|
|
@inherit_doc
|
|
|
|
class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-13 16:21:36 -04:00
|
|
|
Maps a column of continuous features to a column of feature buckets.
|
|
|
|
|
|
|
|
>>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
|
|
|
|
>>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
|
|
|
|
... inputCol="values", outputCol="buckets")
|
|
|
|
>>> bucketed = bucketizer.transform(df).collect()
|
|
|
|
>>> bucketed[0].buckets
|
|
|
|
0.0
|
|
|
|
>>> bucketed[1].buckets
|
|
|
|
0.0
|
|
|
|
>>> bucketed[2].buckets
|
|
|
|
1.0
|
|
|
|
>>> bucketed[3].buckets
|
|
|
|
2.0
|
|
|
|
>>> bucketizer.setParams(outputCol="b").transform(df).head().b
|
|
|
|
0.0
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
splits = \
|
|
|
|
Param(Params._dummy(), "splits",
|
|
|
|
"Split points for mapping continuous features into buckets. With n+1 splits, " +
|
|
|
|
"there are n buckets. A bucket defined by splits x,y holds values in the " +
|
|
|
|
"range [x,y) except the last bucket, which also includes y. The splits " +
|
|
|
|
"should be strictly increasing. Values at -inf, inf must be explicitly " +
|
|
|
|
"provided to cover all Double values; otherwise, values outside the splits " +
|
|
|
|
"specified will be treated as errors.")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, splits=None, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, splits=None, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(Bucketizer, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
|
2015-05-13 16:21:36 -04:00
|
|
|
#: param for Splitting points for mapping continuous features into buckets. With n+1 splits,
|
|
|
|
# there are n buckets. A bucket defined by splits x,y holds values in the range [x,y)
|
|
|
|
# except the last bucket, which also includes y. The splits should be strictly increasing.
|
|
|
|
# Values at -inf, inf must be explicitly provided to cover all Double values; otherwise,
|
|
|
|
# values outside the splits specified will be treated as errors.
|
|
|
|
self.splits = \
|
|
|
|
Param(self, "splits",
|
|
|
|
"Split points for mapping continuous features into buckets. With n+1 splits, " +
|
|
|
|
"there are n buckets. A bucket defined by splits x,y holds values in the " +
|
|
|
|
"range [x,y) except the last bucket, which also includes y. The splits " +
|
|
|
|
"should be strictly increasing. Values at -inf, inf must be explicitly " +
|
|
|
|
"provided to cover all Double values; otherwise, values outside the splits " +
|
|
|
|
"specified will be treated as errors.")
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, splits=None, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, splits=None, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this Bucketizer.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setSplits(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`splits`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.splits] = value
|
2015-05-13 16:21:36 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getSplits(self):
|
|
|
|
"""
|
|
|
|
Gets the value of threshold or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.splits)
|
|
|
|
|
|
|
|
|
2015-09-21 16:06:23 -04:00
|
|
|
@inherit_doc
|
|
|
|
class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
|
|
|
.. note:: Experimental
|
|
|
|
|
|
|
|
Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.
|
[MINOR][ML] fix doc warnings
Without an empty line, sphinx will treat doctest as docstring. holdenk
~~~
/Users/meng/src/spark/python/pyspark/ml/feature.py:docstring of pyspark.ml.feature.CountVectorizer:3: ERROR: Undefined substitution referenced: "label|raw |vectors | +-----+---------------+-------------------------+ |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])".
/Users/meng/src/spark/python/pyspark/ml/feature.py:docstring of pyspark.ml.feature.CountVectorizer:3: ERROR: Undefined substitution referenced: "1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])".
~~~
Author: Xiangrui Meng <meng@databricks.com>
Closes #9188 from mengxr/py-count-vec-doc-fix.
2015-10-20 21:38:06 -04:00
|
|
|
|
2015-09-21 16:06:23 -04:00
|
|
|
>>> df = sqlContext.createDataFrame(
|
|
|
|
... [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
|
|
|
|
... ["label", "raw"])
|
|
|
|
>>> cv = CountVectorizer(inputCol="raw", outputCol="vectors")
|
|
|
|
>>> model = cv.fit(df)
|
|
|
|
>>> model.transform(df).show(truncate=False)
|
|
|
|
+-----+---------------+-------------------------+
|
|
|
|
|label|raw |vectors |
|
|
|
|
+-----+---------------+-------------------------+
|
|
|
|
|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|
|
|
|
|
|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
|
|
|
|
+-----+---------------+-------------------------+
|
|
|
|
...
|
|
|
|
>>> sorted(map(str, model.vocabulary))
|
|
|
|
['a', 'b', 'c']
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
minTF = Param(
|
|
|
|
Params._dummy(), "minTF", "Filter to ignore rare words in" +
|
|
|
|
" a document. For each document, terms with frequency/count less than the given" +
|
|
|
|
" threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
|
|
|
|
" times the term must appear in the document); if this is a double in [0,1), then this " +
|
|
|
|
"specifies a fraction (out of the document's token count). Note that the parameter is " +
|
|
|
|
"only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0")
|
|
|
|
minDF = Param(
|
|
|
|
Params._dummy(), "minDF", "Specifies the minimum number of" +
|
|
|
|
" different documents a term must appear in to be included in the vocabulary." +
|
|
|
|
" If this is an integer >= 1, this specifies the number of documents the term must" +
|
|
|
|
" appear in; if this is a double in [0,1), then this specifies the fraction of documents." +
|
|
|
|
" Default 1.0")
|
|
|
|
vocabSize = Param(
|
|
|
|
Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(CountVectorizer, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
|
|
|
|
self.uid)
|
|
|
|
self.minTF = Param(
|
|
|
|
self, "minTF", "Filter to ignore rare words in" +
|
|
|
|
" a document. For each document, terms with frequency/count less than the given" +
|
|
|
|
" threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
|
|
|
|
" times the term must appear in the document); if this is a double in [0,1), then " +
|
|
|
|
"this specifies a fraction (out of the document's token count). Note that the " +
|
|
|
|
"parameter is only used in transform of CountVectorizerModel and does not affect" +
|
|
|
|
"fitting. Default 1.0")
|
|
|
|
self.minDF = Param(
|
|
|
|
self, "minDF", "Specifies the minimum number of" +
|
|
|
|
" different documents a term must appear in to be included in the vocabulary." +
|
|
|
|
" If this is an integer >= 1, this specifies the number of documents the term must" +
|
|
|
|
" appear in; if this is a double in [0,1), then this specifies the fraction of " +
|
|
|
|
"documents. Default 1.0")
|
|
|
|
self.vocabSize = Param(
|
|
|
|
self, "vocabSize", "max size of the vocabulary. Default 1 << 18.")
|
|
|
|
self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None)
|
|
|
|
Set the params for the CountVectorizer
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setMinTF(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minTF`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.minTF] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getMinTF(self):
|
|
|
|
"""
|
|
|
|
Gets the value of minTF or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.minTF)
|
|
|
|
|
|
|
|
def setMinDF(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minDF`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.minDF] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getMinDF(self):
|
|
|
|
"""
|
|
|
|
Gets the value of minDF or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.minDF)
|
|
|
|
|
|
|
|
def setVocabSize(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`vocabSize`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.vocabSize] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getVocabSize(self):
|
|
|
|
"""
|
|
|
|
Gets the value of vocabSize or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.vocabSize)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return CountVectorizerModel(java_model)
|
|
|
|
|
|
|
|
|
|
|
|
class CountVectorizerModel(JavaModel):
|
|
|
|
"""
|
|
|
|
.. note:: Experimental
|
|
|
|
|
|
|
|
Model fitted by CountVectorizer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
@property
|
|
|
|
def vocabulary(self):
|
|
|
|
"""
|
|
|
|
An array of terms in the vocabulary.
|
|
|
|
"""
|
|
|
|
return self._call_java("vocabulary")
|
|
|
|
|
|
|
|
|
2015-08-31 18:50:41 -04:00
|
|
|
@inherit_doc
|
|
|
|
class DCT(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-08-31 18:50:41 -04:00
|
|
|
A feature transformer that takes the 1D discrete cosine transform
|
|
|
|
of a real vector. No zero padding is performed on the input vector.
|
|
|
|
It returns a real vector of the same length representing the DCT.
|
|
|
|
The return vector is scaled such that the transform matrix is
|
|
|
|
unitary (aka scaled DCT-II).
|
|
|
|
|
|
|
|
More information on
|
|
|
|
`https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia`.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
|
|
|
|
>>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
|
|
|
|
>>> df2 = dct.transform(df1)
|
|
|
|
>>> df2.head().resultVec
|
|
|
|
DenseVector([10.969..., -0.707..., -2.041...])
|
|
|
|
>>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)
|
|
|
|
>>> df3.head().origVec
|
|
|
|
DenseVector([5.0, 8.0, 6.0])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
|
|
|
|
"default False.")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, inverse=False, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, inverse=False, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(DCT, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
|
|
|
|
self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
|
|
|
|
"default False.")
|
|
|
|
self._setDefault(inverse=False)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, inverse=False, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, inverse=False, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this DCT.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setInverse(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`inverse`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.inverse] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getInverse(self):
|
|
|
|
"""
|
|
|
|
Gets the value of inverse or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.inverse)
|
|
|
|
|
|
|
|
|
2015-08-17 20:25:41 -04:00
|
|
|
@inherit_doc
|
|
|
|
class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-08-17 20:25:41 -04:00
|
|
|
Outputs the Hadamard product (i.e., the element-wise product) of each input vector
|
|
|
|
with a provided "weight" vector. In other words, it scales each column of the dataset
|
|
|
|
by a scalar multiplier.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"])
|
|
|
|
>>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
|
|
|
|
... inputCol="values", outputCol="eprod")
|
|
|
|
>>> ep.transform(df).head().eprod
|
|
|
|
DenseVector([2.0, 2.0, 9.0])
|
|
|
|
>>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod
|
|
|
|
DenseVector([4.0, 3.0, 15.0])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " +
|
|
|
|
"it must be MLlib Vector type.")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, scalingVec=None, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, scalingVec=None, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(ElementwiseProduct, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",
|
|
|
|
self.uid)
|
|
|
|
self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " +
|
|
|
|
"it must be MLlib Vector type.")
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, scalingVec=None, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, scalingVec=None, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this ElementwiseProduct.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setScalingVec(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`scalingVec`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.scalingVec] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getScalingVec(self):
|
|
|
|
"""
|
|
|
|
Gets the value of scalingVec or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.scalingVec)
|
|
|
|
|
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
@inherit_doc
|
|
|
|
class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
Maps a sequence of terms to their term frequencies using the
|
|
|
|
hashing trick.
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> df = sqlContext.createDataFrame([(["a", "b", "c"],)], ["words"])
|
2015-02-15 23:29:26 -05:00
|
|
|
>>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> hashingTF.transform(df).head().features
|
|
|
|
SparseVector(10, {7: 1.0, 8: 1.0, 9: 1.0})
|
|
|
|
>>> hashingTF.setParams(outputCol="freqs").transform(df).head().freqs
|
|
|
|
SparseVector(10, {7: 1.0, 8: 1.0, 9: 1.0})
|
2015-01-28 20:14:23 -05:00
|
|
|
>>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"}
|
2015-04-16 19:20:57 -04:00
|
|
|
>>> hashingTF.transform(df, params).head().vector
|
|
|
|
SparseVector(5, {2: 1.0, 3: 1.0, 4: 1.0})
|
2015-01-28 20:14:23 -05:00
|
|
|
"""
|
|
|
|
|
2015-02-15 23:29:26 -05:00
|
|
|
@keyword_only
|
2015-04-16 02:49:42 -04:00
|
|
|
def __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
2015-04-16 02:49:42 -04:00
|
|
|
__init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None)
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
|
|
|
super(HashingTF, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid)
|
2015-04-16 02:49:42 -04:00
|
|
|
self._setDefault(numFeatures=1 << 18)
|
2015-02-15 23:29:26 -05:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-04-16 02:49:42 -04:00
|
|
|
def setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
|
2015-02-15 23:29:26 -05:00
|
|
|
"""
|
2015-04-16 02:49:42 -04:00
|
|
|
setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None)
|
2015-02-15 23:29:26 -05:00
|
|
|
Sets params for this HashingTF.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
2015-04-16 02:49:42 -04:00
|
|
|
return self._set(**kwargs)
|
2015-02-15 23:29:26 -05:00
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
@inherit_doc
|
|
|
|
class IDF(JavaEstimator, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Compute the Inverse Document Frequency (IDF) given a collection of documents.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import DenseVector
|
|
|
|
>>> df = sqlContext.createDataFrame([(DenseVector([1.0, 2.0]),),
|
|
|
|
... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"])
|
|
|
|
>>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
|
|
|
|
>>> idf.fit(df).transform(df).head().idf
|
|
|
|
DenseVector([0.0, 0.0])
|
|
|
|
>>> idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs
|
|
|
|
DenseVector([0.0, 0.0])
|
|
|
|
>>> params = {idf.minDocFreq: 1, idf.outputCol: "vector"}
|
|
|
|
>>> idf.fit(df, params).transform(df).head().vector
|
|
|
|
DenseVector([0.2877, 0.0])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
minDocFreq = Param(Params._dummy(), "minDocFreq",
|
|
|
|
"minimum of documents in which a term should appear for filtering")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, minDocFreq=0, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, minDocFreq=0, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(IDF, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid)
|
2015-05-08 14:14:39 -04:00
|
|
|
self.minDocFreq = Param(self, "minDocFreq",
|
|
|
|
"minimum of documents in which a term should appear for filtering")
|
|
|
|
self._setDefault(minDocFreq=0)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, minDocFreq=0, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, minDocFreq=0, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this IDF.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setMinDocFreq(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minDocFreq`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.minDocFreq] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getMinDocFreq(self):
|
|
|
|
"""
|
|
|
|
Gets the value of minDocFreq or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.minDocFreq)
|
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
def _create_model(self, java_model):
|
|
|
|
return IDFModel(java_model)
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
class IDFModel(JavaModel):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Model fitted by IDF.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2015-09-11 13:32:35 -04:00
|
|
|
@inherit_doc
|
|
|
|
class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
|
|
|
.. note:: Experimental
|
|
|
|
|
|
|
|
Rescale each feature individually to a common range [min, max] linearly using column summary
|
|
|
|
statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
|
|
|
|
feature E is calculated as,
|
|
|
|
|
|
|
|
Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min
|
|
|
|
|
|
|
|
For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)
|
|
|
|
|
|
|
|
Note that since zero values will probably be transformed to non-zero values, output of the
|
|
|
|
transformer will be DenseVector even for sparse input.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
|
|
|
|
>>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled")
|
|
|
|
>>> model = mmScaler.fit(df)
|
|
|
|
>>> model.transform(df).show()
|
|
|
|
+-----+------+
|
|
|
|
| a|scaled|
|
|
|
|
+-----+------+
|
|
|
|
|[0.0]| [0.0]|
|
|
|
|
|[2.0]| [1.0]|
|
|
|
|
+-----+------+
|
|
|
|
...
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
min = Param(Params._dummy(), "min", "Lower bound of the output feature range")
|
|
|
|
max = Param(Params._dummy(), "max", "Upper bound of the output feature range")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(MinMaxScaler, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)
|
|
|
|
self.min = Param(self, "min", "Lower bound of the output feature range")
|
|
|
|
self.max = Param(self, "max", "Upper bound of the output feature range")
|
|
|
|
self._setDefault(min=0.0, max=1.0)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this MinMaxScaler.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setMin(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`min`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.min] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getMin(self):
|
|
|
|
"""
|
|
|
|
Gets the value of min or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.min)
|
|
|
|
|
|
|
|
def setMax(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`max`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.max] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getMax(self):
|
|
|
|
"""
|
|
|
|
Gets the value of max or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.max)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return MinMaxScalerModel(java_model)
|
|
|
|
|
|
|
|
|
|
|
|
class MinMaxScalerModel(JavaModel):
|
|
|
|
"""
|
|
|
|
.. note:: Experimental
|
|
|
|
|
|
|
|
Model fitted by :py:class:`MinMaxScaler`.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2015-06-29 21:40:30 -04:00
|
|
|
@inherit_doc
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
class NGram(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-06-29 21:40:30 -04:00
|
|
|
A feature transformer that converts the input array of strings into an array of n-grams. Null
|
|
|
|
values in the input array are ignored.
|
|
|
|
It returns an array of n-grams where each n-gram is represented by a space-separated string of
|
|
|
|
words.
|
|
|
|
When the input is empty, an empty array is returned.
|
|
|
|
When the input array length is less than n (number of elements per n-gram), no n-grams are
|
|
|
|
returned.
|
|
|
|
|
|
|
|
>>> df = sqlContext.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])])
|
|
|
|
>>> ngram = NGram(n=2, inputCol="inputTokens", outputCol="nGrams")
|
|
|
|
>>> ngram.transform(df).head()
|
|
|
|
Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e'])
|
|
|
|
>>> # Change n-gram length
|
|
|
|
>>> ngram.setParams(n=4).transform(df).head()
|
|
|
|
Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])
|
|
|
|
>>> # Temporarily modify output column.
|
|
|
|
>>> ngram.transform(df, {ngram.outputCol: "output"}).head()
|
|
|
|
Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], output=[u'a b c d', u'b c d e'])
|
|
|
|
>>> ngram.transform(df).head()
|
|
|
|
Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])
|
|
|
|
>>> # Must use keyword arguments to specify params.
|
|
|
|
>>> ngram.setParams("text")
|
|
|
|
Traceback (most recent call last):
|
|
|
|
...
|
|
|
|
TypeError: Method setParams forces keyword arguments.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, n=2, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, n=2, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(NGram, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid)
|
|
|
|
self.n = Param(self, "n", "number of elements per n-gram (>=1)")
|
|
|
|
self._setDefault(n=2)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, n=2, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, n=2, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this NGram.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setN(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`n`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.n] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getN(self):
|
|
|
|
"""
|
|
|
|
Gets the value of n or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.n)
|
|
|
|
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
@inherit_doc
|
|
|
|
class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Normalize a vector to have unit norm using the given p-norm.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})
|
|
|
|
>>> df = sqlContext.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"])
|
|
|
|
>>> normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features")
|
|
|
|
>>> normalizer.transform(df).head().features
|
|
|
|
DenseVector([0.6, -0.8])
|
|
|
|
>>> normalizer.setParams(inputCol="sparse", outputCol="freqs").transform(df).head().freqs
|
|
|
|
SparseVector(4, {1: 0.8, 3: 0.6})
|
|
|
|
>>> params = {normalizer.p: 1.0, normalizer.inputCol: "dense", normalizer.outputCol: "vector"}
|
|
|
|
>>> normalizer.transform(df, params).head().vector
|
|
|
|
DenseVector([0.4286, -0.5714])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
p = Param(Params._dummy(), "p", "the p norm value.")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, p=2.0, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, p=2.0, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(Normalizer, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)
|
2015-05-08 14:14:39 -04:00
|
|
|
self.p = Param(self, "p", "the p norm value.")
|
|
|
|
self._setDefault(p=2.0)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, p=2.0, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, p=2.0, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this Normalizer.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setP(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`p`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.p] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getP(self):
|
|
|
|
"""
|
|
|
|
Gets the value of p or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.p)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-29 03:51:12 -04:00
|
|
|
A one-hot encoder that maps a column of category indices to a
|
|
|
|
column of binary vectors, with at most a single one-value per row
|
|
|
|
that indicates the input category index.
|
|
|
|
For example with 5 categories, an input value of 2.0 would map to
|
|
|
|
an output vector of `[0.0, 0.0, 1.0, 0.0]`.
|
|
|
|
The last category is not included by default (configurable via
|
|
|
|
:py:attr:`dropLast`) because it makes the vector entries sum up to
|
|
|
|
one, and hence linearly dependent.
|
|
|
|
So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
|
|
|
|
Note that this is different from scikit-learn's OneHotEncoder,
|
|
|
|
which keeps all categories.
|
|
|
|
The output vectors are sparse.
|
|
|
|
|
|
|
|
.. seealso::
|
|
|
|
|
|
|
|
:py:class:`StringIndexer` for converting categorical values into
|
|
|
|
category indices
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
|
|
|
|
>>> model = stringIndexer.fit(stringIndDf)
|
|
|
|
>>> td = model.transform(stringIndDf)
|
2015-05-29 03:51:12 -04:00
|
|
|
>>> encoder = OneHotEncoder(inputCol="indexed", outputCol="features")
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> encoder.transform(td).head().features
|
2015-05-29 03:51:12 -04:00
|
|
|
SparseVector(2, {0: 1.0})
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> encoder.setParams(outputCol="freqs").transform(td).head().freqs
|
2015-05-29 03:51:12 -04:00
|
|
|
SparseVector(2, {0: 1.0})
|
|
|
|
>>> params = {encoder.dropLast: False, encoder.outputCol: "test"}
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> encoder.transform(td, params).head().test
|
|
|
|
SparseVector(3, {0: 1.0})
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
2015-05-29 03:51:12 -04:00
|
|
|
dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category")
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
@keyword_only
|
2015-05-29 03:51:12 -04:00
|
|
|
def __init__(self, dropLast=True, inputCol=None, outputCol=None):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
|
|
|
__init__(self, includeFirst=True, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(OneHotEncoder, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
|
2015-05-29 03:51:12 -04:00
|
|
|
self.dropLast = Param(self, "dropLast", "whether to drop the last category")
|
|
|
|
self._setDefault(dropLast=True)
|
2015-05-08 14:14:39 -04:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-05-29 03:51:12 -04:00
|
|
|
def setParams(self, dropLast=True, inputCol=None, outputCol=None):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-29 03:51:12 -04:00
|
|
|
setParams(self, dropLast=True, inputCol=None, outputCol=None)
|
2015-05-08 14:14:39 -04:00
|
|
|
Sets params for this OneHotEncoder.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
2015-05-29 03:51:12 -04:00
|
|
|
def setDropLast(self, value):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-29 03:51:12 -04:00
|
|
|
Sets the value of :py:attr:`dropLast`.
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-29 03:51:12 -04:00
|
|
|
self._paramMap[self.dropLast] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
2015-05-29 03:51:12 -04:00
|
|
|
def getDropLast(self):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-29 03:51:12 -04:00
|
|
|
Gets the value of dropLast or its default value.
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-29 03:51:12 -04:00
|
|
|
return self.getOrDefault(self.dropLast)
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,
|
|
|
|
which is available at `http://en.wikipedia.org/wiki/Polynomial_expansion`, "In mathematics, an
|
|
|
|
expansion of a product of sums expresses it as a sum of products by using the fact that
|
|
|
|
multiplication distributes over addition". Take a 2-variable feature vector as an example:
|
|
|
|
`(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> df = sqlContext.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"])
|
|
|
|
>>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded")
|
|
|
|
>>> px.transform(df).head().expanded
|
|
|
|
DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
|
|
|
|
>>> px.setParams(outputCol="test").transform(df).head().test
|
|
|
|
DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, degree=2, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, degree=2, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(PolynomialExpansion, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj(
|
|
|
|
"org.apache.spark.ml.feature.PolynomialExpansion", self.uid)
|
2015-05-08 14:14:39 -04:00
|
|
|
self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)")
|
|
|
|
self._setDefault(degree=2)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, degree=2, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, degree=2, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this PolynomialExpansion.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setDegree(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`degree`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.degree] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getDegree(self):
|
|
|
|
"""
|
|
|
|
Gets the value of degree or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.degree)
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-21 20:59:03 -04:00
|
|
|
A regex based tokenizer that extracts tokens either by using the
|
|
|
|
provided regex pattern (in Java dialect) to split the text
|
2015-07-30 12:45:17 -04:00
|
|
|
(default) or repeatedly matching the regex (if gaps is false).
|
2015-05-21 20:59:03 -04:00
|
|
|
Optional parameters also allow filtering tokens using a minimal
|
|
|
|
length.
|
2015-05-08 14:14:39 -04:00
|
|
|
It returns an array of strings that can be empty.
|
|
|
|
|
2015-05-21 20:59:03 -04:00
|
|
|
>>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
|
|
|
|
>>> reTokenizer.transform(df).head()
|
2015-05-21 20:59:03 -04:00
|
|
|
Row(text=u'a b c', words=[u'a', u'b', u'c'])
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> # Change a parameter.
|
|
|
|
>>> reTokenizer.setParams(outputCol="tokens").transform(df).head()
|
2015-05-21 20:59:03 -04:00
|
|
|
Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> # Temporarily modify a parameter.
|
|
|
|
>>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()
|
2015-05-21 20:59:03 -04:00
|
|
|
Row(text=u'a b c', words=[u'a', u'b', u'c'])
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> reTokenizer.transform(df).head()
|
2015-05-21 20:59:03 -04:00
|
|
|
Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> # Must use keyword arguments to specify params.
|
|
|
|
>>> reTokenizer.setParams("text")
|
|
|
|
Traceback (most recent call last):
|
|
|
|
...
|
|
|
|
TypeError: Method setParams forces keyword arguments.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
|
2015-05-21 20:59:03 -04:00
|
|
|
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
|
|
|
|
pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
@keyword_only
|
2015-05-21 20:59:03 -04:00
|
|
|
def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-21 20:59:03 -04:00
|
|
|
__init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
|
|
|
super(RegexTokenizer, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
|
|
|
|
self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
|
2015-05-21 20:59:03 -04:00
|
|
|
self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
|
|
|
|
self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
|
|
|
|
self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+")
|
2015-05-08 14:14:39 -04:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-05-21 20:59:03 -04:00
|
|
|
def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-21 20:59:03 -04:00
|
|
|
setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
|
2015-05-08 14:14:39 -04:00
|
|
|
Sets params for this RegexTokenizer.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setMinTokenLength(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minTokenLength`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.minTokenLength] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getMinTokenLength(self):
|
|
|
|
"""
|
|
|
|
Gets the value of minTokenLength or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.minTokenLength)
|
|
|
|
|
|
|
|
def setGaps(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`gaps`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.gaps] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getGaps(self):
|
|
|
|
"""
|
|
|
|
Gets the value of gaps or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.gaps)
|
|
|
|
|
|
|
|
def setPattern(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`pattern`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.pattern] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getPattern(self):
|
|
|
|
"""
|
|
|
|
Gets the value of pattern or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.pattern)
|
|
|
|
|
|
|
|
|
2015-08-31 19:11:27 -04:00
|
|
|
@inherit_doc
|
|
|
|
class SQLTransformer(JavaTransformer):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-08-31 19:11:27 -04:00
|
|
|
Implements the transforms which are defined by SQL statement.
|
|
|
|
Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
|
|
|
|
where '__THIS__' represents the underlying table of the input dataset.
|
|
|
|
|
|
|
|
>>> df = sqlContext.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"])
|
|
|
|
>>> sqlTrans = SQLTransformer(
|
|
|
|
... statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
|
|
|
|
>>> sqlTrans.transform(df).head()
|
|
|
|
Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
statement = Param(Params._dummy(), "statement", "SQL statement")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, statement=None):
|
|
|
|
"""
|
|
|
|
__init__(self, statement=None)
|
|
|
|
"""
|
|
|
|
super(SQLTransformer, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
|
|
|
|
self.statement = Param(self, "statement", "SQL statement")
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, statement=None):
|
|
|
|
"""
|
|
|
|
setParams(self, statement=None)
|
|
|
|
Sets params for this SQLTransformer.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setStatement(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`statement`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.statement] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getStatement(self):
|
|
|
|
"""
|
|
|
|
Gets the value of statement or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.statement)
|
|
|
|
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
@inherit_doc
|
|
|
|
class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Standardizes features by removing the mean and scaling to unit variance using column summary
|
|
|
|
statistics on the samples in the training set.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
|
|
|
|
>>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled")
|
|
|
|
>>> model = standardScaler.fit(df)
|
2015-07-07 15:35:40 -04:00
|
|
|
>>> model.mean
|
|
|
|
DenseVector([1.0])
|
|
|
|
>>> model.std
|
|
|
|
DenseVector([1.4142])
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> model.transform(df).collect()[1].scaled
|
|
|
|
DenseVector([1.4142])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
withMean = Param(Params._dummy(), "withMean", "Center data with mean")
|
|
|
|
withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(StandardScaler, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid)
|
2015-05-08 14:14:39 -04:00
|
|
|
self.withMean = Param(self, "withMean", "Center data with mean")
|
|
|
|
self.withStd = Param(self, "withStd", "Scale to unit standard deviation")
|
|
|
|
self._setDefault(withMean=False, withStd=True)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this StandardScaler.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setWithMean(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`withMean`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.withMean] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getWithMean(self):
|
|
|
|
"""
|
|
|
|
Gets the value of withMean or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.withMean)
|
|
|
|
|
|
|
|
def setWithStd(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`withStd`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.withStd] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getWithStd(self):
|
|
|
|
"""
|
|
|
|
Gets the value of withStd or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.withStd)
|
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
def _create_model(self, java_model):
|
|
|
|
return StandardScalerModel(java_model)
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
class StandardScalerModel(JavaModel):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Model fitted by StandardScaler.
|
|
|
|
"""
|
|
|
|
|
2015-07-07 15:35:40 -04:00
|
|
|
@property
|
|
|
|
def std(self):
|
|
|
|
"""
|
|
|
|
Standard deviation of the StandardScalerModel.
|
|
|
|
"""
|
|
|
|
return self._call_java("std")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def mean(self):
|
|
|
|
"""
|
|
|
|
Mean of the StandardScalerModel.
|
|
|
|
"""
|
|
|
|
return self._call_java("mean")
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
@inherit_doc
|
2015-09-10 23:43:38 -04:00
|
|
|
class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
A label indexer that maps a string column of labels to an ML column of label indices.
|
|
|
|
If the input column is numeric, we cast it to string and index the string values.
|
|
|
|
The indices are in [0, numLabels), ordered by label frequencies.
|
|
|
|
So the most frequent label gets index 0.
|
|
|
|
|
|
|
|
>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
|
|
|
|
>>> model = stringIndexer.fit(stringIndDf)
|
|
|
|
>>> td = model.transform(stringIndDf)
|
|
|
|
>>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
|
|
|
|
... key=lambda x: x[0])
|
|
|
|
[(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]
|
2015-09-09 01:13:05 -04:00
|
|
|
>>> inverter = IndexToString(inputCol="indexed", outputCol="label2", labels=model.labels())
|
|
|
|
>>> itd = inverter.transform(td)
|
|
|
|
>>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
|
|
|
|
... key=lambda x: x[0])
|
|
|
|
[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
2015-09-10 23:43:38 -04:00
|
|
|
def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-09-10 23:43:38 -04:00
|
|
|
__init__(self, inputCol=None, outputCol=None, handleInvalid="error")
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
|
|
|
super(StringIndexer, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
|
2015-09-10 23:43:38 -04:00
|
|
|
self._setDefault(handleInvalid="error")
|
2015-05-08 14:14:39 -04:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
2015-09-10 23:43:38 -04:00
|
|
|
def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-09-10 23:43:38 -04:00
|
|
|
setParams(self, inputCol=None, outputCol=None, handleInvalid="error")
|
2015-05-08 14:14:39 -04:00
|
|
|
Sets params for this StringIndexer.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
def _create_model(self, java_model):
|
|
|
|
return StringIndexerModel(java_model)
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
class StringIndexerModel(JavaModel):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Model fitted by StringIndexer.
|
|
|
|
"""
|
2015-09-09 01:13:05 -04:00
|
|
|
@property
|
|
|
|
def labels(self):
|
|
|
|
"""
|
|
|
|
Ordered list of labels, corresponding to indices to be assigned.
|
|
|
|
"""
|
|
|
|
return self._java_obj.labels
|
|
|
|
|
|
|
|
|
|
|
|
@inherit_doc
|
|
|
|
class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-09-11 11:55:35 -04:00
|
|
|
A :py:class:`Transformer` that maps a column of indices back to a new column of
|
|
|
|
corresponding string values.
|
|
|
|
The index-string mapping is either from the ML attributes of the input column,
|
|
|
|
or from user-supplied labels (which take precedence over ML attributes).
|
2015-09-09 01:13:05 -04:00
|
|
|
See L{StringIndexer} for converting strings into indices.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make the labels show up in generated doc
|
|
|
|
labels = Param(Params._dummy(), "labels",
|
2015-09-11 11:55:35 -04:00
|
|
|
"Optional array of labels specifying index-string mapping." +
|
|
|
|
" If not provided or if empty, then metadata from inputCol is used instead.")
|
2015-09-09 01:13:05 -04:00
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, inputCol=None, outputCol=None, labels=None):
|
|
|
|
"""
|
|
|
|
__init__(self, inputCol=None, outputCol=None, labels=None)
|
|
|
|
"""
|
|
|
|
super(IndexToString, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
|
|
|
|
self.uid)
|
|
|
|
self.labels = Param(self, "labels",
|
2015-09-11 11:55:35 -04:00
|
|
|
"Optional array of labels specifying index-string mapping. If not" +
|
|
|
|
" provided or if empty, then metadata from inputCol is used instead.")
|
2015-09-09 01:13:05 -04:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, inputCol=None, outputCol=None, labels=None):
|
|
|
|
"""
|
|
|
|
setParams(self, inputCol=None, outputCol=None, labels=None)
|
|
|
|
Sets params for this IndexToString.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setLabels(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`labels`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.labels] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getLabels(self):
|
|
|
|
"""
|
|
|
|
Gets the value of :py:attr:`labels` or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.labels)
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
|
2015-09-01 13:48:57 -04:00
|
|
|
class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
|
|
|
.. note:: Experimental
|
|
|
|
|
|
|
|
A feature transformer that filters out stop words from input.
|
|
|
|
Note: null values from input array are preserved unless adding null to stopWords explicitly.
|
|
|
|
"""
|
|
|
|
# a placeholder to make the stopwords show up in generated doc
|
|
|
|
stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
|
|
|
|
caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
|
|
|
|
"comparison over the stop words")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, inputCol=None, outputCol=None, stopWords=None,
|
|
|
|
caseSensitive=False):
|
|
|
|
"""
|
|
|
|
__init__(self, inputCol=None, outputCol=None, stopWords=None,\
|
|
|
|
caseSensitive=false)
|
|
|
|
"""
|
|
|
|
super(StopWordsRemover, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover",
|
|
|
|
self.uid)
|
|
|
|
self.stopWords = Param(self, "stopWords", "The words to be filtered out")
|
|
|
|
self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " +
|
|
|
|
"sensitive comparison over the stop words")
|
|
|
|
stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
|
|
|
|
defaultStopWords = stopWordsObj.English()
|
|
|
|
self._setDefault(stopWords=defaultStopWords)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, inputCol=None, outputCol=None, stopWords=None,
|
|
|
|
caseSensitive=False):
|
|
|
|
"""
|
|
|
|
setParams(self, inputCol="input", outputCol="output", stopWords=None,\
|
|
|
|
caseSensitive=false)
|
|
|
|
Sets params for this StopWordRemover.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setStopWords(self, value):
|
|
|
|
"""
|
|
|
|
Specify the stopwords to be filtered.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.stopWords] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getStopWords(self):
|
|
|
|
"""
|
|
|
|
Get the stopwords.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.stopWords)
|
|
|
|
|
|
|
|
def setCaseSensitive(self, value):
|
|
|
|
"""
|
|
|
|
Set whether to do a case sensitive comparison over the stop words
|
|
|
|
"""
|
|
|
|
self._paramMap[self.caseSensitive] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getCaseSensitive(self):
|
|
|
|
"""
|
|
|
|
Get whether to do a case sensitive comparison over the stop words.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.caseSensitive)
|
|
|
|
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
@inherit_doc
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
A tokenizer that converts the input string to lowercase and then
|
|
|
|
splits it by white spaces.
|
|
|
|
|
|
|
|
>>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
|
|
|
|
>>> tokenizer = Tokenizer(inputCol="text", outputCol="words")
|
|
|
|
>>> tokenizer.transform(df).head()
|
|
|
|
Row(text=u'a b c', words=[u'a', u'b', u'c'])
|
|
|
|
>>> # Change a parameter.
|
|
|
|
>>> tokenizer.setParams(outputCol="tokens").transform(df).head()
|
|
|
|
Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
|
|
|
|
>>> # Temporarily modify a parameter.
|
|
|
|
>>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head()
|
|
|
|
Row(text=u'a b c', words=[u'a', u'b', u'c'])
|
|
|
|
>>> tokenizer.transform(df).head()
|
|
|
|
Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
|
|
|
|
>>> # Must use keyword arguments to specify params.
|
|
|
|
>>> tokenizer.setParams("text")
|
|
|
|
Traceback (most recent call last):
|
|
|
|
...
|
|
|
|
TypeError: Method setParams forces keyword arguments.
|
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(Tokenizer, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Tokenizer", self.uid)
|
2015-05-08 14:14:39 -04:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, inputCol="input", outputCol="output")
|
|
|
|
Sets params for this Tokenizer.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
|
2015-05-07 13:25:41 -04:00
|
|
|
@inherit_doc
|
|
|
|
class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-07 13:25:41 -04:00
|
|
|
A feature transformer that merges multiple columns into a vector column.
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> df = sqlContext.createDataFrame([(1, 0, 3)], ["a", "b", "c"])
|
2015-05-07 13:25:41 -04:00
|
|
|
>>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features")
|
|
|
|
>>> vecAssembler.transform(df).head().features
|
2015-05-07 18:45:37 -04:00
|
|
|
DenseVector([1.0, 0.0, 3.0])
|
2015-05-07 13:25:41 -04:00
|
|
|
>>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs
|
2015-05-07 18:45:37 -04:00
|
|
|
DenseVector([1.0, 0.0, 3.0])
|
2015-05-07 13:25:41 -04:00
|
|
|
>>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"}
|
|
|
|
>>> vecAssembler.transform(df, params).head().vector
|
2015-05-07 18:45:37 -04:00
|
|
|
DenseVector([0.0, 1.0])
|
2015-05-07 13:25:41 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, inputCols=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, inputCols=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(VectorAssembler, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorAssembler", self.uid)
|
2015-05-07 13:25:41 -04:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, inputCols=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, inputCols=None, outputCol=None)
|
|
|
|
Sets params for this VectorAssembler.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
@inherit_doc
|
|
|
|
class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Class for indexing categorical feature columns in a dataset of [[Vector]].
|
|
|
|
|
|
|
|
This has 2 usage modes:
|
|
|
|
- Automatically identify categorical features (default behavior)
|
|
|
|
- This helps process a dataset of unknown vectors into a dataset with some continuous
|
|
|
|
features and some categorical features. The choice between continuous and categorical
|
|
|
|
is based upon a maxCategories parameter.
|
|
|
|
- Set maxCategories to the maximum number of categorical any categorical feature should
|
|
|
|
have.
|
|
|
|
- E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
|
|
|
|
If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},
|
|
|
|
and feature 1 will be declared continuous.
|
|
|
|
- Index all features, if all features are categorical
|
|
|
|
- If maxCategories is set to be very large, then this will build an index of unique
|
|
|
|
values for all features.
|
|
|
|
- Warning: This can cause problems if features are continuous since this will collect ALL
|
|
|
|
unique values to the driver.
|
|
|
|
- E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
|
|
|
|
If maxCategories >= 3, then both features will be declared categorical.
|
|
|
|
|
|
|
|
This returns a model which can transform categorical features to use 0-based indices.
|
|
|
|
|
|
|
|
Index stability:
|
|
|
|
- This is not guaranteed to choose the same category index across multiple runs.
|
|
|
|
- If a categorical feature includes value 0, then this is guaranteed to map value 0 to
|
|
|
|
index 0. This maintains vector sparsity.
|
|
|
|
- More stability may be added in the future.
|
|
|
|
|
|
|
|
TODO: Future extensions: The following functionality is planned for the future:
|
|
|
|
- Preserve metadata in transform; if a feature's metadata is already present,
|
|
|
|
do not recompute.
|
|
|
|
- Specify certain features to not index, either via a parameter or via existing metadata.
|
|
|
|
- Add warning if a categorical feature has only 1 category.
|
|
|
|
- Add option for allowing unknown categories.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> df = sqlContext.createDataFrame([(Vectors.dense([-1.0, 0.0]),),
|
|
|
|
... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"])
|
|
|
|
>>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed")
|
|
|
|
>>> model = indexer.fit(df)
|
|
|
|
>>> model.transform(df).head().indexed
|
|
|
|
DenseVector([1.0, 0.0])
|
2015-09-10 23:43:38 -04:00
|
|
|
>>> model.numFeatures
|
|
|
|
2
|
|
|
|
>>> model.categoryMaps
|
|
|
|
{0: {0.0: 0, -1.0: 1}}
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test
|
|
|
|
DenseVector([0.0, 1.0])
|
|
|
|
>>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"}
|
|
|
|
>>> model2 = indexer.fit(df, params)
|
|
|
|
>>> model2.transform(df).head().vector
|
|
|
|
DenseVector([1.0, 0.0])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
maxCategories = Param(Params._dummy(), "maxCategories",
|
|
|
|
"Threshold for the number of values a categorical feature can take " +
|
|
|
|
"(>= 2). If a feature is found to have > maxCategories values, then " +
|
|
|
|
"it is declared continuous.")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, maxCategories=20, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(VectorIndexer, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
|
2015-05-08 14:14:39 -04:00
|
|
|
self.maxCategories = Param(self, "maxCategories",
|
|
|
|
"Threshold for the number of values a categorical feature " +
|
|
|
|
"can take (>= 2). If a feature is found to have " +
|
|
|
|
"> maxCategories values, then it is declared continuous.")
|
|
|
|
self._setDefault(maxCategories=20)
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, maxCategories=20, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, maxCategories=20, inputCol=None, outputCol=None)
|
|
|
|
Sets params for this VectorIndexer.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setMaxCategories(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`maxCategories`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.maxCategories] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getMaxCategories(self):
|
|
|
|
"""
|
|
|
|
Gets the value of maxCategories or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.maxCategories)
|
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
def _create_model(self, java_model):
|
|
|
|
return VectorIndexerModel(java_model)
|
|
|
|
|
|
|
|
|
|
|
|
class VectorIndexerModel(JavaModel):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
Model fitted by VectorIndexer.
|
|
|
|
"""
|
|
|
|
|
2015-09-10 23:43:38 -04:00
|
|
|
@property
|
|
|
|
def numFeatures(self):
|
|
|
|
"""
|
|
|
|
Number of features, i.e., length of Vectors which this transforms.
|
|
|
|
"""
|
|
|
|
return self._call_java("numFeatures")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def categoryMaps(self):
|
|
|
|
"""
|
|
|
|
Feature value index. Keys are categorical feature indices (column indices).
|
|
|
|
Values are maps from original features values to 0-based category indices.
|
|
|
|
If a feature is not in this map, it is treated as continuous.
|
|
|
|
"""
|
|
|
|
return self._call_java("javaCategoryMaps")
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
|
2015-09-09 21:02:33 -04:00
|
|
|
@inherit_doc
|
|
|
|
class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
|
|
|
.. note:: Experimental
|
|
|
|
|
|
|
|
This class takes a feature vector and outputs a new feature vector with a subarray
|
|
|
|
of the original features.
|
|
|
|
|
|
|
|
The subset of features can be specified with either indices (`setIndices()`)
|
|
|
|
or names (`setNames()`). At least one feature must be selected. Duplicate features
|
|
|
|
are not allowed, so there can be no overlap between selected indices and names.
|
|
|
|
|
|
|
|
The output vector will order features with the selected indices first (in the order given),
|
|
|
|
followed by the selected names (in the order given).
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> df = sqlContext.createDataFrame([
|
|
|
|
... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
|
|
|
|
... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
|
|
|
|
... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])
|
|
|
|
>>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
|
|
|
|
>>> vs.transform(df).head().sliced
|
|
|
|
DenseVector([2.3, 1.0])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
|
|
|
|
"a vector column. There can be no overlap with names.")
|
|
|
|
names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
|
|
|
|
"a vector column. These names must be specified by ML " +
|
|
|
|
"org.apache.spark.ml.attribute.Attribute. There can be no overlap with " +
|
|
|
|
"indices.")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
|
|
|
|
"""
|
|
|
|
__init__(self, inputCol=None, outputCol=None, indices=None, names=None)
|
|
|
|
"""
|
|
|
|
super(VectorSlicer, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
|
|
|
|
self.indices = Param(self, "indices", "An array of indices to select features from " +
|
|
|
|
"a vector column. There can be no overlap with names.")
|
|
|
|
self.names = Param(self, "names", "An array of feature names to select features from " +
|
|
|
|
"a vector column. These names must be specified by ML " +
|
|
|
|
"org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
|
|
|
|
"with indices.")
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
|
|
|
|
"""
|
|
|
|
setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
|
|
|
|
Sets params for this VectorSlicer.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setIndices(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`indices`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.indices] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getIndices(self):
|
|
|
|
"""
|
|
|
|
Gets the value of indices or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.indices)
|
|
|
|
|
|
|
|
def setNames(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`names`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.names] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getNames(self):
|
|
|
|
"""
|
|
|
|
Gets the value of names or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.names)
|
|
|
|
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
@inherit_doc
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further
|
|
|
|
natural language processing or machine learning process.
|
|
|
|
|
|
|
|
>>> sent = ("a b " * 100 + "a c " * 10).split(" ")
|
|
|
|
>>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"])
|
|
|
|
>>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model").fit(doc)
|
2015-08-06 13:09:58 -04:00
|
|
|
>>> model.getVectors().show()
|
|
|
|
+----+--------------------+
|
|
|
|
|word| vector|
|
|
|
|
+----+--------------------+
|
|
|
|
| a|[-0.3511952459812...|
|
|
|
|
| b|[0.29077222943305...|
|
|
|
|
| c|[0.02315592765808...|
|
|
|
|
+----+--------------------+
|
|
|
|
...
|
|
|
|
>>> model.findSynonyms("a", 2).show()
|
|
|
|
+----+-------------------+
|
|
|
|
|word| similarity|
|
|
|
|
+----+-------------------+
|
|
|
|
| b|0.29255685145799626|
|
|
|
|
| c|-0.5414068302988307|
|
|
|
|
+----+-------------------+
|
|
|
|
...
|
2015-05-08 14:14:39 -04:00
|
|
|
>>> model.transform(doc).head().model
|
|
|
|
DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
vectorSize = Param(Params._dummy(), "vectorSize",
|
|
|
|
"the dimension of codes after transforming from words")
|
|
|
|
numPartitions = Param(Params._dummy(), "numPartitions",
|
|
|
|
"number of partitions for sentences of words")
|
|
|
|
minCount = Param(Params._dummy(), "minCount",
|
|
|
|
"the minimum number of times a token must appear to be included in the " +
|
|
|
|
"word2vec model's vocabulary")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
|
2015-05-20 18:16:12 -04:00
|
|
|
seed=None, inputCol=None, outputCol=None):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-14 21:16:22 -04:00
|
|
|
__init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \
|
2015-05-20 18:16:12 -04:00
|
|
|
seed=None, inputCol=None, outputCol=None)
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
|
|
|
super(Word2Vec, self).__init__()
|
2015-05-18 15:02:18 -04:00
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
|
2015-05-08 14:14:39 -04:00
|
|
|
self.vectorSize = Param(self, "vectorSize",
|
|
|
|
"the dimension of codes after transforming from words")
|
|
|
|
self.numPartitions = Param(self, "numPartitions",
|
|
|
|
"number of partitions for sentences of words")
|
|
|
|
self.minCount = Param(self, "minCount",
|
|
|
|
"the minimum number of times a token must appear to be included " +
|
|
|
|
"in the word2vec model's vocabulary")
|
|
|
|
self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
|
2015-05-20 18:16:12 -04:00
|
|
|
seed=None)
|
2015-05-08 14:14:39 -04:00
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
|
2015-05-20 18:16:12 -04:00
|
|
|
seed=None, inputCol=None, outputCol=None):
|
2015-05-08 14:14:39 -04:00
|
|
|
"""
|
2015-05-20 18:16:12 -04:00
|
|
|
setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \
|
2015-05-08 14:14:39 -04:00
|
|
|
inputCol=None, outputCol=None)
|
|
|
|
Sets params for this Word2Vec.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setVectorSize(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`vectorSize`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.vectorSize] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getVectorSize(self):
|
|
|
|
"""
|
|
|
|
Gets the value of vectorSize or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.vectorSize)
|
|
|
|
|
|
|
|
def setNumPartitions(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`numPartitions`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.numPartitions] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getNumPartitions(self):
|
|
|
|
"""
|
|
|
|
Gets the value of numPartitions or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.numPartitions)
|
|
|
|
|
|
|
|
def setMinCount(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`minCount`.
|
|
|
|
"""
|
2015-05-18 15:02:18 -04:00
|
|
|
self._paramMap[self.minCount] = value
|
2015-05-08 14:14:39 -04:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getMinCount(self):
|
|
|
|
"""
|
|
|
|
Gets the value of minCount or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.minCount)
|
|
|
|
|
2015-05-18 15:02:18 -04:00
|
|
|
def _create_model(self, java_model):
|
|
|
|
return Word2VecModel(java_model)
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
|
|
|
|
class Word2VecModel(JavaModel):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
Model fitted by Word2Vec.
|
|
|
|
"""
|
|
|
|
|
2015-08-06 13:09:58 -04:00
|
|
|
def getVectors(self):
|
|
|
|
"""
|
|
|
|
Returns the vector representation of the words as a dataframe
|
|
|
|
with two fields, word and vector.
|
|
|
|
"""
|
|
|
|
return self._call_java("getVectors")
|
|
|
|
|
|
|
|
def findSynonyms(self, word, num):
|
|
|
|
"""
|
|
|
|
Find "num" number of words closest in similarity to "word".
|
|
|
|
word can be a string or vector representation.
|
|
|
|
Returns a dataframe with two fields word and similarity (which
|
|
|
|
gives the cosine similarity).
|
|
|
|
"""
|
|
|
|
if not isinstance(word, basestring):
|
|
|
|
word = _convert_to_vector(word)
|
|
|
|
return self._call_java("findSynonyms", word, num)
|
|
|
|
|
2015-05-08 14:14:39 -04:00
|
|
|
|
2015-07-17 17:08:06 -04:00
|
|
|
@inherit_doc
|
|
|
|
class PCA(JavaEstimator, HasInputCol, HasOutputCol):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-07-17 17:08:06 -04:00
|
|
|
PCA trains a model to project vectors to a low-dimensional space using PCA.
|
|
|
|
|
|
|
|
>>> from pyspark.mllib.linalg import Vectors
|
|
|
|
>>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
|
|
|
|
... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
|
|
|
|
... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
|
|
|
|
>>> df = sqlContext.createDataFrame(data,["features"])
|
|
|
|
>>> pca = PCA(k=2, inputCol="features", outputCol="pca_features")
|
|
|
|
>>> model = pca.fit(df)
|
|
|
|
>>> model.transform(df).collect()[0].pca_features
|
|
|
|
DenseVector([1.648..., -4.013...])
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
k = Param(Params._dummy(), "k", "the number of principal components")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, k=None, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
__init__(self, k=None, inputCol=None, outputCol=None)
|
|
|
|
"""
|
|
|
|
super(PCA, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid)
|
|
|
|
self.k = Param(self, "k", "the number of principal components")
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, k=None, inputCol=None, outputCol=None):
|
|
|
|
"""
|
|
|
|
setParams(self, k=None, inputCol=None, outputCol=None)
|
|
|
|
Set params for this PCA.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setK(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`k`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.k] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getK(self):
|
|
|
|
"""
|
|
|
|
Gets the value of k or its default value.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.k)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return PCAModel(java_model)
|
|
|
|
|
|
|
|
|
|
|
|
class PCAModel(JavaModel):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-07-17 17:08:06 -04:00
|
|
|
Model fitted by PCA.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2015-08-03 16:59:35 -04:00
|
|
|
@inherit_doc
|
|
|
|
class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
|
|
|
|
"""
|
|
|
|
.. note:: Experimental
|
|
|
|
|
|
|
|
Implements the transforms required for fitting a dataset against an
|
|
|
|
R model formula. Currently we support a limited subset of the R
|
2015-09-25 03:43:22 -04:00
|
|
|
operators, including '~', '.', ':', '+', and '-'. Also see the R formula
|
2015-08-03 16:59:35 -04:00
|
|
|
docs:
|
|
|
|
http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
|
|
|
|
|
|
|
|
>>> df = sqlContext.createDataFrame([
|
|
|
|
... (1.0, 1.0, "a"),
|
|
|
|
... (0.0, 2.0, "b"),
|
|
|
|
... (0.0, 0.0, "a")
|
|
|
|
... ], ["y", "x", "s"])
|
|
|
|
>>> rf = RFormula(formula="y ~ x + s")
|
|
|
|
>>> rf.fit(df).transform(df).show()
|
|
|
|
+---+---+---+---------+-----+
|
|
|
|
| y| x| s| features|label|
|
|
|
|
+---+---+---+---------+-----+
|
|
|
|
|1.0|1.0| a|[1.0,1.0]| 1.0|
|
|
|
|
|0.0|2.0| b|[2.0,0.0]| 0.0|
|
|
|
|
|0.0|0.0| a|[0.0,1.0]| 0.0|
|
|
|
|
+---+---+---+---------+-----+
|
|
|
|
...
|
|
|
|
>>> rf.fit(df, {rf.formula: "y ~ . - s"}).transform(df).show()
|
|
|
|
+---+---+---+--------+-----+
|
|
|
|
| y| x| s|features|label|
|
|
|
|
+---+---+---+--------+-----+
|
|
|
|
|1.0|1.0| a| [1.0]| 1.0|
|
|
|
|
|0.0|2.0| b| [2.0]| 0.0|
|
|
|
|
|0.0|0.0| a| [0.0]| 0.0|
|
|
|
|
+---+---+---+--------+-----+
|
|
|
|
...
|
|
|
|
"""
|
|
|
|
|
|
|
|
# a placeholder to make it appear in the generated doc
|
|
|
|
formula = Param(Params._dummy(), "formula", "R model formula")
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def __init__(self, formula=None, featuresCol="features", labelCol="label"):
|
|
|
|
"""
|
|
|
|
__init__(self, formula=None, featuresCol="features", labelCol="label")
|
|
|
|
"""
|
|
|
|
super(RFormula, self).__init__()
|
|
|
|
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid)
|
|
|
|
self.formula = Param(self, "formula", "R model formula")
|
|
|
|
kwargs = self.__init__._input_kwargs
|
|
|
|
self.setParams(**kwargs)
|
|
|
|
|
|
|
|
@keyword_only
|
|
|
|
def setParams(self, formula=None, featuresCol="features", labelCol="label"):
|
|
|
|
"""
|
|
|
|
setParams(self, formula=None, featuresCol="features", labelCol="label")
|
|
|
|
Sets params for RFormula.
|
|
|
|
"""
|
|
|
|
kwargs = self.setParams._input_kwargs
|
|
|
|
return self._set(**kwargs)
|
|
|
|
|
|
|
|
def setFormula(self, value):
|
|
|
|
"""
|
|
|
|
Sets the value of :py:attr:`formula`.
|
|
|
|
"""
|
|
|
|
self._paramMap[self.formula] = value
|
|
|
|
return self
|
|
|
|
|
|
|
|
def getFormula(self):
|
|
|
|
"""
|
|
|
|
Gets the value of :py:attr:`formula`.
|
|
|
|
"""
|
|
|
|
return self.getOrDefault(self.formula)
|
|
|
|
|
|
|
|
def _create_model(self, java_model):
|
|
|
|
return RFormulaModel(java_model)
|
|
|
|
|
|
|
|
|
|
|
|
class RFormulaModel(JavaModel):
|
|
|
|
"""
|
2015-09-09 00:26:20 -04:00
|
|
|
.. note:: Experimental
|
|
|
|
|
2015-08-03 16:59:35 -04:00
|
|
|
Model fitted by :py:class:`RFormula`.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2015-01-28 20:14:23 -05:00
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
|
|
|
from pyspark.context import SparkContext
|
2015-05-08 14:14:39 -04:00
|
|
|
from pyspark.sql import Row, SQLContext
|
2015-01-28 20:14:23 -05:00
|
|
|
globs = globals().copy()
|
|
|
|
# The small batch size here ensures that we see multiple batches,
|
|
|
|
# even in these small test examples:
|
|
|
|
sc = SparkContext("local[2]", "ml.feature tests")
|
2015-04-08 16:31:45 -04:00
|
|
|
sqlContext = SQLContext(sc)
|
2015-01-28 20:14:23 -05:00
|
|
|
globs['sc'] = sc
|
2015-04-08 16:31:45 -04:00
|
|
|
globs['sqlContext'] = sqlContext
|
2015-05-08 14:14:39 -04:00
|
|
|
testData = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="b"),
|
|
|
|
Row(id=2, label="c"), Row(id=3, label="a"),
|
|
|
|
Row(id=4, label="a"), Row(id=5, label="c")], 2)
|
|
|
|
globs['stringIndDf'] = sqlContext.createDataFrame(testData)
|
|
|
|
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
|
2015-01-28 20:14:23 -05:00
|
|
|
sc.stop()
|
|
|
|
if failure_count:
|
|
|
|
exit(-1)
|