d2e86cb3cd
## What changes were proposed in this pull request? This change exposes the `df` (document frequency) as a public val along with the number of documents (`m`) as part of the IDF model. * The document frequency is returned as an `Array[Long]` * If the minimum document frequency is set, this is considered in the df calculation. If the count is less than minDocFreq, the df is 0 for such terms * numDocs is not very required. But it can be useful, if we plan to provide a provision in future for user to give their own idf function, instead of using a default (log((1+m)/(1+df))). In such cases, the user can provide a function taking input of `m` and `df` and returning the idf value * Pyspark changes ## How was this patch tested? The existing test case was edited to also check for the document frequency values. I am not very good with python or pyspark. I have committed and run tests based on my understanding. Kindly let me know if I have missed anything Reviewer request: mengxr zjffdu yinxusen Closes #23549 from purijatin/master. Authored-by: Jatin Puri <purijatin@gmail.com> Signed-off-by: Sean Owen <sean.owen@databricks.com>
839 lines
25 KiB
Python
839 lines
25 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
"""
|
|
Python package for feature in MLlib.
|
|
"""
|
|
from __future__ import absolute_import
|
|
|
|
import sys
|
|
import warnings
|
|
if sys.version >= '3':
|
|
basestring = str
|
|
unicode = str
|
|
|
|
from py4j.protocol import Py4JJavaError
|
|
|
|
from pyspark import since
|
|
from pyspark.rdd import RDD, ignore_unicode_prefix
|
|
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
|
|
from pyspark.mllib.linalg import (
|
|
Vector, Vectors, DenseVector, SparseVector, _convert_to_vector)
|
|
from pyspark.mllib.regression import LabeledPoint
|
|
from pyspark.mllib.util import JavaLoader, JavaSaveable
|
|
|
|
__all__ = ['Normalizer', 'StandardScalerModel', 'StandardScaler',
|
|
'HashingTF', 'IDFModel', 'IDF', 'Word2Vec', 'Word2VecModel',
|
|
'ChiSqSelector', 'ChiSqSelectorModel', 'ElementwiseProduct']
|
|
|
|
|
|
class VectorTransformer(object):
|
|
"""
|
|
.. note:: DeveloperApi
|
|
|
|
Base class for transformation of a vector or RDD of vector
|
|
"""
|
|
def transform(self, vector):
|
|
"""
|
|
Applies transformation on a vector.
|
|
|
|
:param vector: vector to be transformed.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class Normalizer(VectorTransformer):
|
|
r"""
|
|
Normalizes samples individually to unit L\ :sup:`p`\ norm
|
|
|
|
For any 1 <= `p` < float('inf'), normalizes samples using
|
|
sum(abs(vector) :sup:`p`) :sup:`(1/p)` as norm.
|
|
|
|
For `p` = float('inf'), max(abs(vector)) will be used as norm for
|
|
normalization.
|
|
|
|
:param p: Normalization in L^p^ space, p = 2 by default.
|
|
|
|
>>> v = Vectors.dense(range(3))
|
|
>>> nor = Normalizer(1)
|
|
>>> nor.transform(v)
|
|
DenseVector([0.0, 0.3333, 0.6667])
|
|
|
|
>>> rdd = sc.parallelize([v])
|
|
>>> nor.transform(rdd).collect()
|
|
[DenseVector([0.0, 0.3333, 0.6667])]
|
|
|
|
>>> nor2 = Normalizer(float("inf"))
|
|
>>> nor2.transform(v)
|
|
DenseVector([0.0, 0.5, 1.0])
|
|
|
|
.. versionadded:: 1.2.0
|
|
"""
|
|
def __init__(self, p=2.0):
|
|
assert p >= 1.0, "p should be greater than 1.0"
|
|
self.p = float(p)
|
|
|
|
@since('1.2.0')
|
|
def transform(self, vector):
|
|
"""
|
|
Applies unit length normalization on a vector.
|
|
|
|
:param vector: vector or RDD of vector to be normalized.
|
|
:return: normalized vector. If the norm of the input is zero, it
|
|
will return the input vector.
|
|
"""
|
|
if isinstance(vector, RDD):
|
|
vector = vector.map(_convert_to_vector)
|
|
else:
|
|
vector = _convert_to_vector(vector)
|
|
return callMLlibFunc("normalizeVector", self.p, vector)
|
|
|
|
|
|
class JavaVectorTransformer(JavaModelWrapper, VectorTransformer):
|
|
"""
|
|
Wrapper for the model in JVM
|
|
"""
|
|
|
|
def transform(self, vector):
|
|
"""
|
|
Applies transformation on a vector or an RDD[Vector].
|
|
|
|
.. note:: In Python, transform cannot currently be used within
|
|
an RDD transformation or action.
|
|
Call transform directly on the RDD instead.
|
|
|
|
:param vector: Vector or RDD of Vector to be transformed.
|
|
"""
|
|
if isinstance(vector, RDD):
|
|
vector = vector.map(_convert_to_vector)
|
|
else:
|
|
vector = _convert_to_vector(vector)
|
|
return self.call("transform", vector)
|
|
|
|
|
|
class StandardScalerModel(JavaVectorTransformer):
|
|
"""
|
|
Represents a StandardScaler model that can transform vectors.
|
|
|
|
.. versionadded:: 1.2.0
|
|
"""
|
|
|
|
@since('1.2.0')
|
|
def transform(self, vector):
|
|
"""
|
|
Applies standardization transformation on a vector.
|
|
|
|
.. note:: In Python, transform cannot currently be used within
|
|
an RDD transformation or action.
|
|
Call transform directly on the RDD instead.
|
|
|
|
:param vector: Vector or RDD of Vector to be standardized.
|
|
:return: Standardized vector. If the variance of a column is
|
|
zero, it will return default `0.0` for the column with
|
|
zero variance.
|
|
"""
|
|
return JavaVectorTransformer.transform(self, vector)
|
|
|
|
@since('1.4.0')
|
|
def setWithMean(self, withMean):
|
|
"""
|
|
Setter of the boolean which decides
|
|
whether it uses mean or not
|
|
"""
|
|
self.call("setWithMean", withMean)
|
|
return self
|
|
|
|
@since('1.4.0')
|
|
def setWithStd(self, withStd):
|
|
"""
|
|
Setter of the boolean which decides
|
|
whether it uses std or not
|
|
"""
|
|
self.call("setWithStd", withStd)
|
|
return self
|
|
|
|
@property
|
|
@since('2.0.0')
|
|
def withStd(self):
|
|
"""
|
|
Returns if the model scales the data to unit standard deviation.
|
|
"""
|
|
return self.call("withStd")
|
|
|
|
@property
|
|
@since('2.0.0')
|
|
def withMean(self):
|
|
"""
|
|
Returns if the model centers the data before scaling.
|
|
"""
|
|
return self.call("withMean")
|
|
|
|
@property
|
|
@since('2.0.0')
|
|
def std(self):
|
|
"""
|
|
Return the column standard deviation values.
|
|
"""
|
|
return self.call("std")
|
|
|
|
@property
|
|
@since('2.0.0')
|
|
def mean(self):
|
|
"""
|
|
Return the column mean values.
|
|
"""
|
|
return self.call("mean")
|
|
|
|
|
|
class StandardScaler(object):
|
|
"""
|
|
Standardizes features by removing the mean and scaling to unit
|
|
variance using column summary statistics on the samples in the
|
|
training set.
|
|
|
|
:param withMean: False by default. Centers the data with mean
|
|
before scaling. It will build a dense output, so take
|
|
care when applying to sparse input.
|
|
:param withStd: True by default. Scales the data to unit
|
|
standard deviation.
|
|
|
|
>>> vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]
|
|
>>> dataset = sc.parallelize(vs)
|
|
>>> standardizer = StandardScaler(True, True)
|
|
>>> model = standardizer.fit(dataset)
|
|
>>> result = model.transform(dataset)
|
|
>>> for r in result.collect(): r
|
|
DenseVector([-0.7071, 0.7071, -0.7071])
|
|
DenseVector([0.7071, -0.7071, 0.7071])
|
|
>>> int(model.std[0])
|
|
4
|
|
>>> int(model.mean[0]*10)
|
|
9
|
|
>>> model.withStd
|
|
True
|
|
>>> model.withMean
|
|
True
|
|
|
|
.. versionadded:: 1.2.0
|
|
"""
|
|
def __init__(self, withMean=False, withStd=True):
|
|
if not (withMean or withStd):
|
|
warnings.warn("Both withMean and withStd are false. The model does nothing.")
|
|
self.withMean = withMean
|
|
self.withStd = withStd
|
|
|
|
@since('1.2.0')
|
|
def fit(self, dataset):
|
|
"""
|
|
Computes the mean and variance and stores as a model to be used
|
|
for later scaling.
|
|
|
|
:param dataset: The data used to compute the mean and variance
|
|
to build the transformation model.
|
|
:return: a StandardScalarModel
|
|
"""
|
|
dataset = dataset.map(_convert_to_vector)
|
|
jmodel = callMLlibFunc("fitStandardScaler", self.withMean, self.withStd, dataset)
|
|
return StandardScalerModel(jmodel)
|
|
|
|
|
|
class ChiSqSelectorModel(JavaVectorTransformer):
|
|
"""
|
|
Represents a Chi Squared selector model.
|
|
|
|
.. versionadded:: 1.4.0
|
|
"""
|
|
|
|
@since('1.4.0')
|
|
def transform(self, vector):
|
|
"""
|
|
Applies transformation on a vector.
|
|
|
|
:param vector: Vector or RDD of Vector to be transformed.
|
|
:return: transformed vector.
|
|
"""
|
|
return JavaVectorTransformer.transform(self, vector)
|
|
|
|
|
|
class ChiSqSelector(object):
|
|
"""
|
|
Creates a ChiSquared feature selector.
|
|
The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`,
|
|
`fdr`, `fwe`.
|
|
|
|
* `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
|
|
|
|
* `percentile` is similar but chooses a fraction of all features
|
|
instead of a fixed number.
|
|
|
|
* `fpr` chooses all features whose p-values are below a threshold,
|
|
thus controlling the false positive rate of selection.
|
|
|
|
* `fdr` uses the `Benjamini-Hochberg procedure <https://en.wikipedia.org/wiki/
|
|
False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure>`_
|
|
to choose all features whose false discovery rate is below a threshold.
|
|
|
|
* `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by
|
|
1/numFeatures, thus controlling the family-wise error rate of selection.
|
|
|
|
By default, the selection method is `numTopFeatures`, with the default number of top features
|
|
set to 50.
|
|
|
|
>>> data = sc.parallelize([
|
|
... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
|
|
... LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})),
|
|
... LabeledPoint(1.0, [0.0, 9.0, 8.0]),
|
|
... LabeledPoint(2.0, [7.0, 9.0, 5.0]),
|
|
... LabeledPoint(2.0, [8.0, 7.0, 3.0])
|
|
... ])
|
|
>>> model = ChiSqSelector(numTopFeatures=1).fit(data)
|
|
>>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
|
|
SparseVector(1, {})
|
|
>>> model.transform(DenseVector([7.0, 9.0, 5.0]))
|
|
DenseVector([7.0])
|
|
>>> model = ChiSqSelector(selectorType="fpr", fpr=0.2).fit(data)
|
|
>>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
|
|
SparseVector(1, {})
|
|
>>> model.transform(DenseVector([7.0, 9.0, 5.0]))
|
|
DenseVector([7.0])
|
|
>>> model = ChiSqSelector(selectorType="percentile", percentile=0.34).fit(data)
|
|
>>> model.transform(DenseVector([7.0, 9.0, 5.0]))
|
|
DenseVector([7.0])
|
|
|
|
.. versionadded:: 1.4.0
|
|
"""
|
|
def __init__(self, numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, fpr=0.05,
|
|
fdr=0.05, fwe=0.05):
|
|
self.numTopFeatures = numTopFeatures
|
|
self.selectorType = selectorType
|
|
self.percentile = percentile
|
|
self.fpr = fpr
|
|
self.fdr = fdr
|
|
self.fwe = fwe
|
|
|
|
@since('2.1.0')
|
|
def setNumTopFeatures(self, numTopFeatures):
|
|
"""
|
|
set numTopFeature for feature selection by number of top features.
|
|
Only applicable when selectorType = "numTopFeatures".
|
|
"""
|
|
self.numTopFeatures = int(numTopFeatures)
|
|
return self
|
|
|
|
@since('2.1.0')
|
|
def setPercentile(self, percentile):
|
|
"""
|
|
set percentile [0.0, 1.0] for feature selection by percentile.
|
|
Only applicable when selectorType = "percentile".
|
|
"""
|
|
self.percentile = float(percentile)
|
|
return self
|
|
|
|
@since('2.1.0')
|
|
def setFpr(self, fpr):
|
|
"""
|
|
set FPR [0.0, 1.0] for feature selection by FPR.
|
|
Only applicable when selectorType = "fpr".
|
|
"""
|
|
self.fpr = float(fpr)
|
|
return self
|
|
|
|
@since('2.2.0')
|
|
def setFdr(self, fdr):
|
|
"""
|
|
set FDR [0.0, 1.0] for feature selection by FDR.
|
|
Only applicable when selectorType = "fdr".
|
|
"""
|
|
self.fdr = float(fdr)
|
|
return self
|
|
|
|
@since('2.2.0')
|
|
def setFwe(self, fwe):
|
|
"""
|
|
set FWE [0.0, 1.0] for feature selection by FWE.
|
|
Only applicable when selectorType = "fwe".
|
|
"""
|
|
self.fwe = float(fwe)
|
|
return self
|
|
|
|
@since('2.1.0')
|
|
def setSelectorType(self, selectorType):
|
|
"""
|
|
set the selector type of the ChisqSelector.
|
|
Supported options: "numTopFeatures" (default), "percentile", "fpr", "fdr", "fwe".
|
|
"""
|
|
self.selectorType = str(selectorType)
|
|
return self
|
|
|
|
@since('1.4.0')
|
|
def fit(self, data):
|
|
"""
|
|
Returns a ChiSquared feature selector.
|
|
|
|
:param data: an `RDD[LabeledPoint]` containing the labeled dataset
|
|
with categorical features. Real-valued features will be
|
|
treated as categorical for each distinct value.
|
|
Apply feature discretizer before using this function.
|
|
"""
|
|
jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures,
|
|
self.percentile, self.fpr, self.fdr, self.fwe, data)
|
|
return ChiSqSelectorModel(jmodel)
|
|
|
|
|
|
class PCAModel(JavaVectorTransformer):
|
|
"""
|
|
Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
|
|
|
|
.. versionadded:: 1.5.0
|
|
"""
|
|
|
|
|
|
class PCA(object):
|
|
"""
|
|
A feature transformer that projects vectors to a low-dimensional space using PCA.
|
|
|
|
>>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),
|
|
... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),
|
|
... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
|
|
>>> model = PCA(2).fit(sc.parallelize(data))
|
|
>>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
|
|
>>> pcArray[0]
|
|
1.648...
|
|
>>> pcArray[1]
|
|
-4.013...
|
|
|
|
.. versionadded:: 1.5.0
|
|
"""
|
|
def __init__(self, k):
|
|
"""
|
|
:param k: number of principal components.
|
|
"""
|
|
self.k = int(k)
|
|
|
|
@since('1.5.0')
|
|
def fit(self, data):
|
|
"""
|
|
Computes a [[PCAModel]] that contains the principal components of the input vectors.
|
|
:param data: source vectors
|
|
"""
|
|
jmodel = callMLlibFunc("fitPCA", self.k, data)
|
|
return PCAModel(jmodel)
|
|
|
|
|
|
class HashingTF(object):
|
|
"""
|
|
Maps a sequence of terms to their term frequencies using the hashing
|
|
trick.
|
|
|
|
.. note:: The terms must be hashable (can not be dict/set/list...).
|
|
|
|
:param numFeatures: number of features (default: 2^20)
|
|
|
|
>>> htf = HashingTF(100)
|
|
>>> doc = "a a b b c d".split(" ")
|
|
>>> htf.transform(doc)
|
|
SparseVector(100, {...})
|
|
|
|
.. versionadded:: 1.2.0
|
|
"""
|
|
def __init__(self, numFeatures=1 << 20):
|
|
self.numFeatures = numFeatures
|
|
self.binary = False
|
|
|
|
@since("2.0.0")
|
|
def setBinary(self, value):
|
|
"""
|
|
If True, term frequency vector will be binary such that non-zero
|
|
term counts will be set to 1
|
|
(default: False)
|
|
"""
|
|
self.binary = value
|
|
return self
|
|
|
|
@since('1.2.0')
|
|
def indexOf(self, term):
|
|
""" Returns the index of the input term. """
|
|
return hash(term) % self.numFeatures
|
|
|
|
@since('1.2.0')
|
|
def transform(self, document):
|
|
"""
|
|
Transforms the input document (list of terms) to term frequency
|
|
vectors, or transform the RDD of document to RDD of term
|
|
frequency vectors.
|
|
"""
|
|
if isinstance(document, RDD):
|
|
return document.map(self.transform)
|
|
|
|
freq = {}
|
|
for term in document:
|
|
i = self.indexOf(term)
|
|
freq[i] = 1.0 if self.binary else freq.get(i, 0) + 1.0
|
|
return Vectors.sparse(self.numFeatures, freq.items())
|
|
|
|
|
|
class IDFModel(JavaVectorTransformer):
|
|
"""
|
|
Represents an IDF model that can transform term frequency vectors.
|
|
|
|
.. versionadded:: 1.2.0
|
|
"""
|
|
@since('1.2.0')
|
|
def transform(self, x):
|
|
"""
|
|
Transforms term frequency (TF) vectors to TF-IDF vectors.
|
|
|
|
If `minDocFreq` was set for the IDF calculation,
|
|
the terms which occur in fewer than `minDocFreq`
|
|
documents will have an entry of 0.
|
|
|
|
.. note:: In Python, transform cannot currently be used within
|
|
an RDD transformation or action.
|
|
Call transform directly on the RDD instead.
|
|
|
|
:param x: an RDD of term frequency vectors or a term frequency
|
|
vector
|
|
:return: an RDD of TF-IDF vectors or a TF-IDF vector
|
|
"""
|
|
return JavaVectorTransformer.transform(self, x)
|
|
|
|
@since('1.4.0')
|
|
def idf(self):
|
|
"""
|
|
Returns the current IDF vector.
|
|
"""
|
|
return self.call('idf')
|
|
|
|
@since('3.0.0')
|
|
def docFreq(self):
|
|
"""
|
|
Returns the document frequency.
|
|
"""
|
|
return self.call('docFreq')
|
|
|
|
@since('3.0.0')
|
|
def numDocs(self):
|
|
"""
|
|
Returns number of documents evaluated to compute idf
|
|
"""
|
|
return self.call('numDocs')
|
|
|
|
|
|
class IDF(object):
|
|
"""
|
|
Inverse document frequency (IDF).
|
|
|
|
The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`,
|
|
where `m` is the total number of documents and `d(t)` is the number
|
|
of documents that contain term `t`.
|
|
|
|
This implementation supports filtering out terms which do not appear
|
|
in a minimum number of documents (controlled by the variable
|
|
`minDocFreq`). For terms that are not in at least `minDocFreq`
|
|
documents, the IDF is found as 0, resulting in TF-IDFs of 0.
|
|
|
|
:param minDocFreq: minimum of documents in which a term
|
|
should appear for filtering
|
|
|
|
>>> n = 4
|
|
>>> freqs = [Vectors.sparse(n, (1, 3), (1.0, 2.0)),
|
|
... Vectors.dense([0.0, 1.0, 2.0, 3.0]),
|
|
... Vectors.sparse(n, [1], [1.0])]
|
|
>>> data = sc.parallelize(freqs)
|
|
>>> idf = IDF()
|
|
>>> model = idf.fit(data)
|
|
>>> tfidf = model.transform(data)
|
|
>>> for r in tfidf.collect(): r
|
|
SparseVector(4, {1: 0.0, 3: 0.5754})
|
|
DenseVector([0.0, 0.0, 1.3863, 0.863])
|
|
SparseVector(4, {1: 0.0})
|
|
>>> model.transform(Vectors.dense([0.0, 1.0, 2.0, 3.0]))
|
|
DenseVector([0.0, 0.0, 1.3863, 0.863])
|
|
>>> model.transform([0.0, 1.0, 2.0, 3.0])
|
|
DenseVector([0.0, 0.0, 1.3863, 0.863])
|
|
>>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0)))
|
|
SparseVector(4, {1: 0.0, 3: 0.5754})
|
|
|
|
.. versionadded:: 1.2.0
|
|
"""
|
|
def __init__(self, minDocFreq=0):
|
|
self.minDocFreq = minDocFreq
|
|
|
|
@since('1.2.0')
|
|
def fit(self, dataset):
|
|
"""
|
|
Computes the inverse document frequency.
|
|
|
|
:param dataset: an RDD of term frequency vectors
|
|
"""
|
|
if not isinstance(dataset, RDD):
|
|
raise TypeError("dataset should be an RDD of term frequency vectors")
|
|
jmodel = callMLlibFunc("fitIDF", self.minDocFreq, dataset.map(_convert_to_vector))
|
|
return IDFModel(jmodel)
|
|
|
|
|
|
class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
|
|
"""
|
|
class for Word2Vec model
|
|
|
|
.. versionadded:: 1.2.0
|
|
"""
|
|
@since('1.2.0')
|
|
def transform(self, word):
|
|
"""
|
|
Transforms a word to its vector representation
|
|
|
|
.. note:: Local use only
|
|
|
|
:param word: a word
|
|
:return: vector representation of word(s)
|
|
"""
|
|
try:
|
|
return self.call("transform", word)
|
|
except Py4JJavaError:
|
|
raise ValueError("%s not found" % word)
|
|
|
|
@since('1.2.0')
|
|
def findSynonyms(self, word, num):
|
|
"""
|
|
Find synonyms of a word
|
|
|
|
:param word: a word or a vector representation of word
|
|
:param num: number of synonyms to find
|
|
:return: array of (word, cosineSimilarity)
|
|
|
|
.. note:: Local use only
|
|
"""
|
|
if not isinstance(word, basestring):
|
|
word = _convert_to_vector(word)
|
|
words, similarity = self.call("findSynonyms", word, num)
|
|
return zip(words, similarity)
|
|
|
|
@since('1.4.0')
|
|
def getVectors(self):
|
|
"""
|
|
Returns a map of words to their vector representations.
|
|
"""
|
|
return self.call("getVectors")
|
|
|
|
@classmethod
|
|
@since('1.5.0')
|
|
def load(cls, sc, path):
|
|
"""
|
|
Load a model from the given path.
|
|
"""
|
|
jmodel = sc._jvm.org.apache.spark.mllib.feature \
|
|
.Word2VecModel.load(sc._jsc.sc(), path)
|
|
model = sc._jvm.org.apache.spark.mllib.api.python.Word2VecModelWrapper(jmodel)
|
|
return Word2VecModel(model)
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
class Word2Vec(object):
|
|
"""Word2Vec creates vector representation of words in a text corpus.
|
|
The algorithm first constructs a vocabulary from the corpus
|
|
and then learns vector representation of words in the vocabulary.
|
|
The vector representation can be used as features in
|
|
natural language processing and machine learning algorithms.
|
|
|
|
We used skip-gram model in our implementation and hierarchical
|
|
softmax method to train the model. The variable names in the
|
|
implementation matches the original C implementation.
|
|
|
|
For original C implementation,
|
|
see https://code.google.com/p/word2vec/
|
|
For research papers, see
|
|
Efficient Estimation of Word Representations in Vector Space
|
|
and Distributed Representations of Words and Phrases and their
|
|
Compositionality.
|
|
|
|
>>> sentence = "a b " * 100 + "a c " * 10
|
|
>>> localDoc = [sentence, sentence]
|
|
>>> doc = sc.parallelize(localDoc).map(lambda line: line.split(" "))
|
|
>>> model = Word2Vec().setVectorSize(10).setSeed(42).fit(doc)
|
|
|
|
Querying for synonyms of a word will not return that word:
|
|
|
|
>>> syms = model.findSynonyms("a", 2)
|
|
>>> [s[0] for s in syms]
|
|
[u'b', u'c']
|
|
|
|
But querying for synonyms of a vector may return the word whose
|
|
representation is that vector:
|
|
|
|
>>> vec = model.transform("a")
|
|
>>> syms = model.findSynonyms(vec, 2)
|
|
>>> [s[0] for s in syms]
|
|
[u'a', u'b']
|
|
|
|
>>> import os, tempfile
|
|
>>> path = tempfile.mkdtemp()
|
|
>>> model.save(sc, path)
|
|
>>> sameModel = Word2VecModel.load(sc, path)
|
|
>>> model.transform("a") == sameModel.transform("a")
|
|
True
|
|
>>> syms = sameModel.findSynonyms("a", 2)
|
|
>>> [s[0] for s in syms]
|
|
[u'b', u'c']
|
|
>>> from shutil import rmtree
|
|
>>> try:
|
|
... rmtree(path)
|
|
... except OSError:
|
|
... pass
|
|
|
|
.. versionadded:: 1.2.0
|
|
|
|
"""
|
|
def __init__(self):
|
|
"""
|
|
Construct Word2Vec instance
|
|
"""
|
|
self.vectorSize = 100
|
|
self.learningRate = 0.025
|
|
self.numPartitions = 1
|
|
self.numIterations = 1
|
|
self.seed = None
|
|
self.minCount = 5
|
|
self.windowSize = 5
|
|
|
|
@since('1.2.0')
|
|
def setVectorSize(self, vectorSize):
|
|
"""
|
|
Sets vector size (default: 100).
|
|
"""
|
|
self.vectorSize = vectorSize
|
|
return self
|
|
|
|
@since('1.2.0')
|
|
def setLearningRate(self, learningRate):
|
|
"""
|
|
Sets initial learning rate (default: 0.025).
|
|
"""
|
|
self.learningRate = learningRate
|
|
return self
|
|
|
|
@since('1.2.0')
|
|
def setNumPartitions(self, numPartitions):
|
|
"""
|
|
Sets number of partitions (default: 1). Use a small number for
|
|
accuracy.
|
|
"""
|
|
self.numPartitions = numPartitions
|
|
return self
|
|
|
|
@since('1.2.0')
|
|
def setNumIterations(self, numIterations):
|
|
"""
|
|
Sets number of iterations (default: 1), which should be smaller
|
|
than or equal to number of partitions.
|
|
"""
|
|
self.numIterations = numIterations
|
|
return self
|
|
|
|
@since('1.2.0')
|
|
def setSeed(self, seed):
|
|
"""
|
|
Sets random seed.
|
|
"""
|
|
self.seed = seed
|
|
return self
|
|
|
|
@since('1.4.0')
|
|
def setMinCount(self, minCount):
|
|
"""
|
|
Sets minCount, the minimum number of times a token must appear
|
|
to be included in the word2vec model's vocabulary (default: 5).
|
|
"""
|
|
self.minCount = minCount
|
|
return self
|
|
|
|
@since('2.0.0')
|
|
def setWindowSize(self, windowSize):
|
|
"""
|
|
Sets window size (default: 5).
|
|
"""
|
|
self.windowSize = windowSize
|
|
return self
|
|
|
|
@since('1.2.0')
|
|
def fit(self, data):
|
|
"""
|
|
Computes the vector representation of each word in vocabulary.
|
|
|
|
:param data: training data. RDD of list of string
|
|
:return: Word2VecModel instance
|
|
"""
|
|
if not isinstance(data, RDD):
|
|
raise TypeError("data should be an RDD of list of string")
|
|
jmodel = callMLlibFunc("trainWord2VecModel", data, int(self.vectorSize),
|
|
float(self.learningRate), int(self.numPartitions),
|
|
int(self.numIterations), self.seed,
|
|
int(self.minCount), int(self.windowSize))
|
|
return Word2VecModel(jmodel)
|
|
|
|
|
|
class ElementwiseProduct(VectorTransformer):
|
|
"""
|
|
Scales each column of the vector, with the supplied weight vector.
|
|
i.e the elementwise product.
|
|
|
|
>>> weight = Vectors.dense([1.0, 2.0, 3.0])
|
|
>>> eprod = ElementwiseProduct(weight)
|
|
>>> a = Vectors.dense([2.0, 1.0, 3.0])
|
|
>>> eprod.transform(a)
|
|
DenseVector([2.0, 2.0, 9.0])
|
|
>>> b = Vectors.dense([9.0, 3.0, 4.0])
|
|
>>> rdd = sc.parallelize([a, b])
|
|
>>> eprod.transform(rdd).collect()
|
|
[DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])]
|
|
|
|
.. versionadded:: 1.5.0
|
|
"""
|
|
def __init__(self, scalingVector):
|
|
self.scalingVector = _convert_to_vector(scalingVector)
|
|
|
|
@since('1.5.0')
|
|
def transform(self, vector):
|
|
"""
|
|
Computes the Hadamard product of the vector.
|
|
"""
|
|
if isinstance(vector, RDD):
|
|
vector = vector.map(_convert_to_vector)
|
|
|
|
else:
|
|
vector = _convert_to_vector(vector)
|
|
return callMLlibFunc("elementwiseProductVector", self.scalingVector, vector)
|
|
|
|
|
|
def _test():
|
|
import doctest
|
|
from pyspark.sql import SparkSession
|
|
globs = globals().copy()
|
|
spark = SparkSession.builder\
|
|
.master("local[4]")\
|
|
.appName("mllib.feature tests")\
|
|
.getOrCreate()
|
|
globs['sc'] = spark.sparkContext
|
|
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
|
|
spark.stop()
|
|
if failure_count:
|
|
sys.exit(-1)
|
|
|
|
if __name__ == "__main__":
|
|
sys.path.pop(0)
|
|
_test()
|