[SPARK-14238][ML][MLLIB][PYSPARK] Add binary toggle Param to PySpark HashingTF in ML & MLlib

## What changes were proposed in this pull request?

This fix tries to add binary toggle Param to PySpark HashingTF in ML & MLlib. If this toggle is set, then all non-zero counts will be set to 1.

Note: This fix (SPARK-14238) is extended from SPARK-13963 where Scala implementation was done.

## How was this patch tested?

This fix adds two tests to cover the code changes. One for HashingTF in PySpark's ML and one for HashingTF in PySpark's MLLib.

Author: Yong Tang <yong.tang.github@outlook.com>

Closes #12079 from yongtang/SPARK-14238.
This commit is contained in:
Yong Tang 2016-04-14 21:53:32 +02:00 committed by Nick Pentreath
parent bf65c87f70
commit bc748b7b8f
4 changed files with 69 additions and 3 deletions

View file

@ -536,14 +536,19 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java
.. versionadded:: 1.3.0
"""
binary = Param(Params._dummy(), "binary", "If True, all non zero counts are set to 1. " +
"This is useful for discrete probabilistic models that model binary events " +
"rather than integer counts. Default False.",
typeConverter=TypeConverters.toBoolean)
@keyword_only
def __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None):
def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None):
"""
__init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None)
"""
super(HashingTF, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid)
self._setDefault(numFeatures=1 << 18)
self._setDefault(numFeatures=1 << 18, binary=False)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@ -557,6 +562,21 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)
@since("2.0.0")
def setBinary(self, value):
"""
Sets the value of :py:attr:`binary`.
"""
self._paramMap[self.binary] = value
return self
@since("2.0.0")
def getBinary(self):
"""
Gets the value of binary or its default value.
"""
return self.getOrDefault(self.binary)
@inherit_doc
class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):

View file

@ -847,6 +847,25 @@ class TrainingSummaryTest(PySparkTestCase):
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
class HashingTFTest(PySparkTestCase):
def test_apply_binary_term_freqs(self):
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
n = 100
hashingTF = HashingTF()
hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
output = hashingTF.transform(df)
features = output.select("features").first().features.toArray()
expected = Vectors.sparse(n, {(ord("a") % n): 1.0,
(ord("b") % n): 1.0,
(ord("c") % n): 1.0}).toArray()
for i in range(0, n):
self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) +
": expected " + str(expected[i]) + ", got " + str(features[i]))
if __name__ == "__main__":
from pyspark.ml.tests import *
if xmlrunner:

View file

@ -379,6 +379,17 @@ class HashingTF(object):
"""
def __init__(self, numFeatures=1 << 20):
self.numFeatures = numFeatures
self.binary = False
@since("2.0.0")
def setBinary(self, value):
"""
If True, term frequency vector will be binary such that non-zero
term counts will be set to 1
(default: False)
"""
self.binary = value
return self
@since('1.2.0')
def indexOf(self, term):
@ -398,7 +409,7 @@ class HashingTF(object):
freq = {}
for term in document:
i = self.indexOf(term)
freq[i] = freq.get(i, 0) + 1.0
freq[i] = 1.0 if self.binary else freq.get(i, 0) + 1.0
return Vectors.sparse(self.numFeatures, freq.items())

View file

@ -58,6 +58,7 @@ from pyspark.mllib.recommendation import Rating
from pyspark.mllib.regression import LabeledPoint, StreamingLinearRegressionWithSGD
from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.stat import Statistics
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import Word2Vec
from pyspark.mllib.feature import IDF
from pyspark.mllib.feature import StandardScaler, ElementwiseProduct
@ -1583,6 +1584,21 @@ class ALSTests(MLlibTestCase):
self.assertRaises(Py4JJavaError, self.sc._jvm.SerDe.loads, bytearray(ser.dumps(r)))
class HashingTFTest(MLlibTestCase):
def test_binary_term_freqs(self):
hashingTF = HashingTF(100).setBinary(True)
doc = "a a b c c c".split(" ")
n = hashingTF.numFeatures
output = hashingTF.transform(doc).toArray()
expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
hashingTF.indexOf("b"): 1.0,
hashingTF.indexOf("c"): 1.0}).toArray()
for i in range(0, n):
self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
": expected " + str(expected[i]) + ", got " + str(output[i]))
if __name__ == "__main__":
from pyspark.mllib.tests import *
if not _have_scipy: