[SPARK-23615][ML][PYSPARK] Add maxDF Parameter to Python CountVectorizer

## What changes were proposed in this pull request? The maxDF parameter is for filtering out frequently occurring terms. This param was recently added to the Scala CountVectorizer and needs to be added to Python also. ## How was this patch tested? add test Author: Huaxin Gao <huaxing@us.ibm.com> Closes #20777 from huaxingao/spark-23615.
2018-03-23 15:58:48 -07:00 · 2018-03-23 15:58:48 -07:00 · a33655348c
parent 95c03cbd27
commit a33655348c
3 changed files with 67 additions and 18 deletions
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@ -70,19 +70,21 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
  def getMinDF: Double = $(minDF)

  /**
-   * Specifies the maximum number of different documents a term must appear in to be included
-   * in the vocabulary.
-   * If this is an integer greater than or equal to 1, this specifies the number of documents
-   * the term must appear in; if this is a double in [0,1), then this specifies the fraction of
-   * documents.
+   * Specifies the maximum number of different documents a term could appear in to be included
+   * in the vocabulary. A term that appears more than the threshold will be ignored. If this is an
+   * integer greater than or equal to 1, this specifies the maximum number of documents the term
+   * could appear in; if this is a double in [0,1), then this specifies the maximum fraction of
+   * documents the term could appear in.
   *
-   * Default: (2^64^) - 1
+   * Default: (2^63^) - 1
   * @group param
   */
  val maxDF: DoubleParam = new DoubleParam(this, "maxDF", "Specifies the maximum number of" +
-    " different documents a term must appear in to be included in the vocabulary." +
-    " If this is an integer >= 1, this specifies the number of documents the term must" +
-    " appear in; if this is a double in [0,1), then this specifies the fraction of documents.",
+    " different documents a term could appear in to be included in the vocabulary." +
+    " A term that appears more than the threshold will be ignored. If this is an integer >= 1," +
+    " this specifies the maximum number of documents the term could appear in;" +
+    " if this is a double in [0,1), then this specifies the maximum fraction of" +
+    " documents the term could appear in.",
    ParamValidators.gtEq(0.0))

  /** @group getParam */
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@ -422,6 +422,14 @@ class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol):
        " If this is an integer >= 1, this specifies the number of documents the term must" +
        " appear in; if this is a double in [0,1), then this specifies the fraction of documents." +
        " Default 1.0", typeConverter=TypeConverters.toFloat)
+    maxDF = Param(
+        Params._dummy(), "maxDF", "Specifies the maximum number of" +
+        " different documents a term could appear in to be included in the vocabulary." +
+        " A term that appears more than the threshold will be ignored. If this is an" +
+        " integer >= 1, this specifies the maximum number of documents the term could appear in;" +
+        " if this is a double in [0,1), then this specifies the maximum" +
+        " fraction of documents the term could appear in." +
+        " Default (2^63) - 1", typeConverter=TypeConverters.toFloat)
    vocabSize = Param(
        Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.",
        typeConverter=TypeConverters.toInt)
@ -433,7 +441,7 @@ class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol):

    def __init__(self, *args):
        super(_CountVectorizerParams, self).__init__(*args)
-        self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False)
+        self._setDefault(minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False)

    @since("1.6.0")
    def getMinTF(self):
@ -449,6 +457,13 @@ class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol):
        """
        return self.getOrDefault(self.minDF)

+    @since("2.4.0")
+    def getMaxDF(self):
+        """
+        Gets the value of maxDF or its default value.
+        """
+        return self.getOrDefault(self.maxDF)
+
    @since("1.6.0")
    def getVocabSize(self):
        """
@ -513,11 +528,11 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav
    """

    @keyword_only
-    def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,
-                 outputCol=None):
+    def __init__(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,
+                 inputCol=None, outputCol=None):
        """
-        __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,\
-                 outputCol=None)
+        __init__(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,\
+                 inputCol=None,outputCol=None)
        """
        super(CountVectorizer, self).__init__()
        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
@ -527,11 +542,11 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav

    @keyword_only
    @since("1.6.0")
-    def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,
-                  outputCol=None):
+    def setParams(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,
+                  inputCol=None, outputCol=None):
        """
-        setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,\
-                  outputCol=None)
+        setParams(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,\
+                  inputCol=None, outputCol=None)
        Set the params for the CountVectorizer
        """
        kwargs = self._input_kwargs
@ -551,6 +566,13 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav
        """
        return self._set(minDF=value)

+    @since("2.4.0")
+    def setMaxDF(self, value):
+        """
+        Sets the value of :py:attr:`maxDF`.
+        """
+        return self._set(maxDF=value)
+
    @since("1.6.0")
    def setVocabSize(self, value):
        """
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@ -697,6 +697,31 @@ class FeatureTests(SparkSessionTestCase):
            feature, expected = r
            self.assertEqual(feature, expected)

+    def test_count_vectorizer_with_maxDF(self):
+        dataset = self.spark.createDataFrame([
+            (0, "a b c d".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),),
+            (1, "a b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),),
+            (2, "a b".split(' '), SparseVector(3, {0: 1.0}),),
+            (3, "a".split(' '), SparseVector(3,  {}),)], ["id", "words", "expected"])
+        cv = CountVectorizer(inputCol="words", outputCol="features")
+        model1 = cv.setMaxDF(3).fit(dataset)
+        self.assertEqual(model1.vocabulary, ['b', 'c', 'd'])
+
+        transformedList1 = model1.transform(dataset).select("features", "expected").collect()
+
+        for r in transformedList1:
+            feature, expected = r
+            self.assertEqual(feature, expected)
+
+        model2 = cv.setMaxDF(0.75).fit(dataset)
+        self.assertEqual(model2.vocabulary, ['b', 'c', 'd'])
+
+        transformedList2 = model2.transform(dataset).select("features", "expected").collect()
+
+        for r in transformedList2:
+            feature, expected = r
+            self.assertEqual(feature, expected)
+
    def test_count_vectorizer_from_vocab(self):
        model = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words",
                                                     outputCol="features", minTF=2)