+
+
+
+[`NGram`](api/scala/index.html#org.apache.spark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.NGram
+
+val wordDataFrame = sqlContext.createDataFrame(Seq(
+ (0, Array("Hi", "I", "heard", "about", "Spark")),
+ (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
+ (2, Array("Logistic", "regression", "models", "are", "neat"))
+)).toDF("label", "words")
+
+val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
+val ngramDataFrame = ngram.transform(wordDataFrame)
+ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+{% endhighlight %}
+
+
+
+
+[`NGram`](api/java/org/apache/spark/ml/feature/NGram.html) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.NGram;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD jrdd = jsc.parallelize(Lists.newArrayList(
+ RowFactory.create(0D, Lists.newArrayList("Hi", "I", "heard", "about", "Spark")),
+ RowFactory.create(1D, Lists.newArrayList("I", "wish", "Java", "could", "use", "case", "classes")),
+ RowFactory.create(2D, Lists.newArrayList("Logistic", "regression", "models", "are", "neat"))
+));
+StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+});
+DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
+NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
+DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame);
+for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
+ java.util.List ngrams = r.getList(0);
+ for (String ngram : ngrams) System.out.print(ngram + " --- ");
+ System.out.println();
+}
+{% endhighlight %}
+
+
+
+
+[`NGram`](api/python/pyspark.ml.html#pyspark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+
+{% highlight python %}
+from pyspark.ml.feature import NGram
+
+wordDataFrame = sqlContext.createDataFrame([
+ (0, ["Hi", "I", "heard", "about", "Spark"]),
+ (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
+ (2, ["Logistic", "regression", "models", "are", "neat"])
+], ["label", "words"])
+ngram = NGram(inputCol="words", outputCol="ngrams")
+ngramDataFrame = ngram.transform(wordDataFrame)
+for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
+ print(ngrams_label)
+{% endhighlight %}
+
+
+
+
## Binarizer
Binarization is the process of thresholding numerical features to binary features. As some probabilistic estimators make assumption that the input data is distributed according to [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution), a binarizer is useful for pre-processing the input data with continuous numerical features.